diff options
author | iaz1607 <iaz1607@yandex-team.ru> | 2022-02-10 16:45:37 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:45:37 +0300 |
commit | e5437feb4ac2d2dc044e1090b9312dde5ef197e0 (patch) | |
tree | f5a238c69dd20a1fa2092127a31b8aff25020f7d /contrib/libs/apache | |
parent | f4945d0a44b8770f0801de3056aa41639b0b7bd2 (diff) | |
download | ydb-e5437feb4ac2d2dc044e1090b9312dde5ef197e0.tar.gz |
Restoring authorship annotation for <iaz1607@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/libs/apache')
70 files changed, 24120 insertions, 24120 deletions
diff --git a/contrib/libs/apache/orc/LICENSE b/contrib/libs/apache/orc/LICENSE index 70507878ee..d0a807872e 100644 --- a/contrib/libs/apache/orc/LICENSE +++ b/contrib/libs/apache/orc/LICENSE @@ -1,222 +1,222 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability contains - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - -APACHE ORC SUBCOMPONENTS: - -The Apache ORC project contains subcomponents with separate copyright -notices and license terms. Your use of the source code for the these -subcomponents is subject to the terms and conditions of the following -licenses. - + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability contains + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + +APACHE ORC SUBCOMPONENTS: + +The Apache ORC project contains subcomponents with separate copyright +notices and license terms. Your use of the source code for the these +subcomponents is subject to the terms and conditions of the following +licenses. + For protobuf: - + Copyright 2008 Google Inc. All rights reserved. - + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - + * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above @@ -226,7 +226,7 @@ For protobuf: * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. - + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR diff --git a/contrib/libs/apache/orc/NOTICE b/contrib/libs/apache/orc/NOTICE index 4c79570fac..ddfede01f5 100644 --- a/contrib/libs/apache/orc/NOTICE +++ b/contrib/libs/apache/orc/NOTICE @@ -1,9 +1,9 @@ -Apache ORC +Apache ORC Copyright 2013 and onwards The Apache Software Foundation. - -This product includes software developed by The Apache Software -Foundation (http://www.apache.org/). - -This product includes software developed by Hewlett-Packard: -(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P - + +This product includes software developed by The Apache Software +Foundation (http://www.apache.org/). + +This product includes software developed by Hewlett-Packard: +(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P + diff --git a/contrib/libs/apache/orc/README.md b/contrib/libs/apache/orc/README.md index 0668ee07a5..f71e63b529 100644 --- a/contrib/libs/apache/orc/README.md +++ b/contrib/libs/apache/orc/README.md @@ -1,96 +1,96 @@ -# [Apache ORC](https://orc.apache.org/) - -ORC is a self-describing type-aware columnar file format designed for -Hadoop workloads. It is optimized for large streaming reads, but with -integrated support for finding required rows quickly. Storing data in -a columnar format lets the reader read, decompress, and process only -the values that are required for the current query. Because ORC files -are type-aware, the writer chooses the most appropriate encoding for -the type and builds an internal index as the file is written. -Predicate pushdown uses those indexes to determine which stripes in a -file need to be read for a particular query and the row indexes can -narrow the search to a particular set of 10,000 rows. ORC supports the -complete set of types in Hive, including the complex types: structs, -lists, maps, and unions. - -## ORC File Library - -This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files. But the C++ library only writes the original (Hive 0.11) version of ORC files, and will be extended in the future. - -Releases: -* Latest: <a href="http://orc.apache.org/releases">Apache ORC releases</a> -* Maven Central: <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a> -* Downloads: <a href="http://orc.apache.org/downloads">Apache ORC downloads</a> - -The current build status: -* Master branch <a href="https://travis-ci.org/apache/orc/branches"> -![master build status](https://travis-ci.org/apache/orc.svg?branch=master)</a> -* <a href="https://travis-ci.org/apache/orc/pull_requests">Pull Requests</a> - - -Bug tracking: <a href="http://orc.apache.org/bugs">Apache Jira</a> - - -The subdirectories are: -* c++ - the c++ reader and writer +# [Apache ORC](https://orc.apache.org/) + +ORC is a self-describing type-aware columnar file format designed for +Hadoop workloads. It is optimized for large streaming reads, but with +integrated support for finding required rows quickly. Storing data in +a columnar format lets the reader read, decompress, and process only +the values that are required for the current query. Because ORC files +are type-aware, the writer chooses the most appropriate encoding for +the type and builds an internal index as the file is written. +Predicate pushdown uses those indexes to determine which stripes in a +file need to be read for a particular query and the row indexes can +narrow the search to a particular set of 10,000 rows. ORC supports the +complete set of types in Hive, including the complex types: structs, +lists, maps, and unions. + +## ORC File Library + +This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files. But the C++ library only writes the original (Hive 0.11) version of ORC files, and will be extended in the future. + +Releases: +* Latest: <a href="http://orc.apache.org/releases">Apache ORC releases</a> +* Maven Central: <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a> +* Downloads: <a href="http://orc.apache.org/downloads">Apache ORC downloads</a> + +The current build status: +* Master branch <a href="https://travis-ci.org/apache/orc/branches"> +![master build status](https://travis-ci.org/apache/orc.svg?branch=master)</a> +* <a href="https://travis-ci.org/apache/orc/pull_requests">Pull Requests</a> + + +Bug tracking: <a href="http://orc.apache.org/bugs">Apache Jira</a> + + +The subdirectories are: +* c++ - the c++ reader and writer * cmake_modules - the cmake modules -* docker - docker scripts to build and test on various linuxes -* examples - various ORC example files that are used to test compatibility -* java - the java reader and writer -* proto - the protocol buffer definition for the ORC metadata -* site - the website and documentation +* docker - docker scripts to build and test on various linuxes +* examples - various ORC example files that are used to test compatibility +* java - the java reader and writer +* proto - the protocol buffer definition for the ORC metadata +* site - the website and documentation * snap - the script to build [snaps](https://snapcraft.io/) of the ORC tools -* tools - the c++ tools for reading and inspecting ORC files - -### Building - +* tools - the c++ tools for reading and inspecting ORC files + +### Building + * Install java 1.8 or higher -* Install maven 3 or higher -* Install cmake - -To build a release version with debug information: -```shell -% mkdir build -% cd build -% cmake .. -% make package -% make test-out - -``` - -To build a debug version: -```shell -% mkdir build -% cd build -% cmake .. -DCMAKE_BUILD_TYPE=DEBUG -% make package -% make test-out - -``` - -To build a release version without debug information: -```shell -% mkdir build -% cd build -% cmake .. -DCMAKE_BUILD_TYPE=RELEASE -% make package -% make test-out - -``` - -To build only the Java library: -```shell -% cd java +* Install maven 3 or higher +* Install cmake + +To build a release version with debug information: +```shell +% mkdir build +% cd build +% cmake .. +% make package +% make test-out + +``` + +To build a debug version: +```shell +% mkdir build +% cd build +% cmake .. -DCMAKE_BUILD_TYPE=DEBUG +% make package +% make test-out + +``` + +To build a release version without debug information: +```shell +% mkdir build +% cd build +% cmake .. -DCMAKE_BUILD_TYPE=RELEASE +% make package +% make test-out + +``` + +To build only the Java library: +```shell +% cd java % ./mvnw package - -``` - -To build only the C++ library: -```shell -% mkdir build -% cd build -% cmake .. -DBUILD_JAVA=OFF -% make package -% make test-out - -``` + +``` + +To build only the C++ library: +```shell +% mkdir build +% cd build +% cmake .. -DBUILD_JAVA=OFF +% make package +% make test-out + +``` diff --git a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh index 86c1288b62..42f0476f03 100644 --- a/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/BloomFilter.hh @@ -1,45 +1,45 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_BLOOMFILTER_HH -#define ORC_BLOOMFILTER_HH - -#include "orc/orc-config.hh" - -#include <memory> -#include <vector> - -namespace orc { - - class BloomFilter { - public: - virtual ~BloomFilter(); - - // test if the element exists in BloomFilter - virtual bool testBytes(const char * data, int64_t length) const = 0; - virtual bool testLong(int64_t data) const = 0; - virtual bool testDouble(double data) const = 0; - }; - - struct BloomFilterIndex { - std::vector<std::shared_ptr<BloomFilter>> entries; - }; - -}; - -#endif //ORC_BLOOMFILTER_HH +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BLOOMFILTER_HH +#define ORC_BLOOMFILTER_HH + +#include "orc/orc-config.hh" + +#include <memory> +#include <vector> + +namespace orc { + + class BloomFilter { + public: + virtual ~BloomFilter(); + + // test if the element exists in BloomFilter + virtual bool testBytes(const char * data, int64_t length) const = 0; + virtual bool testLong(int64_t data) const = 0; + virtual bool testDouble(double data) const = 0; + }; + + struct BloomFilterIndex { + std::vector<std::shared_ptr<BloomFilter>> entries; + }; + +}; + +#endif //ORC_BLOOMFILTER_HH diff --git a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh index aa19214738..349cabe025 100644 --- a/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh +++ b/contrib/libs/apache/orc/c++/include/orc/ColumnPrinter.hh @@ -1,51 +1,51 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COLUMN_PRINTER_HH -#define ORC_COLUMN_PRINTER_HH - -#include "orc/orc-config.hh" -#include "orc/OrcFile.hh" -#include "orc/Vector.hh" - -#include <stdio.h> -#include <string> -#include <memory> -#include <string> -#include <vector> - -namespace orc { - - class ColumnPrinter { - protected: - std::string &buffer; - bool hasNulls ; - const char* notNull; - - public: - ColumnPrinter(std::string&); - virtual ~ColumnPrinter(); - virtual void printRow(uint64_t rowId) = 0; - // should be called once at the start of each batch of rows - virtual void reset(const ColumnVectorBatch& batch); - }; - - ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&, - const Type* type); -} -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COLUMN_PRINTER_HH +#define ORC_COLUMN_PRINTER_HH + +#include "orc/orc-config.hh" +#include "orc/OrcFile.hh" +#include "orc/Vector.hh" + +#include <stdio.h> +#include <string> +#include <memory> +#include <string> +#include <vector> + +namespace orc { + + class ColumnPrinter { + protected: + std::string &buffer; + bool hasNulls ; + const char* notNull; + + public: + ColumnPrinter(std::string&); + virtual ~ColumnPrinter(); + virtual void printRow(uint64_t rowId) = 0; + // should be called once at the start of each batch of rows + virtual void reset(const ColumnVectorBatch& batch); + }; + + ORC_UNIQUE_PTR<ColumnPrinter> createColumnPrinter(std::string&, + const Type* type); +} +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Common.hh b/contrib/libs/apache/orc/c++/include/orc/Common.hh index 4aa4a85118..34dc0a118f 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Common.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Common.hh @@ -1,286 +1,286 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COMMON_HH -#define ORC_COMMON_HH - -#include "orc/Vector.hh" -#include "orc/Type.hh" -#include "orc/Exceptions.hh" - -#include <string> - -namespace orc { - - class FileVersion { - private: - uint32_t majorVersion; - uint32_t minorVersion; - public: - static const FileVersion& v_0_11(); - static const FileVersion& v_0_12(); - - FileVersion(uint32_t major, uint32_t minor) : - majorVersion(major), minorVersion(minor) { - } - - /** - * Get major version - */ - uint32_t getMajor() const { - return this->majorVersion; - } - - /** - * Get minor version - */ - uint32_t getMinor() const { - return this->minorVersion; - } - - bool operator == (const FileVersion & right) const { - return this->majorVersion == right.getMajor() && - this->minorVersion == right.getMinor(); - } - - bool operator != (const FileVersion & right) const { - return !(*this == right); - } - - std::string toString() const; - }; - - enum WriterId { - ORC_JAVA_WRITER = 0, - ORC_CPP_WRITER = 1, - PRESTO_WRITER = 2, +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COMMON_HH +#define ORC_COMMON_HH + +#include "orc/Vector.hh" +#include "orc/Type.hh" +#include "orc/Exceptions.hh" + +#include <string> + +namespace orc { + + class FileVersion { + private: + uint32_t majorVersion; + uint32_t minorVersion; + public: + static const FileVersion& v_0_11(); + static const FileVersion& v_0_12(); + + FileVersion(uint32_t major, uint32_t minor) : + majorVersion(major), minorVersion(minor) { + } + + /** + * Get major version + */ + uint32_t getMajor() const { + return this->majorVersion; + } + + /** + * Get minor version + */ + uint32_t getMinor() const { + return this->minorVersion; + } + + bool operator == (const FileVersion & right) const { + return this->majorVersion == right.getMajor() && + this->minorVersion == right.getMinor(); + } + + bool operator != (const FileVersion & right) const { + return !(*this == right); + } + + std::string toString() const; + }; + + enum WriterId { + ORC_JAVA_WRITER = 0, + ORC_CPP_WRITER = 1, + PRESTO_WRITER = 2, SCRITCHLEY_GO = 3, TRINO_WRITER = 4, - UNKNOWN_WRITER = INT32_MAX - }; - + UNKNOWN_WRITER = INT32_MAX + }; + std::string writerIdToString(uint32_t id); - enum CompressionKind { - CompressionKind_NONE = 0, - CompressionKind_ZLIB = 1, - CompressionKind_SNAPPY = 2, - CompressionKind_LZO = 3, - CompressionKind_LZ4 = 4, - CompressionKind_ZSTD = 5, - CompressionKind_MAX = INT32_MAX - }; - - /** - * Get the name of the CompressionKind. - */ - std::string compressionKindToString(CompressionKind kind); - - enum WriterVersion { - WriterVersion_ORIGINAL = 0, - WriterVersion_HIVE_8732 = 1, - WriterVersion_HIVE_4243 = 2, - WriterVersion_HIVE_12055 = 3, - WriterVersion_HIVE_13083 = 4, - WriterVersion_ORC_101 = 5, - WriterVersion_ORC_135 = 6, + enum CompressionKind { + CompressionKind_NONE = 0, + CompressionKind_ZLIB = 1, + CompressionKind_SNAPPY = 2, + CompressionKind_LZO = 3, + CompressionKind_LZ4 = 4, + CompressionKind_ZSTD = 5, + CompressionKind_MAX = INT32_MAX + }; + + /** + * Get the name of the CompressionKind. + */ + std::string compressionKindToString(CompressionKind kind); + + enum WriterVersion { + WriterVersion_ORIGINAL = 0, + WriterVersion_HIVE_8732 = 1, + WriterVersion_HIVE_4243 = 2, + WriterVersion_HIVE_12055 = 3, + WriterVersion_HIVE_13083 = 4, + WriterVersion_ORC_101 = 5, + WriterVersion_ORC_135 = 6, WriterVersion_ORC_517 = 7, WriterVersion_ORC_203 = 8, WriterVersion_ORC_14 = 9, - WriterVersion_MAX = INT32_MAX - }; - - /** - * Get the name of the WriterVersion. - */ - std::string writerVersionToString(WriterVersion kind); - - enum StreamKind { - StreamKind_PRESENT = 0, - StreamKind_DATA = 1, - StreamKind_LENGTH = 2, - StreamKind_DICTIONARY_DATA = 3, - StreamKind_DICTIONARY_COUNT = 4, - StreamKind_SECONDARY = 5, - StreamKind_ROW_INDEX = 6, - StreamKind_BLOOM_FILTER = 7, - StreamKind_BLOOM_FILTER_UTF8 = 8 - }; - - /** - * Get the string representation of the StreamKind. - */ - std::string streamKindToString(StreamKind kind); - - class StreamInformation { - public: - virtual ~StreamInformation(); - - virtual StreamKind getKind() const = 0; - virtual uint64_t getColumnId() const = 0; - virtual uint64_t getOffset() const = 0; - virtual uint64_t getLength() const = 0; - }; - - enum ColumnEncodingKind { - ColumnEncodingKind_DIRECT = 0, - ColumnEncodingKind_DICTIONARY = 1, - ColumnEncodingKind_DIRECT_V2 = 2, - ColumnEncodingKind_DICTIONARY_V2 = 3 - }; - - std::string columnEncodingKindToString(ColumnEncodingKind kind); - - class StripeInformation { - public: - virtual ~StripeInformation(); - - /** - * Get the byte offset of the start of the stripe. - * @return the bytes from the start of the file - */ - virtual uint64_t getOffset() const = 0; - - /** - * Get the total length of the stripe in bytes. - * @return the number of bytes in the stripe - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the length of the stripe's indexes. - * @return the number of bytes in the index - */ - virtual uint64_t getIndexLength() const = 0; - - /** - * Get the length of the stripe's data. - * @return the number of bytes in the stripe - */ - virtual uint64_t getDataLength()const = 0; - - /** - * Get the length of the stripe's tail section, which contains its index. - * @return the number of bytes in the tail - */ - virtual uint64_t getFooterLength() const = 0; - - /** - * Get the number of rows in the stripe. - * @return a count of the number of rows - */ - virtual uint64_t getNumberOfRows() const = 0; - - /** - * Get the number of streams in the stripe. - */ - virtual uint64_t getNumberOfStreams() const = 0; - - /** - * Get the StreamInformation for the given stream. - */ - virtual ORC_UNIQUE_PTR<StreamInformation> - getStreamInformation(uint64_t streamId) const = 0; - - /** - * Get the column encoding for the given column. - * @param colId the columnId - */ - virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; - - /** - * Get the dictionary size. - * @param colId the columnId - * @return the size of the dictionary or 0 if there isn't one - */ - virtual uint64_t getDictionarySize(uint64_t colId) const = 0; - - /** - * Get the writer timezone. - */ - virtual const std::string& getWriterTimezone() const = 0; - }; - - // Return true if val1 < val2; otherwise return false - template <typename T> - inline bool compare(T val1, T val2) { - return (val1 < val2); - } - - // Specialization for Decimal - template <> - inline bool compare(Decimal val1, Decimal val2) { - // compare integral parts - Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, - val1.scale); - Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, - val2.scale); - - if (integral1 < integral2) { - return true; - } else if (integral1 > integral2) { - return false; - } - - // integral parts are equal, continue comparing fractional parts - // unnecessary to check overflow here because the scaled number will not - // exceed original ones - bool overflow = false, positive = val1.value >= 0; - val1.value -= scaleUpInt128ByPowerOfTen(integral1, - val1.scale, - overflow); - val2.value -= scaleUpInt128ByPowerOfTen(integral2, - val2.scale, - overflow); - - int32_t diff = val1.scale - val2.scale; - if (diff > 0) { - val2.value = scaleUpInt128ByPowerOfTen(val2.value, - diff, - overflow); - if (overflow) { - return positive ? true : false; - } - } else { - val1.value = scaleUpInt128ByPowerOfTen(val1.value, - -diff, - overflow); - if (overflow) { - return positive ? false : true; - } - } - - if (val1.value < val2.value) { - return true; - } - return false; - } - - enum BloomFilterVersion { - // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support - // both old and new readers. - ORIGINAL = 0, - // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8. - // See ORC-101 - UTF8 = 1, - FUTURE = INT32_MAX - }; - -} - -#endif + WriterVersion_MAX = INT32_MAX + }; + + /** + * Get the name of the WriterVersion. + */ + std::string writerVersionToString(WriterVersion kind); + + enum StreamKind { + StreamKind_PRESENT = 0, + StreamKind_DATA = 1, + StreamKind_LENGTH = 2, + StreamKind_DICTIONARY_DATA = 3, + StreamKind_DICTIONARY_COUNT = 4, + StreamKind_SECONDARY = 5, + StreamKind_ROW_INDEX = 6, + StreamKind_BLOOM_FILTER = 7, + StreamKind_BLOOM_FILTER_UTF8 = 8 + }; + + /** + * Get the string representation of the StreamKind. + */ + std::string streamKindToString(StreamKind kind); + + class StreamInformation { + public: + virtual ~StreamInformation(); + + virtual StreamKind getKind() const = 0; + virtual uint64_t getColumnId() const = 0; + virtual uint64_t getOffset() const = 0; + virtual uint64_t getLength() const = 0; + }; + + enum ColumnEncodingKind { + ColumnEncodingKind_DIRECT = 0, + ColumnEncodingKind_DICTIONARY = 1, + ColumnEncodingKind_DIRECT_V2 = 2, + ColumnEncodingKind_DICTIONARY_V2 = 3 + }; + + std::string columnEncodingKindToString(ColumnEncodingKind kind); + + class StripeInformation { + public: + virtual ~StripeInformation(); + + /** + * Get the byte offset of the start of the stripe. + * @return the bytes from the start of the file + */ + virtual uint64_t getOffset() const = 0; + + /** + * Get the total length of the stripe in bytes. + * @return the number of bytes in the stripe + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the length of the stripe's indexes. + * @return the number of bytes in the index + */ + virtual uint64_t getIndexLength() const = 0; + + /** + * Get the length of the stripe's data. + * @return the number of bytes in the stripe + */ + virtual uint64_t getDataLength()const = 0; + + /** + * Get the length of the stripe's tail section, which contains its index. + * @return the number of bytes in the tail + */ + virtual uint64_t getFooterLength() const = 0; + + /** + * Get the number of rows in the stripe. + * @return a count of the number of rows + */ + virtual uint64_t getNumberOfRows() const = 0; + + /** + * Get the number of streams in the stripe. + */ + virtual uint64_t getNumberOfStreams() const = 0; + + /** + * Get the StreamInformation for the given stream. + */ + virtual ORC_UNIQUE_PTR<StreamInformation> + getStreamInformation(uint64_t streamId) const = 0; + + /** + * Get the column encoding for the given column. + * @param colId the columnId + */ + virtual ColumnEncodingKind getColumnEncoding(uint64_t colId) const = 0; + + /** + * Get the dictionary size. + * @param colId the columnId + * @return the size of the dictionary or 0 if there isn't one + */ + virtual uint64_t getDictionarySize(uint64_t colId) const = 0; + + /** + * Get the writer timezone. + */ + virtual const std::string& getWriterTimezone() const = 0; + }; + + // Return true if val1 < val2; otherwise return false + template <typename T> + inline bool compare(T val1, T val2) { + return (val1 < val2); + } + + // Specialization for Decimal + template <> + inline bool compare(Decimal val1, Decimal val2) { + // compare integral parts + Int128 integral1 = scaleDownInt128ByPowerOfTen(val1.value, + val1.scale); + Int128 integral2 = scaleDownInt128ByPowerOfTen(val2.value, + val2.scale); + + if (integral1 < integral2) { + return true; + } else if (integral1 > integral2) { + return false; + } + + // integral parts are equal, continue comparing fractional parts + // unnecessary to check overflow here because the scaled number will not + // exceed original ones + bool overflow = false, positive = val1.value >= 0; + val1.value -= scaleUpInt128ByPowerOfTen(integral1, + val1.scale, + overflow); + val2.value -= scaleUpInt128ByPowerOfTen(integral2, + val2.scale, + overflow); + + int32_t diff = val1.scale - val2.scale; + if (diff > 0) { + val2.value = scaleUpInt128ByPowerOfTen(val2.value, + diff, + overflow); + if (overflow) { + return positive ? true : false; + } + } else { + val1.value = scaleUpInt128ByPowerOfTen(val1.value, + -diff, + overflow); + if (overflow) { + return positive ? false : true; + } + } + + if (val1.value < val2.value) { + return true; + } + return false; + } + + enum BloomFilterVersion { + // Include both the BLOOM_FILTER and BLOOM_FILTER_UTF8 streams to support + // both old and new readers. + ORIGINAL = 0, + // Only include the BLOOM_FILTER_UTF8 streams that consistently use UTF8. + // See ORC-101 + UTF8 = 1, + FUTURE = INT32_MAX + }; + +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh index 9765d4fd6b..e991f9eecd 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Exceptions.hh @@ -1,60 +1,60 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_EXCEPTIONS_HH -#define ORC_EXCEPTIONS_HH - -#include "orc/orc-config.hh" - -#include <stdexcept> -#include <string> - -namespace orc { - - class NotImplementedYet: public std::logic_error { - public: - explicit NotImplementedYet(const std::string& what_arg); - explicit NotImplementedYet(const char* what_arg); - virtual ~NotImplementedYet() ORC_NOEXCEPT; - NotImplementedYet(const NotImplementedYet&); - private: - NotImplementedYet& operator=(const NotImplementedYet&); - }; - - class ParseError: public std::runtime_error { - public: - explicit ParseError(const std::string& what_arg); - explicit ParseError(const char* what_arg); - virtual ~ParseError() ORC_NOEXCEPT; - ParseError(const ParseError&); - private: - ParseError& operator=(const ParseError&); - }; - - class InvalidArgument: public std::runtime_error { - public: - explicit InvalidArgument(const std::string& what_arg); - explicit InvalidArgument(const char* what_arg); - virtual ~InvalidArgument() ORC_NOEXCEPT; - InvalidArgument(const InvalidArgument&); - private: - InvalidArgument& operator=(const InvalidArgument&); - }; -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_EXCEPTIONS_HH +#define ORC_EXCEPTIONS_HH + +#include "orc/orc-config.hh" + +#include <stdexcept> +#include <string> + +namespace orc { + + class NotImplementedYet: public std::logic_error { + public: + explicit NotImplementedYet(const std::string& what_arg); + explicit NotImplementedYet(const char* what_arg); + virtual ~NotImplementedYet() ORC_NOEXCEPT; + NotImplementedYet(const NotImplementedYet&); + private: + NotImplementedYet& operator=(const NotImplementedYet&); + }; + + class ParseError: public std::runtime_error { + public: + explicit ParseError(const std::string& what_arg); + explicit ParseError(const char* what_arg); + virtual ~ParseError() ORC_NOEXCEPT; + ParseError(const ParseError&); + private: + ParseError& operator=(const ParseError&); + }; + + class InvalidArgument: public std::runtime_error { + public: + explicit InvalidArgument(const std::string& what_arg); + explicit InvalidArgument(const char* what_arg); + virtual ~InvalidArgument() ORC_NOEXCEPT; + InvalidArgument(const InvalidArgument&); + private: + InvalidArgument& operator=(const InvalidArgument&); + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh index f86d8f08a6..63b84478c6 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh @@ -1,372 +1,372 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_INT_128 -#define ORC_INT_128 - -#include "orc/orc-config.hh" - -#include <stdexcept> -#include <string> - -namespace orc { - - /** - * Represents a signed 128-bit integer in two's complement. - * Calculations wrap around and overflow is ignored. - * - * For a discussion of the algorithms, look at Knuth's volume 2, - * Semi-numerical Algorithms section 4.3.1. - * - */ - class Int128 { - public: - Int128() { - highbits = 0; - lowbits = 0; - } - - /** - * Convert a signed 64 bit value into an Int128. - */ - Int128(int64_t right) { - if (right >= 0) { - highbits = 0; - lowbits = static_cast<uint64_t>(right); - } else { - highbits = -1; - lowbits = static_cast<uint64_t>(right); - } - } - - /** - * Create from the twos complement representation. - */ - Int128(int64_t high, uint64_t low) { - highbits = high; - lowbits = low; - } - - /** - * Parse the number from a base 10 string representation. - */ - explicit Int128(const std::string&); - - /** - * Maximum positive value allowed by the type. - */ - static Int128 maximumValue(); - - /** - * Minimum negative value allowed by the type. - */ - static Int128 minimumValue(); - - Int128& negate() { - lowbits = ~lowbits + 1; - highbits = ~highbits; - if (lowbits == 0) { - highbits += 1; - } - return *this; - } - - Int128& abs() { - if (highbits < 0) { - negate(); - } - return *this; - } - - Int128 abs() const { - Int128 value = *this; - value.abs(); - return value; - } - - Int128& invert() { - lowbits = ~lowbits; - highbits = ~highbits; - return *this; - } - - /** - * Add a number to this one. The result is truncated to 128 bits. - * @param right the number to add - * @return *this - */ - Int128& operator+=(const Int128 &right) { - uint64_t sum = lowbits + right.lowbits; - highbits += right.highbits; - if (sum < lowbits) { - highbits += 1; - } - lowbits = sum; - return *this; - } - - /** - * Subtract a number from this one. The result is truncated to 128 bits. - * @param right the number to subtract - * @return *this - */ - Int128& operator-=(const Int128 &right) { - uint64_t diff = lowbits - right.lowbits; - highbits -= right.highbits; - if (diff > lowbits) { - highbits -= 1; - } - lowbits = diff; - return *this; - } - - /** - * Multiply this number by a number. The result is truncated to 128 bits. - * @param right the number to multiply by - * @return *this - */ - Int128& operator*=(const Int128 &right); - - /** - * Divide this number by right and return the result. This operation is - * not destructive. - * - * The answer rounds to zero. Signs work like: - * 21 / 5 -> 4, 1 - * -21 / 5 -> -4, -1 - * 21 / -5 -> -4, 1 - * -21 / -5 -> 4, -1 - * @param right the number to divide by - * @param remainder the remainder after the division - */ - Int128 divide(const Int128 &right, Int128& remainder) const; - - /** - * Logical or between two Int128. - * @param right the number to or in - * @return *this - */ - Int128& operator|=(const Int128 &right) { - lowbits |= right.lowbits; - highbits |= right.highbits; - return *this; - } - - /** - * Logical and between two Int128. - * @param right the number to and in - * @return *this - */ - Int128& operator&=(const Int128 &right) { - lowbits &= right.lowbits; - highbits &= right.highbits; - return *this; - } - - /** - * Logical and between two Int128. - * @param right the number to and in - * @return logical and result - */ - Int128 operator&(const Int128 &right) { - Int128 value = *this; - value &= right; - return value; - } - - /** - * Shift left by the given number of bits. - * Values larger than 2**127 will shift into the sign bit. - */ - Int128& operator<<=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - highbits <<= bits; - highbits |= (lowbits >> (64 - bits)); - lowbits <<= bits; - } else if (bits < 128) { - highbits = static_cast<int64_t>(lowbits) << (bits - 64); - lowbits = 0; - } else { - highbits = 0; - lowbits = 0; - } - } - return *this; - } - - /** - * Shift right by the given number of bits. Negative values will - * sign extend and fill with one bits. - */ - Int128& operator>>=(uint32_t bits) { - if (bits != 0) { - if (bits < 64) { - lowbits >>= bits; - lowbits |= static_cast<uint64_t>(highbits << (64 - bits)); - highbits = static_cast<int64_t> - (static_cast<uint64_t>(highbits) >> bits); - } else if (bits < 128) { - lowbits = static_cast<uint64_t>(highbits >> (bits - 64)); - highbits = highbits >= 0 ? 0 : -1l; - } else { - highbits = highbits >= 0 ? 0 : -1l; - lowbits = static_cast<uint64_t>(highbits); - } - } - return *this; - } - - bool operator==(const Int128& right) const { - return highbits == right.highbits && lowbits == right.lowbits; - } - - bool operator!=(const Int128& right) const { - return highbits != right.highbits || lowbits != right.lowbits; - } - - bool operator<(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits < right.lowbits; - } else { - return highbits < right.highbits; - } - } - - bool operator<=(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits <= right.lowbits; - } else { - return highbits <= right.highbits; - } - } - - bool operator>(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits > right.lowbits; - } else { - return highbits > right.highbits; - } - } - - bool operator>=(const Int128 &right) const { - if (highbits == right.highbits) { - return lowbits >= right.lowbits; - } else { - return highbits >= right.highbits; - } - } - - uint32_t hash() const { - return static_cast<uint32_t>(highbits >> 32) ^ - static_cast<uint32_t>(highbits) ^ - static_cast<uint32_t>(lowbits >> 32) ^ - static_cast<uint32_t>(lowbits); - } - - /** - * Does this value fit into a long? - */ - bool fitsInLong() const { - switch (highbits) { - case 0: - return 0 == (lowbits & LONG_SIGN_BIT); - case -1: - return 0 != (lowbits & LONG_SIGN_BIT); - default: - return false; - } - } - - /** - * Convert the value to a long and - */ - int64_t toLong() const { - if (fitsInLong()) { - return static_cast<int64_t>(lowbits); - } - throw std::range_error("Int128 too large to convert to long"); - } - - /** - * Return the base 10 string representation of the integer. - */ - std::string toString() const; - - /** - * Return the base 10 string representation with a decimal point, - * the given number of places after the decimal. - */ - std::string toDecimalString(int32_t scale=0) const; - - /** - * Return the base 16 string representation of the two's complement with - * a prefix of "0x". - * Int128(-1).toHexString() = "0xffffffffffffffffffffffffffffffff". - */ - std::string toHexString() const; - - /** - * Get the high bits of the twos complement representation of the number. - */ - int64_t getHighBits() { - return highbits; - } - - /** - * Get the low bits of the twos complement representation of the number. - */ - uint64_t getLowBits() { - return lowbits; - } - - /** - * Represent the absolute number as a list of uint32. - * Visible for testing only. - * @param array the array that is set to the value of the number - * @param wasNegative set to true if the original number was negative - * @return the number of elements that were set in the array (1 to 4) - */ - int64_t fillInArray(uint32_t* array, bool &wasNegative) const; - - private: - static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u; - int64_t highbits; - uint64_t lowbits; - }; - - - /** - * Scales up an Int128 value - * @param value the Int128 value to scale - * @param power the scale offset. Result of a negative factor is undefined. - * @param overflow returns whether the result overflows or not - * @return the scaled value - */ - Int128 scaleUpInt128ByPowerOfTen(Int128 value, - int32_t power, - bool &overflow); - /** - * Scales down an Int128 value - * @param value the Int128 value to scale - * @param power the scale offset. Result of a negative factor is undefined. - * @return the scaled value - */ - Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power); -} -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_INT_128 +#define ORC_INT_128 + +#include "orc/orc-config.hh" + +#include <stdexcept> +#include <string> + +namespace orc { + + /** + * Represents a signed 128-bit integer in two's complement. + * Calculations wrap around and overflow is ignored. + * + * For a discussion of the algorithms, look at Knuth's volume 2, + * Semi-numerical Algorithms section 4.3.1. + * + */ + class Int128 { + public: + Int128() { + highbits = 0; + lowbits = 0; + } + + /** + * Convert a signed 64 bit value into an Int128. + */ + Int128(int64_t right) { + if (right >= 0) { + highbits = 0; + lowbits = static_cast<uint64_t>(right); + } else { + highbits = -1; + lowbits = static_cast<uint64_t>(right); + } + } + + /** + * Create from the twos complement representation. + */ + Int128(int64_t high, uint64_t low) { + highbits = high; + lowbits = low; + } + + /** + * Parse the number from a base 10 string representation. + */ + explicit Int128(const std::string&); + + /** + * Maximum positive value allowed by the type. + */ + static Int128 maximumValue(); + + /** + * Minimum negative value allowed by the type. + */ + static Int128 minimumValue(); + + Int128& negate() { + lowbits = ~lowbits + 1; + highbits = ~highbits; + if (lowbits == 0) { + highbits += 1; + } + return *this; + } + + Int128& abs() { + if (highbits < 0) { + negate(); + } + return *this; + } + + Int128 abs() const { + Int128 value = *this; + value.abs(); + return value; + } + + Int128& invert() { + lowbits = ~lowbits; + highbits = ~highbits; + return *this; + } + + /** + * Add a number to this one. The result is truncated to 128 bits. + * @param right the number to add + * @return *this + */ + Int128& operator+=(const Int128 &right) { + uint64_t sum = lowbits + right.lowbits; + highbits += right.highbits; + if (sum < lowbits) { + highbits += 1; + } + lowbits = sum; + return *this; + } + + /** + * Subtract a number from this one. The result is truncated to 128 bits. + * @param right the number to subtract + * @return *this + */ + Int128& operator-=(const Int128 &right) { + uint64_t diff = lowbits - right.lowbits; + highbits -= right.highbits; + if (diff > lowbits) { + highbits -= 1; + } + lowbits = diff; + return *this; + } + + /** + * Multiply this number by a number. The result is truncated to 128 bits. + * @param right the number to multiply by + * @return *this + */ + Int128& operator*=(const Int128 &right); + + /** + * Divide this number by right and return the result. This operation is + * not destructive. + * + * The answer rounds to zero. Signs work like: + * 21 / 5 -> 4, 1 + * -21 / 5 -> -4, -1 + * 21 / -5 -> -4, 1 + * -21 / -5 -> 4, -1 + * @param right the number to divide by + * @param remainder the remainder after the division + */ + Int128 divide(const Int128 &right, Int128& remainder) const; + + /** + * Logical or between two Int128. + * @param right the number to or in + * @return *this + */ + Int128& operator|=(const Int128 &right) { + lowbits |= right.lowbits; + highbits |= right.highbits; + return *this; + } + + /** + * Logical and between two Int128. + * @param right the number to and in + * @return *this + */ + Int128& operator&=(const Int128 &right) { + lowbits &= right.lowbits; + highbits &= right.highbits; + return *this; + } + + /** + * Logical and between two Int128. + * @param right the number to and in + * @return logical and result + */ + Int128 operator&(const Int128 &right) { + Int128 value = *this; + value &= right; + return value; + } + + /** + * Shift left by the given number of bits. + * Values larger than 2**127 will shift into the sign bit. + */ + Int128& operator<<=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + highbits <<= bits; + highbits |= (lowbits >> (64 - bits)); + lowbits <<= bits; + } else if (bits < 128) { + highbits = static_cast<int64_t>(lowbits) << (bits - 64); + lowbits = 0; + } else { + highbits = 0; + lowbits = 0; + } + } + return *this; + } + + /** + * Shift right by the given number of bits. Negative values will + * sign extend and fill with one bits. + */ + Int128& operator>>=(uint32_t bits) { + if (bits != 0) { + if (bits < 64) { + lowbits >>= bits; + lowbits |= static_cast<uint64_t>(highbits << (64 - bits)); + highbits = static_cast<int64_t> + (static_cast<uint64_t>(highbits) >> bits); + } else if (bits < 128) { + lowbits = static_cast<uint64_t>(highbits >> (bits - 64)); + highbits = highbits >= 0 ? 0 : -1l; + } else { + highbits = highbits >= 0 ? 0 : -1l; + lowbits = static_cast<uint64_t>(highbits); + } + } + return *this; + } + + bool operator==(const Int128& right) const { + return highbits == right.highbits && lowbits == right.lowbits; + } + + bool operator!=(const Int128& right) const { + return highbits != right.highbits || lowbits != right.lowbits; + } + + bool operator<(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits < right.lowbits; + } else { + return highbits < right.highbits; + } + } + + bool operator<=(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits <= right.lowbits; + } else { + return highbits <= right.highbits; + } + } + + bool operator>(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits > right.lowbits; + } else { + return highbits > right.highbits; + } + } + + bool operator>=(const Int128 &right) const { + if (highbits == right.highbits) { + return lowbits >= right.lowbits; + } else { + return highbits >= right.highbits; + } + } + + uint32_t hash() const { + return static_cast<uint32_t>(highbits >> 32) ^ + static_cast<uint32_t>(highbits) ^ + static_cast<uint32_t>(lowbits >> 32) ^ + static_cast<uint32_t>(lowbits); + } + + /** + * Does this value fit into a long? + */ + bool fitsInLong() const { + switch (highbits) { + case 0: + return 0 == (lowbits & LONG_SIGN_BIT); + case -1: + return 0 != (lowbits & LONG_SIGN_BIT); + default: + return false; + } + } + + /** + * Convert the value to a long and + */ + int64_t toLong() const { + if (fitsInLong()) { + return static_cast<int64_t>(lowbits); + } + throw std::range_error("Int128 too large to convert to long"); + } + + /** + * Return the base 10 string representation of the integer. + */ + std::string toString() const; + + /** + * Return the base 10 string representation with a decimal point, + * the given number of places after the decimal. + */ + std::string toDecimalString(int32_t scale=0) const; + + /** + * Return the base 16 string representation of the two's complement with + * a prefix of "0x". + * Int128(-1).toHexString() = "0xffffffffffffffffffffffffffffffff". + */ + std::string toHexString() const; + + /** + * Get the high bits of the twos complement representation of the number. + */ + int64_t getHighBits() { + return highbits; + } + + /** + * Get the low bits of the twos complement representation of the number. + */ + uint64_t getLowBits() { + return lowbits; + } + + /** + * Represent the absolute number as a list of uint32. + * Visible for testing only. + * @param array the array that is set to the value of the number + * @param wasNegative set to true if the original number was negative + * @return the number of elements that were set in the array (1 to 4) + */ + int64_t fillInArray(uint32_t* array, bool &wasNegative) const; + + private: + static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u; + int64_t highbits; + uint64_t lowbits; + }; + + + /** + * Scales up an Int128 value + * @param value the Int128 value to scale + * @param power the scale offset. Result of a negative factor is undefined. + * @param overflow returns whether the result overflows or not + * @return the scaled value + */ + Int128 scaleUpInt128ByPowerOfTen(Int128 value, + int32_t power, + bool &overflow); + /** + * Scales down an Int128 value + * @param value the Int128 value to scale + * @param power the scale offset. Result of a negative factor is undefined. + * @return the scaled value + */ + Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power); +} +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh index 71d76c438a..a34651721f 100644 --- a/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh +++ b/contrib/libs/apache/orc/c++/include/orc/MemoryPool.hh @@ -1,150 +1,150 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MEMORYPOOL_HH_ -#define MEMORYPOOL_HH_ - -#include "orc/orc-config.hh" -#include "orc/Int128.hh" - -#include <memory> - -namespace orc { - - class MemoryPool { - public: - virtual ~MemoryPool(); - - virtual char* malloc(uint64_t size) = 0; - virtual void free(char* p) = 0; - }; - MemoryPool* getDefaultPool(); - - template <class T> - class DataBuffer { - private: - MemoryPool& memoryPool; - T* buf; - // current size - uint64_t currentSize; - // maximal capacity (actual allocated memory) - uint64_t currentCapacity; - - // not implemented - DataBuffer(DataBuffer& buffer); - DataBuffer& operator=(DataBuffer& buffer); - - public: - DataBuffer(MemoryPool& pool, uint64_t _size = 0); - - DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT; - - virtual ~DataBuffer(); - - T* data() { - return buf; - } - - const T* data() const { - return buf; - } - - uint64_t size() { - return currentSize; - } - - uint64_t capacity() { - return currentCapacity; - } - - T& operator[](uint64_t i) { - return buf[i]; - } - - void reserve(uint64_t _size); - void resize(uint64_t _size); - }; - - // Specializations for char - - template <> - DataBuffer<char>::~DataBuffer(); - - template <> - void DataBuffer<char>::resize(uint64_t newSize); - - // Specializations for char* - - template <> - DataBuffer<char*>::~DataBuffer(); - - template <> - void DataBuffer<char*>::resize(uint64_t newSize); - - // Specializations for double - - template <> - DataBuffer<double>::~DataBuffer(); - - template <> - void DataBuffer<double>::resize(uint64_t newSize); - - // Specializations for int64_t - - template <> - DataBuffer<int64_t>::~DataBuffer(); - - template <> - void DataBuffer<int64_t>::resize(uint64_t newSize); - - // Specializations for uint64_t - - template <> - DataBuffer<uint64_t>::~DataBuffer(); - - template <> - void DataBuffer<uint64_t>::resize(uint64_t newSize); - - // Specializations for unsigned char - - template <> - DataBuffer<unsigned char>::~DataBuffer(); - - template <> - void DataBuffer<unsigned char>::resize(uint64_t newSize); - - #ifdef __clang__ - #pragma clang diagnostic push - #pragma clang diagnostic ignored "-Wweak-template-vtables" - #endif - - extern template class DataBuffer<char>; - extern template class DataBuffer<char*>; - extern template class DataBuffer<double>; - extern template class DataBuffer<Int128>; - extern template class DataBuffer<int64_t>; - extern template class DataBuffer<uint64_t>; - extern template class DataBuffer<unsigned char>; - - #ifdef __clang__ - #pragma clang diagnostic pop - #endif -} // namespace orc - - -#endif /* MEMORYPOOL_HH_ */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MEMORYPOOL_HH_ +#define MEMORYPOOL_HH_ + +#include "orc/orc-config.hh" +#include "orc/Int128.hh" + +#include <memory> + +namespace orc { + + class MemoryPool { + public: + virtual ~MemoryPool(); + + virtual char* malloc(uint64_t size) = 0; + virtual void free(char* p) = 0; + }; + MemoryPool* getDefaultPool(); + + template <class T> + class DataBuffer { + private: + MemoryPool& memoryPool; + T* buf; + // current size + uint64_t currentSize; + // maximal capacity (actual allocated memory) + uint64_t currentCapacity; + + // not implemented + DataBuffer(DataBuffer& buffer); + DataBuffer& operator=(DataBuffer& buffer); + + public: + DataBuffer(MemoryPool& pool, uint64_t _size = 0); + + DataBuffer(DataBuffer<T>&& buffer) ORC_NOEXCEPT; + + virtual ~DataBuffer(); + + T* data() { + return buf; + } + + const T* data() const { + return buf; + } + + uint64_t size() { + return currentSize; + } + + uint64_t capacity() { + return currentCapacity; + } + + T& operator[](uint64_t i) { + return buf[i]; + } + + void reserve(uint64_t _size); + void resize(uint64_t _size); + }; + + // Specializations for char + + template <> + DataBuffer<char>::~DataBuffer(); + + template <> + void DataBuffer<char>::resize(uint64_t newSize); + + // Specializations for char* + + template <> + DataBuffer<char*>::~DataBuffer(); + + template <> + void DataBuffer<char*>::resize(uint64_t newSize); + + // Specializations for double + + template <> + DataBuffer<double>::~DataBuffer(); + + template <> + void DataBuffer<double>::resize(uint64_t newSize); + + // Specializations for int64_t + + template <> + DataBuffer<int64_t>::~DataBuffer(); + + template <> + void DataBuffer<int64_t>::resize(uint64_t newSize); + + // Specializations for uint64_t + + template <> + DataBuffer<uint64_t>::~DataBuffer(); + + template <> + void DataBuffer<uint64_t>::resize(uint64_t newSize); + + // Specializations for unsigned char + + template <> + DataBuffer<unsigned char>::~DataBuffer(); + + template <> + void DataBuffer<unsigned char>::resize(uint64_t newSize); + + #ifdef __clang__ + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wweak-template-vtables" + #endif + + extern template class DataBuffer<char>; + extern template class DataBuffer<char*>; + extern template class DataBuffer<double>; + extern template class DataBuffer<Int128>; + extern template class DataBuffer<int64_t>; + extern template class DataBuffer<uint64_t>; + extern template class DataBuffer<unsigned char>; + + #ifdef __clang__ + #pragma clang diagnostic pop + #endif +} // namespace orc + + +#endif /* MEMORYPOOL_HH_ */ diff --git a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh index c64853168a..541d725bfc 100644 --- a/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh +++ b/contrib/libs/apache/orc/c++/include/orc/OrcFile.hh @@ -1,148 +1,148 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_FILE_HH -#define ORC_FILE_HH - -#include <string> - -#include "orc/orc-config.hh" -#include "orc/Reader.hh" -#include "orc/Writer.hh" - -/** /file orc/OrcFile.hh - @brief The top level interface to ORC. -*/ - -namespace orc { - - /** - * An abstract interface for providing ORC readers a stream of bytes. - */ - class InputStream { - public: - virtual ~InputStream(); - - /** - * Get the total length of the file in bytes. - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the natural size for reads. - * @return the number of bytes that should be read at once - */ - virtual uint64_t getNaturalReadSize() const = 0; - - /** - * Read length bytes from the file starting at offset into - * the buffer starting at buf. - * @param buf the starting position of a buffer. - * @param length the number of bytes to read. - * @param offset the position in the stream to read from. - */ - virtual void read(void* buf, - uint64_t length, - uint64_t offset) = 0; - - /** - * Get the name of the stream for error messages. - */ - virtual const std::string& getName() const = 0; - }; - - /** - * An abstract interface for providing ORC writer a stream of bytes. - */ - class OutputStream { - public: - virtual ~OutputStream(); - - /** - * Get the total length of bytes written. - */ - virtual uint64_t getLength() const = 0; - - /** - * Get the natural size for reads. - * @return the number of bytes that should be written at once - */ - virtual uint64_t getNaturalWriteSize() const = 0; - - /** - * Write/Append length bytes pointed by buf to the file stream - * @param buf the starting position of a buffer. - * @param length the number of bytes to write. - */ - virtual void write(const void* buf, size_t length) = 0; - - /** - * Get the name of the stream for error messages. - */ - virtual const std::string& getName() const = 0; - - /** - * Close the stream and flush any pending data to the disk. - */ - virtual void close() = 0; - }; - - /** - * Create a stream to a local file or HDFS file if path begins with "hdfs://" - * @param path the name of the file in the local file system or HDFS - */ - ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path); - - /** - * Create a stream to a local file. - * @param path the name of the file in the local file system - */ - ORC_UNIQUE_PTR<InputStream> readLocalFile(const std::string& path); - - /** - * Create a stream to an HDFS file. - * @param path the uri of the file in HDFS - */ - ORC_UNIQUE_PTR<InputStream> readHdfsFile(const std::string& path); - - /** - * Create a reader to read the ORC file. - * @param stream the stream to read - * @param options the options for reading the file - */ - ORC_UNIQUE_PTR<Reader> createReader(ORC_UNIQUE_PTR<InputStream> stream, - const ReaderOptions& options); - /** - * Create a stream to write to a local file. - * @param path the name of the file in the local file system - */ - ORC_UNIQUE_PTR<OutputStream> writeLocalFile(const std::string& path); - - /** - * Create a writer to write the ORC file. - * @param type the type of data to be written - * @param stream the stream to write to - * @param options the options for writing the file - */ - ORC_UNIQUE_PTR<Writer> createWriter( - const Type& type, - OutputStream* stream, - const WriterOptions& options); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_FILE_HH +#define ORC_FILE_HH + +#include <string> + +#include "orc/orc-config.hh" +#include "orc/Reader.hh" +#include "orc/Writer.hh" + +/** /file orc/OrcFile.hh + @brief The top level interface to ORC. +*/ + +namespace orc { + + /** + * An abstract interface for providing ORC readers a stream of bytes. + */ + class InputStream { + public: + virtual ~InputStream(); + + /** + * Get the total length of the file in bytes. + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the natural size for reads. + * @return the number of bytes that should be read at once + */ + virtual uint64_t getNaturalReadSize() const = 0; + + /** + * Read length bytes from the file starting at offset into + * the buffer starting at buf. + * @param buf the starting position of a buffer. + * @param length the number of bytes to read. + * @param offset the position in the stream to read from. + */ + virtual void read(void* buf, + uint64_t length, + uint64_t offset) = 0; + + /** + * Get the name of the stream for error messages. + */ + virtual const std::string& getName() const = 0; + }; + + /** + * An abstract interface for providing ORC writer a stream of bytes. + */ + class OutputStream { + public: + virtual ~OutputStream(); + + /** + * Get the total length of bytes written. + */ + virtual uint64_t getLength() const = 0; + + /** + * Get the natural size for reads. + * @return the number of bytes that should be written at once + */ + virtual uint64_t getNaturalWriteSize() const = 0; + + /** + * Write/Append length bytes pointed by buf to the file stream + * @param buf the starting position of a buffer. + * @param length the number of bytes to write. + */ + virtual void write(const void* buf, size_t length) = 0; + + /** + * Get the name of the stream for error messages. + */ + virtual const std::string& getName() const = 0; + + /** + * Close the stream and flush any pending data to the disk. + */ + virtual void close() = 0; + }; + + /** + * Create a stream to a local file or HDFS file if path begins with "hdfs://" + * @param path the name of the file in the local file system or HDFS + */ + ORC_UNIQUE_PTR<InputStream> readFile(const std::string& path); + + /** + * Create a stream to a local file. + * @param path the name of the file in the local file system + */ + ORC_UNIQUE_PTR<InputStream> readLocalFile(const std::string& path); + + /** + * Create a stream to an HDFS file. + * @param path the uri of the file in HDFS + */ + ORC_UNIQUE_PTR<InputStream> readHdfsFile(const std::string& path); + + /** + * Create a reader to read the ORC file. + * @param stream the stream to read + * @param options the options for reading the file + */ + ORC_UNIQUE_PTR<Reader> createReader(ORC_UNIQUE_PTR<InputStream> stream, + const ReaderOptions& options); + /** + * Create a stream to write to a local file. + * @param path the name of the file in the local file system + */ + ORC_UNIQUE_PTR<OutputStream> writeLocalFile(const std::string& path); + + /** + * Create a writer to write the ORC file. + * @param type the type of data to be written + * @param stream the stream to write to + * @param options the options for writing the file + */ + ORC_UNIQUE_PTR<Writer> createWriter( + const Type& type, + OutputStream* stream, + const WriterOptions& options); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh index 5d9a532c11..55c95557fc 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh @@ -1,550 +1,550 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_READER_HH -#define ORC_READER_HH - -#include "orc/BloomFilter.hh" -#include "orc/Common.hh" -#include "orc/orc-config.hh" -#include "orc/Statistics.hh" -#include "orc/Type.hh" -#include "orc/Vector.hh" - -#include <map> -#include <memory> -#include <set> -#include <string> -#include <vector> - -namespace orc { - - // classes that hold data members so we can maintain binary compatibility - struct ReaderOptionsPrivate; - struct RowReaderOptionsPrivate; - - /** - * Options for creating a Reader. - */ - class ReaderOptions { - private: - ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits; - - public: - ReaderOptions(); - ReaderOptions(const ReaderOptions&); - ReaderOptions(ReaderOptions&); - ReaderOptions& operator=(const ReaderOptions&); - virtual ~ReaderOptions(); - - /** - * Set the stream to use for printing warning or error messages. - */ - ReaderOptions& setErrorStream(std::ostream& stream); - - /** - * Set a serialized copy of the file tail to be used when opening the file. - * - * When one process opens the file and other processes need to read - * the rows, we want to enable clients to just read the tail once. - * By passing the string returned by Reader.getSerializedFileTail(), to - * this function, the second reader will not need to read the file tail - * from disk. - * - * @param serialization the bytes of the serialized tail to use - */ - ReaderOptions& setSerializedFileTail(const std::string& serialization); - - /** - * Set the memory allocator. - */ - ReaderOptions& setMemoryPool(MemoryPool& pool); - - /** - * Set the location of the tail as defined by the logical length of the - * file. - */ - ReaderOptions& setTailLocation(uint64_t offset); - - /** - * Get the stream to write warnings or errors to. - */ - std::ostream* getErrorStream() const; - - /** - * Get the serialized file tail that the user passed in. - */ - std::string getSerializedFileTail() const; - - /** - * Get the desired tail location. - * @return if not set, return the maximum long. - */ - uint64_t getTailLocation() const; - - /** - * Get the memory allocator. - */ - MemoryPool* getMemoryPool() const; - }; - - /** - * Options for creating a RowReader. - */ - class RowReaderOptions { - private: - ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits; - - public: - RowReaderOptions(); - RowReaderOptions(const RowReaderOptions&); - RowReaderOptions(RowReaderOptions&); - RowReaderOptions& operator=(const RowReaderOptions&); - virtual ~RowReaderOptions(); - - /** - * For files that have structs as the top-level object, select the fields - * to read. The first field is 0, the second 1, and so on. By default, - * all columns are read. This option clears any previous setting of - * the selected columns. - * @param include a list of fields to read - * @return this - */ - RowReaderOptions& include(const std::list<uint64_t>& include); - - /** - * For files that have structs as the top-level object, select the fields - * to read by name. By default, all columns are read. This option clears - * any previous setting of the selected columns. - * @param include a list of fields to read - * @return this - */ - RowReaderOptions& include(const std::list<std::string>& include); - - /** - * Selects which type ids to read. The root type is always 0 and the - * rest of the types are labeled in a preorder traversal of the tree. - * The parent types are automatically selected, but the children are not. - * - * This option clears any previous setting of the selected columns or - * types. - * @param types a list of the type ids to read - * @return this - */ - RowReaderOptions& includeTypes(const std::list<uint64_t>& types); - - /** - * Set the section of the file to process. - * @param offset the starting byte offset - * @param length the number of bytes to read - * @return this - */ - RowReaderOptions& range(uint64_t offset, uint64_t length); - - /** - * For Hive 0.11 (and 0.12) decimals, the precision was unlimited - * and thus may overflow the 38 digits that is supported. If one - * of the Hive 0.11 decimals is too large, the reader may either convert - * the value to NULL or throw an exception. That choice is controlled - * by this setting. - * - * Defaults to true. - * - * @param shouldThrow should the reader throw a ParseError? - * @return returns *this - */ - RowReaderOptions& throwOnHive11DecimalOverflow(bool shouldThrow); - - /** - * For Hive 0.11 (and 0.12) written decimals, which have unlimited - * scale and precision, the reader forces the scale to a consistent - * number that is configured. This setting changes the scale that is - * forced upon these old decimals. See also throwOnHive11DecimalOverflow. - * - * Defaults to 6. - * - * @param forcedScale the scale that will be forced on Hive 0.11 decimals - * @return returns *this - */ - RowReaderOptions& forcedScaleOnHive11Decimal(int32_t forcedScale); - - /** - * Set enable encoding block mode. - * By enable encoding block mode, Row Reader will not decode - * dictionary encoded string vector, but instead return an index array with - * reference to corresponding dictionary. - */ - RowReaderOptions& setEnableLazyDecoding(bool enable); - - /** - * Should enable encoding block mode - */ - bool getEnableLazyDecoding() const; - - /** - * Were the field ids set? - */ - bool getIndexesSet() const; - - /** - * Were the type ids set? - */ - bool getTypeIdsSet() const; - - /** - * Get the list of selected field or type ids to read. - */ - const std::list<uint64_t>& getInclude() const; - - /** - * Were the include names set? - */ - bool getNamesSet() const; - - /** - * Get the list of selected columns to read. All children of the selected - * columns are also selected. - */ - const std::list<std::string>& getIncludeNames() const; - - /** - * Get the start of the range for the data being processed. - * @return if not set, return 0 - */ - uint64_t getOffset() const; - - /** - * Get the end of the range for the data being processed. - * @return if not set, return the maximum long - */ - uint64_t getLength() const; - - /** - * Should the reader throw a ParseError when a Hive 0.11 decimal is - * larger than the supported 38 digits of precision? Otherwise, the - * data item is replaced by a NULL. - */ - bool getThrowOnHive11DecimalOverflow() const; - - /** - * What scale should all Hive 0.11 decimals be normalized to? - */ - int32_t getForcedScaleOnHive11Decimal() const; - }; - - - class RowReader; - - /** - * The interface for reading ORC file meta-data and constructing RowReaders. - * This is an an abstract class that will be subclassed as necessary. - */ - class Reader { - public: - virtual ~Reader(); - - /** - * Get the format version of the file. Currently known values are: - * 0.11 and 0.12 - * @return the FileVersion object - */ - virtual FileVersion getFormatVersion() const = 0; - - /** - * Get the number of rows in the file. - * @return the number of rows - */ - virtual uint64_t getNumberOfRows() const = 0; - - /** +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_READER_HH +#define ORC_READER_HH + +#include "orc/BloomFilter.hh" +#include "orc/Common.hh" +#include "orc/orc-config.hh" +#include "orc/Statistics.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" + +#include <map> +#include <memory> +#include <set> +#include <string> +#include <vector> + +namespace orc { + + // classes that hold data members so we can maintain binary compatibility + struct ReaderOptionsPrivate; + struct RowReaderOptionsPrivate; + + /** + * Options for creating a Reader. + */ + class ReaderOptions { + private: + ORC_UNIQUE_PTR<ReaderOptionsPrivate> privateBits; + + public: + ReaderOptions(); + ReaderOptions(const ReaderOptions&); + ReaderOptions(ReaderOptions&); + ReaderOptions& operator=(const ReaderOptions&); + virtual ~ReaderOptions(); + + /** + * Set the stream to use for printing warning or error messages. + */ + ReaderOptions& setErrorStream(std::ostream& stream); + + /** + * Set a serialized copy of the file tail to be used when opening the file. + * + * When one process opens the file and other processes need to read + * the rows, we want to enable clients to just read the tail once. + * By passing the string returned by Reader.getSerializedFileTail(), to + * this function, the second reader will not need to read the file tail + * from disk. + * + * @param serialization the bytes of the serialized tail to use + */ + ReaderOptions& setSerializedFileTail(const std::string& serialization); + + /** + * Set the memory allocator. + */ + ReaderOptions& setMemoryPool(MemoryPool& pool); + + /** + * Set the location of the tail as defined by the logical length of the + * file. + */ + ReaderOptions& setTailLocation(uint64_t offset); + + /** + * Get the stream to write warnings or errors to. + */ + std::ostream* getErrorStream() const; + + /** + * Get the serialized file tail that the user passed in. + */ + std::string getSerializedFileTail() const; + + /** + * Get the desired tail location. + * @return if not set, return the maximum long. + */ + uint64_t getTailLocation() const; + + /** + * Get the memory allocator. + */ + MemoryPool* getMemoryPool() const; + }; + + /** + * Options for creating a RowReader. + */ + class RowReaderOptions { + private: + ORC_UNIQUE_PTR<RowReaderOptionsPrivate> privateBits; + + public: + RowReaderOptions(); + RowReaderOptions(const RowReaderOptions&); + RowReaderOptions(RowReaderOptions&); + RowReaderOptions& operator=(const RowReaderOptions&); + virtual ~RowReaderOptions(); + + /** + * For files that have structs as the top-level object, select the fields + * to read. The first field is 0, the second 1, and so on. By default, + * all columns are read. This option clears any previous setting of + * the selected columns. + * @param include a list of fields to read + * @return this + */ + RowReaderOptions& include(const std::list<uint64_t>& include); + + /** + * For files that have structs as the top-level object, select the fields + * to read by name. By default, all columns are read. This option clears + * any previous setting of the selected columns. + * @param include a list of fields to read + * @return this + */ + RowReaderOptions& include(const std::list<std::string>& include); + + /** + * Selects which type ids to read. The root type is always 0 and the + * rest of the types are labeled in a preorder traversal of the tree. + * The parent types are automatically selected, but the children are not. + * + * This option clears any previous setting of the selected columns or + * types. + * @param types a list of the type ids to read + * @return this + */ + RowReaderOptions& includeTypes(const std::list<uint64_t>& types); + + /** + * Set the section of the file to process. + * @param offset the starting byte offset + * @param length the number of bytes to read + * @return this + */ + RowReaderOptions& range(uint64_t offset, uint64_t length); + + /** + * For Hive 0.11 (and 0.12) decimals, the precision was unlimited + * and thus may overflow the 38 digits that is supported. If one + * of the Hive 0.11 decimals is too large, the reader may either convert + * the value to NULL or throw an exception. That choice is controlled + * by this setting. + * + * Defaults to true. + * + * @param shouldThrow should the reader throw a ParseError? + * @return returns *this + */ + RowReaderOptions& throwOnHive11DecimalOverflow(bool shouldThrow); + + /** + * For Hive 0.11 (and 0.12) written decimals, which have unlimited + * scale and precision, the reader forces the scale to a consistent + * number that is configured. This setting changes the scale that is + * forced upon these old decimals. See also throwOnHive11DecimalOverflow. + * + * Defaults to 6. + * + * @param forcedScale the scale that will be forced on Hive 0.11 decimals + * @return returns *this + */ + RowReaderOptions& forcedScaleOnHive11Decimal(int32_t forcedScale); + + /** + * Set enable encoding block mode. + * By enable encoding block mode, Row Reader will not decode + * dictionary encoded string vector, but instead return an index array with + * reference to corresponding dictionary. + */ + RowReaderOptions& setEnableLazyDecoding(bool enable); + + /** + * Should enable encoding block mode + */ + bool getEnableLazyDecoding() const; + + /** + * Were the field ids set? + */ + bool getIndexesSet() const; + + /** + * Were the type ids set? + */ + bool getTypeIdsSet() const; + + /** + * Get the list of selected field or type ids to read. + */ + const std::list<uint64_t>& getInclude() const; + + /** + * Were the include names set? + */ + bool getNamesSet() const; + + /** + * Get the list of selected columns to read. All children of the selected + * columns are also selected. + */ + const std::list<std::string>& getIncludeNames() const; + + /** + * Get the start of the range for the data being processed. + * @return if not set, return 0 + */ + uint64_t getOffset() const; + + /** + * Get the end of the range for the data being processed. + * @return if not set, return the maximum long + */ + uint64_t getLength() const; + + /** + * Should the reader throw a ParseError when a Hive 0.11 decimal is + * larger than the supported 38 digits of precision? Otherwise, the + * data item is replaced by a NULL. + */ + bool getThrowOnHive11DecimalOverflow() const; + + /** + * What scale should all Hive 0.11 decimals be normalized to? + */ + int32_t getForcedScaleOnHive11Decimal() const; + }; + + + class RowReader; + + /** + * The interface for reading ORC file meta-data and constructing RowReaders. + * This is an an abstract class that will be subclassed as necessary. + */ + class Reader { + public: + virtual ~Reader(); + + /** + * Get the format version of the file. Currently known values are: + * 0.11 and 0.12 + * @return the FileVersion object + */ + virtual FileVersion getFormatVersion() const = 0; + + /** + * Get the number of rows in the file. + * @return the number of rows + */ + virtual uint64_t getNumberOfRows() const = 0; + + /** * Get the software instance and version that wrote this file. * @return a user-facing string that specifies the software version */ virtual std::string getSoftwareVersion() const = 0; /** - * Get the user metadata keys. - * @return the set of user metadata keys - */ - virtual std::list<std::string> getMetadataKeys() const = 0; - - /** - * Get a user metadata value. - * @param key a key given by the user - * @return the bytes associated with the given key - */ - virtual std::string getMetadataValue(const std::string& key) const = 0; - - /** - * Did the user set the given metadata value. - * @param key the key to check - * @return true if the metadata value was set - */ - virtual bool hasMetadataValue(const std::string& key) const = 0; - - /** - * Get the compression kind. - * @return the kind of compression in the file - */ - virtual CompressionKind getCompression() const = 0; - - /** - * Get the buffer size for the compression. - * @return number of bytes to buffer for the compression codec. - */ - virtual uint64_t getCompressionSize() const = 0; - - /** - * Get ID of writer that generated the file. - * @return UNKNOWN_WRITER if the writer ID is undefined - */ - virtual WriterId getWriterId() const = 0; - - /** - * Get the writer id value when getWriterId() returns an unknown writer. - * @return the integer value of the writer ID. - */ - virtual uint32_t getWriterIdValue() const = 0; - - /** - * Get the version of the writer. - * @return the version of the writer. - */ - virtual WriterVersion getWriterVersion() const = 0; - - /** - * Get the number of rows per an entry in the row index. - * @return the number of rows per an entry in the row index or 0 if there - * is no row index. - */ - virtual uint64_t getRowIndexStride() const = 0; - - /** - * Get the number of stripes in the file. - * @return the number of stripes - */ - virtual uint64_t getNumberOfStripes() const = 0; - - /** - * Get the information about a stripe. - * @param stripeIndex the index of the stripe (0 to N-1) to get information about - * @return the information about that stripe - */ - virtual ORC_UNIQUE_PTR<StripeInformation> - getStripe(uint64_t stripeIndex) const = 0; - - /** - * Get the number of stripe statistics in the file. - * @return the number of stripe statistics - */ - virtual uint64_t getNumberOfStripeStatistics() const = 0; - - /** - * Get the statistics about a stripe. - * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about - * @return the statistics about that stripe - */ - virtual ORC_UNIQUE_PTR<StripeStatistics> - getStripeStatistics(uint64_t stripeIndex) const = 0; - - /** - * Get the length of the data stripes in the file. - * @return the number of bytes in stripes - */ - virtual uint64_t getContentLength() const = 0; - - /** - * Get the length of the file stripe statistics. - * @return the number of compressed bytes in the file stripe statistics - */ - virtual uint64_t getStripeStatisticsLength() const = 0; - - /** - * Get the length of the file footer. - * @return the number of compressed bytes in the file footer - */ - virtual uint64_t getFileFooterLength() const = 0; - - /** - * Get the length of the file postscript. - * @return the number of bytes in the file postscript - */ - virtual uint64_t getFilePostscriptLength() const = 0; - - /** - * Get the total length of the file. - * @return the number of bytes in the file - */ - virtual uint64_t getFileLength() const = 0; - - /** - * Get the statistics about the columns in the file. - * @return the information about the column - */ - virtual ORC_UNIQUE_PTR<Statistics> getStatistics() const = 0; - - /** - * Get the statistics about a single column in the file. - * @param columnId id of the column - * @return the information about the column - */ - virtual ORC_UNIQUE_PTR<ColumnStatistics> - getColumnStatistics(uint32_t columnId) const = 0; - - /** - * Check if the file has correct column statistics. - */ - virtual bool hasCorrectStatistics() const = 0; - - /** - * Get the serialized file tail. - * Usefull if another reader of the same file wants to avoid re-reading - * the file tail. See ReaderOptions.setSerializedFileTail(). - * @return a string of bytes with the file tail - */ - virtual std::string getSerializedFileTail() const = 0; - - /** - * Get the type of the rows in the file. The top level is typically a - * struct. - * @return the root type - */ - virtual const Type& getType() const = 0; - - /** - * Create a RowReader based on this reader with the default options. - * @return a RowReader to read the rows - */ - virtual ORC_UNIQUE_PTR<RowReader> createRowReader() const = 0; - - /** - * Create a RowReader based on this reader. - * @param options RowReader Options - * @return a RowReader to read the rows - */ - virtual ORC_UNIQUE_PTR<RowReader> createRowReader(const RowReaderOptions& options) const = 0; - - /** - * Get the name of the input stream. - */ - virtual const std::string& getStreamName() const = 0; - - /** - * Estimate an upper bound on heap memory allocation by the Reader - * based on the information in the file footer. - * The bound is less tight if only few columns are read or compression is - * used. - */ - /** - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by all columns - */ - virtual uint64_t getMemoryUse(int stripeIx=-1) = 0; - - /** - * @param include Column Field Ids - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by selected columns - */ - virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; - - /** - * @param names Column Names - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by selected columns - */ - virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0; - - /** - * @param include Column Type Ids - * @param stripeIx index of the stripe to be read (if not specified, - * all stripes are considered). - * @return upper bound on memory use by selected columns - */ - virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; - - /** - * Get BloomFiters of all selected columns in the specified stripe - * @param stripeIndex index of the stripe to be read for bloom filters. - * @param included index of selected columns to return (if not specified, - * all columns that have bloom filters are considered). - * @return map of bloom filters with the key standing for the index of column. - */ - virtual std::map<uint32_t, BloomFilterIndex> - getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; - }; - - /** - * The interface for reading rows in ORC files. - * This is an an abstract class that will be subclassed as necessary. - */ - class RowReader { - public: - virtual ~RowReader(); - /** - * Get the selected type of the rows in the file. The file's row type - * is projected down to just the selected columns. Thus, if the file's - * type is struct<col0:int,col1:double,col2:string> and the selected - * columns are "col0,col2" the selected type would be - * struct<col0:int,col2:string>. - * @return the root type - */ - virtual const Type& getSelectedType() const = 0; - - /** - * Get the selected columns of the file. - */ - virtual const std::vector<bool> getSelectedColumns() const = 0; - - /** - * Create a row batch for reading the selected columns of this file. - * @param size the number of rows to read - * @return a new ColumnVectorBatch to read into - */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; - - /** - * Read the next row batch from the current position. - * Caller must look at numElements in the row batch to determine how - * many rows were read. - * @param data the row batch to read into. - * @return true if a non-zero number of rows were read or false if the - * end of the file was reached. - */ - virtual bool next(ColumnVectorBatch& data) = 0; - - /** - * Get the row number of the first row in the previously read batch. - * @return the row number of the previous batch. - */ - virtual uint64_t getRowNumber() const = 0; - - /** - * Seek to a given row. - * @param rowNumber the next row the reader should return - */ - virtual void seekToRow(uint64_t rowNumber) = 0; - - }; -} - -#endif + * Get the user metadata keys. + * @return the set of user metadata keys + */ + virtual std::list<std::string> getMetadataKeys() const = 0; + + /** + * Get a user metadata value. + * @param key a key given by the user + * @return the bytes associated with the given key + */ + virtual std::string getMetadataValue(const std::string& key) const = 0; + + /** + * Did the user set the given metadata value. + * @param key the key to check + * @return true if the metadata value was set + */ + virtual bool hasMetadataValue(const std::string& key) const = 0; + + /** + * Get the compression kind. + * @return the kind of compression in the file + */ + virtual CompressionKind getCompression() const = 0; + + /** + * Get the buffer size for the compression. + * @return number of bytes to buffer for the compression codec. + */ + virtual uint64_t getCompressionSize() const = 0; + + /** + * Get ID of writer that generated the file. + * @return UNKNOWN_WRITER if the writer ID is undefined + */ + virtual WriterId getWriterId() const = 0; + + /** + * Get the writer id value when getWriterId() returns an unknown writer. + * @return the integer value of the writer ID. + */ + virtual uint32_t getWriterIdValue() const = 0; + + /** + * Get the version of the writer. + * @return the version of the writer. + */ + virtual WriterVersion getWriterVersion() const = 0; + + /** + * Get the number of rows per an entry in the row index. + * @return the number of rows per an entry in the row index or 0 if there + * is no row index. + */ + virtual uint64_t getRowIndexStride() const = 0; + + /** + * Get the number of stripes in the file. + * @return the number of stripes + */ + virtual uint64_t getNumberOfStripes() const = 0; + + /** + * Get the information about a stripe. + * @param stripeIndex the index of the stripe (0 to N-1) to get information about + * @return the information about that stripe + */ + virtual ORC_UNIQUE_PTR<StripeInformation> + getStripe(uint64_t stripeIndex) const = 0; + + /** + * Get the number of stripe statistics in the file. + * @return the number of stripe statistics + */ + virtual uint64_t getNumberOfStripeStatistics() const = 0; + + /** + * Get the statistics about a stripe. + * @param stripeIndex the index of the stripe (0 to N-1) to get statistics about + * @return the statistics about that stripe + */ + virtual ORC_UNIQUE_PTR<StripeStatistics> + getStripeStatistics(uint64_t stripeIndex) const = 0; + + /** + * Get the length of the data stripes in the file. + * @return the number of bytes in stripes + */ + virtual uint64_t getContentLength() const = 0; + + /** + * Get the length of the file stripe statistics. + * @return the number of compressed bytes in the file stripe statistics + */ + virtual uint64_t getStripeStatisticsLength() const = 0; + + /** + * Get the length of the file footer. + * @return the number of compressed bytes in the file footer + */ + virtual uint64_t getFileFooterLength() const = 0; + + /** + * Get the length of the file postscript. + * @return the number of bytes in the file postscript + */ + virtual uint64_t getFilePostscriptLength() const = 0; + + /** + * Get the total length of the file. + * @return the number of bytes in the file + */ + virtual uint64_t getFileLength() const = 0; + + /** + * Get the statistics about the columns in the file. + * @return the information about the column + */ + virtual ORC_UNIQUE_PTR<Statistics> getStatistics() const = 0; + + /** + * Get the statistics about a single column in the file. + * @param columnId id of the column + * @return the information about the column + */ + virtual ORC_UNIQUE_PTR<ColumnStatistics> + getColumnStatistics(uint32_t columnId) const = 0; + + /** + * Check if the file has correct column statistics. + */ + virtual bool hasCorrectStatistics() const = 0; + + /** + * Get the serialized file tail. + * Usefull if another reader of the same file wants to avoid re-reading + * the file tail. See ReaderOptions.setSerializedFileTail(). + * @return a string of bytes with the file tail + */ + virtual std::string getSerializedFileTail() const = 0; + + /** + * Get the type of the rows in the file. The top level is typically a + * struct. + * @return the root type + */ + virtual const Type& getType() const = 0; + + /** + * Create a RowReader based on this reader with the default options. + * @return a RowReader to read the rows + */ + virtual ORC_UNIQUE_PTR<RowReader> createRowReader() const = 0; + + /** + * Create a RowReader based on this reader. + * @param options RowReader Options + * @return a RowReader to read the rows + */ + virtual ORC_UNIQUE_PTR<RowReader> createRowReader(const RowReaderOptions& options) const = 0; + + /** + * Get the name of the input stream. + */ + virtual const std::string& getStreamName() const = 0; + + /** + * Estimate an upper bound on heap memory allocation by the Reader + * based on the information in the file footer. + * The bound is less tight if only few columns are read or compression is + * used. + */ + /** + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by all columns + */ + virtual uint64_t getMemoryUse(int stripeIx=-1) = 0; + + /** + * @param include Column Field Ids + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by selected columns + */ + virtual uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; + + /** + * @param names Column Names + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by selected columns + */ + virtual uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) = 0; + + /** + * @param include Column Type Ids + * @param stripeIx index of the stripe to be read (if not specified, + * all stripes are considered). + * @return upper bound on memory use by selected columns + */ + virtual uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) = 0; + + /** + * Get BloomFiters of all selected columns in the specified stripe + * @param stripeIndex index of the stripe to be read for bloom filters. + * @param included index of selected columns to return (if not specified, + * all columns that have bloom filters are considered). + * @return map of bloom filters with the key standing for the index of column. + */ + virtual std::map<uint32_t, BloomFilterIndex> + getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const = 0; + }; + + /** + * The interface for reading rows in ORC files. + * This is an an abstract class that will be subclassed as necessary. + */ + class RowReader { + public: + virtual ~RowReader(); + /** + * Get the selected type of the rows in the file. The file's row type + * is projected down to just the selected columns. Thus, if the file's + * type is struct<col0:int,col1:double,col2:string> and the selected + * columns are "col0,col2" the selected type would be + * struct<col0:int,col2:string>. + * @return the root type + */ + virtual const Type& getSelectedType() const = 0; + + /** + * Get the selected columns of the file. + */ + virtual const std::vector<bool> getSelectedColumns() const = 0; + + /** + * Create a row batch for reading the selected columns of this file. + * @param size the number of rows to read + * @return a new ColumnVectorBatch to read into + */ + virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size + ) const = 0; + + /** + * Read the next row batch from the current position. + * Caller must look at numElements in the row batch to determine how + * many rows were read. + * @param data the row batch to read into. + * @return true if a non-zero number of rows were read or false if the + * end of the file was reached. + */ + virtual bool next(ColumnVectorBatch& data) = 0; + + /** + * Get the row number of the first row in the previously read batch. + * @return the row number of the previous batch. + */ + virtual uint64_t getRowNumber() const = 0; + + /** + * Seek to a given row. + * @param rowNumber the next row the reader should return + */ + virtual void seekToRow(uint64_t rowNumber) = 0; + + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh index 1d4b0b6558..c7da63a542 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh @@ -1,400 +1,400 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_STATISTICS_HH -#define ORC_STATISTICS_HH - -#include "orc/orc-config.hh" -#include "orc/Type.hh" -#include "orc/Vector.hh" - -namespace orc { - - /** - * Statistics that are available for all types of columns. - */ - class ColumnStatistics { - public: - virtual ~ColumnStatistics(); - - /** - * Get the number of values in this column. It will differ from the number - * of rows because of NULL values. - * @return the number of values - */ - virtual uint64_t getNumberOfValues() const = 0; - - /** - * Check whether column has null value. - * @return true if has null value - */ - virtual bool hasNull() const = 0; - - /** - * Print out statistics of column if any. - */ - virtual std::string toString() const = 0; - }; - - /** - * Statistics for binary columns. - */ - class BinaryColumnStatistics: public ColumnStatistics { - public: - virtual ~BinaryColumnStatistics(); - - /** - * Check whether column has total length. - * @return true if has total length - */ - virtual bool hasTotalLength() const = 0; - - virtual uint64_t getTotalLength() const = 0; - }; - - /** - * Statistics for boolean columns. - */ - class BooleanColumnStatistics: public ColumnStatistics { - public: - virtual ~BooleanColumnStatistics(); - - /** - * Check whether column has true/false count. - * @return true if has true/false count - */ - virtual bool hasCount() const = 0; - - virtual uint64_t getFalseCount() const = 0; - virtual uint64_t getTrueCount() const = 0; - }; - - /** - * Statistics for date columns. - */ - class DateColumnStatistics: public ColumnStatistics { - public: - virtual ~DateColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ - virtual int32_t getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ - virtual int32_t getMaximum() const = 0; - }; - - /** - * Statistics for decimal columns. - */ - class DecimalColumnStatistics: public ColumnStatistics { - public: - virtual ~DecimalColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has sum. - * @return true if has sum - */ - virtual bool hasSum() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ - virtual Decimal getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ - virtual Decimal getMaximum() const = 0; - - /** - * Get the sum for the column. - * @return sum of all the values - */ - virtual Decimal getSum() const = 0; - }; - - /** - * Statistics for float and double columns. - */ - class DoubleColumnStatistics: public ColumnStatistics { - public: - virtual ~DoubleColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has sum. - * @return true if has sum - */ - virtual bool hasSum() const = 0; - - /** - * Get the smallest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the minimum - */ - virtual double getMinimum() const = 0; - - /** - * Get the largest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the maximum - */ - virtual double getMaximum() const = 0; - - /** - * Get the sum of the values in the column. - * @return the sum - */ - virtual double getSum() const = 0; - }; - - /** - * Statistics for all of the integer columns, such as byte, short, int, and - * long. - */ - class IntegerColumnStatistics: public ColumnStatistics { - public: - virtual ~IntegerColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has sum. - * @return true if has sum - */ - virtual bool hasSum() const = 0; - - /** - * Get the smallest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the minimum - */ - virtual int64_t getMinimum() const = 0; - - /** - * Get the largest value in the column. Only defined if getNumberOfValues - * is non-zero. - * @return the maximum - */ - virtual int64_t getMaximum() const = 0; - - /** - * Get the sum of the column. Only valid if isSumDefined returns true. - * @return the sum of the column - */ - virtual int64_t getSum() const = 0; - }; - - /** - * Statistics for string columns. - */ - class StringColumnStatistics: public ColumnStatistics { - public: - virtual ~StringColumnStatistics(); - - /** - * Check whether column has minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column has maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Check whether column has total length. - * @return true if has total length - */ - virtual bool hasTotalLength() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STATISTICS_HH +#define ORC_STATISTICS_HH + +#include "orc/orc-config.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" + +namespace orc { + + /** + * Statistics that are available for all types of columns. + */ + class ColumnStatistics { + public: + virtual ~ColumnStatistics(); + + /** + * Get the number of values in this column. It will differ from the number + * of rows because of NULL values. + * @return the number of values + */ + virtual uint64_t getNumberOfValues() const = 0; + + /** + * Check whether column has null value. + * @return true if has null value + */ + virtual bool hasNull() const = 0; + + /** + * Print out statistics of column if any. + */ + virtual std::string toString() const = 0; + }; + + /** + * Statistics for binary columns. + */ + class BinaryColumnStatistics: public ColumnStatistics { + public: + virtual ~BinaryColumnStatistics(); + + /** + * Check whether column has total length. + * @return true if has total length + */ + virtual bool hasTotalLength() const = 0; + + virtual uint64_t getTotalLength() const = 0; + }; + + /** + * Statistics for boolean columns. + */ + class BooleanColumnStatistics: public ColumnStatistics { + public: + virtual ~BooleanColumnStatistics(); + + /** + * Check whether column has true/false count. + * @return true if has true/false count + */ + virtual bool hasCount() const = 0; + + virtual uint64_t getFalseCount() const = 0; + virtual uint64_t getTrueCount() const = 0; + }; + + /** + * Statistics for date columns. + */ + class DateColumnStatistics: public ColumnStatistics { + public: + virtual ~DateColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ + virtual int32_t getMinimum() const = 0; + + /** + * Get the maximum value for the column. + * @return maximum value + */ + virtual int32_t getMaximum() const = 0; + }; + + /** + * Statistics for decimal columns. + */ + class DecimalColumnStatistics: public ColumnStatistics { + public: + virtual ~DecimalColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has sum. + * @return true if has sum + */ + virtual bool hasSum() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ + virtual Decimal getMinimum() const = 0; + + /** + * Get the maximum value for the column. + * @return maximum value + */ + virtual Decimal getMaximum() const = 0; + + /** + * Get the sum for the column. + * @return sum of all the values + */ + virtual Decimal getSum() const = 0; + }; + + /** + * Statistics for float and double columns. + */ + class DoubleColumnStatistics: public ColumnStatistics { + public: + virtual ~DoubleColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has sum. + * @return true if has sum + */ + virtual bool hasSum() const = 0; + + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + virtual double getMinimum() const = 0; + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + virtual double getMaximum() const = 0; + + /** + * Get the sum of the values in the column. + * @return the sum + */ + virtual double getSum() const = 0; + }; + + /** + * Statistics for all of the integer columns, such as byte, short, int, and + * long. + */ + class IntegerColumnStatistics: public ColumnStatistics { + public: + virtual ~IntegerColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has sum. + * @return true if has sum + */ + virtual bool hasSum() const = 0; + + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + virtual int64_t getMinimum() const = 0; + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + virtual int64_t getMaximum() const = 0; + + /** + * Get the sum of the column. Only valid if isSumDefined returns true. + * @return the sum of the column + */ + virtual int64_t getSum() const = 0; + }; + + /** + * Statistics for string columns. + */ + class StringColumnStatistics: public ColumnStatistics { + public: + virtual ~StringColumnStatistics(); + + /** + * Check whether column has minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column has maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Check whether column has total length. + * @return true if has total length + */ + virtual bool hasTotalLength() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ virtual const std::string & getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ + + /** + * Get the maximum value for the column. + * @return maximum value + */ virtual const std::string & getMaximum() const = 0; - - /** - * Get the total length of all values. - * @return total length of all the values - */ - virtual uint64_t getTotalLength() const = 0; - }; - - /** - * Statistics for timestamp columns. - */ - class TimestampColumnStatistics: public ColumnStatistics { - public: - virtual ~TimestampColumnStatistics(); - - /** - * Check whether column minimum. - * @return true if has minimum - */ - virtual bool hasMinimum() const = 0; - - /** - * Check whether column maximum. - * @return true if has maximum - */ - virtual bool hasMaximum() const = 0; - - /** - * Get the minimum value for the column. - * @return minimum value - */ - virtual int64_t getMinimum() const = 0; - - /** - * Get the maximum value for the column. - * @return maximum value - */ - virtual int64_t getMaximum() const = 0; - - /** - * Check whether column has a lowerBound. - * @return true if column has a lowerBound - */ - virtual bool hasLowerBound() const = 0; - - /** - * Check whether column has an upperBound. - * @return true if column has an upperBound - */ - virtual bool hasUpperBound() const = 0; - - /** - * Get the lowerBound value for the column. - * @return lowerBound value - */ - virtual int64_t getLowerBound() const = 0; - - /** - * Get the upperBound value for the column. - * @return upperBound value - */ - virtual int64_t getUpperBound() const = 0; - - - }; - - class Statistics { - public: - virtual ~Statistics(); - - /** - * Get the statistics of the given column. - * @param colId id of the column - * @return one column's statistics - */ - virtual const ColumnStatistics* getColumnStatistics(uint32_t colId - ) const = 0; - - /** - * Get the number of columns. - * @return the number of columns - */ - virtual uint32_t getNumberOfColumns() const = 0; - }; - - class StripeStatistics : public Statistics { - public: - virtual ~StripeStatistics(); - - /** - * Get the statistics of a given RowIndex entry in a given column. - * @param columnId id of the column - * @param rowIndexId RowIndex entry id - * @return statistics of the given RowIndex entry - */ - virtual const ColumnStatistics* - getRowIndexStatistics( - uint32_t columnId, uint32_t rowIndexId) const = 0; - - /** - * Get the number of RowIndex statistics in a given column. - * @param columnId id of the column - * @return the number of RowIndex statistics - */ - virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0; - }; -} - -#endif + + /** + * Get the total length of all values. + * @return total length of all the values + */ + virtual uint64_t getTotalLength() const = 0; + }; + + /** + * Statistics for timestamp columns. + */ + class TimestampColumnStatistics: public ColumnStatistics { + public: + virtual ~TimestampColumnStatistics(); + + /** + * Check whether column minimum. + * @return true if has minimum + */ + virtual bool hasMinimum() const = 0; + + /** + * Check whether column maximum. + * @return true if has maximum + */ + virtual bool hasMaximum() const = 0; + + /** + * Get the minimum value for the column. + * @return minimum value + */ + virtual int64_t getMinimum() const = 0; + + /** + * Get the maximum value for the column. + * @return maximum value + */ + virtual int64_t getMaximum() const = 0; + + /** + * Check whether column has a lowerBound. + * @return true if column has a lowerBound + */ + virtual bool hasLowerBound() const = 0; + + /** + * Check whether column has an upperBound. + * @return true if column has an upperBound + */ + virtual bool hasUpperBound() const = 0; + + /** + * Get the lowerBound value for the column. + * @return lowerBound value + */ + virtual int64_t getLowerBound() const = 0; + + /** + * Get the upperBound value for the column. + * @return upperBound value + */ + virtual int64_t getUpperBound() const = 0; + + + }; + + class Statistics { + public: + virtual ~Statistics(); + + /** + * Get the statistics of the given column. + * @param colId id of the column + * @return one column's statistics + */ + virtual const ColumnStatistics* getColumnStatistics(uint32_t colId + ) const = 0; + + /** + * Get the number of columns. + * @return the number of columns + */ + virtual uint32_t getNumberOfColumns() const = 0; + }; + + class StripeStatistics : public Statistics { + public: + virtual ~StripeStatistics(); + + /** + * Get the statistics of a given RowIndex entry in a given column. + * @param columnId id of the column + * @param rowIndexId RowIndex entry id + * @return statistics of the given RowIndex entry + */ + virtual const ColumnStatistics* + getRowIndexStatistics( + uint32_t columnId, uint32_t rowIndexId) const = 0; + + /** + * Get the number of RowIndex statistics in a given column. + * @param columnId id of the column + * @return the number of RowIndex statistics + */ + virtual uint32_t getNumberOfRowIndexStats(uint32_t columnId) const = 0; + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Type.hh b/contrib/libs/apache/orc/c++/include/orc/Type.hh index c0cbf2d671..ba0f87e9b2 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Type.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Type.hh @@ -1,111 +1,111 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_TYPE_HH -#define ORC_TYPE_HH - -#include "orc/orc-config.hh" -#include "orc/Vector.hh" -#include "MemoryPool.hh" - -namespace orc { - - enum TypeKind { - BOOLEAN = 0, - BYTE = 1, - SHORT = 2, - INT = 3, - LONG = 4, - FLOAT = 5, - DOUBLE = 6, - STRING = 7, - BINARY = 8, - TIMESTAMP = 9, - LIST = 10, - MAP = 11, - STRUCT = 12, - UNION = 13, - DECIMAL = 14, - DATE = 15, - VARCHAR = 16, - CHAR = 17 - }; - - class Type { - public: - virtual ~Type(); - virtual uint64_t getColumnId() const = 0; - virtual uint64_t getMaximumColumnId() const = 0; - virtual TypeKind getKind() const = 0; - virtual uint64_t getSubtypeCount() const = 0; - virtual const Type* getSubtype(uint64_t childId) const = 0; - virtual const std::string& getFieldName(uint64_t childId) const = 0; - virtual uint64_t getMaximumLength() const = 0; - virtual uint64_t getPrecision() const = 0; - virtual uint64_t getScale() const = 0; - virtual std::string toString() const = 0; - - /** - * Create a row batch for this type. - */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size, - MemoryPool& pool, - bool encoded = false - ) const = 0; - - /** - * Add a new field to a struct type. - * @param fieldName the name of the new field - * @param fieldType the type of the new field - * @return a reference to the struct type - */ - virtual Type* addStructField(const std::string& fieldName, - ORC_UNIQUE_PTR<Type> fieldType) = 0; - - /** - * Add a new child to a union type. - * @param fieldType the type of the new field - * @return a reference to the union type - */ - virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0; - - /** - * Build a Type object from string text representation. - */ - static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input); - }; - - const int64_t DEFAULT_DECIMAL_SCALE = 18; - const int64_t DEFAULT_DECIMAL_PRECISION = 38; - - ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind); - ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind, - uint64_t maxLength); - ORC_UNIQUE_PTR<Type> - createDecimalType(uint64_t precision= - DEFAULT_DECIMAL_PRECISION, - uint64_t scale=DEFAULT_DECIMAL_SCALE); - - ORC_UNIQUE_PTR<Type> createStructType(); - ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements); - ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key, - ORC_UNIQUE_PTR<Type> value); - ORC_UNIQUE_PTR<Type> createUnionType(); - -} -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_TYPE_HH +#define ORC_TYPE_HH + +#include "orc/orc-config.hh" +#include "orc/Vector.hh" +#include "MemoryPool.hh" + +namespace orc { + + enum TypeKind { + BOOLEAN = 0, + BYTE = 1, + SHORT = 2, + INT = 3, + LONG = 4, + FLOAT = 5, + DOUBLE = 6, + STRING = 7, + BINARY = 8, + TIMESTAMP = 9, + LIST = 10, + MAP = 11, + STRUCT = 12, + UNION = 13, + DECIMAL = 14, + DATE = 15, + VARCHAR = 16, + CHAR = 17 + }; + + class Type { + public: + virtual ~Type(); + virtual uint64_t getColumnId() const = 0; + virtual uint64_t getMaximumColumnId() const = 0; + virtual TypeKind getKind() const = 0; + virtual uint64_t getSubtypeCount() const = 0; + virtual const Type* getSubtype(uint64_t childId) const = 0; + virtual const std::string& getFieldName(uint64_t childId) const = 0; + virtual uint64_t getMaximumLength() const = 0; + virtual uint64_t getPrecision() const = 0; + virtual uint64_t getScale() const = 0; + virtual std::string toString() const = 0; + + /** + * Create a row batch for this type. + */ + virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size, + MemoryPool& pool, + bool encoded = false + ) const = 0; + + /** + * Add a new field to a struct type. + * @param fieldName the name of the new field + * @param fieldType the type of the new field + * @return a reference to the struct type + */ + virtual Type* addStructField(const std::string& fieldName, + ORC_UNIQUE_PTR<Type> fieldType) = 0; + + /** + * Add a new child to a union type. + * @param fieldType the type of the new field + * @return a reference to the union type + */ + virtual Type* addUnionChild(ORC_UNIQUE_PTR<Type> fieldType) = 0; + + /** + * Build a Type object from string text representation. + */ + static ORC_UNIQUE_PTR<Type> buildTypeFromString(const std::string& input); + }; + + const int64_t DEFAULT_DECIMAL_SCALE = 18; + const int64_t DEFAULT_DECIMAL_PRECISION = 38; + + ORC_UNIQUE_PTR<Type> createPrimitiveType(TypeKind kind); + ORC_UNIQUE_PTR<Type> createCharType(TypeKind kind, + uint64_t maxLength); + ORC_UNIQUE_PTR<Type> + createDecimalType(uint64_t precision= + DEFAULT_DECIMAL_PRECISION, + uint64_t scale=DEFAULT_DECIMAL_SCALE); + + ORC_UNIQUE_PTR<Type> createStructType(); + ORC_UNIQUE_PTR<Type> createListType(ORC_UNIQUE_PTR<Type> elements); + ORC_UNIQUE_PTR<Type> createMapType(ORC_UNIQUE_PTR<Type> key, + ORC_UNIQUE_PTR<Type> value); + ORC_UNIQUE_PTR<Type> createUnionType(); + +} +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Vector.hh b/contrib/libs/apache/orc/c++/include/orc/Vector.hh index 629c0b7f6b..97bba1ef83 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Vector.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Vector.hh @@ -1,326 +1,326 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_VECTOR_HH -#define ORC_VECTOR_HH - -#include "orc/orc-config.hh" -#include "MemoryPool.hh" -#include "Int128.hh" - -#include <list> -#include <memory> -#include <cstring> -#include <vector> -#include <stdexcept> -#include <cstdlib> -#include <iostream> - -namespace orc { - - /** - * The base class for each of the column vectors. This class handles - * the generic attributes such as number of elements, capacity, and - * notNull vector. - */ - struct ColumnVectorBatch { - ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~ColumnVectorBatch(); - - // the number of slots available - uint64_t capacity; - // the number of current occupied slots - uint64_t numElements; - // an array of capacity length marking non-null values - DataBuffer<char> notNull; - // whether there are any null values - bool hasNulls; - // whether the vector batch is encoded - bool isEncoded; - - // custom memory pool - MemoryPool& memoryPool; - - /** - * Generate a description of this vector as a string. - */ - virtual std::string toString() const = 0; - - /** - * Change the number of slots to at least the given capacity. - * This function is not recursive into subtypes. - */ - virtual void resize(uint64_t capacity); - - /** - * Empties the vector from all its elements, recursively. - * Do not alter the current capacity. - */ - virtual void clear(); - - /** - * Heap memory used by the batch. - */ - virtual uint64_t getMemoryUsage(); - - /** - * Check whether the batch length varies depending on data. - */ - virtual bool hasVariableLength(); - - private: - ColumnVectorBatch(const ColumnVectorBatch&); - ColumnVectorBatch& operator=(const ColumnVectorBatch&); - }; - - struct LongVectorBatch: public ColumnVectorBatch { - LongVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~LongVectorBatch(); - - DataBuffer<int64_t> data; - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - }; - - struct DoubleVectorBatch: public ColumnVectorBatch { - DoubleVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~DoubleVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - DataBuffer<double> data; - }; - - struct StringVectorBatch: public ColumnVectorBatch { - StringVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~StringVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // pointers to the start of each string - DataBuffer<char*> data; - // the length of each string - DataBuffer<int64_t> length; - // string blob - DataBuffer<char> blob; - }; - - struct StringDictionary { - StringDictionary(MemoryPool& pool); - DataBuffer<char> dictionaryBlob; - - // Offset for each dictionary key entry. - DataBuffer<int64_t> dictionaryOffset; - - void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) { - if (index < 0 || static_cast<uint64_t>(index) >= dictionaryOffset.size()) { - throw std::out_of_range("index out of range."); - } - - int64_t* offsetPtr = dictionaryOffset.data(); - - valPtr = dictionaryBlob.data() + offsetPtr[index]; - length = offsetPtr[index + 1] - offsetPtr[index]; - } - }; - - /** - * Include a index array with reference to corresponding dictionary. - * User first obtain index from index array and retrieve string pointer - * and length by calling getValueByIndex() from dictionary. - */ - struct EncodedStringVectorBatch : public StringVectorBatch { - EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~EncodedStringVectorBatch(); - std::string toString() const; - std::shared_ptr<StringDictionary> dictionary; - - // index for dictionary entry - DataBuffer<int64_t> index; - }; - - struct StructVectorBatch: public ColumnVectorBatch { - StructVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~StructVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - std::vector<ColumnVectorBatch*> fields; - }; - - struct ListVectorBatch: public ColumnVectorBatch { - ListVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~ListVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - /** - * The offset of the first element of each list. - * The length of list i is offsets[i+1] - offsets[i]. - */ - DataBuffer<int64_t> offsets; - - // the concatenated elements - ORC_UNIQUE_PTR<ColumnVectorBatch> elements; - }; - - struct MapVectorBatch: public ColumnVectorBatch { - MapVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~MapVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - /** - * The offset of the first element of each map. - * The size of map i is offsets[i+1] - offsets[i]. - */ - DataBuffer<int64_t> offsets; - - // the concatenated keys - ORC_UNIQUE_PTR<ColumnVectorBatch> keys; - // the concatenated elements - ORC_UNIQUE_PTR<ColumnVectorBatch> elements; - }; - - struct UnionVectorBatch: public ColumnVectorBatch { - UnionVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~UnionVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - bool hasVariableLength(); - - /** - * For each value, which element of children has the value. - */ - DataBuffer<unsigned char> tags; - - /** - * For each value, the index inside of the child ColumnVectorBatch. - */ - DataBuffer<uint64_t> offsets; - - // the sub-columns - std::vector<ColumnVectorBatch*> children; - }; - - struct Decimal { - Decimal(const Int128& value, int32_t scale); - explicit Decimal(const std::string& value); - Decimal(); - - std::string toString() const; - Int128 value; - int32_t scale; - }; - - struct Decimal64VectorBatch: public ColumnVectorBatch { - Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~Decimal64VectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // total number of digits - int32_t precision; - // the number of places after the decimal - int32_t scale; - - // the numeric values - DataBuffer<int64_t> values; - - protected: - /** - * Contains the scales that were read from the file. Should NOT be - * used. - */ - DataBuffer<int64_t> readScales; - friend class Decimal64ColumnReader; - friend class Decimal64ColumnWriter; - }; - - struct Decimal128VectorBatch: public ColumnVectorBatch { - Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~Decimal128VectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // total number of digits - int32_t precision; - // the number of places after the decimal - int32_t scale; - - // the numeric values - DataBuffer<Int128> values; - - protected: - /** - * Contains the scales that were read from the file. Should NOT be - * used. - */ - DataBuffer<int64_t> readScales; - friend class Decimal128ColumnReader; - friend class DecimalHive11ColumnReader; - friend class Decimal128ColumnWriter; - }; - - /** - * A column vector batch for storing timestamp values. - * The timestamps are stored split into the time_t value (seconds since - * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. - */ - struct TimestampVectorBatch: public ColumnVectorBatch { - TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); - virtual ~TimestampVectorBatch(); - std::string toString() const; - void resize(uint64_t capacity); - void clear(); - uint64_t getMemoryUsage(); - - // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) - // Note that we always assume data is in GMT timezone; therefore it is - // user's responsibility to convert wall clock time in local timezone - // to GMT. - DataBuffer<int64_t> data; - - // the nanoseconds of each value - DataBuffer<int64_t> nanoseconds; - }; - -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_VECTOR_HH +#define ORC_VECTOR_HH + +#include "orc/orc-config.hh" +#include "MemoryPool.hh" +#include "Int128.hh" + +#include <list> +#include <memory> +#include <cstring> +#include <vector> +#include <stdexcept> +#include <cstdlib> +#include <iostream> + +namespace orc { + + /** + * The base class for each of the column vectors. This class handles + * the generic attributes such as number of elements, capacity, and + * notNull vector. + */ + struct ColumnVectorBatch { + ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~ColumnVectorBatch(); + + // the number of slots available + uint64_t capacity; + // the number of current occupied slots + uint64_t numElements; + // an array of capacity length marking non-null values + DataBuffer<char> notNull; + // whether there are any null values + bool hasNulls; + // whether the vector batch is encoded + bool isEncoded; + + // custom memory pool + MemoryPool& memoryPool; + + /** + * Generate a description of this vector as a string. + */ + virtual std::string toString() const = 0; + + /** + * Change the number of slots to at least the given capacity. + * This function is not recursive into subtypes. + */ + virtual void resize(uint64_t capacity); + + /** + * Empties the vector from all its elements, recursively. + * Do not alter the current capacity. + */ + virtual void clear(); + + /** + * Heap memory used by the batch. + */ + virtual uint64_t getMemoryUsage(); + + /** + * Check whether the batch length varies depending on data. + */ + virtual bool hasVariableLength(); + + private: + ColumnVectorBatch(const ColumnVectorBatch&); + ColumnVectorBatch& operator=(const ColumnVectorBatch&); + }; + + struct LongVectorBatch: public ColumnVectorBatch { + LongVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~LongVectorBatch(); + + DataBuffer<int64_t> data; + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + }; + + struct DoubleVectorBatch: public ColumnVectorBatch { + DoubleVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~DoubleVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + DataBuffer<double> data; + }; + + struct StringVectorBatch: public ColumnVectorBatch { + StringVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~StringVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // pointers to the start of each string + DataBuffer<char*> data; + // the length of each string + DataBuffer<int64_t> length; + // string blob + DataBuffer<char> blob; + }; + + struct StringDictionary { + StringDictionary(MemoryPool& pool); + DataBuffer<char> dictionaryBlob; + + // Offset for each dictionary key entry. + DataBuffer<int64_t> dictionaryOffset; + + void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) { + if (index < 0 || static_cast<uint64_t>(index) >= dictionaryOffset.size()) { + throw std::out_of_range("index out of range."); + } + + int64_t* offsetPtr = dictionaryOffset.data(); + + valPtr = dictionaryBlob.data() + offsetPtr[index]; + length = offsetPtr[index + 1] - offsetPtr[index]; + } + }; + + /** + * Include a index array with reference to corresponding dictionary. + * User first obtain index from index array and retrieve string pointer + * and length by calling getValueByIndex() from dictionary. + */ + struct EncodedStringVectorBatch : public StringVectorBatch { + EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~EncodedStringVectorBatch(); + std::string toString() const; + std::shared_ptr<StringDictionary> dictionary; + + // index for dictionary entry + DataBuffer<int64_t> index; + }; + + struct StructVectorBatch: public ColumnVectorBatch { + StructVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~StructVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + std::vector<ColumnVectorBatch*> fields; + }; + + struct ListVectorBatch: public ColumnVectorBatch { + ListVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~ListVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + /** + * The offset of the first element of each list. + * The length of list i is offsets[i+1] - offsets[i]. + */ + DataBuffer<int64_t> offsets; + + // the concatenated elements + ORC_UNIQUE_PTR<ColumnVectorBatch> elements; + }; + + struct MapVectorBatch: public ColumnVectorBatch { + MapVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~MapVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + /** + * The offset of the first element of each map. + * The size of map i is offsets[i+1] - offsets[i]. + */ + DataBuffer<int64_t> offsets; + + // the concatenated keys + ORC_UNIQUE_PTR<ColumnVectorBatch> keys; + // the concatenated elements + ORC_UNIQUE_PTR<ColumnVectorBatch> elements; + }; + + struct UnionVectorBatch: public ColumnVectorBatch { + UnionVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~UnionVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + bool hasVariableLength(); + + /** + * For each value, which element of children has the value. + */ + DataBuffer<unsigned char> tags; + + /** + * For each value, the index inside of the child ColumnVectorBatch. + */ + DataBuffer<uint64_t> offsets; + + // the sub-columns + std::vector<ColumnVectorBatch*> children; + }; + + struct Decimal { + Decimal(const Int128& value, int32_t scale); + explicit Decimal(const std::string& value); + Decimal(); + + std::string toString() const; + Int128 value; + int32_t scale; + }; + + struct Decimal64VectorBatch: public ColumnVectorBatch { + Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~Decimal64VectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // total number of digits + int32_t precision; + // the number of places after the decimal + int32_t scale; + + // the numeric values + DataBuffer<int64_t> values; + + protected: + /** + * Contains the scales that were read from the file. Should NOT be + * used. + */ + DataBuffer<int64_t> readScales; + friend class Decimal64ColumnReader; + friend class Decimal64ColumnWriter; + }; + + struct Decimal128VectorBatch: public ColumnVectorBatch { + Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~Decimal128VectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // total number of digits + int32_t precision; + // the number of places after the decimal + int32_t scale; + + // the numeric values + DataBuffer<Int128> values; + + protected: + /** + * Contains the scales that were read from the file. Should NOT be + * used. + */ + DataBuffer<int64_t> readScales; + friend class Decimal128ColumnReader; + friend class DecimalHive11ColumnReader; + friend class Decimal128ColumnWriter; + }; + + /** + * A column vector batch for storing timestamp values. + * The timestamps are stored split into the time_t value (seconds since + * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. + */ + struct TimestampVectorBatch: public ColumnVectorBatch { + TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); + virtual ~TimestampVectorBatch(); + std::string toString() const; + void resize(uint64_t capacity); + void clear(); + uint64_t getMemoryUsage(); + + // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) + // Note that we always assume data is in GMT timezone; therefore it is + // user's responsibility to convert wall clock time in local timezone + // to GMT. + DataBuffer<int64_t> data; + + // the nanoseconds of each value + DataBuffer<int64_t> nanoseconds; + }; + +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/Writer.hh b/contrib/libs/apache/orc/c++/include/orc/Writer.hh index 5b333861b1..2588d62151 100644 --- a/contrib/libs/apache/orc/c++/include/orc/Writer.hh +++ b/contrib/libs/apache/orc/c++/include/orc/Writer.hh @@ -1,252 +1,252 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_WRITER_HH -#define ORC_WRITER_HH - -#include "orc/Common.hh" -#include "orc/orc-config.hh" -#include "orc/Type.hh" -#include "orc/Vector.hh" - -#include <memory> -#include <set> -#include <string> -#include <vector> - -namespace orc { - - // classes that hold data members so we can maintain binary compatibility - struct WriterOptionsPrivate; - - enum CompressionStrategy { - CompressionStrategy_SPEED = 0, - CompressionStrategy_COMPRESSION - }; - - enum RleVersion { - RleVersion_1 = 0, - RleVersion_2 = 1 - }; - - class Timezone; - - /** - * Options for creating a Writer. - */ - class WriterOptions { - private: - ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits; - - public: - WriterOptions(); - WriterOptions(const WriterOptions&); - WriterOptions(WriterOptions&); - WriterOptions& operator=(const WriterOptions&); - virtual ~WriterOptions(); - - /** - * Set the strip size. - */ - WriterOptions& setStripeSize(uint64_t size); - - /** - * Get the strip size. - * @return if not set, return default value. - */ - uint64_t getStripeSize() const; - - /** - * Set the data compression block size. - */ - WriterOptions& setCompressionBlockSize(uint64_t size); - - /** - * Get the data compression block size. - * @return if not set, return default value. - */ - uint64_t getCompressionBlockSize() const; - - /** - * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index. - */ - WriterOptions& setRowIndexStride(uint64_t stride); - - /** - * Get the row index stride (the number of rows per an entry in the row index). - * @return if not set, return default value. - */ - uint64_t getRowIndexStride() const; - - /** - * Set the dictionary key size threshold. - * 0 to disable dictionary encoding. - * 1 to always enable dictionary encoding. - */ - WriterOptions& setDictionaryKeySizeThreshold(double val); - - /** - * Get the dictionary key size threshold. - */ - double getDictionaryKeySizeThreshold() const; - - /** - * Set Orc file version - */ - WriterOptions& setFileVersion(const FileVersion& version); - - /** - * Get Orc file version - */ - FileVersion getFileVersion() const; - - /** - * Set compression kind. - */ - WriterOptions& setCompression(CompressionKind comp); - - /** - * Get the compression kind. - * @return if not set, return default value which is ZLIB. - */ - CompressionKind getCompression() const; - - /** - * Set the compression strategy. - */ - WriterOptions& setCompressionStrategy(CompressionStrategy strategy); - - /** - * Get the compression strategy. - * @return if not set, return default value which is speed. - */ - CompressionStrategy getCompressionStrategy() const; - - /** - * Get if the bitpacking should be aligned. - * @return true if should be aligned, return false otherwise - */ - bool getAlignedBitpacking() const; - - /** - * Set the padding tolerance. - */ - WriterOptions& setPaddingTolerance(double tolerance); - - /** - * Get the padding tolerance. - * @return if not set, return default value which is zero. - */ - double getPaddingTolerance() const; - - /** - * Set the memory pool. - */ - WriterOptions& setMemoryPool(MemoryPool * memoryPool); - - /** - * Get the memory pool. - * @return if not set, return default memory pool. - */ - MemoryPool * getMemoryPool() const; - - /** - * Set the error stream. - */ - WriterOptions& setErrorStream(std::ostream& errStream); - - /** - * Get the error stream. - * @return if not set, return std::err. - */ - std::ostream * getErrorStream() const; - - /** - * Get the RLE version. - */ - RleVersion getRleVersion() const; - - /** - * Get whether or not to write row group index - * @return if not set, the default is false - */ - bool getEnableIndex() const; - - /** - * Get whether or not to enable dictionary encoding - * @return if not set, the default is false - */ - bool getEnableDictionary() const; - - /** - * Set columns that use BloomFilter - */ - WriterOptions& setColumnsUseBloomFilter(const std::set<uint64_t>& columns); - - /** - * Get whether this column uses BloomFilter - */ - bool isColumnUseBloomFilter(uint64_t column) const; - - /** - * Set false positive probability of BloomFilter - */ - WriterOptions& setBloomFilterFPP(double fpp); - - /** - * Get false positive probability of BloomFilter - */ - double getBloomFilterFPP() const; - - /** - * Get version of BloomFilter - */ - BloomFilterVersion getBloomFilterVersion() const; - }; - - class Writer { - public: - virtual ~Writer(); - - /** - * Create a row batch for writing the columns into this file. - * @param size the number of rows to write. - * @return a new ColumnVectorBatch to write into. - */ - virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size - ) const = 0; - - /** - * Add a row batch into current writer. - * @param rowsToAdd the row batch data to write. - */ - virtual void add(ColumnVectorBatch& rowsToAdd) = 0; - - /** - * Close the writer and flush any pending data to the output stream. - */ - virtual void close() = 0; - - /** - * Add user metadata to the writer. - */ - virtual void addUserMetadata(const std::string name, const std::string value) = 0; - }; -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_WRITER_HH +#define ORC_WRITER_HH + +#include "orc/Common.hh" +#include "orc/orc-config.hh" +#include "orc/Type.hh" +#include "orc/Vector.hh" + +#include <memory> +#include <set> +#include <string> +#include <vector> + +namespace orc { + + // classes that hold data members so we can maintain binary compatibility + struct WriterOptionsPrivate; + + enum CompressionStrategy { + CompressionStrategy_SPEED = 0, + CompressionStrategy_COMPRESSION + }; + + enum RleVersion { + RleVersion_1 = 0, + RleVersion_2 = 1 + }; + + class Timezone; + + /** + * Options for creating a Writer. + */ + class WriterOptions { + private: + ORC_UNIQUE_PTR<WriterOptionsPrivate> privateBits; + + public: + WriterOptions(); + WriterOptions(const WriterOptions&); + WriterOptions(WriterOptions&); + WriterOptions& operator=(const WriterOptions&); + virtual ~WriterOptions(); + + /** + * Set the strip size. + */ + WriterOptions& setStripeSize(uint64_t size); + + /** + * Get the strip size. + * @return if not set, return default value. + */ + uint64_t getStripeSize() const; + + /** + * Set the data compression block size. + */ + WriterOptions& setCompressionBlockSize(uint64_t size); + + /** + * Get the data compression block size. + * @return if not set, return default value. + */ + uint64_t getCompressionBlockSize() const; + + /** + * Set row index stride (the number of rows per an entry in the row index). Use value 0 to disable row index. + */ + WriterOptions& setRowIndexStride(uint64_t stride); + + /** + * Get the row index stride (the number of rows per an entry in the row index). + * @return if not set, return default value. + */ + uint64_t getRowIndexStride() const; + + /** + * Set the dictionary key size threshold. + * 0 to disable dictionary encoding. + * 1 to always enable dictionary encoding. + */ + WriterOptions& setDictionaryKeySizeThreshold(double val); + + /** + * Get the dictionary key size threshold. + */ + double getDictionaryKeySizeThreshold() const; + + /** + * Set Orc file version + */ + WriterOptions& setFileVersion(const FileVersion& version); + + /** + * Get Orc file version + */ + FileVersion getFileVersion() const; + + /** + * Set compression kind. + */ + WriterOptions& setCompression(CompressionKind comp); + + /** + * Get the compression kind. + * @return if not set, return default value which is ZLIB. + */ + CompressionKind getCompression() const; + + /** + * Set the compression strategy. + */ + WriterOptions& setCompressionStrategy(CompressionStrategy strategy); + + /** + * Get the compression strategy. + * @return if not set, return default value which is speed. + */ + CompressionStrategy getCompressionStrategy() const; + + /** + * Get if the bitpacking should be aligned. + * @return true if should be aligned, return false otherwise + */ + bool getAlignedBitpacking() const; + + /** + * Set the padding tolerance. + */ + WriterOptions& setPaddingTolerance(double tolerance); + + /** + * Get the padding tolerance. + * @return if not set, return default value which is zero. + */ + double getPaddingTolerance() const; + + /** + * Set the memory pool. + */ + WriterOptions& setMemoryPool(MemoryPool * memoryPool); + + /** + * Get the memory pool. + * @return if not set, return default memory pool. + */ + MemoryPool * getMemoryPool() const; + + /** + * Set the error stream. + */ + WriterOptions& setErrorStream(std::ostream& errStream); + + /** + * Get the error stream. + * @return if not set, return std::err. + */ + std::ostream * getErrorStream() const; + + /** + * Get the RLE version. + */ + RleVersion getRleVersion() const; + + /** + * Get whether or not to write row group index + * @return if not set, the default is false + */ + bool getEnableIndex() const; + + /** + * Get whether or not to enable dictionary encoding + * @return if not set, the default is false + */ + bool getEnableDictionary() const; + + /** + * Set columns that use BloomFilter + */ + WriterOptions& setColumnsUseBloomFilter(const std::set<uint64_t>& columns); + + /** + * Get whether this column uses BloomFilter + */ + bool isColumnUseBloomFilter(uint64_t column) const; + + /** + * Set false positive probability of BloomFilter + */ + WriterOptions& setBloomFilterFPP(double fpp); + + /** + * Get false positive probability of BloomFilter + */ + double getBloomFilterFPP() const; + + /** + * Get version of BloomFilter + */ + BloomFilterVersion getBloomFilterVersion() const; + }; + + class Writer { + public: + virtual ~Writer(); + + /** + * Create a row batch for writing the columns into this file. + * @param size the number of rows to write. + * @return a new ColumnVectorBatch to write into. + */ + virtual ORC_UNIQUE_PTR<ColumnVectorBatch> createRowBatch(uint64_t size + ) const = 0; + + /** + * Add a row batch into current writer. + * @param rowsToAdd the row batch data to write. + */ + virtual void add(ColumnVectorBatch& rowsToAdd) = 0; + + /** + * Close the writer and flush any pending data to the output stream. + */ + virtual void close() = 0; + + /** + * Add user metadata to the writer. + */ + virtual void addUserMetadata(const std::string name, const std::string value) = 0; + }; +} + +#endif diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh index 18bbbd78e1..d06d892b41 100644 --- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh +++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh @@ -1,78 +1,78 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_CONFIG_HH -#define ORC_CONFIG_HH - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_CONFIG_HH +#define ORC_CONFIG_HH + #define ORC_VERSION "1.6.12" - -#define ORC_CXX_HAS_CSTDINT -#define ORC_CXX_HAS_INITIALIZER_LIST -#define ORC_CXX_HAS_NOEXCEPT -#define ORC_CXX_HAS_NULLPTR -#define ORC_CXX_HAS_OVERRIDE -#define ORC_CXX_HAS_UNIQUE_PTR - -#ifdef ORC_CXX_HAS_CSTDINT - #include <cstdint> -#else - #include <stdint.h> -#endif - -#ifdef ORC_CXX_HAS_NOEXCEPT - #define ORC_NOEXCEPT noexcept -#else - #define ORC_NOEXCEPT throw () -#endif - -#ifdef ORC_CXX_HAS_NULLPTR - #define ORC_NULLPTR nullptr -#else - namespace orc { - class nullptr_t { - public: - template<class T> - operator T*() const { - return 0; - } - - template<class C, class T> - operator T C::*() const { - return 0; - } - private: - void operator&() const; // whose address can't be taken - }; - const nullptr_t nullptr = {}; - } - #define ORC_NULLPTR orc::nullptr -#endif - -#ifdef ORC_CXX_HAS_OVERRIDE - #define ORC_OVERRIDE override -#else - #define ORC_OVERRIDE -#endif - -#ifdef ORC_CXX_HAS_UNIQUE_PTR - #define ORC_UNIQUE_PTR std::unique_ptr -#else - #define ORC_UNIQUE_PTR std::auto_ptr - namespace std { - template<typename T> - inline T move(T& x) { return x; } - } -#endif - -#endif + +#define ORC_CXX_HAS_CSTDINT +#define ORC_CXX_HAS_INITIALIZER_LIST +#define ORC_CXX_HAS_NOEXCEPT +#define ORC_CXX_HAS_NULLPTR +#define ORC_CXX_HAS_OVERRIDE +#define ORC_CXX_HAS_UNIQUE_PTR + +#ifdef ORC_CXX_HAS_CSTDINT + #include <cstdint> +#else + #include <stdint.h> +#endif + +#ifdef ORC_CXX_HAS_NOEXCEPT + #define ORC_NOEXCEPT noexcept +#else + #define ORC_NOEXCEPT throw () +#endif + +#ifdef ORC_CXX_HAS_NULLPTR + #define ORC_NULLPTR nullptr +#else + namespace orc { + class nullptr_t { + public: + template<class T> + operator T*() const { + return 0; + } + + template<class C, class T> + operator T C::*() const { + return 0; + } + private: + void operator&() const; // whose address can't be taken + }; + const nullptr_t nullptr = {}; + } + #define ORC_NULLPTR orc::nullptr +#endif + +#ifdef ORC_CXX_HAS_OVERRIDE + #define ORC_OVERRIDE override +#else + #define ORC_OVERRIDE +#endif + +#ifdef ORC_CXX_HAS_UNIQUE_PTR + #define ORC_UNIQUE_PTR std::unique_ptr +#else + #define ORC_UNIQUE_PTR std::auto_ptr + namespace std { + template<typename T> + inline T move(T& x) { return x; } + } +#endif + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.cc b/contrib/libs/apache/orc/c++/src/Adaptor.cc index bf3a3e181b..f402d65adf 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor.cc +++ b/contrib/libs/apache/orc/c++/src/Adaptor.cc @@ -1,88 +1,88 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "Adaptor.hh" -#include <sstream> -#include <iomanip> - -#ifndef HAS_STOLL -namespace std { - int64_t std::stoll(std::string str) { - int64_t val = 0; - stringstream ss; - ss << str; - ss >> val; - return val; - } -} -#endif - -#ifndef HAS_STRPTIME -char* strptime(const char* s, const char* f, struct tm* tm) { - std::istringstream input(s); - input.imbue(std::locale(setlocale(LC_ALL, nullptr))); - input >> std::get_time(tm, f); - if (input.fail()) return nullptr; - return (char*)(s + input.tellg()); -} -#endif - -#ifndef HAS_PREAD - #ifdef _WIN32 -#include <Windows.h> -#include <io.h> -ssize_t pread(int fd, void* buf, size_t size, off_t offset) { - auto handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd)); - - OVERLAPPED ol; - memset(&ol, 0, sizeof(OVERLAPPED)); - ol.Offset = offset; - - DWORD rt; - if (!ReadFile(handle, buf, static_cast<DWORD>(size), &rt, &ol)) { - errno = GetLastError(); - return -1; - } - return static_cast<ssize_t>(rt); -} - #else - #error("pread() undefined: unknown environment") - #endif -#endif - -namespace orc { -#ifdef HAS_DOUBLE_TO_STRING - std::string to_string(double val) { - return std::to_string(val); - } -#else - std::string to_string(double val) { - return std::to_string(static_cast<long double>(val)); - } -#endif - -#ifdef HAS_INT64_TO_STRING - std::string to_string(int64_t val) { - return std::to_string(val); - } -#else - std::string to_string(int64_t val) { - return std::to_string(static_cast<long long int>(val)); - } -#endif -} +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "Adaptor.hh" +#include <sstream> +#include <iomanip> + +#ifndef HAS_STOLL +namespace std { + int64_t std::stoll(std::string str) { + int64_t val = 0; + stringstream ss; + ss << str; + ss >> val; + return val; + } +} +#endif + +#ifndef HAS_STRPTIME +char* strptime(const char* s, const char* f, struct tm* tm) { + std::istringstream input(s); + input.imbue(std::locale(setlocale(LC_ALL, nullptr))); + input >> std::get_time(tm, f); + if (input.fail()) return nullptr; + return (char*)(s + input.tellg()); +} +#endif + +#ifndef HAS_PREAD + #ifdef _WIN32 +#include <Windows.h> +#include <io.h> +ssize_t pread(int fd, void* buf, size_t size, off_t offset) { + auto handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd)); + + OVERLAPPED ol; + memset(&ol, 0, sizeof(OVERLAPPED)); + ol.Offset = offset; + + DWORD rt; + if (!ReadFile(handle, buf, static_cast<DWORD>(size), &rt, &ol)) { + errno = GetLastError(); + return -1; + } + return static_cast<ssize_t>(rt); +} + #else + #error("pread() undefined: unknown environment") + #endif +#endif + +namespace orc { +#ifdef HAS_DOUBLE_TO_STRING + std::string to_string(double val) { + return std::to_string(val); + } +#else + std::string to_string(double val) { + return std::to_string(static_cast<long double>(val)); + } +#endif + +#ifdef HAS_INT64_TO_STRING + std::string to_string(int64_t val) { + return std::to_string(val); + } +#else + std::string to_string(int64_t val) { + return std::to_string(static_cast<long long int>(val)); + } +#endif +} diff --git a/contrib/libs/apache/orc/c++/src/Adaptor.hh b/contrib/libs/apache/orc/c++/src/Adaptor.hh index a91b9c894d..2d6be71faa 100644 --- a/contrib/libs/apache/orc/c++/src/Adaptor.hh +++ b/contrib/libs/apache/orc/c++/src/Adaptor.hh @@ -1,175 +1,175 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ADAPTER_HH -#define ADAPTER_HH - -/* #undef INT64_IS_LL */ -#define HAS_CONSTEXPR +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ADAPTER_HH +#define ADAPTER_HH + +/* #undef INT64_IS_LL */ +#define HAS_CONSTEXPR #ifndef _MSC_VER -#define HAS_PREAD -#endif -#define HAS_STRPTIME -#define HAS_STOLL -#define HAS_DIAGNOSTIC_PUSH -#define HAS_DOUBLE_TO_STRING -#define HAS_INT64_TO_STRING -#define HAS_PRE_1970 +#define HAS_PREAD +#endif +#define HAS_STRPTIME +#define HAS_STOLL +#define HAS_DIAGNOSTIC_PUSH +#define HAS_DOUBLE_TO_STRING +#define HAS_INT64_TO_STRING +#define HAS_PRE_1970 #define HAS_POST_2038 -#define HAS_STD_ISNAN -#define HAS_STD_MUTEX +#define HAS_STD_ISNAN +#define HAS_STD_MUTEX #ifndef _MSC_VER #define HAS_BUILTIN_OVERFLOW_CHECK #endif -/* #undef NEEDS_REDUNDANT_MOVE */ -/* #undef NEEDS_Z_PREFIX */ - -#include "orc/orc-config.hh" -#include <string> - -#ifdef _MSC_VER -#include <BaseTsd.h> -typedef SSIZE_T ssize_t; -#define timegm(tm) _mkgmtime(tm) -#define gmtime_r(timep, result) (gmtime_s(result, timep) ? NULL : result) -#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf) -#endif - -#ifndef HAS_STOLL - // A poor man's stoll that converts str to a long long int base 10 - namespace std { - int64_t stoll(std::string str); - } -#endif - -#ifndef HAS_STRPTIME - char* strptime(const char* buf, const char* format, struct tm* tm); -#endif - -#ifndef HAS_PREAD - ssize_t pread(int fd, void* buf, size_t count, off_t offset); -#endif - -#ifdef INT64_IS_LL - #define INT64_FORMAT_STRING "ll" -#else - #define INT64_FORMAT_STRING "l" -#endif - -#ifndef ORC_CXX_HAS_NOEXCEPT - #define noexcept ORC_NOEXCEPT -#endif - -#ifndef ORC_CXX_HAS_OVERRIDE - #define override ORC_OVERRIDE -#endif - -#ifdef HAS_DIAGNOSTIC_PUSH - #ifdef __clang__ - #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") - #define DIAGNOSTIC_POP _Pragma("clang diagnostic pop") - #elif defined(__GNUC__) - #define DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") - #define DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") - #elif defined(_MSC_VER) - #define DIAGNOSTIC_PUSH __pragma(warning(push)) - #define DIAGNOSTIC_POP __pragma(warning(pop)) - #else - #error("Unknown compiler") - #endif -#else - #define DIAGNOSTIC_PUSH - #define DIAGNOSTIC_POP -#endif - -#define PRAGMA(TXT) _Pragma(#TXT) - - #define DIAGNOSTIC_IGNORE(XXX) - -#ifndef ORC_CXX_HAS_UNIQUE_PTR - #define unique_ptr auto_ptr -#endif - -#ifndef UINT32_MAX - #define UINT32_MAX 0xffffffff -#endif - -#ifndef INT64_MAX - #define INT64_MAX 0x7fffffffffffffff -#endif - -#ifndef INT64_MIN - #define INT64_MIN (-0x7fffffffffffffff - 1) -#endif - -#define GTEST_LANG_CXX11 0 - -#ifdef NEEDS_REDUNDANT_MOVE - #define REDUNDANT_MOVE(XXX) std::move(XXX) -#else - #define REDUNDANT_MOVE(XXX) XXX -#endif - -#ifndef HAS_STD_ISNAN - #include <math.h> - #define std::isnan(XXX) isnan(XXX) -#else - #include <cmath> -#endif - -#ifndef HAS_STD_MUTEX - #include <pthread.h> - namespace orc { - /** - * Lock guard for pthread_mutex_t object using RAII - * The Lock is automatically release when exiting current scope. - */ - class LockORC { - public: - explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) { - pthread_mutex_lock(&mutex_ref_); - } - ~LockORC() { pthread_mutex_unlock(&mutex_ref_); } - private: - // no default constructor - LockORC(); - // prohibit copying - LockORC(const LockORC&); - LockORC& operator=(const LockORC&); - - pthread_mutex_t& mutex_ref_; - }; - } - #define std::mutex pthread_mutex_t - #define std::lock_guard<std::mutex> LockORC -#else - #include <mutex> -#endif - -#ifdef NEEDS_Z_PREFIX -#define Z_PREFIX 1 -#endif - -namespace orc { - std::string to_string(double val); - std::string to_string(int64_t val); -} - +/* #undef NEEDS_REDUNDANT_MOVE */ +/* #undef NEEDS_Z_PREFIX */ + +#include "orc/orc-config.hh" +#include <string> + +#ifdef _MSC_VER +#include <BaseTsd.h> +typedef SSIZE_T ssize_t; +#define timegm(tm) _mkgmtime(tm) +#define gmtime_r(timep, result) (gmtime_s(result, timep) ? NULL : result) +#define asctime_r(tm, buf) (asctime_s(buf, 26, tm) ? NULL : buf) +#endif + +#ifndef HAS_STOLL + // A poor man's stoll that converts str to a long long int base 10 + namespace std { + int64_t stoll(std::string str); + } +#endif + +#ifndef HAS_STRPTIME + char* strptime(const char* buf, const char* format, struct tm* tm); +#endif + +#ifndef HAS_PREAD + ssize_t pread(int fd, void* buf, size_t count, off_t offset); +#endif + +#ifdef INT64_IS_LL + #define INT64_FORMAT_STRING "ll" +#else + #define INT64_FORMAT_STRING "l" +#endif + +#ifndef ORC_CXX_HAS_NOEXCEPT + #define noexcept ORC_NOEXCEPT +#endif + +#ifndef ORC_CXX_HAS_OVERRIDE + #define override ORC_OVERRIDE +#endif + +#ifdef HAS_DIAGNOSTIC_PUSH + #ifdef __clang__ + #define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") + #define DIAGNOSTIC_POP _Pragma("clang diagnostic pop") + #elif defined(__GNUC__) + #define DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") + #define DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") + #elif defined(_MSC_VER) + #define DIAGNOSTIC_PUSH __pragma(warning(push)) + #define DIAGNOSTIC_POP __pragma(warning(pop)) + #else + #error("Unknown compiler") + #endif +#else + #define DIAGNOSTIC_PUSH + #define DIAGNOSTIC_POP +#endif + +#define PRAGMA(TXT) _Pragma(#TXT) + + #define DIAGNOSTIC_IGNORE(XXX) + +#ifndef ORC_CXX_HAS_UNIQUE_PTR + #define unique_ptr auto_ptr +#endif + +#ifndef UINT32_MAX + #define UINT32_MAX 0xffffffff +#endif + +#ifndef INT64_MAX + #define INT64_MAX 0x7fffffffffffffff +#endif + +#ifndef INT64_MIN + #define INT64_MIN (-0x7fffffffffffffff - 1) +#endif + +#define GTEST_LANG_CXX11 0 + +#ifdef NEEDS_REDUNDANT_MOVE + #define REDUNDANT_MOVE(XXX) std::move(XXX) +#else + #define REDUNDANT_MOVE(XXX) XXX +#endif + +#ifndef HAS_STD_ISNAN + #include <math.h> + #define std::isnan(XXX) isnan(XXX) +#else + #include <cmath> +#endif + +#ifndef HAS_STD_MUTEX + #include <pthread.h> + namespace orc { + /** + * Lock guard for pthread_mutex_t object using RAII + * The Lock is automatically release when exiting current scope. + */ + class LockORC { + public: + explicit LockORC(pthread_mutex_t& mutex) : mutex_ref_(mutex) { + pthread_mutex_lock(&mutex_ref_); + } + ~LockORC() { pthread_mutex_unlock(&mutex_ref_); } + private: + // no default constructor + LockORC(); + // prohibit copying + LockORC(const LockORC&); + LockORC& operator=(const LockORC&); + + pthread_mutex_t& mutex_ref_; + }; + } + #define std::mutex pthread_mutex_t + #define std::lock_guard<std::mutex> LockORC +#else + #include <mutex> +#endif + +#ifdef NEEDS_Z_PREFIX +#define Z_PREFIX 1 +#endif + +namespace orc { + std::string to_string(double val); + std::string to_string(int64_t val); +} + #ifdef HAS_BUILTIN_OVERFLOW_CHECK #define multiplyExact !__builtin_mul_overflow #define addExact !__builtin_add_overflow @@ -204,8 +204,8 @@ namespace orc { } #endif -#ifndef HAS_CONSTEXPR -#define constexpr const -#endif - -#endif /* ADAPTER_HH */ +#ifndef HAS_CONSTEXPR +#define constexpr const +#endif + +#endif /* ADAPTER_HH */ diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc index 8a1f1880e7..8ec0acda8c 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc @@ -1,328 +1,328 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "BloomFilter.hh" -#include "Murmur3.hh" - -namespace orc { - - constexpr uint64_t BITS_OF_LONG = 64; - constexpr uint8_t SHIFT_6_BITS = 6; - constexpr uint8_t SHIFT_3_BITS = 3; - - static bool isLittleEndian() { - static union { uint32_t i; char c[4]; } num = { 0x01020304 }; - return num.c[0] == 4; - } - - /** - * Implementation of BitSet - */ - BitSet::BitSet(uint64_t numBits) { - mData.resize(static_cast<size_t>(ceil( - static_cast<double>(numBits) / BITS_OF_LONG)), 0); - } - - BitSet::BitSet(const uint64_t * bits, uint64_t numBits) { - // caller should make sure numBits is multiple of 64 - mData.resize(numBits >> SHIFT_6_BITS, 0); - memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS); - } - - void BitSet::set(uint64_t index) { - mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG)); - } - - bool BitSet::get(uint64_t index) { - return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0; - } - - uint64_t BitSet::bitSize() { - return mData.size() << SHIFT_6_BITS; - } - - void BitSet::merge(const BitSet& other) { - if (mData.size() != other.mData.size()) { - std::stringstream ss; - ss << "BitSet must be of equal length (" - << mData.size() << " != " << other.mData.size() << ")"; - throw std::logic_error(ss.str()); - } - - for (size_t i = 0; i != mData.size(); i++) { - mData[i] |= other.mData[i]; - } - } - - void BitSet::clear() { - memset(mData.data(), 0, sizeof(uint64_t) * mData.size()); - } - - const uint64_t * BitSet::getData() const { - return mData.data(); - } - - bool BitSet::operator==(const BitSet& other) const { - return mData == other.mData; - } - - /** - * Helper functions - */ - void checkArgument(bool expression, const std::string& message) { - if (!expression) { - throw std::logic_error(message); - } - } - - int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) { - double n = static_cast<double>(expectedEntries); - return std::max<int32_t>(1, static_cast<int32_t>( - std::round(static_cast<double>(numBits) / n * std::log(2.0)))); - } - - int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) { - double n = static_cast<double>(expectedEntries); - return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) * std::log(2.0))); - } - - // We use the trick mentioned in "Less Hashing, Same Performance: - // Building a Better Bloom Filter" by Kirsch et.al. From abstract - // 'only two hash functions are necessary to effectively implement - // a Bloom filter without any loss in the asymptotic false positive - // probability' - // Lets split up 64-bit hashcode into two 32-bit hash codes and employ - // the technique mentioned in the above paper - inline uint64_t getBytesHash(const char * data, int64_t length) { - if (data == nullptr) { - return Murmur3::NULL_HASHCODE; - } - - return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data), - static_cast<uint32_t>(length)); - } - - /** - * Implementation of BloomFilter - */ - BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) { - checkArgument(expectedEntries > 0, - "expectedEntries should be > 0"); - checkArgument(fpp > 0.0 && fpp < 1.0, - "False positive probability should be > 0.0 & < 1.0"); - - uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp)); - // make 'mNumBits' multiple of 64 - mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG)); - mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits); - mBitSet.reset(new BitSet(mNumBits)); - } - - void BloomFilterImpl::addBytes(const char * data, int64_t length) { - uint64_t hash64 = getBytesHash(data, length); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BloomFilter.hh" +#include "Murmur3.hh" + +namespace orc { + + constexpr uint64_t BITS_OF_LONG = 64; + constexpr uint8_t SHIFT_6_BITS = 6; + constexpr uint8_t SHIFT_3_BITS = 3; + + static bool isLittleEndian() { + static union { uint32_t i; char c[4]; } num = { 0x01020304 }; + return num.c[0] == 4; + } + + /** + * Implementation of BitSet + */ + BitSet::BitSet(uint64_t numBits) { + mData.resize(static_cast<size_t>(ceil( + static_cast<double>(numBits) / BITS_OF_LONG)), 0); + } + + BitSet::BitSet(const uint64_t * bits, uint64_t numBits) { + // caller should make sure numBits is multiple of 64 + mData.resize(numBits >> SHIFT_6_BITS, 0); + memcpy(mData.data(), bits, numBits >> SHIFT_3_BITS); + } + + void BitSet::set(uint64_t index) { + mData[index >> SHIFT_6_BITS] |= (1ULL << (index % BITS_OF_LONG)); + } + + bool BitSet::get(uint64_t index) { + return (mData[index >> SHIFT_6_BITS] & (1ULL << (index % BITS_OF_LONG))) != 0; + } + + uint64_t BitSet::bitSize() { + return mData.size() << SHIFT_6_BITS; + } + + void BitSet::merge(const BitSet& other) { + if (mData.size() != other.mData.size()) { + std::stringstream ss; + ss << "BitSet must be of equal length (" + << mData.size() << " != " << other.mData.size() << ")"; + throw std::logic_error(ss.str()); + } + + for (size_t i = 0; i != mData.size(); i++) { + mData[i] |= other.mData[i]; + } + } + + void BitSet::clear() { + memset(mData.data(), 0, sizeof(uint64_t) * mData.size()); + } + + const uint64_t * BitSet::getData() const { + return mData.data(); + } + + bool BitSet::operator==(const BitSet& other) const { + return mData == other.mData; + } + + /** + * Helper functions + */ + void checkArgument(bool expression, const std::string& message) { + if (!expression) { + throw std::logic_error(message); + } + } + + int32_t optimalNumOfHashFunctions(uint64_t expectedEntries, uint64_t numBits) { + double n = static_cast<double>(expectedEntries); + return std::max<int32_t>(1, static_cast<int32_t>( + std::round(static_cast<double>(numBits) / n * std::log(2.0)))); + } + + int32_t optimalNumOfBits(uint64_t expectedEntries, double fpp) { + double n = static_cast<double>(expectedEntries); + return static_cast<int32_t>(-n * std::log(fpp) / (std::log(2.0) * std::log(2.0))); + } + + // We use the trick mentioned in "Less Hashing, Same Performance: + // Building a Better Bloom Filter" by Kirsch et.al. From abstract + // 'only two hash functions are necessary to effectively implement + // a Bloom filter without any loss in the asymptotic false positive + // probability' + // Lets split up 64-bit hashcode into two 32-bit hash codes and employ + // the technique mentioned in the above paper + inline uint64_t getBytesHash(const char * data, int64_t length) { + if (data == nullptr) { + return Murmur3::NULL_HASHCODE; + } + + return Murmur3::hash64(reinterpret_cast<const uint8_t *>(data), + static_cast<uint32_t>(length)); + } + + /** + * Implementation of BloomFilter + */ + BloomFilterImpl::BloomFilterImpl(uint64_t expectedEntries, double fpp) { + checkArgument(expectedEntries > 0, + "expectedEntries should be > 0"); + checkArgument(fpp > 0.0 && fpp < 1.0, + "False positive probability should be > 0.0 & < 1.0"); + + uint64_t nb = static_cast<uint64_t>(optimalNumOfBits(expectedEntries, fpp)); + // make 'mNumBits' multiple of 64 + mNumBits = nb + (BITS_OF_LONG - (nb % BITS_OF_LONG)); + mNumHashFunctions = optimalNumOfHashFunctions(expectedEntries, mNumBits); + mBitSet.reset(new BitSet(mNumBits)); + } + + void BloomFilterImpl::addBytes(const char * data, int64_t length) { + uint64_t hash64 = getBytesHash(data, length); addHash(static_cast<int64_t>(hash64)); - } - - void BloomFilterImpl::addLong(int64_t data) { + } + + void BloomFilterImpl::addLong(int64_t data) { addHash(getLongHash(data)); - } - - bool BloomFilterImpl::testBytes(const char * data, int64_t length) const { - uint64_t hash64 = getBytesHash(data, length); + } + + bool BloomFilterImpl::testBytes(const char * data, int64_t length) const { + uint64_t hash64 = getBytesHash(data, length); return testHash(static_cast<int64_t>(hash64)); - } - - bool BloomFilterImpl::testLong(int64_t data) const { + } + + bool BloomFilterImpl::testLong(int64_t data) const { return testHash(getLongHash(data)); - } - - uint64_t BloomFilterImpl::sizeInBytes() const { - return getBitSize() >> SHIFT_3_BITS; - } - - uint64_t BloomFilterImpl::getBitSize() const { - return mBitSet->bitSize(); - } - - int32_t BloomFilterImpl::getNumHashFunctions() const { - return mNumHashFunctions; - } - - DIAGNOSTIC_PUSH - -#if defined(__clang__) - DIAGNOSTIC_IGNORE("-Wundefined-reinterpret-cast") -#endif - -#if defined(__GNUC__) - DIAGNOSTIC_IGNORE("-Wstrict-aliasing") -#endif - - // caller should make sure input proto::BloomFilter is valid since - // no check will be performed in the following constructor - BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) { - mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions()); - - const std::string& bitsetStr = bloomFilter.utf8bitset(); - mNumBits = bitsetStr.size() << SHIFT_3_BITS; - checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); - - const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data()); - if (isLittleEndian()) { - mBitSet.reset(new BitSet(bitset, mNumBits)); - } else { - std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS); - for (size_t i = 0; i != longs.size(); ++i) { - // convert little-endian to big-endian - const uint64_t src = bitset[i]; - uint64_t& dst = longs[i]; - for (size_t bit = 0; bit != 64; bit += 8) { - dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); - } - } - - mBitSet.reset(new BitSet(longs.data(), mNumBits)); - } - } - - void BloomFilterImpl::addDouble(double data) { - addLong(reinterpret_cast<int64_t&>(data)); - } - - bool BloomFilterImpl::testDouble(double data) const{ - return testLong(reinterpret_cast<int64_t&>(data)); - } - - DIAGNOSTIC_POP - + } + + uint64_t BloomFilterImpl::sizeInBytes() const { + return getBitSize() >> SHIFT_3_BITS; + } + + uint64_t BloomFilterImpl::getBitSize() const { + return mBitSet->bitSize(); + } + + int32_t BloomFilterImpl::getNumHashFunctions() const { + return mNumHashFunctions; + } + + DIAGNOSTIC_PUSH + +#if defined(__clang__) + DIAGNOSTIC_IGNORE("-Wundefined-reinterpret-cast") +#endif + +#if defined(__GNUC__) + DIAGNOSTIC_IGNORE("-Wstrict-aliasing") +#endif + + // caller should make sure input proto::BloomFilter is valid since + // no check will be performed in the following constructor + BloomFilterImpl::BloomFilterImpl(const proto::BloomFilter& bloomFilter) { + mNumHashFunctions = static_cast<int32_t>(bloomFilter.numhashfunctions()); + + const std::string& bitsetStr = bloomFilter.utf8bitset(); + mNumBits = bitsetStr.size() << SHIFT_3_BITS; + checkArgument(mNumBits % BITS_OF_LONG == 0, "numBits should be multiple of 64!"); + + const uint64_t * bitset = reinterpret_cast<const uint64_t *>(bitsetStr.data()); + if (isLittleEndian()) { + mBitSet.reset(new BitSet(bitset, mNumBits)); + } else { + std::vector<uint64_t> longs(mNumBits >> SHIFT_6_BITS); + for (size_t i = 0; i != longs.size(); ++i) { + // convert little-endian to big-endian + const uint64_t src = bitset[i]; + uint64_t& dst = longs[i]; + for (size_t bit = 0; bit != 64; bit += 8) { + dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); + } + } + + mBitSet.reset(new BitSet(longs.data(), mNumBits)); + } + } + + void BloomFilterImpl::addDouble(double data) { + addLong(reinterpret_cast<int64_t&>(data)); + } + + bool BloomFilterImpl::testDouble(double data) const{ + return testLong(reinterpret_cast<int64_t&>(data)); + } + + DIAGNOSTIC_POP + void BloomFilterImpl::addHash(int64_t hash64) { - int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); + int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. // So we cast hash64 to uint64_t here for an unsigned right shift. int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32); - - for (int32_t i = 1; i <= mNumHashFunctions; ++i) { - int32_t combinedHash = hash1 + i * hash2; - // hashcode should be positive, flip all the bits if it's negative - if (combinedHash < 0) { - combinedHash = ~combinedHash; - } - uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; - mBitSet->set(pos); - } - } - + + for (int32_t i = 1; i <= mNumHashFunctions; ++i) { + int32_t combinedHash = hash1 + i * hash2; + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; + mBitSet->set(pos); + } + } + bool BloomFilterImpl::testHash(int64_t hash64) const{ - int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); + int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff); // In Java codes, we use "hash64 >>> 32" which is an unsigned shift op. // So we cast hash64 to uint64_t here for an unsigned right shift. int32_t hash2 = static_cast<int32_t>(static_cast<uint64_t>(hash64) >> 32); - - for (int32_t i = 1; i <= mNumHashFunctions; ++i) { - int32_t combinedHash = hash1 + i * hash2; - // hashcode should be positive, flip all the bits if it's negative - if (combinedHash < 0) { - combinedHash = ~combinedHash; - } - uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; - if (!mBitSet->get(pos)) { - return false; - } - } - return true; - } - - void BloomFilterImpl::merge(const BloomFilterImpl& other) { - if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) { - std::stringstream ss; - ss << "BloomFilters are not compatible for merging: " - << "this: numBits:" << mNumBits - << ",numHashFunctions:" << mNumHashFunctions - << ", that: numBits:" << other.mNumBits - << ",numHashFunctions:" << other.mNumHashFunctions; - throw std::logic_error(ss.str()); - } - - mBitSet->merge(*other.mBitSet); - } - - void BloomFilterImpl::reset() { - mBitSet->clear(); - } - - void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const { - bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions)); - - // According to ORC standard, the encoding is a sequence of bytes with - // a little endian encoding in the utf8bitset field. - if (isLittleEndian()) { - // bytes are already organized in little endian; thus no conversion needed - const char * bitset = reinterpret_cast<const char *>(mBitSet->getData()); - bloomFilter.set_utf8bitset(bitset, sizeInBytes()); - } else { - std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0); - const uint64_t * longs = mBitSet->getData(); - for (size_t i = 0; i != bitset.size(); ++i) { - uint64_t& dst = bitset[i]; - const uint64_t src = longs[i]; - // convert big-endian to little-endian - for (size_t bit = 0; bit != 64; bit += 8) { - dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); - } - } - bloomFilter.set_utf8bitset(bitset.data(), sizeInBytes()); - } - } - - bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const { - return mNumBits == other.mNumBits && - mNumHashFunctions == other.mNumHashFunctions && - *mBitSet == *other.mBitSet; - } - - BloomFilter::~BloomFilter() { - // PASS - } - - std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize( - const proto::Stream_Kind& streamKind, - const proto::ColumnEncoding& encoding, - const proto::BloomFilter& bloomFilter) { - - std::unique_ptr<BloomFilter> ret(nullptr); - - // only BLOOM_FILTER_UTF8 is supported - if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) { - return ret; - } - - // make sure we don't use unknown encodings or original timestamp encodings - if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) { - return ret; - } - - // make sure all required fields exist - if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) { - return ret; - } - - ret.reset(new BloomFilterImpl(bloomFilter)); - return ret; - } - -} + + for (int32_t i = 1; i <= mNumHashFunctions; ++i) { + int32_t combinedHash = hash1 + i * hash2; + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + uint64_t pos = static_cast<uint64_t>(combinedHash) % mNumBits; + if (!mBitSet->get(pos)) { + return false; + } + } + return true; + } + + void BloomFilterImpl::merge(const BloomFilterImpl& other) { + if (mNumBits != other.mNumBits || mNumHashFunctions != other.mNumHashFunctions) { + std::stringstream ss; + ss << "BloomFilters are not compatible for merging: " + << "this: numBits:" << mNumBits + << ",numHashFunctions:" << mNumHashFunctions + << ", that: numBits:" << other.mNumBits + << ",numHashFunctions:" << other.mNumHashFunctions; + throw std::logic_error(ss.str()); + } + + mBitSet->merge(*other.mBitSet); + } + + void BloomFilterImpl::reset() { + mBitSet->clear(); + } + + void BloomFilterImpl::serialize(proto::BloomFilter& bloomFilter) const { + bloomFilter.set_numhashfunctions(static_cast<uint32_t>(mNumHashFunctions)); + + // According to ORC standard, the encoding is a sequence of bytes with + // a little endian encoding in the utf8bitset field. + if (isLittleEndian()) { + // bytes are already organized in little endian; thus no conversion needed + const char * bitset = reinterpret_cast<const char *>(mBitSet->getData()); + bloomFilter.set_utf8bitset(bitset, sizeInBytes()); + } else { + std::vector<uint64_t> bitset(sizeInBytes() / sizeof(uint64_t), 0); + const uint64_t * longs = mBitSet->getData(); + for (size_t i = 0; i != bitset.size(); ++i) { + uint64_t& dst = bitset[i]; + const uint64_t src = longs[i]; + // convert big-endian to little-endian + for (size_t bit = 0; bit != 64; bit += 8) { + dst |= (((src & (0xFFu << bit)) >> bit) << (56 - bit)); + } + } + bloomFilter.set_utf8bitset(bitset.data(), sizeInBytes()); + } + } + + bool BloomFilterImpl::operator==(const BloomFilterImpl& other) const { + return mNumBits == other.mNumBits && + mNumHashFunctions == other.mNumHashFunctions && + *mBitSet == *other.mBitSet; + } + + BloomFilter::~BloomFilter() { + // PASS + } + + std::unique_ptr<BloomFilter> BloomFilterUTF8Utils::deserialize( + const proto::Stream_Kind& streamKind, + const proto::ColumnEncoding& encoding, + const proto::BloomFilter& bloomFilter) { + + std::unique_ptr<BloomFilter> ret(nullptr); + + // only BLOOM_FILTER_UTF8 is supported + if (streamKind != proto::Stream_Kind_BLOOM_FILTER_UTF8) { + return ret; + } + + // make sure we don't use unknown encodings or original timestamp encodings + if (!encoding.has_bloomencoding() || encoding.bloomencoding() != 1) { + return ret; + } + + // make sure all required fields exist + if (!bloomFilter.has_numhashfunctions() || !bloomFilter.has_utf8bitset()) { + return ret; + } + + ret.reset(new BloomFilterImpl(bloomFilter)); + return ret; + } + +} diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh index cf18a46fd9..ab2006bdae 100644 --- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh +++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh @@ -1,197 +1,197 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_BLOOMFILTER_IMPL_HH -#define ORC_BLOOMFILTER_IMPL_HH - -#include "orc/BloomFilter.hh" -#include "wrap/orc-proto-wrapper.hh" - -#include <cmath> -#include <sstream> -#include <vector> - -namespace orc { - - /** - * Bare metal bit set implementation. For performance reasons, this implementation does not check - * for index bounds nor expand the bit set size if the specified index is greater than the size. - */ - class BitSet { - public: - /** - * Creates an empty BitSet - * - * @param numBits - number of bits used - */ - BitSet(uint64_t numBits); - - /** - * Creates BitSet from serialized uint64_t buffer - * - * @param bits - serialized uint64_t buffer of bitset - * @param numBits - number of bits used - */ - BitSet(const uint64_t * bits, uint64_t numBits); - - /** - * Sets the bit at specified index. - * - * @param index - position - */ - void set(uint64_t index); - - /** - * Returns true if the bit is set in the specified index. - * - * @param index - position - * @return - value at the bit position - */ - bool get(uint64_t index); - - /** - * Number of bits - */ - uint64_t bitSize(); - - /** - * Combines the two BitSets using bitwise OR. - */ - void merge(const BitSet& other); - - /** - * Clears the bit set. - */ - void clear(); - - /** - * Gets underlying raw data - */ - const uint64_t * getData() const; - - /** - * Compares two BitSets - */ - bool operator==(const BitSet& other) const; - - private: - std::vector<uint64_t> mData; - }; - - /** - * BloomFilter is a probabilistic data structure for set membership check. - * BloomFilters are highly space efficient when compared to using a HashSet. - * Because of the probabilistic nature of bloom filter false positive (element - * not present in bloom filter but test() says true) are possible but false - * negatives are not possible (if element is present then test() will never - * say false). The false positive probability is configurable (default: 5%) - * depending on which storage requirement may increase or decrease. Lower the - * false positive probability greater is the space requirement. - * - * Bloom filters are sensitive to number of elements that will be inserted in - * the bloom filter. During the creation of bloom filter expected number of - * entries must be specified. If the number of insertions exceed the specified - * initial number of entries then false positive probability will increase - * accordingly. - * - * Internally, this implementation of bloom filter uses Murmur3 fast - * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than - * Murmur3 in Java, it suffers from hash collisions for specific sequence of - * repeating bytes. Check the following link for more info - * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw - * - * Note that this class is here for backwards compatibility, because it uses - * the JVM default character set for strings. All new users should - * BloomFilterUtf8, which always uses UTF8 for the encoding. - */ - class BloomFilterImpl : public BloomFilter { - public: - /** - * Creates an empty BloomFilter - * - * @param expectedEntries - number of entries it will hold - * @param fpp - false positive probability - */ - BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP); - - /** - * Creates a BloomFilter by deserializing the proto-buf version - * - * caller should make sure input proto::BloomFilter is valid - */ - BloomFilterImpl(const proto::BloomFilter& bloomFilter); - - /** - * Adds a new element to the BloomFilter - */ - void addBytes(const char * data, int64_t length); - void addLong(int64_t data); - void addDouble(double data); - - /** - * Test if the element exists in BloomFilter - */ - bool testBytes(const char * data, int64_t length) const override; - bool testLong(int64_t data) const override; - bool testDouble(double data) const override; - - uint64_t sizeInBytes() const; - uint64_t getBitSize() const; - int32_t getNumHashFunctions() const; - - void merge(const BloomFilterImpl& other); - - void reset(); - - bool operator==(const BloomFilterImpl& other) const; - - private: - friend struct BloomFilterUTF8Utils; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BLOOMFILTER_IMPL_HH +#define ORC_BLOOMFILTER_IMPL_HH + +#include "orc/BloomFilter.hh" +#include "wrap/orc-proto-wrapper.hh" + +#include <cmath> +#include <sstream> +#include <vector> + +namespace orc { + + /** + * Bare metal bit set implementation. For performance reasons, this implementation does not check + * for index bounds nor expand the bit set size if the specified index is greater than the size. + */ + class BitSet { + public: + /** + * Creates an empty BitSet + * + * @param numBits - number of bits used + */ + BitSet(uint64_t numBits); + + /** + * Creates BitSet from serialized uint64_t buffer + * + * @param bits - serialized uint64_t buffer of bitset + * @param numBits - number of bits used + */ + BitSet(const uint64_t * bits, uint64_t numBits); + + /** + * Sets the bit at specified index. + * + * @param index - position + */ + void set(uint64_t index); + + /** + * Returns true if the bit is set in the specified index. + * + * @param index - position + * @return - value at the bit position + */ + bool get(uint64_t index); + + /** + * Number of bits + */ + uint64_t bitSize(); + + /** + * Combines the two BitSets using bitwise OR. + */ + void merge(const BitSet& other); + + /** + * Clears the bit set. + */ + void clear(); + + /** + * Gets underlying raw data + */ + const uint64_t * getData() const; + + /** + * Compares two BitSets + */ + bool operator==(const BitSet& other) const; + + private: + std::vector<uint64_t> mData; + }; + + /** + * BloomFilter is a probabilistic data structure for set membership check. + * BloomFilters are highly space efficient when compared to using a HashSet. + * Because of the probabilistic nature of bloom filter false positive (element + * not present in bloom filter but test() says true) are possible but false + * negatives are not possible (if element is present then test() will never + * say false). The false positive probability is configurable (default: 5%) + * depending on which storage requirement may increase or decrease. Lower the + * false positive probability greater is the space requirement. + * + * Bloom filters are sensitive to number of elements that will be inserted in + * the bloom filter. During the creation of bloom filter expected number of + * entries must be specified. If the number of insertions exceed the specified + * initial number of entries then false positive probability will increase + * accordingly. + * + * Internally, this implementation of bloom filter uses Murmur3 fast + * non-cryptographic hash algorithm. Although Murmur2 is slightly faster than + * Murmur3 in Java, it suffers from hash collisions for specific sequence of + * repeating bytes. Check the following link for more info + * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw + * + * Note that this class is here for backwards compatibility, because it uses + * the JVM default character set for strings. All new users should + * BloomFilterUtf8, which always uses UTF8 for the encoding. + */ + class BloomFilterImpl : public BloomFilter { + public: + /** + * Creates an empty BloomFilter + * + * @param expectedEntries - number of entries it will hold + * @param fpp - false positive probability + */ + BloomFilterImpl(uint64_t expectedEntries, double fpp=DEFAULT_FPP); + + /** + * Creates a BloomFilter by deserializing the proto-buf version + * + * caller should make sure input proto::BloomFilter is valid + */ + BloomFilterImpl(const proto::BloomFilter& bloomFilter); + + /** + * Adds a new element to the BloomFilter + */ + void addBytes(const char * data, int64_t length); + void addLong(int64_t data); + void addDouble(double data); + + /** + * Test if the element exists in BloomFilter + */ + bool testBytes(const char * data, int64_t length) const override; + bool testLong(int64_t data) const override; + bool testDouble(double data) const override; + + uint64_t sizeInBytes() const; + uint64_t getBitSize() const; + int32_t getNumHashFunctions() const; + + void merge(const BloomFilterImpl& other); + + void reset(); + + bool operator==(const BloomFilterImpl& other) const; + + private: + friend struct BloomFilterUTF8Utils; friend class TestBloomFilter_testBloomFilterBasicOperations_Test; - - // compute k hash values from hash64 and set bits + + // compute k hash values from hash64 and set bits void addHash(int64_t hash64); - - // compute k hash values from hash64 and check bits + + // compute k hash values from hash64 and check bits bool testHash(int64_t hash64) const; - - void serialize(proto::BloomFilter& bloomFilter) const; - - private: - static constexpr double DEFAULT_FPP = 0.05; - uint64_t mNumBits; - int32_t mNumHashFunctions; - std::unique_ptr<BitSet> mBitSet; - }; - - struct BloomFilterUTF8Utils { - // serialize BloomFilter in protobuf - static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) { - in.serialize(out); - } - - // deserialize BloomFilter from protobuf - static std::unique_ptr<BloomFilter> - deserialize(const proto::Stream_Kind& streamKind, - const proto::ColumnEncoding& columnEncoding, - const proto::BloomFilter& bloomFilter); - }; - + + void serialize(proto::BloomFilter& bloomFilter) const; + + private: + static constexpr double DEFAULT_FPP = 0.05; + uint64_t mNumBits; + int32_t mNumHashFunctions; + std::unique_ptr<BitSet> mBitSet; + }; + + struct BloomFilterUTF8Utils { + // serialize BloomFilter in protobuf + static void serialize(const BloomFilterImpl& in, proto::BloomFilter& out) { + in.serialize(out); + } + + // deserialize BloomFilter from protobuf + static std::unique_ptr<BloomFilter> + deserialize(const proto::Stream_Kind& streamKind, + const proto::ColumnEncoding& columnEncoding, + const proto::BloomFilter& bloomFilter); + }; + // Thomas Wang's integer hash function // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm // Put this in header file so tests can use it as well. @@ -205,6 +205,6 @@ namespace orc { key = key + (key << 31); return key; } -} - -#endif //ORC_BLOOMFILTER_IMPL_HH +} + +#endif //ORC_BLOOMFILTER_IMPL_HH diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.cc b/contrib/libs/apache/orc/c++/src/ByteRLE.cc index ee1a4575dc..30f5148b7c 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.cc +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.cc @@ -1,626 +1,626 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <algorithm> -#include <iostream> -#include <string.h> -#include <utility> - -#include "ByteRLE.hh" -#include "orc/Exceptions.hh" - -namespace orc { - - const int MINIMUM_REPEAT = 3; - const int MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; - const int MAX_LITERAL_SIZE = 128; - - ByteRleEncoder::~ByteRleEncoder() { - // PASS - } - - class ByteRleEncoderImpl : public ByteRleEncoder { - public: - ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); - virtual ~ByteRleEncoderImpl() override; - - /** - * Encode the next batch of values. - * @param data to be encoded - * @param numValues the number of values to be encoded - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) override; - - /** - * Get size of buffer used so far. - */ - virtual uint64_t getBufferSize() const override; - - /** - * Flush underlying BufferedOutputStream. - */ - virtual uint64_t flush() override; - - virtual void recordPosition(PositionRecorder* recorder) const override; - - protected: - std::unique_ptr<BufferedOutputStream> outputStream; - char* literals; - int numLiterals; - bool repeat; - int tailRunLength; - int bufferPosition; - int bufferLength; - char* buffer; - - void writeByte(char c); - void writeValues(); - void write(char c); - }; - - ByteRleEncoderImpl::ByteRleEncoderImpl( - std::unique_ptr<BufferedOutputStream> output) - : outputStream(std::move(output)) { - literals = new char[MAX_LITERAL_SIZE]; - numLiterals = 0; - tailRunLength = 0; - repeat = false; - bufferPosition = 0; - bufferLength = 0; - buffer = nullptr; - } - - ByteRleEncoderImpl::~ByteRleEncoderImpl() { - // PASS - delete [] literals; - } - - void ByteRleEncoderImpl::writeByte(char c) { - if (bufferPosition == bufferLength) { - int addedSize = 0; - if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { - throw std::bad_alloc(); - } - bufferPosition = 0; - bufferLength = addedSize; - } - buffer[bufferPosition++] = c; - } - - void ByteRleEncoderImpl::add( - const char* data, - uint64_t numValues, - const char* notNull) { - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - write(data[i]); - } - } - } - - void ByteRleEncoderImpl::writeValues() { - if (numLiterals != 0) { - if (repeat) { - writeByte( - static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT))); - writeByte(literals[0]); - } else { - writeByte(static_cast<char>(-numLiterals)); - for (int i = 0; i < numLiterals; ++i) { - writeByte(literals[i]); - } - } - repeat = false; - tailRunLength = 0; - numLiterals = 0; - } - } - - uint64_t ByteRleEncoderImpl::flush() { - writeValues(); - outputStream->BackUp(bufferLength - bufferPosition); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; - } - - void ByteRleEncoderImpl::write(char value) { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0]) { - numLiterals += 1; - if (numLiterals == MAXIMUM_REPEAT) { - writeValues(); - } - } else { - writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; - } - } else { - if (value == literals[numLiterals - 1]) { - tailRunLength += 1; - } else { - tailRunLength = 1; - } - if (tailRunLength == MINIMUM_REPEAT) { - if (numLiterals + 1 == MINIMUM_REPEAT) { - repeat = true; - numLiterals += 1; - } else { - numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); - writeValues(); - literals[0] = value; - repeat = true; - numLiterals = MINIMUM_REPEAT; - } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); - } - } - } - } - - uint64_t ByteRleEncoderImpl::getBufferSize() const { - return outputStream->getSize(); - } - - void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const { - uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); - if (outputStream->isCompressed()) { - // start of the compression chunk in the stream - recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); - } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - // byte offset of the RLE run’s start location - recorder->add(flushedSize + unflushedSize); - } - recorder->add(static_cast<uint64_t>(numLiterals)); - } - - std::unique_ptr<ByteRleEncoder> createByteRleEncoder - (std::unique_ptr<BufferedOutputStream> output) { - return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl - (std::move(output))); - } - - class BooleanRleEncoderImpl : public ByteRleEncoderImpl { - public: - BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); - virtual ~BooleanRleEncoderImpl() override; - - /** - * Encode the next batch of values - * @param data to be encoded - * @param numValues the number of values to be encoded - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) override; - - /** - * Flushing underlying BufferedOutputStream - */ - virtual uint64_t flush() override; - - virtual void recordPosition(PositionRecorder* recorder) const override; - - private: - int bitsRemained; - char current; - - }; - - BooleanRleEncoderImpl::BooleanRleEncoderImpl( - std::unique_ptr<BufferedOutputStream> output) - : ByteRleEncoderImpl(std::move(output)) { - bitsRemained = 8; - current = static_cast<char>(0); - } - - BooleanRleEncoderImpl::~BooleanRleEncoderImpl() { - // PASS - } - - void BooleanRleEncoderImpl::add( - const char* data, - uint64_t numValues, - const char* notNull) { - for (uint64_t i = 0; i < numValues; ++i) { - if (bitsRemained == 0) { - write(current); - current = static_cast<char>(0); - bitsRemained = 8; - } - if (!notNull || notNull[i]) { - if (!data || data[i]) { - current = - static_cast<char>(current | (0x80 >> (8 - bitsRemained))); - } - --bitsRemained; - } - } - if (bitsRemained == 0) { - write(current); - current = static_cast<char>(0); - bitsRemained = 8; - } - } - - uint64_t BooleanRleEncoderImpl::flush() { - if (bitsRemained != 8) { - write(current); - } - bitsRemained = 8; - current = static_cast<char>(0); - return ByteRleEncoderImpl::flush(); - } - - void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { - ByteRleEncoderImpl::recordPosition(recorder); - recorder->add(static_cast<uint64_t>(8 - bitsRemained)); - } - - std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder - (std::unique_ptr<BufferedOutputStream> output) { - BooleanRleEncoderImpl* encoder = - new BooleanRleEncoderImpl(std::move(output)) ; - return std::unique_ptr<ByteRleEncoder>( - reinterpret_cast<ByteRleEncoder*>(encoder)); - } - - ByteRleDecoder::~ByteRleDecoder() { - // PASS - } - - class ByteRleDecoderImpl: public ByteRleDecoder { - public: - ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); - - virtual ~ByteRleDecoderImpl(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&); - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues); - - /** - * Read a number of values into the batch. - */ - virtual void next(char* data, uint64_t numValues, char* notNull); - - protected: - inline void nextBuffer(); - inline signed char readByte(); - inline void readHeader(); - - std::unique_ptr<SeekableInputStream> inputStream; - size_t remainingValues; - char value; - const char* bufferStart; - const char* bufferEnd; - bool repeating; - }; - - void ByteRleDecoderImpl::nextBuffer() { - int bufferLength; - const void* bufferPointer; - bool result = inputStream->Next(&bufferPointer, &bufferLength); - if (!result) { - throw ParseError("bad read in nextBuffer"); - } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - - signed char ByteRleDecoderImpl::readByte() { - if (bufferStart == bufferEnd) { - nextBuffer(); - } - return *(bufferStart++); - } - - void ByteRleDecoderImpl::readHeader() { - signed char ch = readByte(); - if (ch < 0) { - remainingValues = static_cast<size_t>(-ch); - repeating = false; - } else { - remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT; - repeating = true; - value = readByte(); - } - } - - ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> - input) { - inputStream = std::move(input); - repeating = false; - remainingValues = 0; - value = 0; - bufferStart = nullptr; - bufferEnd = nullptr; - } - - ByteRleDecoderImpl::~ByteRleDecoderImpl() { - // PASS - } - - void ByteRleDecoderImpl::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // force a re-read from the stream - bufferEnd = bufferStart; - // read a new header - readHeader(); - // skip ahead the given number of records - ByteRleDecoderImpl::skip(location.next()); - } - - void ByteRleDecoderImpl::skip(uint64_t numValues) { - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - size_t count = std::min(static_cast<size_t>(numValues), remainingValues); - remainingValues -= count; - numValues -= count; - // for literals we need to skip over count bytes, which may involve - // reading from the underlying stream - if (!repeating) { - size_t consumedBytes = count; - while (consumedBytes > 0) { - if (bufferStart == bufferEnd) { - nextBuffer(); - } - size_t skipSize = std::min(static_cast<size_t>(consumedBytes), - static_cast<size_t>(bufferEnd - - bufferStart)); - bufferStart += skipSize; - consumedBytes -= skipSize; - } - } - } - } - - void ByteRleDecoderImpl::next(char* data, uint64_t numValues, - char* notNull) { - uint64_t position = 0; - // skip over null values - while (notNull && position < numValues && !notNull[position]) { - position += 1; - } - while (position < numValues) { - // if we are out of values, read more - if (remainingValues == 0) { - readHeader(); - } - // how many do we read out of this block? - size_t count = std::min(static_cast<size_t>(numValues - position), - remainingValues); - uint64_t consumed = 0; - if (repeating) { - if (notNull) { - for(uint64_t i=0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = value; - consumed += 1; - } - } - } else { - memset(data + position, value, count); - consumed = count; - } - } else { - if (notNull) { - for(uint64_t i=0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = readByte(); - consumed += 1; - } - } - } else { - uint64_t i = 0; - while (i < count) { - if (bufferStart == bufferEnd) { - nextBuffer(); - } - uint64_t copyBytes = - std::min(static_cast<uint64_t>(count - i), - static_cast<uint64_t>(bufferEnd - bufferStart)); - memcpy(data + position + i, bufferStart, copyBytes); - bufferStart += copyBytes; - i += copyBytes; - } - consumed = count; - } - } - remainingValues -= consumed; - position += count; - // skip over any null values - while (notNull && position < numValues && !notNull[position]) { - position += 1; - } - } - } - - std::unique_ptr<ByteRleDecoder> createByteRleDecoder - (std::unique_ptr<SeekableInputStream> input) { - return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl - (std::move(input))); - } - - class BooleanRleDecoderImpl: public ByteRleDecoderImpl { - public: - BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); - - virtual ~BooleanRleDecoderImpl(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&); - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues); - - /** - * Read a number of values into the batch. - */ - virtual void next(char* data, uint64_t numValues, char* notNull); - - protected: - size_t remainingBits; - char lastByte; - }; - - BooleanRleDecoderImpl::BooleanRleDecoderImpl - (std::unique_ptr<SeekableInputStream> input - ): ByteRleDecoderImpl(std::move(input)) { - remainingBits = 0; - lastByte = 0; - } - - BooleanRleDecoderImpl::~BooleanRleDecoderImpl() { - // PASS - } - - void BooleanRleDecoderImpl::seek(PositionProvider& location) { - ByteRleDecoderImpl::seek(location); - uint64_t consumed = location.next(); - remainingBits = 0; - if (consumed > 8) { - throw ParseError("bad position"); - } - if (consumed != 0) { - remainingBits = 8 - consumed; - ByteRleDecoderImpl::next(&lastByte, 1, nullptr); - } - } - - void BooleanRleDecoderImpl::skip(uint64_t numValues) { - if (numValues <= remainingBits) { - remainingBits -= numValues; - } else { - numValues -= remainingBits; - uint64_t bytesSkipped = numValues / 8; - ByteRleDecoderImpl::skip(bytesSkipped); - if (numValues % 8 != 0) { - ByteRleDecoderImpl::next(&lastByte, 1, nullptr); - remainingBits = 8 - (numValues % 8); - } else { - remainingBits = 0; - } - } - } - - void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, - char* notNull) { - // next spot to fill in - uint64_t position = 0; - - // use up any remaining bits - if (notNull) { - while(remainingBits > 0 && position < numValues) { - if (notNull[position]) { - remainingBits -= 1; - data[position] = (static_cast<unsigned char>(lastByte) >> - remainingBits) & 0x1; - } else { - data[position] = 0; - } - position += 1; - } - } else { - while(remainingBits > 0 && position < numValues) { - remainingBits -= 1; - data[position++] = (static_cast<unsigned char>(lastByte) >> - remainingBits) & 0x1; - } - } - - // count the number of nonNulls remaining - uint64_t nonNulls = numValues - position; - if (notNull) { - for(uint64_t i=position; i < numValues; ++i) { - if (!notNull[i]) { - nonNulls -= 1; - } - } - } - - // fill in the remaining values - if (nonNulls == 0) { - while (position < numValues) { - data[position++] = 0; - } - } else if (position < numValues) { - // read the new bytes into the array - uint64_t bytesRead = (nonNulls + 7) / 8; - ByteRleDecoderImpl::next(data + position, bytesRead, nullptr); - lastByte = data[position + bytesRead - 1]; - remainingBits = bytesRead * 8 - nonNulls; - // expand the array backwards so that we don't clobber the data - uint64_t bitsLeft = bytesRead * 8 - remainingBits; - if (notNull) { - for(int64_t i=static_cast<int64_t>(numValues) - 1; - i >= static_cast<int64_t>(position); --i) { - if (notNull[i]) { - uint64_t shiftPosn = (-bitsLeft) % 8; - data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; - bitsLeft -= 1; - } else { - data[i] = 0; - } - } - } else { - for(int64_t i=static_cast<int64_t>(numValues) - 1; - i >= static_cast<int64_t>(position); --i, --bitsLeft) { - uint64_t shiftPosn = (-bitsLeft) % 8; - data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; - } - } - } - } - - std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder - (std::unique_ptr<SeekableInputStream> input) { - BooleanRleDecoderImpl* decoder = - new BooleanRleDecoderImpl(std::move(input)); - return std::unique_ptr<ByteRleDecoder>( - reinterpret_cast<ByteRleDecoder*>(decoder)); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <algorithm> +#include <iostream> +#include <string.h> +#include <utility> + +#include "ByteRLE.hh" +#include "orc/Exceptions.hh" + +namespace orc { + + const int MINIMUM_REPEAT = 3; + const int MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; + const int MAX_LITERAL_SIZE = 128; + + ByteRleEncoder::~ByteRleEncoder() { + // PASS + } + + class ByteRleEncoderImpl : public ByteRleEncoder { + public: + ByteRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); + virtual ~ByteRleEncoderImpl() override; + + /** + * Encode the next batch of values. + * @param data to be encoded + * @param numValues the number of values to be encoded + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const char* data, uint64_t numValues, + const char* notNull) override; + + /** + * Get size of buffer used so far. + */ + virtual uint64_t getBufferSize() const override; + + /** + * Flush underlying BufferedOutputStream. + */ + virtual uint64_t flush() override; + + virtual void recordPosition(PositionRecorder* recorder) const override; + + protected: + std::unique_ptr<BufferedOutputStream> outputStream; + char* literals; + int numLiterals; + bool repeat; + int tailRunLength; + int bufferPosition; + int bufferLength; + char* buffer; + + void writeByte(char c); + void writeValues(); + void write(char c); + }; + + ByteRleEncoderImpl::ByteRleEncoderImpl( + std::unique_ptr<BufferedOutputStream> output) + : outputStream(std::move(output)) { + literals = new char[MAX_LITERAL_SIZE]; + numLiterals = 0; + tailRunLength = 0; + repeat = false; + bufferPosition = 0; + bufferLength = 0; + buffer = nullptr; + } + + ByteRleEncoderImpl::~ByteRleEncoderImpl() { + // PASS + delete [] literals; + } + + void ByteRleEncoderImpl::writeByte(char c) { + if (bufferPosition == bufferLength) { + int addedSize = 0; + if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { + throw std::bad_alloc(); + } + bufferPosition = 0; + bufferLength = addedSize; + } + buffer[bufferPosition++] = c; + } + + void ByteRleEncoderImpl::add( + const char* data, + uint64_t numValues, + const char* notNull) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + write(data[i]); + } + } + } + + void ByteRleEncoderImpl::writeValues() { + if (numLiterals != 0) { + if (repeat) { + writeByte( + static_cast<char>(numLiterals - static_cast<int>(MINIMUM_REPEAT))); + writeByte(literals[0]); + } else { + writeByte(static_cast<char>(-numLiterals)); + for (int i = 0; i < numLiterals; ++i) { + writeByte(literals[i]); + } + } + repeat = false; + tailRunLength = 0; + numLiterals = 0; + } + } + + uint64_t ByteRleEncoderImpl::flush() { + writeValues(); + outputStream->BackUp(bufferLength - bufferPosition); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; + } + + void ByteRleEncoderImpl::write(char value) { + if (numLiterals == 0) { + literals[numLiterals++] = value; + tailRunLength = 1; + } else if (repeat) { + if (value == literals[0]) { + numLiterals += 1; + if (numLiterals == MAXIMUM_REPEAT) { + writeValues(); + } + } else { + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; + } + } else { + if (value == literals[numLiterals - 1]) { + tailRunLength += 1; + } else { + tailRunLength = 1; + } + if (tailRunLength == MINIMUM_REPEAT) { + if (numLiterals + 1 == MINIMUM_REPEAT) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); + writeValues(); + literals[0] = value; + repeat = true; + numLiterals = MINIMUM_REPEAT; + } + } else { + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } + } + } + } + + uint64_t ByteRleEncoderImpl::getBufferSize() const { + return outputStream->getSize(); + } + + void ByteRleEncoderImpl::recordPosition(PositionRecorder *recorder) const { + uint64_t flushedSize = outputStream->getSize(); + uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); + if (outputStream->isCompressed()) { + // start of the compression chunk in the stream + recorder->add(flushedSize); + // number of decompressed bytes that need to be consumed + recorder->add(unflushedSize); + } else { + flushedSize -= static_cast<uint64_t>(bufferLength); + // byte offset of the RLE run’s start location + recorder->add(flushedSize + unflushedSize); + } + recorder->add(static_cast<uint64_t>(numLiterals)); + } + + std::unique_ptr<ByteRleEncoder> createByteRleEncoder + (std::unique_ptr<BufferedOutputStream> output) { + return std::unique_ptr<ByteRleEncoder>(new ByteRleEncoderImpl + (std::move(output))); + } + + class BooleanRleEncoderImpl : public ByteRleEncoderImpl { + public: + BooleanRleEncoderImpl(std::unique_ptr<BufferedOutputStream> output); + virtual ~BooleanRleEncoderImpl() override; + + /** + * Encode the next batch of values + * @param data to be encoded + * @param numValues the number of values to be encoded + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const char* data, uint64_t numValues, + const char* notNull) override; + + /** + * Flushing underlying BufferedOutputStream + */ + virtual uint64_t flush() override; + + virtual void recordPosition(PositionRecorder* recorder) const override; + + private: + int bitsRemained; + char current; + + }; + + BooleanRleEncoderImpl::BooleanRleEncoderImpl( + std::unique_ptr<BufferedOutputStream> output) + : ByteRleEncoderImpl(std::move(output)) { + bitsRemained = 8; + current = static_cast<char>(0); + } + + BooleanRleEncoderImpl::~BooleanRleEncoderImpl() { + // PASS + } + + void BooleanRleEncoderImpl::add( + const char* data, + uint64_t numValues, + const char* notNull) { + for (uint64_t i = 0; i < numValues; ++i) { + if (bitsRemained == 0) { + write(current); + current = static_cast<char>(0); + bitsRemained = 8; + } + if (!notNull || notNull[i]) { + if (!data || data[i]) { + current = + static_cast<char>(current | (0x80 >> (8 - bitsRemained))); + } + --bitsRemained; + } + } + if (bitsRemained == 0) { + write(current); + current = static_cast<char>(0); + bitsRemained = 8; + } + } + + uint64_t BooleanRleEncoderImpl::flush() { + if (bitsRemained != 8) { + write(current); + } + bitsRemained = 8; + current = static_cast<char>(0); + return ByteRleEncoderImpl::flush(); + } + + void BooleanRleEncoderImpl::recordPosition(PositionRecorder* recorder) const { + ByteRleEncoderImpl::recordPosition(recorder); + recorder->add(static_cast<uint64_t>(8 - bitsRemained)); + } + + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder + (std::unique_ptr<BufferedOutputStream> output) { + BooleanRleEncoderImpl* encoder = + new BooleanRleEncoderImpl(std::move(output)) ; + return std::unique_ptr<ByteRleEncoder>( + reinterpret_cast<ByteRleEncoder*>(encoder)); + } + + ByteRleDecoder::~ByteRleDecoder() { + // PASS + } + + class ByteRleDecoderImpl: public ByteRleDecoder { + public: + ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); + + virtual ~ByteRleDecoderImpl(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&); + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues); + + /** + * Read a number of values into the batch. + */ + virtual void next(char* data, uint64_t numValues, char* notNull); + + protected: + inline void nextBuffer(); + inline signed char readByte(); + inline void readHeader(); + + std::unique_ptr<SeekableInputStream> inputStream; + size_t remainingValues; + char value; + const char* bufferStart; + const char* bufferEnd; + bool repeating; + }; + + void ByteRleDecoderImpl::nextBuffer() { + int bufferLength; + const void* bufferPointer; + bool result = inputStream->Next(&bufferPointer, &bufferLength); + if (!result) { + throw ParseError("bad read in nextBuffer"); + } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + + signed char ByteRleDecoderImpl::readByte() { + if (bufferStart == bufferEnd) { + nextBuffer(); + } + return *(bufferStart++); + } + + void ByteRleDecoderImpl::readHeader() { + signed char ch = readByte(); + if (ch < 0) { + remainingValues = static_cast<size_t>(-ch); + repeating = false; + } else { + remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT; + repeating = true; + value = readByte(); + } + } + + ByteRleDecoderImpl::ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> + input) { + inputStream = std::move(input); + repeating = false; + remainingValues = 0; + value = 0; + bufferStart = nullptr; + bufferEnd = nullptr; + } + + ByteRleDecoderImpl::~ByteRleDecoderImpl() { + // PASS + } + + void ByteRleDecoderImpl::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // force a re-read from the stream + bufferEnd = bufferStart; + // read a new header + readHeader(); + // skip ahead the given number of records + ByteRleDecoderImpl::skip(location.next()); + } + + void ByteRleDecoderImpl::skip(uint64_t numValues) { + while (numValues > 0) { + if (remainingValues == 0) { + readHeader(); + } + size_t count = std::min(static_cast<size_t>(numValues), remainingValues); + remainingValues -= count; + numValues -= count; + // for literals we need to skip over count bytes, which may involve + // reading from the underlying stream + if (!repeating) { + size_t consumedBytes = count; + while (consumedBytes > 0) { + if (bufferStart == bufferEnd) { + nextBuffer(); + } + size_t skipSize = std::min(static_cast<size_t>(consumedBytes), + static_cast<size_t>(bufferEnd - + bufferStart)); + bufferStart += skipSize; + consumedBytes -= skipSize; + } + } + } + } + + void ByteRleDecoderImpl::next(char* data, uint64_t numValues, + char* notNull) { + uint64_t position = 0; + // skip over null values + while (notNull && position < numValues && !notNull[position]) { + position += 1; + } + while (position < numValues) { + // if we are out of values, read more + if (remainingValues == 0) { + readHeader(); + } + // how many do we read out of this block? + size_t count = std::min(static_cast<size_t>(numValues - position), + remainingValues); + uint64_t consumed = 0; + if (repeating) { + if (notNull) { + for(uint64_t i=0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = value; + consumed += 1; + } + } + } else { + memset(data + position, value, count); + consumed = count; + } + } else { + if (notNull) { + for(uint64_t i=0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = readByte(); + consumed += 1; + } + } + } else { + uint64_t i = 0; + while (i < count) { + if (bufferStart == bufferEnd) { + nextBuffer(); + } + uint64_t copyBytes = + std::min(static_cast<uint64_t>(count - i), + static_cast<uint64_t>(bufferEnd - bufferStart)); + memcpy(data + position + i, bufferStart, copyBytes); + bufferStart += copyBytes; + i += copyBytes; + } + consumed = count; + } + } + remainingValues -= consumed; + position += count; + // skip over any null values + while (notNull && position < numValues && !notNull[position]) { + position += 1; + } + } + } + + std::unique_ptr<ByteRleDecoder> createByteRleDecoder + (std::unique_ptr<SeekableInputStream> input) { + return std::unique_ptr<ByteRleDecoder>(new ByteRleDecoderImpl + (std::move(input))); + } + + class BooleanRleDecoderImpl: public ByteRleDecoderImpl { + public: + BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input); + + virtual ~BooleanRleDecoderImpl(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&); + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues); + + /** + * Read a number of values into the batch. + */ + virtual void next(char* data, uint64_t numValues, char* notNull); + + protected: + size_t remainingBits; + char lastByte; + }; + + BooleanRleDecoderImpl::BooleanRleDecoderImpl + (std::unique_ptr<SeekableInputStream> input + ): ByteRleDecoderImpl(std::move(input)) { + remainingBits = 0; + lastByte = 0; + } + + BooleanRleDecoderImpl::~BooleanRleDecoderImpl() { + // PASS + } + + void BooleanRleDecoderImpl::seek(PositionProvider& location) { + ByteRleDecoderImpl::seek(location); + uint64_t consumed = location.next(); + remainingBits = 0; + if (consumed > 8) { + throw ParseError("bad position"); + } + if (consumed != 0) { + remainingBits = 8 - consumed; + ByteRleDecoderImpl::next(&lastByte, 1, nullptr); + } + } + + void BooleanRleDecoderImpl::skip(uint64_t numValues) { + if (numValues <= remainingBits) { + remainingBits -= numValues; + } else { + numValues -= remainingBits; + uint64_t bytesSkipped = numValues / 8; + ByteRleDecoderImpl::skip(bytesSkipped); + if (numValues % 8 != 0) { + ByteRleDecoderImpl::next(&lastByte, 1, nullptr); + remainingBits = 8 - (numValues % 8); + } else { + remainingBits = 0; + } + } + } + + void BooleanRleDecoderImpl::next(char* data, uint64_t numValues, + char* notNull) { + // next spot to fill in + uint64_t position = 0; + + // use up any remaining bits + if (notNull) { + while(remainingBits > 0 && position < numValues) { + if (notNull[position]) { + remainingBits -= 1; + data[position] = (static_cast<unsigned char>(lastByte) >> + remainingBits) & 0x1; + } else { + data[position] = 0; + } + position += 1; + } + } else { + while(remainingBits > 0 && position < numValues) { + remainingBits -= 1; + data[position++] = (static_cast<unsigned char>(lastByte) >> + remainingBits) & 0x1; + } + } + + // count the number of nonNulls remaining + uint64_t nonNulls = numValues - position; + if (notNull) { + for(uint64_t i=position; i < numValues; ++i) { + if (!notNull[i]) { + nonNulls -= 1; + } + } + } + + // fill in the remaining values + if (nonNulls == 0) { + while (position < numValues) { + data[position++] = 0; + } + } else if (position < numValues) { + // read the new bytes into the array + uint64_t bytesRead = (nonNulls + 7) / 8; + ByteRleDecoderImpl::next(data + position, bytesRead, nullptr); + lastByte = data[position + bytesRead - 1]; + remainingBits = bytesRead * 8 - nonNulls; + // expand the array backwards so that we don't clobber the data + uint64_t bitsLeft = bytesRead * 8 - remainingBits; + if (notNull) { + for(int64_t i=static_cast<int64_t>(numValues) - 1; + i >= static_cast<int64_t>(position); --i) { + if (notNull[i]) { + uint64_t shiftPosn = (-bitsLeft) % 8; + data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; + bitsLeft -= 1; + } else { + data[i] = 0; + } + } + } else { + for(int64_t i=static_cast<int64_t>(numValues) - 1; + i >= static_cast<int64_t>(position); --i, --bitsLeft) { + uint64_t shiftPosn = (-bitsLeft) % 8; + data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1; + } + } + } + } + + std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder + (std::unique_ptr<SeekableInputStream> input) { + BooleanRleDecoderImpl* decoder = + new BooleanRleDecoderImpl(std::move(input)); + return std::unique_ptr<ByteRleDecoder>( + reinterpret_cast<ByteRleDecoder*>(decoder)); + } +} diff --git a/contrib/libs/apache/orc/c++/src/ByteRLE.hh b/contrib/libs/apache/orc/c++/src/ByteRLE.hh index 71ca579cd3..b799675aee 100644 --- a/contrib/libs/apache/orc/c++/src/ByteRLE.hh +++ b/contrib/libs/apache/orc/c++/src/ByteRLE.hh @@ -1,117 +1,117 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_BYTE_RLE_HH -#define ORC_BYTE_RLE_HH - -#include <memory> - -#include "io/InputStream.hh" -#include "io/OutputStream.hh" - -namespace orc { - - class ByteRleEncoder { - public: - virtual ~ByteRleEncoder(); - - /** - * Encode the next batch of values - * @param data to be encoded - * @param numValues the number of values to be encoded - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const char* data, uint64_t numValues, - const char* notNull) = 0; - - /** - * Get size of buffer used so far. - */ - virtual uint64_t getBufferSize() const = 0; - - /** - * Flushing underlying output stream - */ - virtual uint64_t flush() = 0; - - /** - * record current position - * @param recorder use the recorder to record current positions - */ - virtual void recordPosition(PositionRecorder* recorder) const = 0; - }; - - class ByteRleDecoder { - public: - virtual ~ByteRleDecoder(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&) = 0; - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues) = 0; - - /** - * Read a number of values into the batch. - * @param data the array to read into - * @param numValues the number of values to read - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void next(char* data, uint64_t numValues, char* notNull) = 0; - }; - - /** - * Create a byte RLE encoder. - * @param output the output stream to write to - */ - std::unique_ptr<ByteRleEncoder> createByteRleEncoder - (std::unique_ptr<BufferedOutputStream> output); - - /** - * Create a boolean RLE encoder. - * @param output the output stream to write to - */ - std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder - (std::unique_ptr<BufferedOutputStream> output); - - /** - * Create a byte RLE decoder. - * @param input the input stream to read from - */ - std::unique_ptr<ByteRleDecoder> createByteRleDecoder - (std::unique_ptr<SeekableInputStream> input); - - /** - * Create a boolean RLE decoder. - * - * Unlike the other RLE decoders, the boolean decoder sets the data to 0 - * if the value is masked by notNull. This is required for the notNull stream - * processing to properly apply multiple masks from nested types. - * @param input the input stream to read from - */ - std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder - (std::unique_ptr<SeekableInputStream> input); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_BYTE_RLE_HH +#define ORC_BYTE_RLE_HH + +#include <memory> + +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +namespace orc { + + class ByteRleEncoder { + public: + virtual ~ByteRleEncoder(); + + /** + * Encode the next batch of values + * @param data to be encoded + * @param numValues the number of values to be encoded + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const char* data, uint64_t numValues, + const char* notNull) = 0; + + /** + * Get size of buffer used so far. + */ + virtual uint64_t getBufferSize() const = 0; + + /** + * Flushing underlying output stream + */ + virtual uint64_t flush() = 0; + + /** + * record current position + * @param recorder use the recorder to record current positions + */ + virtual void recordPosition(PositionRecorder* recorder) const = 0; + }; + + class ByteRleDecoder { + public: + virtual ~ByteRleDecoder(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&) = 0; + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues) = 0; + + /** + * Read a number of values into the batch. + * @param data the array to read into + * @param numValues the number of values to read + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void next(char* data, uint64_t numValues, char* notNull) = 0; + }; + + /** + * Create a byte RLE encoder. + * @param output the output stream to write to + */ + std::unique_ptr<ByteRleEncoder> createByteRleEncoder + (std::unique_ptr<BufferedOutputStream> output); + + /** + * Create a boolean RLE encoder. + * @param output the output stream to write to + */ + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder + (std::unique_ptr<BufferedOutputStream> output); + + /** + * Create a byte RLE decoder. + * @param input the input stream to read from + */ + std::unique_ptr<ByteRleDecoder> createByteRleDecoder + (std::unique_ptr<SeekableInputStream> input); + + /** + * Create a boolean RLE decoder. + * + * Unlike the other RLE decoders, the boolean decoder sets the data to 0 + * if the value is masked by notNull. This is required for the notNull stream + * processing to properly apply multiple masks from nested types. + * @param input the input stream to read from + */ + std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder + (std::unique_ptr<SeekableInputStream> input); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc index b4b5860cad..91c2904038 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc @@ -1,747 +1,747 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/ColumnPrinter.hh" -#include "orc/orc-config.hh" - -#include "Adaptor.hh" - -#include <limits> -#include <sstream> -#include <stdexcept> -#include <time.h> -#include <typeinfo> - -#ifdef __clang__ - #pragma clang diagnostic ignored "-Wformat-security" -#endif - -namespace orc { - - class VoidColumnPrinter: public ColumnPrinter { - public: - VoidColumnPrinter(std::string&); - ~VoidColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class BooleanColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - public: - BooleanColumnPrinter(std::string&); - ~BooleanColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class LongColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - public: - LongColumnPrinter(std::string&); - ~LongColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class DoubleColumnPrinter: public ColumnPrinter { - private: - const double* data; - const bool isFloat; - - public: - DoubleColumnPrinter(std::string&, const Type& type); - virtual ~DoubleColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class TimestampColumnPrinter: public ColumnPrinter { - private: - const int64_t* seconds; - const int64_t* nanoseconds; - - public: - TimestampColumnPrinter(std::string&); - ~TimestampColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class DateColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - - public: - DateColumnPrinter(std::string&); - ~DateColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class Decimal64ColumnPrinter: public ColumnPrinter { - private: - const int64_t* data; - int32_t scale; - public: - Decimal64ColumnPrinter(std::string&); - ~Decimal64ColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class Decimal128ColumnPrinter: public ColumnPrinter { - private: - const Int128* data; - int32_t scale; - public: - Decimal128ColumnPrinter(std::string&); - ~Decimal128ColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class StringColumnPrinter: public ColumnPrinter { - private: - const char* const * start; - const int64_t* length; - public: - StringColumnPrinter(std::string&); - virtual ~StringColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class BinaryColumnPrinter: public ColumnPrinter { - private: - const char* const * start; - const int64_t* length; - public: - BinaryColumnPrinter(std::string&); - virtual ~BinaryColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class ListColumnPrinter: public ColumnPrinter { - private: - const int64_t* offsets; - std::unique_ptr<ColumnPrinter> elementPrinter; - - public: - ListColumnPrinter(std::string&, const Type& type); - virtual ~ListColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class MapColumnPrinter: public ColumnPrinter { - private: - const int64_t* offsets; - std::unique_ptr<ColumnPrinter> keyPrinter; - std::unique_ptr<ColumnPrinter> elementPrinter; - - public: - MapColumnPrinter(std::string&, const Type& type); - virtual ~MapColumnPrinter() override {} - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class UnionColumnPrinter: public ColumnPrinter { - private: - const unsigned char *tags; - const uint64_t* offsets; - std::vector<ColumnPrinter*> fieldPrinter; - - public: - UnionColumnPrinter(std::string&, const Type& type); - virtual ~UnionColumnPrinter() override; - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - class StructColumnPrinter: public ColumnPrinter { - private: - std::vector<ColumnPrinter*> fieldPrinter; - std::vector<std::string> fieldNames; - public: - StructColumnPrinter(std::string&, const Type& type); - virtual ~StructColumnPrinter() override; - void printRow(uint64_t rowId) override; - void reset(const ColumnVectorBatch& batch) override; - }; - - void writeChar(std::string& file, char ch) { - file += ch; - } - - void writeString(std::string& file, const char *ptr) { - size_t len = strlen(ptr); - file.append(ptr, len); - } - - ColumnPrinter::ColumnPrinter(std::string& _buffer - ): buffer(_buffer) { - notNull = nullptr; - hasNulls = false; - } - - ColumnPrinter::~ColumnPrinter() { - // PASS - } - - void ColumnPrinter::reset(const ColumnVectorBatch& batch) { - hasNulls = batch.hasNulls; - if (hasNulls) { - notNull = batch.notNull.data(); - } else { - notNull = nullptr ; - } - } - - std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, - const Type* type) { - ColumnPrinter *result = nullptr; - if (type == nullptr) { - result = new VoidColumnPrinter(buffer); - } else { - switch(static_cast<int64_t>(type->getKind())) { - case BOOLEAN: - result = new BooleanColumnPrinter(buffer); - break; - - case BYTE: - case SHORT: - case INT: - case LONG: - result = new LongColumnPrinter(buffer); - break; - - case FLOAT: - case DOUBLE: - result = new DoubleColumnPrinter(buffer, *type); - break; - - case STRING: - case VARCHAR : - case CHAR: - result = new StringColumnPrinter(buffer); - break; - - case BINARY: - result = new BinaryColumnPrinter(buffer); - break; - - case TIMESTAMP: - result = new TimestampColumnPrinter(buffer); - break; - - case LIST: - result = new ListColumnPrinter(buffer, *type); - break; - - case MAP: - result = new MapColumnPrinter(buffer, *type); - break; - - case STRUCT: - result = new StructColumnPrinter(buffer, *type); - break; - - case DECIMAL: - if (type->getPrecision() == 0 || type->getPrecision() > 18) { - result = new Decimal128ColumnPrinter(buffer); - } else { - result = new Decimal64ColumnPrinter(buffer); - } - break; - - case DATE: - result = new DateColumnPrinter(buffer); - break; - - case UNION: - result = new UnionColumnPrinter(buffer, *type); - break; - - default: - throw std::logic_error("unknown batch type"); - } - } - return std::unique_ptr<ColumnPrinter>(result); - } - - VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer) { - // PASS - } - - void VoidColumnPrinter::reset(const ColumnVectorBatch&) { - // PASS - } - - void VoidColumnPrinter::printRow(uint64_t) { - writeString(buffer, "null"); - } - - LongColumnPrinter::LongColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { - // PASS - } - - void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); - } - - void LongColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", - static_cast<int64_t >(data[rowId])); - writeString(buffer, numBuffer); - } - } - - DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - data(nullptr), - isFloat(type.getKind() == FLOAT){ - // PASS - } - - void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); - } - - void DoubleColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", - data[rowId]); - writeString(buffer, numBuffer); - } - } - - Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr), - scale(0) { - // PASS - } - - void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; - } - - std::string toDecimalString(int64_t value, int32_t scale) { - std::stringstream buffer; - if (scale == 0) { - buffer << value; - return buffer.str(); - } - std::string sign = ""; - if (value < 0) { - sign = "-"; - value = -value; - } - buffer << value; - std::string str = buffer.str(); - int32_t len = static_cast<int32_t>(str.length()); - if (len > scale) { - return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); - } else if (len == scale) { - return sign + "0." + str; - } else { - std::string result = sign + "0."; - for(int32_t i=0; i < scale - len; ++i) { - result += "0"; - } - return result + str; - } - } - - void Decimal64ColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, toDecimalString(data[rowId], scale).c_str()); - } - } - - Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr), - scale(0) { - // PASS - } - - void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); - scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; - } - - void Decimal128ColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, data[rowId].toDecimalString(scale).c_str()); - } - } - - StringColumnPrinter::StringColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - start(nullptr), - length(nullptr) { - // PASS - } - - void StringColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); - length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); - } - - void StringColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '"'); - for(int64_t i=0; i < length[rowId]; ++i) { - char ch = static_cast<char>(start[rowId][i]); - switch (ch) { - case '\\': - writeString(buffer, "\\\\"); - break; - case '\b': - writeString(buffer, "\\b"); - break; - case '\f': - writeString(buffer, "\\f"); - break; - case '\n': - writeString(buffer, "\\n"); - break; - case '\r': - writeString(buffer, "\\r"); - break; - case '\t': - writeString(buffer, "\\t"); - break; - case '"': - writeString(buffer, "\\\""); - break; - default: - writeChar(buffer, ch); - break; - } - } - writeChar(buffer, '"'); - } - } - - ListColumnPrinter::ListColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - offsets(nullptr) { - elementPrinter = createColumnPrinter(buffer, type.getSubtype(0)); - } - - void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); - elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch). - elements); - } - - void ListColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '['); - for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { - if (i != offsets[rowId]) { - writeString(buffer, ", "); - } - elementPrinter->printRow(static_cast<uint64_t>(i)); - } - writeChar(buffer, ']'); - } - } - - MapColumnPrinter::MapColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - offsets(nullptr) { - keyPrinter = createColumnPrinter(buffer, type.getSubtype(0)); - elementPrinter = createColumnPrinter(buffer, type.getSubtype(1)); - } - - void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch); - offsets = myBatch.offsets.data(); - keyPrinter->reset(*myBatch.keys); - elementPrinter->reset(*myBatch.elements); - } - - void MapColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '['); - for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { - if (i != offsets[rowId]) { - writeString(buffer, ", "); - } - writeString(buffer, "{\"key\": "); - keyPrinter->printRow(static_cast<uint64_t>(i)); - writeString(buffer, ", \"value\": "); - elementPrinter->printRow(static_cast<uint64_t>(i)); - writeChar(buffer, '}'); - } - writeChar(buffer, ']'); - } - } - - UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer), - tags(nullptr), - offsets(nullptr) { - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)) - .release()); - } - } - - UnionColumnPrinter::~UnionColumnPrinter() { - for (size_t i = 0; i < fieldPrinter.size(); i++) { - delete fieldPrinter[i]; - } - } - - void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const UnionVectorBatch& unionBatch = - dynamic_cast<const UnionVectorBatch&>(batch); - tags = unionBatch.tags.data(); - offsets = unionBatch.offsets.data(); - for(size_t i=0; i < fieldPrinter.size(); ++i) { - fieldPrinter[i]->reset(*(unionBatch.children[i])); - } - } - - void UnionColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, "{\"tag\": "); - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", - static_cast<int64_t>(tags[rowId])); - writeString(buffer, numBuffer); - writeString(buffer, ", \"value\": "); - fieldPrinter[tags[rowId]]->printRow(offsets[rowId]); - writeChar(buffer, '}'); - } - } - - StructColumnPrinter::StructColumnPrinter(std::string& _buffer, - const Type& type - ): ColumnPrinter(_buffer) { - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - fieldNames.push_back(type.getFieldName(i)); - fieldPrinter.push_back(createColumnPrinter(buffer, - type.getSubtype(i)) - .release()); - } - } - - StructColumnPrinter::~StructColumnPrinter() { - for (size_t i = 0; i < fieldPrinter.size(); i++) { - delete fieldPrinter[i]; - } - } - - void StructColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const StructVectorBatch& structBatch = - dynamic_cast<const StructVectorBatch&>(batch); - for(size_t i=0; i < fieldPrinter.size(); ++i) { - fieldPrinter[i]->reset(*(structBatch.fields[i])); - } - } - - void StructColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '{'); - for(unsigned int i=0; i < fieldPrinter.size(); ++i) { - if (i != 0) { - writeString(buffer, ", "); - } - writeChar(buffer, '"'); - writeString(buffer, fieldNames[i].c_str()); - writeString(buffer, "\": "); - fieldPrinter[i]->printRow(rowId); - } - writeChar(buffer, '}'); - } - } - - DateColumnPrinter::DateColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { - // PASS - } - - void DateColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - const time_t timeValue = data[rowId] * 24 * 60 * 60; - struct tm tmValue; - gmtime_r(&timeValue, &tmValue); - char timeBuffer[11]; - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue); - writeChar(buffer, '"'); - writeString(buffer, timeBuffer); - writeChar(buffer, '"'); - } - } - - void DateColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); - } - - BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - data(nullptr) { - // PASS - } - - void BooleanColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeString(buffer, (data[rowId] ? "true" : "false")); - } - } - - void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); - } - - BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - start(nullptr), - length(nullptr) { - // PASS - } - - void BinaryColumnPrinter::printRow(uint64_t rowId) { - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - writeChar(buffer, '['); - for(int64_t i=0; i < length[rowId]; ++i) { - if (i != 0) { - writeString(buffer, ", "); - } - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), "%d", - (static_cast<const int>(start[rowId][i]) & 0xff)); - writeString(buffer, numBuffer); - } - writeChar(buffer, ']'); - } - } - - void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); - length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); - } - - TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer - ): ColumnPrinter(_buffer), - seconds(nullptr), - nanoseconds(nullptr) { - // PASS - } - - void TimestampColumnPrinter::printRow(uint64_t rowId) { - const int64_t NANO_DIGITS = 9; - if (hasNulls && !notNull[rowId]) { - writeString(buffer, "null"); - } else { - int64_t nanos = nanoseconds[rowId]; - time_t secs = static_cast<time_t>(seconds[rowId]); - struct tm tmValue; - gmtime_r(&secs, &tmValue); - char timeBuffer[20]; - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - writeChar(buffer, '"'); - writeString(buffer, timeBuffer); - writeChar(buffer, '.'); - // remove trailing zeros off the back of the nanos value. - int64_t zeroDigits = 0; - if (nanos == 0) { - zeroDigits = 8; - } else { - while (nanos % 10 == 0) { - nanos /= 10; - zeroDigits += 1; - } - } - char numBuffer[64]; - snprintf(numBuffer, sizeof(numBuffer), - "%0*" INT64_FORMAT_STRING "d\"", - static_cast<int>(NANO_DIGITS - zeroDigits), - static_cast<int64_t >(nanos)); - writeString(buffer, numBuffer); - } - } - - void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) { - ColumnPrinter::reset(batch); - const TimestampVectorBatch& ts = - dynamic_cast<const TimestampVectorBatch&>(batch); - seconds = ts.data.data(); - nanoseconds = ts.nanoseconds.data(); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/ColumnPrinter.hh" +#include "orc/orc-config.hh" + +#include "Adaptor.hh" + +#include <limits> +#include <sstream> +#include <stdexcept> +#include <time.h> +#include <typeinfo> + +#ifdef __clang__ + #pragma clang diagnostic ignored "-Wformat-security" +#endif + +namespace orc { + + class VoidColumnPrinter: public ColumnPrinter { + public: + VoidColumnPrinter(std::string&); + ~VoidColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class BooleanColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + public: + BooleanColumnPrinter(std::string&); + ~BooleanColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class LongColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + public: + LongColumnPrinter(std::string&); + ~LongColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class DoubleColumnPrinter: public ColumnPrinter { + private: + const double* data; + const bool isFloat; + + public: + DoubleColumnPrinter(std::string&, const Type& type); + virtual ~DoubleColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class TimestampColumnPrinter: public ColumnPrinter { + private: + const int64_t* seconds; + const int64_t* nanoseconds; + + public: + TimestampColumnPrinter(std::string&); + ~TimestampColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class DateColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + + public: + DateColumnPrinter(std::string&); + ~DateColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class Decimal64ColumnPrinter: public ColumnPrinter { + private: + const int64_t* data; + int32_t scale; + public: + Decimal64ColumnPrinter(std::string&); + ~Decimal64ColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class Decimal128ColumnPrinter: public ColumnPrinter { + private: + const Int128* data; + int32_t scale; + public: + Decimal128ColumnPrinter(std::string&); + ~Decimal128ColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class StringColumnPrinter: public ColumnPrinter { + private: + const char* const * start; + const int64_t* length; + public: + StringColumnPrinter(std::string&); + virtual ~StringColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class BinaryColumnPrinter: public ColumnPrinter { + private: + const char* const * start; + const int64_t* length; + public: + BinaryColumnPrinter(std::string&); + virtual ~BinaryColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class ListColumnPrinter: public ColumnPrinter { + private: + const int64_t* offsets; + std::unique_ptr<ColumnPrinter> elementPrinter; + + public: + ListColumnPrinter(std::string&, const Type& type); + virtual ~ListColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class MapColumnPrinter: public ColumnPrinter { + private: + const int64_t* offsets; + std::unique_ptr<ColumnPrinter> keyPrinter; + std::unique_ptr<ColumnPrinter> elementPrinter; + + public: + MapColumnPrinter(std::string&, const Type& type); + virtual ~MapColumnPrinter() override {} + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class UnionColumnPrinter: public ColumnPrinter { + private: + const unsigned char *tags; + const uint64_t* offsets; + std::vector<ColumnPrinter*> fieldPrinter; + + public: + UnionColumnPrinter(std::string&, const Type& type); + virtual ~UnionColumnPrinter() override; + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + class StructColumnPrinter: public ColumnPrinter { + private: + std::vector<ColumnPrinter*> fieldPrinter; + std::vector<std::string> fieldNames; + public: + StructColumnPrinter(std::string&, const Type& type); + virtual ~StructColumnPrinter() override; + void printRow(uint64_t rowId) override; + void reset(const ColumnVectorBatch& batch) override; + }; + + void writeChar(std::string& file, char ch) { + file += ch; + } + + void writeString(std::string& file, const char *ptr) { + size_t len = strlen(ptr); + file.append(ptr, len); + } + + ColumnPrinter::ColumnPrinter(std::string& _buffer + ): buffer(_buffer) { + notNull = nullptr; + hasNulls = false; + } + + ColumnPrinter::~ColumnPrinter() { + // PASS + } + + void ColumnPrinter::reset(const ColumnVectorBatch& batch) { + hasNulls = batch.hasNulls; + if (hasNulls) { + notNull = batch.notNull.data(); + } else { + notNull = nullptr ; + } + } + + std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string& buffer, + const Type* type) { + ColumnPrinter *result = nullptr; + if (type == nullptr) { + result = new VoidColumnPrinter(buffer); + } else { + switch(static_cast<int64_t>(type->getKind())) { + case BOOLEAN: + result = new BooleanColumnPrinter(buffer); + break; + + case BYTE: + case SHORT: + case INT: + case LONG: + result = new LongColumnPrinter(buffer); + break; + + case FLOAT: + case DOUBLE: + result = new DoubleColumnPrinter(buffer, *type); + break; + + case STRING: + case VARCHAR : + case CHAR: + result = new StringColumnPrinter(buffer); + break; + + case BINARY: + result = new BinaryColumnPrinter(buffer); + break; + + case TIMESTAMP: + result = new TimestampColumnPrinter(buffer); + break; + + case LIST: + result = new ListColumnPrinter(buffer, *type); + break; + + case MAP: + result = new MapColumnPrinter(buffer, *type); + break; + + case STRUCT: + result = new StructColumnPrinter(buffer, *type); + break; + + case DECIMAL: + if (type->getPrecision() == 0 || type->getPrecision() > 18) { + result = new Decimal128ColumnPrinter(buffer); + } else { + result = new Decimal64ColumnPrinter(buffer); + } + break; + + case DATE: + result = new DateColumnPrinter(buffer); + break; + + case UNION: + result = new UnionColumnPrinter(buffer, *type); + break; + + default: + throw std::logic_error("unknown batch type"); + } + } + return std::unique_ptr<ColumnPrinter>(result); + } + + VoidColumnPrinter::VoidColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer) { + // PASS + } + + void VoidColumnPrinter::reset(const ColumnVectorBatch&) { + // PASS + } + + void VoidColumnPrinter::printRow(uint64_t) { + writeString(buffer, "null"); + } + + LongColumnPrinter::LongColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr) { + // PASS + } + + void LongColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + } + + void LongColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", + static_cast<int64_t >(data[rowId])); + writeString(buffer, numBuffer); + } + } + + DoubleColumnPrinter::DoubleColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + data(nullptr), + isFloat(type.getKind() == FLOAT){ + // PASS + } + + void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data(); + } + + void DoubleColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), isFloat ? "%.7g" : "%.14g", + data[rowId]); + writeString(buffer, numBuffer); + } + } + + Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr), + scale(0) { + // PASS + } + + void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data(); + scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale; + } + + std::string toDecimalString(int64_t value, int32_t scale) { + std::stringstream buffer; + if (scale == 0) { + buffer << value; + return buffer.str(); + } + std::string sign = ""; + if (value < 0) { + sign = "-"; + value = -value; + } + buffer << value; + std::string str = buffer.str(); + int32_t len = static_cast<int32_t>(str.length()); + if (len > scale) { + return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." + + str.substr(static_cast<size_t>(len - scale), + static_cast<size_t>(scale)); + } else if (len == scale) { + return sign + "0." + str; + } else { + std::string result = sign + "0."; + for(int32_t i=0; i < scale - len; ++i) { + result += "0"; + } + return result + str; + } + } + + void Decimal64ColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, toDecimalString(data[rowId], scale).c_str()); + } + } + + Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr), + scale(0) { + // PASS + } + + void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data(); + scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale; + } + + void Decimal128ColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, data[rowId].toDecimalString(scale).c_str()); + } + } + + StringColumnPrinter::StringColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + start(nullptr), + length(nullptr) { + // PASS + } + + void StringColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); + length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); + } + + void StringColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '"'); + for(int64_t i=0; i < length[rowId]; ++i) { + char ch = static_cast<char>(start[rowId][i]); + switch (ch) { + case '\\': + writeString(buffer, "\\\\"); + break; + case '\b': + writeString(buffer, "\\b"); + break; + case '\f': + writeString(buffer, "\\f"); + break; + case '\n': + writeString(buffer, "\\n"); + break; + case '\r': + writeString(buffer, "\\r"); + break; + case '\t': + writeString(buffer, "\\t"); + break; + case '"': + writeString(buffer, "\\\""); + break; + default: + writeChar(buffer, ch); + break; + } + } + writeChar(buffer, '"'); + } + } + + ListColumnPrinter::ListColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + offsets(nullptr) { + elementPrinter = createColumnPrinter(buffer, type.getSubtype(0)); + } + + void ListColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data(); + elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch). + elements); + } + + void ListColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '['); + for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { + if (i != offsets[rowId]) { + writeString(buffer, ", "); + } + elementPrinter->printRow(static_cast<uint64_t>(i)); + } + writeChar(buffer, ']'); + } + } + + MapColumnPrinter::MapColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + offsets(nullptr) { + keyPrinter = createColumnPrinter(buffer, type.getSubtype(0)); + elementPrinter = createColumnPrinter(buffer, type.getSubtype(1)); + } + + void MapColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch); + offsets = myBatch.offsets.data(); + keyPrinter->reset(*myBatch.keys); + elementPrinter->reset(*myBatch.elements); + } + + void MapColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '['); + for(int64_t i=offsets[rowId]; i < offsets[rowId+1]; ++i) { + if (i != offsets[rowId]) { + writeString(buffer, ", "); + } + writeString(buffer, "{\"key\": "); + keyPrinter->printRow(static_cast<uint64_t>(i)); + writeString(buffer, ", \"value\": "); + elementPrinter->printRow(static_cast<uint64_t>(i)); + writeChar(buffer, '}'); + } + writeChar(buffer, ']'); + } + } + + UnionColumnPrinter::UnionColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer), + tags(nullptr), + offsets(nullptr) { + for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + fieldPrinter.push_back(createColumnPrinter(buffer, type.getSubtype(i)) + .release()); + } + } + + UnionColumnPrinter::~UnionColumnPrinter() { + for (size_t i = 0; i < fieldPrinter.size(); i++) { + delete fieldPrinter[i]; + } + } + + void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const UnionVectorBatch& unionBatch = + dynamic_cast<const UnionVectorBatch&>(batch); + tags = unionBatch.tags.data(); + offsets = unionBatch.offsets.data(); + for(size_t i=0; i < fieldPrinter.size(); ++i) { + fieldPrinter[i]->reset(*(unionBatch.children[i])); + } + } + + void UnionColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, "{\"tag\": "); + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), "%" INT64_FORMAT_STRING "d", + static_cast<int64_t>(tags[rowId])); + writeString(buffer, numBuffer); + writeString(buffer, ", \"value\": "); + fieldPrinter[tags[rowId]]->printRow(offsets[rowId]); + writeChar(buffer, '}'); + } + } + + StructColumnPrinter::StructColumnPrinter(std::string& _buffer, + const Type& type + ): ColumnPrinter(_buffer) { + for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + fieldNames.push_back(type.getFieldName(i)); + fieldPrinter.push_back(createColumnPrinter(buffer, + type.getSubtype(i)) + .release()); + } + } + + StructColumnPrinter::~StructColumnPrinter() { + for (size_t i = 0; i < fieldPrinter.size(); i++) { + delete fieldPrinter[i]; + } + } + + void StructColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const StructVectorBatch& structBatch = + dynamic_cast<const StructVectorBatch&>(batch); + for(size_t i=0; i < fieldPrinter.size(); ++i) { + fieldPrinter[i]->reset(*(structBatch.fields[i])); + } + } + + void StructColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '{'); + for(unsigned int i=0; i < fieldPrinter.size(); ++i) { + if (i != 0) { + writeString(buffer, ", "); + } + writeChar(buffer, '"'); + writeString(buffer, fieldNames[i].c_str()); + writeString(buffer, "\": "); + fieldPrinter[i]->printRow(rowId); + } + writeChar(buffer, '}'); + } + } + + DateColumnPrinter::DateColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr) { + // PASS + } + + void DateColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + const time_t timeValue = data[rowId] * 24 * 60 * 60; + struct tm tmValue; + gmtime_r(&timeValue, &tmValue); + char timeBuffer[11]; + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue); + writeChar(buffer, '"'); + writeString(buffer, timeBuffer); + writeChar(buffer, '"'); + } + } + + void DateColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + } + + BooleanColumnPrinter::BooleanColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + data(nullptr) { + // PASS + } + + void BooleanColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeString(buffer, (data[rowId] ? "true" : "false")); + } + } + + void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + data = dynamic_cast<const LongVectorBatch&>(batch).data.data(); + } + + BinaryColumnPrinter::BinaryColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + start(nullptr), + length(nullptr) { + // PASS + } + + void BinaryColumnPrinter::printRow(uint64_t rowId) { + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + writeChar(buffer, '['); + for(int64_t i=0; i < length[rowId]; ++i) { + if (i != 0) { + writeString(buffer, ", "); + } + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), "%d", + (static_cast<const int>(start[rowId][i]) & 0xff)); + writeString(buffer, numBuffer); + } + writeChar(buffer, ']'); + } + } + + void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + start = dynamic_cast<const StringVectorBatch&>(batch).data.data(); + length = dynamic_cast<const StringVectorBatch&>(batch).length.data(); + } + + TimestampColumnPrinter::TimestampColumnPrinter(std::string& _buffer + ): ColumnPrinter(_buffer), + seconds(nullptr), + nanoseconds(nullptr) { + // PASS + } + + void TimestampColumnPrinter::printRow(uint64_t rowId) { + const int64_t NANO_DIGITS = 9; + if (hasNulls && !notNull[rowId]) { + writeString(buffer, "null"); + } else { + int64_t nanos = nanoseconds[rowId]; + time_t secs = static_cast<time_t>(seconds[rowId]); + struct tm tmValue; + gmtime_r(&secs, &tmValue); + char timeBuffer[20]; + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + writeChar(buffer, '"'); + writeString(buffer, timeBuffer); + writeChar(buffer, '.'); + // remove trailing zeros off the back of the nanos value. + int64_t zeroDigits = 0; + if (nanos == 0) { + zeroDigits = 8; + } else { + while (nanos % 10 == 0) { + nanos /= 10; + zeroDigits += 1; + } + } + char numBuffer[64]; + snprintf(numBuffer, sizeof(numBuffer), + "%0*" INT64_FORMAT_STRING "d\"", + static_cast<int>(NANO_DIGITS - zeroDigits), + static_cast<int64_t >(nanos)); + writeString(buffer, numBuffer); + } + } + + void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) { + ColumnPrinter::reset(batch); + const TimestampVectorBatch& ts = + dynamic_cast<const TimestampVectorBatch&>(batch); + seconds = ts.data.data(); + nanoseconds = ts.nanoseconds.data(); + } +} diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc index 8cf660be11..aa891f5074 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc @@ -1,1836 +1,1836 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" - -#include "Adaptor.hh" -#include "ByteRLE.hh" -#include "ColumnReader.hh" -#include "orc/Exceptions.hh" -#include "RLE.hh" - -#include <math.h> -#include <iostream> - -namespace orc { - - StripeStreams::~StripeStreams() { - // PASS - } - - inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { - switch (static_cast<int64_t>(kind)) { - case proto::ColumnEncoding_Kind_DIRECT: - case proto::ColumnEncoding_Kind_DICTIONARY: - return RleVersion_1; - case proto::ColumnEncoding_Kind_DIRECT_V2: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - return RleVersion_2; - default: - throw ParseError("Unknown encoding in convertRleVersion"); - } - } - - ColumnReader::ColumnReader(const Type& type, - StripeStreams& stripe - ): columnId(type.getColumnId()), - memoryPool(stripe.getMemoryPool()) { - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); - if (stream.get()) { - notNullDecoder = createBooleanRleDecoder(std::move(stream)); - } - } - - ColumnReader::~ColumnReader() { - // PASS - } - - uint64_t ColumnReader::skip(uint64_t numValues) { - ByteRleDecoder* decoder = notNullDecoder.get(); - if (decoder) { - // page through the values that we want to skip - // and count how many are non-null - const size_t MAX_BUFFER_SIZE = 32768; - size_t bufferSize = std::min(MAX_BUFFER_SIZE, - static_cast<size_t>(numValues)); - char buffer[MAX_BUFFER_SIZE]; - uint64_t remaining = numValues; - while (remaining > 0) { - uint64_t chunkSize = - std::min(remaining, - static_cast<uint64_t>(bufferSize)); - decoder->next(buffer, chunkSize, nullptr); - remaining -= chunkSize; - for(uint64_t i=0; i < chunkSize; ++i) { - if (!buffer[i]) { - numValues -= 1; - } - } - } - } - return numValues; - } - - void ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* incomingMask) { - if (numValues > rowBatch.capacity) { - rowBatch.resize(numValues); - } - rowBatch.numElements = numValues; - ByteRleDecoder* decoder = notNullDecoder.get(); - if (decoder) { - char* notNullArray = rowBatch.notNull.data(); - decoder->next(notNullArray, numValues, incomingMask); - // check to see if there are nulls in this batch - for(uint64_t i=0; i < numValues; ++i) { - if (!notNullArray[i]) { - rowBatch.hasNulls = true; - return; - } - } - } else if (incomingMask) { - // If we don't have a notNull stream, copy the incomingMask - rowBatch.hasNulls = true; - memcpy(rowBatch.notNull.data(), incomingMask, numValues); - return; - } - rowBatch.hasNulls = false; - } - - void ColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - if (notNullDecoder.get()) { - notNullDecoder->seek(positions.at(columnId)); - } - } - - /** - * Expand an array of bytes in place to the corresponding array of longs. - * Has to work backwards so that they data isn't clobbered during the - * expansion. - * @param buffer the array of chars and array of longs that need to be - * expanded - * @param numValues the number of bytes to convert to longs - */ - void expandBytesToLongs(int64_t* buffer, uint64_t numValues) { - for(size_t i=numValues - 1; i < numValues; --i) { - buffer[i] = reinterpret_cast<char *>(buffer)[i]; - } - } - - class BooleanColumnReader: public ColumnReader { - private: - std::unique_ptr<orc::ByteRleDecoder> rle; - - public: - BooleanColumnReader(const Type& type, StripeStreams& stipe); - ~BooleanColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - BooleanColumnReader::BooleanColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe){ - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Boolean column"); - rle = createBooleanRleDecoder(std::move(stream)); - } - - BooleanColumnReader::~BooleanColumnReader() { - // PASS - } - - uint64_t BooleanColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // Since the byte rle places the output in a char* instead of long*, - // we cheat here and use the long* and then expand it in a second pass. - int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - expandBytesToLongs(ptr, numValues); - } - - void BooleanColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - class ByteColumnReader: public ColumnReader { - private: - std::unique_ptr<orc::ByteRleDecoder> rle; - - public: - ByteColumnReader(const Type& type, StripeStreams& stipe); - ~ByteColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - ByteColumnReader::ByteColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe){ - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Byte column"); - rle = createByteRleDecoder(std::move(stream)); - } - - ByteColumnReader::~ByteColumnReader() { - // PASS - } - - uint64_t ByteColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void ByteColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // Since the byte rle places the output in a char* instead of long*, - // we cheat here and use the long* and then expand it in a second pass. - int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); - rle->next(reinterpret_cast<char*>(ptr), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - expandBytesToLongs(ptr, numValues); - } - - void ByteColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - class IntegerColumnReader: public ColumnReader { - protected: - std::unique_ptr<orc::RleDecoder> rle; - - public: - IntegerColumnReader(const Type& type, StripeStreams& stripe); - ~IntegerColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - IntegerColumnReader::IntegerColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Integer column"); - rle = createRleDecoder(std::move(stream), true, vers, memoryPool); - } - - IntegerColumnReader::~IntegerColumnReader() { - // PASS - } - - uint64_t IntegerColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void IntegerColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(), - numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); - } - - void IntegerColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - class TimestampColumnReader: public ColumnReader { - private: - std::unique_ptr<orc::RleDecoder> secondsRle; - std::unique_ptr<orc::RleDecoder> nanoRle; - const Timezone& writerTimezone; - const int64_t epochOffset; - - public: - TimestampColumnReader(const Type& type, StripeStreams& stripe); - ~TimestampColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - - TimestampColumnReader::TimestampColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - writerTimezone(stripe.getWriterTimezone()), - epochOffset(writerTimezone.getEpoch()) { - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("DATA stream not found in Timestamp column"); - secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool); - stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); - if (stream == nullptr) - throw ParseError("SECONDARY stream not found in Timestamp column"); - nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool); - } - - TimestampColumnReader::~TimestampColumnReader() { - // PASS - } - - uint64_t TimestampColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - secondsRle->skip(numValues); - nanoRle->skip(numValues); - return numValues; - } - - void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - TimestampVectorBatch& timestampBatch = - dynamic_cast<TimestampVectorBatch&>(rowBatch); - int64_t *secsBuffer = timestampBatch.data.data(); - secondsRle->next(secsBuffer, numValues, notNull); - int64_t *nanoBuffer = timestampBatch.nanoseconds.data(); - nanoRle->next(nanoBuffer, numValues, notNull); - - // Construct the values - for(uint64_t i=0; i < numValues; i++) { - if (notNull == nullptr || notNull[i]) { - uint64_t zeros = nanoBuffer[i] & 0x7; - nanoBuffer[i] >>= 3; - if (zeros != 0) { - for(uint64_t j = 0; j <= zeros; ++j) { - nanoBuffer[i] *= 10; - } - } - int64_t writerTime = secsBuffer[i] + epochOffset; - secsBuffer[i] = writerTimezone.convertToUTC(writerTime); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" + +#include "Adaptor.hh" +#include "ByteRLE.hh" +#include "ColumnReader.hh" +#include "orc/Exceptions.hh" +#include "RLE.hh" + +#include <math.h> +#include <iostream> + +namespace orc { + + StripeStreams::~StripeStreams() { + // PASS + } + + inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { + switch (static_cast<int64_t>(kind)) { + case proto::ColumnEncoding_Kind_DIRECT: + case proto::ColumnEncoding_Kind_DICTIONARY: + return RleVersion_1; + case proto::ColumnEncoding_Kind_DIRECT_V2: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + return RleVersion_2; + default: + throw ParseError("Unknown encoding in convertRleVersion"); + } + } + + ColumnReader::ColumnReader(const Type& type, + StripeStreams& stripe + ): columnId(type.getColumnId()), + memoryPool(stripe.getMemoryPool()) { + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true); + if (stream.get()) { + notNullDecoder = createBooleanRleDecoder(std::move(stream)); + } + } + + ColumnReader::~ColumnReader() { + // PASS + } + + uint64_t ColumnReader::skip(uint64_t numValues) { + ByteRleDecoder* decoder = notNullDecoder.get(); + if (decoder) { + // page through the values that we want to skip + // and count how many are non-null + const size_t MAX_BUFFER_SIZE = 32768; + size_t bufferSize = std::min(MAX_BUFFER_SIZE, + static_cast<size_t>(numValues)); + char buffer[MAX_BUFFER_SIZE]; + uint64_t remaining = numValues; + while (remaining > 0) { + uint64_t chunkSize = + std::min(remaining, + static_cast<uint64_t>(bufferSize)); + decoder->next(buffer, chunkSize, nullptr); + remaining -= chunkSize; + for(uint64_t i=0; i < chunkSize; ++i) { + if (!buffer[i]) { + numValues -= 1; + } + } + } + } + return numValues; + } + + void ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* incomingMask) { + if (numValues > rowBatch.capacity) { + rowBatch.resize(numValues); + } + rowBatch.numElements = numValues; + ByteRleDecoder* decoder = notNullDecoder.get(); + if (decoder) { + char* notNullArray = rowBatch.notNull.data(); + decoder->next(notNullArray, numValues, incomingMask); + // check to see if there are nulls in this batch + for(uint64_t i=0; i < numValues; ++i) { + if (!notNullArray[i]) { + rowBatch.hasNulls = true; + return; + } + } + } else if (incomingMask) { + // If we don't have a notNull stream, copy the incomingMask + rowBatch.hasNulls = true; + memcpy(rowBatch.notNull.data(), incomingMask, numValues); + return; + } + rowBatch.hasNulls = false; + } + + void ColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + if (notNullDecoder.get()) { + notNullDecoder->seek(positions.at(columnId)); + } + } + + /** + * Expand an array of bytes in place to the corresponding array of longs. + * Has to work backwards so that they data isn't clobbered during the + * expansion. + * @param buffer the array of chars and array of longs that need to be + * expanded + * @param numValues the number of bytes to convert to longs + */ + void expandBytesToLongs(int64_t* buffer, uint64_t numValues) { + for(size_t i=numValues - 1; i < numValues; --i) { + buffer[i] = reinterpret_cast<char *>(buffer)[i]; + } + } + + class BooleanColumnReader: public ColumnReader { + private: + std::unique_ptr<orc::ByteRleDecoder> rle; + + public: + BooleanColumnReader(const Type& type, StripeStreams& stipe); + ~BooleanColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + BooleanColumnReader::BooleanColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe){ + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Boolean column"); + rle = createBooleanRleDecoder(std::move(stream)); + } + + BooleanColumnReader::~BooleanColumnReader() { + // PASS + } + + uint64_t BooleanColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // Since the byte rle places the output in a char* instead of long*, + // we cheat here and use the long* and then expand it in a second pass. + int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); + rle->next(reinterpret_cast<char*>(ptr), + numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + expandBytesToLongs(ptr, numValues); + } + + void BooleanColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + class ByteColumnReader: public ColumnReader { + private: + std::unique_ptr<orc::ByteRleDecoder> rle; + + public: + ByteColumnReader(const Type& type, StripeStreams& stipe); + ~ByteColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + ByteColumnReader::ByteColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe){ + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Byte column"); + rle = createByteRleDecoder(std::move(stream)); + } + + ByteColumnReader::~ByteColumnReader() { + // PASS + } + + uint64_t ByteColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void ByteColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // Since the byte rle places the output in a char* instead of long*, + // we cheat here and use the long* and then expand it in a second pass. + int64_t *ptr = dynamic_cast<LongVectorBatch&>(rowBatch).data.data(); + rle->next(reinterpret_cast<char*>(ptr), + numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + expandBytesToLongs(ptr, numValues); + } + + void ByteColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + class IntegerColumnReader: public ColumnReader { + protected: + std::unique_ptr<orc::RleDecoder> rle; + + public: + IntegerColumnReader(const Type& type, StripeStreams& stripe); + ~IntegerColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + IntegerColumnReader::IntegerColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Integer column"); + rle = createRleDecoder(std::move(stream), true, vers, memoryPool); + } + + IntegerColumnReader::~IntegerColumnReader() { + // PASS + } + + uint64_t IntegerColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void IntegerColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + rle->next(dynamic_cast<LongVectorBatch&>(rowBatch).data.data(), + numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr); + } + + void IntegerColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + class TimestampColumnReader: public ColumnReader { + private: + std::unique_ptr<orc::RleDecoder> secondsRle; + std::unique_ptr<orc::RleDecoder> nanoRle; + const Timezone& writerTimezone; + const int64_t epochOffset; + + public: + TimestampColumnReader(const Type& type, StripeStreams& stripe); + ~TimestampColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + + TimestampColumnReader::TimestampColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe), + writerTimezone(stripe.getWriterTimezone()), + epochOffset(writerTimezone.getEpoch()) { + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("DATA stream not found in Timestamp column"); + secondsRle = createRleDecoder(std::move(stream), true, vers, memoryPool); + stream = stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); + if (stream == nullptr) + throw ParseError("SECONDARY stream not found in Timestamp column"); + nanoRle = createRleDecoder(std::move(stream), false, vers, memoryPool); + } + + TimestampColumnReader::~TimestampColumnReader() { + // PASS + } + + uint64_t TimestampColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + secondsRle->skip(numValues); + nanoRle->skip(numValues); + return numValues; + } + + void TimestampColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + TimestampVectorBatch& timestampBatch = + dynamic_cast<TimestampVectorBatch&>(rowBatch); + int64_t *secsBuffer = timestampBatch.data.data(); + secondsRle->next(secsBuffer, numValues, notNull); + int64_t *nanoBuffer = timestampBatch.nanoseconds.data(); + nanoRle->next(nanoBuffer, numValues, notNull); + + // Construct the values + for(uint64_t i=0; i < numValues; i++) { + if (notNull == nullptr || notNull[i]) { + uint64_t zeros = nanoBuffer[i] & 0x7; + nanoBuffer[i] >>= 3; + if (zeros != 0) { + for(uint64_t j = 0; j <= zeros; ++j) { + nanoBuffer[i] *= 10; + } + } + int64_t writerTime = secsBuffer[i] + epochOffset; + secsBuffer[i] = writerTimezone.convertToUTC(writerTime); if (secsBuffer[i] < 0 && nanoBuffer[i] > 999999) { - secsBuffer[i] -= 1; - } - } - } - } - - void TimestampColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - secondsRle->seek(positions.at(columnId)); - nanoRle->seek(positions.at(columnId)); - } - - class DoubleColumnReader: public ColumnReader { - public: - DoubleColumnReader(const Type& type, StripeStreams& stripe); - ~DoubleColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - std::unique_ptr<SeekableInputStream> inputStream; - TypeKind columnKind; - const uint64_t bytesPerValue ; - const char *bufferPointer; - const char *bufferEnd; - - unsigned char readByte() { - if (bufferPointer == bufferEnd) { - int length; - if (!inputStream->Next - (reinterpret_cast<const void**>(&bufferPointer), &length)) { - throw ParseError("bad read in DoubleColumnReader::next()"); - } - bufferEnd = bufferPointer + length; - } - return static_cast<unsigned char>(*(bufferPointer++)); - } - - double readDouble() { - int64_t bits = 0; - for (uint64_t i=0; i < 8; i++) { - bits |= static_cast<int64_t>(readByte()) << (i*8); - } - double *result = reinterpret_cast<double*>(&bits); - return *result; - } - - double readFloat() { - int32_t bits = 0; - for (uint64_t i=0; i < 4; i++) { - bits |= readByte() << (i*8); - } - float *result = reinterpret_cast<float*>(&bits); - return static_cast<double>(*result); - } - }; - - DoubleColumnReader::DoubleColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - columnKind(type.getKind()), - bytesPerValue((type.getKind() == - FLOAT) ? 4 : 8), - bufferPointer(nullptr), - bufferEnd(nullptr) { - inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (inputStream == nullptr) - throw ParseError("DATA stream not found in Double column"); - } - - DoubleColumnReader::~DoubleColumnReader() { - // PASS - } - - uint64_t DoubleColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - - if (static_cast<size_t>(bufferEnd - bufferPointer) >= - bytesPerValue * numValues) { - bufferPointer += bytesPerValue * numValues; - } else { - size_t sizeToSkip = bytesPerValue * numValues - - static_cast<size_t>(bufferEnd - bufferPointer); - const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); - while (sizeToSkip != 0) { - size_t step = sizeToSkip > cap ? cap : sizeToSkip; - inputStream->Skip(static_cast<int>(step)); - sizeToSkip -= step; - } - bufferEnd = nullptr; - bufferPointer = nullptr; - } - - return numValues; - } - - void DoubleColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // update the notNull from the parent class - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data(); - - if (columnKind == FLOAT) { - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - outArray[i] = readFloat(); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - outArray[i] = readFloat(); - } - } - } else { - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - outArray[i] = readDouble(); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - outArray[i] = readDouble(); - } - } - } - } - - void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) { - int64_t posn = 0; - while (posn < bufferSize) { - const void* chunk; - int length; - if (!stream->Next(&chunk, &length)) { - throw ParseError("bad read in readFully"); - } - if (posn + length > bufferSize) { - throw ParseError("Corrupt dictionary blob in StringDictionaryColumn"); - } - memcpy(buffer + posn, chunk, static_cast<size_t>(length)); - posn += length; - } - } - - void DoubleColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - inputStream->seek(positions.at(columnId)); - } - - class StringDictionaryColumnReader: public ColumnReader { - private: - std::shared_ptr<StringDictionary> dictionary; - std::unique_ptr<RleDecoder> rle; - - public: - StringDictionaryColumnReader(const Type& type, StripeStreams& stipe); - ~StringDictionaryColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - StringDictionaryColumnReader::StringDictionaryColumnReader - (const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe), - dictionary(new StringDictionary(stripe.getMemoryPool())) { - RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) - .kind()); - uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize(); - rle = createRleDecoder(stripe.getStream(columnId, - proto::Stream_Kind_DATA, - true), - false, rleVersion, memoryPool); - std::unique_ptr<RleDecoder> lengthDecoder = - createRleDecoder(stripe.getStream(columnId, - proto::Stream_Kind_LENGTH, - false), - false, rleVersion, memoryPool); - dictionary->dictionaryOffset.resize(dictSize + 1); - int64_t* lengthArray = dictionary->dictionaryOffset.data(); - lengthDecoder->next(lengthArray + 1, dictSize, nullptr); - lengthArray[0] = 0; - for(uint32_t i = 1; i < dictSize + 1; ++i) { - lengthArray[i] += lengthArray[i - 1]; - } - dictionary->dictionaryBlob.resize( - static_cast<uint64_t>(lengthArray[dictSize])); - std::unique_ptr<SeekableInputStream> blobStream = - stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); - readFully( - dictionary->dictionaryBlob.data(), - lengthArray[dictSize], - blobStream.get()); - } - - StringDictionaryColumnReader::~StringDictionaryColumnReader() { - // PASS - } - - uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - rle->skip(numValues); - return numValues; - } - - void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // update the notNull from the parent class - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char *blob = dictionary->dictionaryBlob.data(); - int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data(); - char **outputStarts = byteBatch.data.data(); - int64_t *outputLengths = byteBatch.length.data(); - rle->next(outputLengths, numValues, notNull); - uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1; - if (notNull) { - for(uint64_t i=0; i < numValues; ++i) { - if (notNull[i]) { - int64_t entry = outputLengths[i]; - if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) { - throw ParseError("Entry index out of range in StringDictionaryColumn"); - } - outputStarts[i] = blob + dictionaryOffsets[entry]; - outputLengths[i] = dictionaryOffsets[entry+1] - - dictionaryOffsets[entry]; - } - } - } else { - for(uint64_t i=0; i < numValues; ++i) { - int64_t entry = outputLengths[i]; - if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) { - throw ParseError("Entry index out of range in StringDictionaryColumn"); - } - outputStarts[i] = blob + dictionaryOffsets[entry]; - outputLengths[i] = dictionaryOffsets[entry+1] - - dictionaryOffsets[entry]; - } - } - } - - void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - rowBatch.isEncoded = true; - - EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch); - batch.dictionary = this->dictionary; - - // Length buffer is reused to save dictionary entry ids - rle->next(batch.index.data(), numValues, notNull); - } - - void StringDictionaryColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - } - - - class StringDirectColumnReader: public ColumnReader { - private: - std::unique_ptr<RleDecoder> lengthRle; - std::unique_ptr<SeekableInputStream> blobStream; - const char *lastBuffer; - size_t lastBufferLength; - - /** - * Compute the total length of the values. - * @param lengths the array of lengths - * @param notNull the array of notNull flags - * @param numValues the lengths of the arrays - * @return the total number of bytes for the non-null values - */ - size_t computeSize(const int64_t *lengths, const char *notNull, - uint64_t numValues); - - public: - StringDirectColumnReader(const Type& type, StripeStreams& stipe); - ~StringDirectColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - - StringDirectColumnReader::StringDirectColumnReader - (const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) - .kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in StringDirectColumn"); - lengthRle = createRleDecoder( - std::move(stream), false, rleVersion, memoryPool); - blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (blobStream == nullptr) - throw ParseError("DATA stream not found in StringDirectColumn"); - lastBuffer = nullptr; - lastBufferLength = 0; - } - - StringDirectColumnReader::~StringDirectColumnReader() { - // PASS - } - - uint64_t StringDirectColumnReader::skip(uint64_t numValues) { - const size_t BUFFER_SIZE = 1024; - numValues = ColumnReader::skip(numValues); - int64_t buffer[BUFFER_SIZE]; - uint64_t done = 0; - size_t totalBytes = 0; - // read the lengths, so we know haw many bytes to skip - while (done < numValues) { - uint64_t step = std::min(BUFFER_SIZE, - static_cast<size_t>(numValues - done)); - lengthRle->next(buffer, step, nullptr); - totalBytes += computeSize(buffer, nullptr, step); - done += step; - } - if (totalBytes <= lastBufferLength) { - // subtract the needed bytes from the ones left over - lastBufferLength -= totalBytes; - lastBuffer += totalBytes; - } else { - // move the stream forward after accounting for the buffered bytes - totalBytes -= lastBufferLength; - const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); - while (totalBytes != 0) { - size_t step = totalBytes > cap ? cap : totalBytes; - blobStream->Skip(static_cast<int>(step)); - totalBytes -= step; - } - lastBufferLength = 0; - lastBuffer = nullptr; - } - return numValues; - } - - size_t StringDirectColumnReader::computeSize(const int64_t* lengths, - const char* notNull, - uint64_t numValues) { - size_t totalLength = 0; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - totalLength += static_cast<size_t>(lengths[i]); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - totalLength += static_cast<size_t>(lengths[i]); - } - } - return totalLength; - } - - void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - // update the notNull from the parent class - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); - char **startPtr = byteBatch.data.data(); - int64_t *lengthPtr = byteBatch.length.data(); - - // read the length vector - lengthRle->next(lengthPtr, numValues, notNull); - - // figure out the total length of data we need from the blob stream - const size_t totalLength = computeSize(lengthPtr, notNull, numValues); - - // Load data from the blob stream into our buffer until we have enough - // to get the rest directly out of the stream's buffer. - size_t bytesBuffered = 0; - byteBatch.blob.resize(totalLength); - char *ptr= byteBatch.blob.data(); - while (bytesBuffered + lastBufferLength < totalLength) { - memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength); - bytesBuffered += lastBufferLength; - const void* readBuffer; - int readLength; - if (!blobStream->Next(&readBuffer, &readLength)) { - throw ParseError("failed to read in StringDirectColumnReader.next"); - } - lastBuffer = static_cast<const char*>(readBuffer); - lastBufferLength = static_cast<size_t>(readLength); - } - - if (bytesBuffered < totalLength) { - size_t moreBytes = totalLength - bytesBuffered; - memcpy(ptr + bytesBuffered, lastBuffer, moreBytes); - lastBuffer += moreBytes; - lastBufferLength -= moreBytes; - } - - size_t filledSlots = 0; - ptr = byteBatch.blob.data(); - if (notNull) { - while (filledSlots < numValues) { - if (notNull[filledSlots]) { - startPtr[filledSlots] = const_cast<char*>(ptr); - ptr += lengthPtr[filledSlots]; - } - filledSlots += 1; - } - } else { - while (filledSlots < numValues) { - startPtr[filledSlots] = const_cast<char*>(ptr); - ptr += lengthPtr[filledSlots]; - filledSlots += 1; - } - } - } - - void StringDirectColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - blobStream->seek(positions.at(columnId)); - lengthRle->seek(positions.at(columnId)); - } - - class StructColumnReader: public ColumnReader { - private: - std::vector<ColumnReader*> children; - - public: - StructColumnReader(const Type& type, StripeStreams& stipe); - ~StructColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - StructColumnReader::StructColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - // count the number of selected sub-columns - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) { - case proto::ColumnEncoding_Kind_DIRECT: - for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { - const Type& child = *type.getSubtype(i); - if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { - children.push_back(buildReader(child, stripe).release()); - } - } - break; - case proto::ColumnEncoding_Kind_DIRECT_V2: - case proto::ColumnEncoding_Kind_DICTIONARY: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - default: - throw ParseError("Unknown encoding for StructColumnReader"); - } - } - - StructColumnReader::~StructColumnReader() { - for (size_t i=0; i<children.size(); i++) { - delete children[i]; - } - } - - uint64_t StructColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - for(std::vector<ColumnReader*>::iterator ptr=children.begin(); ptr != children.end(); ++ptr) { - (*ptr)->skip(numValues); - } - return numValues; - } - - void StructColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - uint64_t i=0; - notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr; - for(std::vector<ColumnReader*>::iterator ptr=children.begin(); - ptr != children.end(); ++ptr, ++i) { - if (encoded) { - (*ptr)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), - numValues, notNull); - } else { - (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), - numValues, notNull); - } - } - } - - void StructColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - - for(std::vector<ColumnReader*>::iterator ptr = children.begin(); - ptr != children.end(); - ++ptr) { - (*ptr)->seekToRowGroup(positions); - } - } - - class ListColumnReader: public ColumnReader { - private: - std::unique_ptr<ColumnReader> child; - std::unique_ptr<RleDecoder> rle; - - public: - ListColumnReader(const Type& type, StripeStreams& stipe); - ~ListColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - ListColumnReader::ListColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - // count the number of selected sub-columns - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in List column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool); - const Type& childType = *type.getSubtype(0); - if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) { - child = buildReader(childType, stripe); - } - } - - ListColumnReader::~ListColumnReader() { - // PASS - } - - uint64_t ListColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - ColumnReader *childReader = child.get(); - if (childReader) { - const uint64_t BUFFER_SIZE = 1024; - int64_t buffer[BUFFER_SIZE]; - uint64_t childrenElements = 0; - uint64_t lengthsRead = 0; - while (lengthsRead < numValues) { - uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { - childrenElements += static_cast<size_t>(buffer[i]); - } - lengthsRead += chunk; - } - childReader->skip(childrenElements); - } else { - rle->skip(numValues); - } - return numValues; - } - - void ListColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); - int64_t* offsets = listBatch.offsets.data(); - notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr; - rle->next(offsets, numValues, notNull); - uint64_t totalChildren = 0; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } else { - offsets[i] = static_cast<int64_t>(totalChildren); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } - } - offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader *childReader = child.get(); - if (childReader) { - if (encoded) { - childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr); - } else { - childReader->next(*(listBatch.elements.get()), totalChildren, nullptr); - } - } - } - - void ListColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - if (child.get()) { - child->seekToRowGroup(positions); - } - } - - class MapColumnReader: public ColumnReader { - private: - std::unique_ptr<ColumnReader> keyReader; - std::unique_ptr<ColumnReader> elementReader; - std::unique_ptr<RleDecoder> rle; - - public: - MapColumnReader(const Type& type, StripeStreams& stipe); - ~MapColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - MapColumnReader::MapColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - // Determine if the key and/or value columns are selected - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in Map column"); - rle = createRleDecoder(std::move(stream), false, vers, memoryPool); - const Type& keyType = *type.getSubtype(0); - if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) { - keyReader = buildReader(keyType, stripe); - } - const Type& elementType = *type.getSubtype(1); - if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) { - elementReader = buildReader(elementType, stripe); - } - } - - MapColumnReader::~MapColumnReader() { - // PASS - } - - uint64_t MapColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - ColumnReader *rawKeyReader = keyReader.get(); - ColumnReader *rawElementReader = elementReader.get(); - if (rawKeyReader || rawElementReader) { - const uint64_t BUFFER_SIZE = 1024; - int64_t buffer[BUFFER_SIZE]; - uint64_t childrenElements = 0; - uint64_t lengthsRead = 0; - while (lengthsRead < numValues) { - uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { - childrenElements += static_cast<size_t>(buffer[i]); - } - lengthsRead += chunk; - } - if (rawKeyReader) { - rawKeyReader->skip(childrenElements); - } - if (rawElementReader) { - rawElementReader->skip(childrenElements); - } - } else { - rle->skip(numValues); - } - return numValues; - } - - void MapColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) - { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) - { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); - int64_t* offsets = mapBatch.offsets.data(); - notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr; - rle->next(offsets, numValues, notNull); - uint64_t totalChildren = 0; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } else { - offsets[i] = static_cast<int64_t>(totalChildren); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - uint64_t tmp = static_cast<uint64_t>(offsets[i]); - offsets[i] = static_cast<int64_t>(totalChildren); - totalChildren += tmp; - } - } - offsets[numValues] = static_cast<int64_t>(totalChildren); - ColumnReader *rawKeyReader = keyReader.get(); - if (rawKeyReader) { - if (encoded) { - rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr); - } else { - rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr); - } - } - ColumnReader *rawElementReader = elementReader.get(); - if (rawElementReader) { - if (encoded) { - rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr); - } else { - rawElementReader->next(*(mapBatch.elements.get()), totalChildren, nullptr); - } - } - } - - void MapColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - if (keyReader.get()) { - keyReader->seekToRowGroup(positions); - } - if (elementReader.get()) { - elementReader->seekToRowGroup(positions); - } - } - - class UnionColumnReader: public ColumnReader { - private: - std::unique_ptr<ByteRleDecoder> rle; - std::vector<ColumnReader*> childrenReader; - std::vector<int64_t> childrenCounts; - uint64_t numChildren; - - public: - UnionColumnReader(const Type& type, StripeStreams& stipe); - ~UnionColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - - private: - template<bool encoded> - void nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull); - }; - - UnionColumnReader::UnionColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - numChildren = type.getSubtypeCount(); - childrenReader.resize(numChildren); - childrenCounts.resize(numChildren); - - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (stream == nullptr) - throw ParseError("LENGTH stream not found in Union column"); - rle = createByteRleDecoder(std::move(stream)); - // figure out which types are selected - const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); - for(unsigned int i=0; i < numChildren; ++i) { - const Type &child = *type.getSubtype(i); - if (selectedColumns[static_cast<size_t>(child.getColumnId())]) { - childrenReader[i] = buildReader(child, stripe).release(); - } - } - } - - UnionColumnReader::~UnionColumnReader() { - for(std::vector<ColumnReader*>::iterator itr = childrenReader.begin(); - itr != childrenReader.end(); ++itr) { - delete *itr; - } - } - - uint64_t UnionColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - const uint64_t BUFFER_SIZE = 1024; - char buffer[BUFFER_SIZE]; - uint64_t lengthsRead = 0; - int64_t *counts = childrenCounts.data(); - memset(counts, 0, sizeof(int64_t) * numChildren); - while (lengthsRead < numValues) { - uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); - rle->next(buffer, chunk, nullptr); - for(size_t i=0; i < chunk; ++i) { - counts[static_cast<size_t>(buffer[i])] += 1; - } - lengthsRead += chunk; - } - for(size_t i=0; i < numChildren; ++i) { - if (counts[i] != 0 && childrenReader[i] != nullptr) { - childrenReader[i]->skip(static_cast<uint64_t>(counts[i])); - } - } - return numValues; - } - - void UnionColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<false>(rowBatch, numValues, notNull); - } - - void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - nextInternal<true>(rowBatch, numValues, notNull); - } - - template<bool encoded> - void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); - uint64_t* offsets = unionBatch.offsets.data(); - int64_t* counts = childrenCounts.data(); - memset(counts, 0, sizeof(int64_t) * numChildren); - unsigned char* tags = unionBatch.tags.data(); - notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr; - rle->next(reinterpret_cast<char *>(tags), numValues, notNull); - // set the offsets for each row - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - offsets[i] = - static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - offsets[i] = - static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); - } - } - // read the right number of each child column - for(size_t i=0; i < numChildren; ++i) { - if (childrenReader[i] != nullptr) { - if (encoded) { - childrenReader[i]->nextEncoded(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); - } else { - childrenReader[i]->next(*(unionBatch.children[i]), - static_cast<uint64_t>(counts[i]), nullptr); - } - } - } - } - - void UnionColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - rle->seek(positions.at(columnId)); - for(size_t i = 0; i < numChildren; ++i) { - if (childrenReader[i] != nullptr) { - childrenReader[i]->seekToRowGroup(positions); - } - } - } - - /** - * Destructively convert the number from zigzag encoding to the - * natural signed representation. - */ - void unZigZagInt128(Int128& value) { - bool needsNegate = value.getLowBits() & 1; - value >>= 1; - if (needsNegate) { - value.negate(); - value -= 1; - } - } - - class Decimal64ColumnReader: public ColumnReader { - public: - static const uint32_t MAX_PRECISION_64 = 18; - static const uint32_t MAX_PRECISION_128 = 38; - static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1]; - - protected: - std::unique_ptr<SeekableInputStream> valueStream; - int32_t precision; - int32_t scale; - const char* buffer; - const char* bufferEnd; - - std::unique_ptr<RleDecoder> scaleDecoder; - - /** - * Read the valueStream for more bytes. - */ - void readBuffer() { - while (buffer == bufferEnd) { - int length; - if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), - &length)) { - throw ParseError("Read past end of stream in Decimal64ColumnReader "+ - valueStream->getName()); - } - bufferEnd = buffer + length; - } - } - - void readInt64(int64_t& value, int32_t currentScale) { - value = 0; - size_t offset = 0; - while (true) { - readBuffer(); - unsigned char ch = static_cast<unsigned char>(*(buffer++)); - value |= static_cast<uint64_t>(ch & 0x7f) << offset; - offset += 7; - if (!(ch & 0x80)) { - break; - } - } - value = unZigZag(static_cast<uint64_t>(value)); - if (scale > currentScale && - static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) { - value *= POWERS_OF_TEN[scale - currentScale]; - } else if (scale < currentScale && - static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) { - value /= POWERS_OF_TEN[currentScale - scale]; - } else if (scale != currentScale) { - throw ParseError("Decimal scale out of range"); - } - } - - public: - Decimal64ColumnReader(const Type& type, StripeStreams& stipe); - ~Decimal64ColumnReader() override; - - uint64_t skip(uint64_t numValues) override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) override; - }; - const uint32_t Decimal64ColumnReader::MAX_PRECISION_64; - const uint32_t Decimal64ColumnReader::MAX_PRECISION_128; - const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]= - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; - - Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, - StripeStreams& stripe - ): ColumnReader(type, stripe) { - scale = static_cast<int32_t>(type.getScale()); - precision = static_cast<int32_t>(type.getPrecision()); - valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); - if (valueStream == nullptr) - throw ParseError("DATA stream not found in Decimal64Column"); - buffer = nullptr; - bufferEnd = nullptr; - RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); - std::unique_ptr<SeekableInputStream> stream = - stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); - if (stream == nullptr) - throw ParseError("SECONDARY stream not found in Decimal64Column"); - scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool); - } - - Decimal64ColumnReader::~Decimal64ColumnReader() { - // PASS - } - - uint64_t Decimal64ColumnReader::skip(uint64_t numValues) { - numValues = ColumnReader::skip(numValues); - uint64_t skipped = 0; - while (skipped < numValues) { - readBuffer(); - if (!(0x80 & *(buffer++))) { - skipped += 1; - } - } - scaleDecoder->skip(numValues); - return numValues; - } - - void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal64VectorBatch &batch = - dynamic_cast<Decimal64VectorBatch&>(rowBatch); - int64_t* values = batch.values.data(); - // read the next group of scales - int64_t* scaleBuffer = batch.readScales.data(); - scaleDecoder->next(scaleBuffer, numValues, notNull); - batch.precision = precision; - batch.scale = scale; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } - - void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) { - if (scale > currentScale) { - while(scale > currentScale) { - uint32_t scaleAdjust = - std::min(Decimal64ColumnReader::MAX_PRECISION_64, - scale - currentScale); - value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust]; - currentScale += scaleAdjust; - } - } else if (scale < currentScale) { - Int128 remainder; - while(currentScale > scale) { - uint32_t scaleAdjust = - std::min(Decimal64ColumnReader::MAX_PRECISION_64, - currentScale - scale); - value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], - remainder); - currentScale -= scaleAdjust; - } - } - } - - void Decimal64ColumnReader::seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions) { - ColumnReader::seekToRowGroup(positions); - valueStream->seek(positions.at(columnId)); - scaleDecoder->seek(positions.at(columnId)); - } - - class Decimal128ColumnReader: public Decimal64ColumnReader { - public: - Decimal128ColumnReader(const Type& type, StripeStreams& stipe); - ~Decimal128ColumnReader() override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - - private: - void readInt128(Int128& value, int32_t currentScale) { - value = 0; - Int128 work; - uint32_t offset = 0; - while (true) { - readBuffer(); - unsigned char ch = static_cast<unsigned char>(*(buffer++)); - work = ch & 0x7f; - work <<= offset; - value |= work; - offset += 7; - if (!(ch & 0x80)) { - break; - } - } - unZigZagInt128(value); - scaleInt128(value, static_cast<uint32_t>(scale), - static_cast<uint32_t>(currentScale)); - } - }; - - Decimal128ColumnReader::Decimal128ColumnReader - (const Type& type, - StripeStreams& stripe - ): Decimal64ColumnReader(type, stripe) { - // PASS - } - - Decimal128ColumnReader::~Decimal128ColumnReader() { - // PASS - } - - void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal128VectorBatch &batch = - dynamic_cast<Decimal128VectorBatch&>(rowBatch); - Int128* values = batch.values.data(); - // read the next group of scales - int64_t* scaleBuffer = batch.readScales.data(); - scaleDecoder->next(scaleBuffer, numValues, notNull); - batch.precision = precision; - batch.scale = scale; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); - } - } - } - - class DecimalHive11ColumnReader: public Decimal64ColumnReader { - private: - bool throwOnOverflow; - std::ostream* errorStream; - - /** - * Read an Int128 from the stream and correct it to the desired scale. - */ - bool readInt128(Int128& value, int32_t currentScale) { - // -/+ 99999999999999999999999999999999999999 - static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001); - static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff); - - value = 0; - Int128 work; - uint32_t offset = 0; - bool result = true; - while (true) { - readBuffer(); - unsigned char ch = static_cast<unsigned char>(*(buffer++)); - work = ch & 0x7f; - // If we have read more than 128 bits, we flag the error, but keep - // reading bytes so the stream isn't thrown off. - if (offset > 128 || (offset == 126 && work > 3)) { - result = false; - } - work <<= offset; - value |= work; - offset += 7; - if (!(ch & 0x80)) { - break; - } - } - - if (!result) { - return result; - } - unZigZagInt128(value); - scaleInt128(value, static_cast<uint32_t>(scale), - static_cast<uint32_t>(currentScale)); - return value >= MIN_VALUE && value <= MAX_VALUE; - } - - public: - DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe); - ~DecimalHive11ColumnReader() override; - - void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) override; - }; - - DecimalHive11ColumnReader::DecimalHive11ColumnReader - (const Type& type, - StripeStreams& stripe - ): Decimal64ColumnReader(type, stripe) { - scale = stripe.getForcedScaleOnHive11Decimal(); - throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow(); - errorStream = stripe.getErrorStream(); - } - - DecimalHive11ColumnReader::~DecimalHive11ColumnReader() { - // PASS - } - - void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char *notNull) { - ColumnReader::next(rowBatch, numValues, notNull); - notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; - Decimal128VectorBatch &batch = - dynamic_cast<Decimal128VectorBatch&>(rowBatch); - Int128* values = batch.values.data(); - // read the next group of scales - int64_t* scaleBuffer = batch.readScales.data(); - - scaleDecoder->next(scaleBuffer, numValues, notNull); - - batch.precision = precision; - batch.scale = scale; - if (notNull) { - for(size_t i=0; i < numValues; ++i) { - if (notNull[i]) { - if (!readInt128(values[i], - static_cast<int32_t>(scaleBuffer[i]))) { - if (throwOnOverflow) { - throw ParseError("Hive 0.11 decimal was more than 38 digits."); - } else { - *errorStream << "Warning: " - << "Hive 0.11 decimal with more than 38 digits " - << "replaced by NULL.\n"; - notNull[i] = false; - } - } - } - } - } else { - for(size_t i=0; i < numValues; ++i) { - if (!readInt128(values[i], - static_cast<int32_t>(scaleBuffer[i]))) { - if (throwOnOverflow) { - throw ParseError("Hive 0.11 decimal was more than 38 digits."); - } else { - *errorStream << "Warning: " - << "Hive 0.11 decimal with more than 38 digits " - << "replaced by NULL.\n"; - batch.hasNulls = true; - batch.notNull[i] = false; - } - } - } - } - } - - /** - * Create a reader for the given stripe. - */ - std::unique_ptr<ColumnReader> buildReader(const Type& type, - StripeStreams& stripe) { - switch (static_cast<int64_t>(type.getKind())) { - case DATE: - case INT: - case LONG: - case SHORT: - return std::unique_ptr<ColumnReader>( - new IntegerColumnReader(type, stripe)); - case BINARY: - case CHAR: - case STRING: - case VARCHAR: - switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){ - case proto::ColumnEncoding_Kind_DICTIONARY: - case proto::ColumnEncoding_Kind_DICTIONARY_V2: - return std::unique_ptr<ColumnReader>( - new StringDictionaryColumnReader(type, stripe)); - case proto::ColumnEncoding_Kind_DIRECT: - case proto::ColumnEncoding_Kind_DIRECT_V2: - return std::unique_ptr<ColumnReader>( - new StringDirectColumnReader(type, stripe)); - default: - throw NotImplementedYet("buildReader unhandled string encoding"); - } - - case BOOLEAN: - return std::unique_ptr<ColumnReader>( - new BooleanColumnReader(type, stripe)); - - case BYTE: - return std::unique_ptr<ColumnReader>( - new ByteColumnReader(type, stripe)); - - case LIST: - return std::unique_ptr<ColumnReader>( - new ListColumnReader(type, stripe)); - - case MAP: - return std::unique_ptr<ColumnReader>( - new MapColumnReader(type, stripe)); - - case UNION: - return std::unique_ptr<ColumnReader>( - new UnionColumnReader(type, stripe)); - - case STRUCT: - return std::unique_ptr<ColumnReader>( - new StructColumnReader(type, stripe)); - - case FLOAT: - case DOUBLE: - return std::unique_ptr<ColumnReader>( - new DoubleColumnReader(type, stripe)); - - case TIMESTAMP: - return std::unique_ptr<ColumnReader> - (new TimestampColumnReader(type, stripe)); - - case DECIMAL: - // is this a Hive 0.11 or 0.12 file? - if (type.getPrecision() == 0) { - return std::unique_ptr<ColumnReader> - (new DecimalHive11ColumnReader(type, stripe)); - - // can we represent the values using int64_t? - } else if (type.getPrecision() <= - Decimal64ColumnReader::MAX_PRECISION_64) { - return std::unique_ptr<ColumnReader> - (new Decimal64ColumnReader(type, stripe)); - - // otherwise we use the Int128 implementation - } else { - return std::unique_ptr<ColumnReader> - (new Decimal128ColumnReader(type, stripe)); - } - - default: - throw NotImplementedYet("buildReader unhandled type"); - } - } - -} + secsBuffer[i] -= 1; + } + } + } + } + + void TimestampColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + secondsRle->seek(positions.at(columnId)); + nanoRle->seek(positions.at(columnId)); + } + + class DoubleColumnReader: public ColumnReader { + public: + DoubleColumnReader(const Type& type, StripeStreams& stripe); + ~DoubleColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + std::unique_ptr<SeekableInputStream> inputStream; + TypeKind columnKind; + const uint64_t bytesPerValue ; + const char *bufferPointer; + const char *bufferEnd; + + unsigned char readByte() { + if (bufferPointer == bufferEnd) { + int length; + if (!inputStream->Next + (reinterpret_cast<const void**>(&bufferPointer), &length)) { + throw ParseError("bad read in DoubleColumnReader::next()"); + } + bufferEnd = bufferPointer + length; + } + return static_cast<unsigned char>(*(bufferPointer++)); + } + + double readDouble() { + int64_t bits = 0; + for (uint64_t i=0; i < 8; i++) { + bits |= static_cast<int64_t>(readByte()) << (i*8); + } + double *result = reinterpret_cast<double*>(&bits); + return *result; + } + + double readFloat() { + int32_t bits = 0; + for (uint64_t i=0; i < 4; i++) { + bits |= readByte() << (i*8); + } + float *result = reinterpret_cast<float*>(&bits); + return static_cast<double>(*result); + } + }; + + DoubleColumnReader::DoubleColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe), + columnKind(type.getKind()), + bytesPerValue((type.getKind() == + FLOAT) ? 4 : 8), + bufferPointer(nullptr), + bufferEnd(nullptr) { + inputStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (inputStream == nullptr) + throw ParseError("DATA stream not found in Double column"); + } + + DoubleColumnReader::~DoubleColumnReader() { + // PASS + } + + uint64_t DoubleColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + + if (static_cast<size_t>(bufferEnd - bufferPointer) >= + bytesPerValue * numValues) { + bufferPointer += bytesPerValue * numValues; + } else { + size_t sizeToSkip = bytesPerValue * numValues - + static_cast<size_t>(bufferEnd - bufferPointer); + const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); + while (sizeToSkip != 0) { + size_t step = sizeToSkip > cap ? cap : sizeToSkip; + inputStream->Skip(static_cast<int>(step)); + sizeToSkip -= step; + } + bufferEnd = nullptr; + bufferPointer = nullptr; + } + + return numValues; + } + + void DoubleColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // update the notNull from the parent class + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + double* outArray = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data(); + + if (columnKind == FLOAT) { + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + outArray[i] = readFloat(); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + outArray[i] = readFloat(); + } + } + } else { + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + outArray[i] = readDouble(); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + outArray[i] = readDouble(); + } + } + } + } + + void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) { + int64_t posn = 0; + while (posn < bufferSize) { + const void* chunk; + int length; + if (!stream->Next(&chunk, &length)) { + throw ParseError("bad read in readFully"); + } + if (posn + length > bufferSize) { + throw ParseError("Corrupt dictionary blob in StringDictionaryColumn"); + } + memcpy(buffer + posn, chunk, static_cast<size_t>(length)); + posn += length; + } + } + + void DoubleColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + inputStream->seek(positions.at(columnId)); + } + + class StringDictionaryColumnReader: public ColumnReader { + private: + std::shared_ptr<StringDictionary> dictionary; + std::unique_ptr<RleDecoder> rle; + + public: + StringDictionaryColumnReader(const Type& type, StripeStreams& stipe); + ~StringDictionaryColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + StringDictionaryColumnReader::StringDictionaryColumnReader + (const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe), + dictionary(new StringDictionary(stripe.getMemoryPool())) { + RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) + .kind()); + uint32_t dictSize = stripe.getEncoding(columnId).dictionarysize(); + rle = createRleDecoder(stripe.getStream(columnId, + proto::Stream_Kind_DATA, + true), + false, rleVersion, memoryPool); + std::unique_ptr<RleDecoder> lengthDecoder = + createRleDecoder(stripe.getStream(columnId, + proto::Stream_Kind_LENGTH, + false), + false, rleVersion, memoryPool); + dictionary->dictionaryOffset.resize(dictSize + 1); + int64_t* lengthArray = dictionary->dictionaryOffset.data(); + lengthDecoder->next(lengthArray + 1, dictSize, nullptr); + lengthArray[0] = 0; + for(uint32_t i = 1; i < dictSize + 1; ++i) { + lengthArray[i] += lengthArray[i - 1]; + } + dictionary->dictionaryBlob.resize( + static_cast<uint64_t>(lengthArray[dictSize])); + std::unique_ptr<SeekableInputStream> blobStream = + stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false); + readFully( + dictionary->dictionaryBlob.data(), + lengthArray[dictSize], + blobStream.get()); + } + + StringDictionaryColumnReader::~StringDictionaryColumnReader() { + // PASS + } + + uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + rle->skip(numValues); + return numValues; + } + + void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // update the notNull from the parent class + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); + char *blob = dictionary->dictionaryBlob.data(); + int64_t *dictionaryOffsets = dictionary->dictionaryOffset.data(); + char **outputStarts = byteBatch.data.data(); + int64_t *outputLengths = byteBatch.length.data(); + rle->next(outputLengths, numValues, notNull); + uint64_t dictionaryCount = dictionary->dictionaryOffset.size() - 1; + if (notNull) { + for(uint64_t i=0; i < numValues; ++i) { + if (notNull[i]) { + int64_t entry = outputLengths[i]; + if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount ) { + throw ParseError("Entry index out of range in StringDictionaryColumn"); + } + outputStarts[i] = blob + dictionaryOffsets[entry]; + outputLengths[i] = dictionaryOffsets[entry+1] - + dictionaryOffsets[entry]; + } + } + } else { + for(uint64_t i=0; i < numValues; ++i) { + int64_t entry = outputLengths[i]; + if (entry < 0 || static_cast<uint64_t>(entry) >= dictionaryCount) { + throw ParseError("Entry index out of range in StringDictionaryColumn"); + } + outputStarts[i] = blob + dictionaryOffsets[entry]; + outputLengths[i] = dictionaryOffsets[entry+1] - + dictionaryOffsets[entry]; + } + } + } + + void StringDictionaryColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + rowBatch.isEncoded = true; + + EncodedStringVectorBatch& batch = dynamic_cast<EncodedStringVectorBatch&>(rowBatch); + batch.dictionary = this->dictionary; + + // Length buffer is reused to save dictionary entry ids + rle->next(batch.index.data(), numValues, notNull); + } + + void StringDictionaryColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + } + + + class StringDirectColumnReader: public ColumnReader { + private: + std::unique_ptr<RleDecoder> lengthRle; + std::unique_ptr<SeekableInputStream> blobStream; + const char *lastBuffer; + size_t lastBufferLength; + + /** + * Compute the total length of the values. + * @param lengths the array of lengths + * @param notNull the array of notNull flags + * @param numValues the lengths of the arrays + * @return the total number of bytes for the non-null values + */ + size_t computeSize(const int64_t *lengths, const char *notNull, + uint64_t numValues); + + public: + StringDirectColumnReader(const Type& type, StripeStreams& stipe); + ~StringDirectColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + + StringDirectColumnReader::StringDirectColumnReader + (const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + RleVersion rleVersion = convertRleVersion(stripe.getEncoding(columnId) + .kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in StringDirectColumn"); + lengthRle = createRleDecoder( + std::move(stream), false, rleVersion, memoryPool); + blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (blobStream == nullptr) + throw ParseError("DATA stream not found in StringDirectColumn"); + lastBuffer = nullptr; + lastBufferLength = 0; + } + + StringDirectColumnReader::~StringDirectColumnReader() { + // PASS + } + + uint64_t StringDirectColumnReader::skip(uint64_t numValues) { + const size_t BUFFER_SIZE = 1024; + numValues = ColumnReader::skip(numValues); + int64_t buffer[BUFFER_SIZE]; + uint64_t done = 0; + size_t totalBytes = 0; + // read the lengths, so we know haw many bytes to skip + while (done < numValues) { + uint64_t step = std::min(BUFFER_SIZE, + static_cast<size_t>(numValues - done)); + lengthRle->next(buffer, step, nullptr); + totalBytes += computeSize(buffer, nullptr, step); + done += step; + } + if (totalBytes <= lastBufferLength) { + // subtract the needed bytes from the ones left over + lastBufferLength -= totalBytes; + lastBuffer += totalBytes; + } else { + // move the stream forward after accounting for the buffered bytes + totalBytes -= lastBufferLength; + const size_t cap = static_cast<size_t>(std::numeric_limits<int>::max()); + while (totalBytes != 0) { + size_t step = totalBytes > cap ? cap : totalBytes; + blobStream->Skip(static_cast<int>(step)); + totalBytes -= step; + } + lastBufferLength = 0; + lastBuffer = nullptr; + } + return numValues; + } + + size_t StringDirectColumnReader::computeSize(const int64_t* lengths, + const char* notNull, + uint64_t numValues) { + size_t totalLength = 0; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + totalLength += static_cast<size_t>(lengths[i]); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + totalLength += static_cast<size_t>(lengths[i]); + } + } + return totalLength; + } + + void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + // update the notNull from the parent class + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + StringVectorBatch& byteBatch = dynamic_cast<StringVectorBatch&>(rowBatch); + char **startPtr = byteBatch.data.data(); + int64_t *lengthPtr = byteBatch.length.data(); + + // read the length vector + lengthRle->next(lengthPtr, numValues, notNull); + + // figure out the total length of data we need from the blob stream + const size_t totalLength = computeSize(lengthPtr, notNull, numValues); + + // Load data from the blob stream into our buffer until we have enough + // to get the rest directly out of the stream's buffer. + size_t bytesBuffered = 0; + byteBatch.blob.resize(totalLength); + char *ptr= byteBatch.blob.data(); + while (bytesBuffered + lastBufferLength < totalLength) { + memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength); + bytesBuffered += lastBufferLength; + const void* readBuffer; + int readLength; + if (!blobStream->Next(&readBuffer, &readLength)) { + throw ParseError("failed to read in StringDirectColumnReader.next"); + } + lastBuffer = static_cast<const char*>(readBuffer); + lastBufferLength = static_cast<size_t>(readLength); + } + + if (bytesBuffered < totalLength) { + size_t moreBytes = totalLength - bytesBuffered; + memcpy(ptr + bytesBuffered, lastBuffer, moreBytes); + lastBuffer += moreBytes; + lastBufferLength -= moreBytes; + } + + size_t filledSlots = 0; + ptr = byteBatch.blob.data(); + if (notNull) { + while (filledSlots < numValues) { + if (notNull[filledSlots]) { + startPtr[filledSlots] = const_cast<char*>(ptr); + ptr += lengthPtr[filledSlots]; + } + filledSlots += 1; + } + } else { + while (filledSlots < numValues) { + startPtr[filledSlots] = const_cast<char*>(ptr); + ptr += lengthPtr[filledSlots]; + filledSlots += 1; + } + } + } + + void StringDirectColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + blobStream->seek(positions.at(columnId)); + lengthRle->seek(positions.at(columnId)); + } + + class StructColumnReader: public ColumnReader { + private: + std::vector<ColumnReader*> children; + + public: + StructColumnReader(const Type& type, StripeStreams& stipe); + ~StructColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + StructColumnReader::StructColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + // count the number of selected sub-columns + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) { + case proto::ColumnEncoding_Kind_DIRECT: + for(unsigned int i=0; i < type.getSubtypeCount(); ++i) { + const Type& child = *type.getSubtype(i); + if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) { + children.push_back(buildReader(child, stripe).release()); + } + } + break; + case proto::ColumnEncoding_Kind_DIRECT_V2: + case proto::ColumnEncoding_Kind_DICTIONARY: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + default: + throw ParseError("Unknown encoding for StructColumnReader"); + } + } + + StructColumnReader::~StructColumnReader() { + for (size_t i=0; i<children.size(); i++) { + delete children[i]; + } + } + + uint64_t StructColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + for(std::vector<ColumnReader*>::iterator ptr=children.begin(); ptr != children.end(); ++ptr) { + (*ptr)->skip(numValues); + } + return numValues; + } + + void StructColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void StructColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void StructColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + uint64_t i=0; + notNull = rowBatch.hasNulls? rowBatch.notNull.data() : nullptr; + for(std::vector<ColumnReader*>::iterator ptr=children.begin(); + ptr != children.end(); ++ptr, ++i) { + if (encoded) { + (*ptr)->nextEncoded(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), + numValues, notNull); + } else { + (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]), + numValues, notNull); + } + } + } + + void StructColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + + for(std::vector<ColumnReader*>::iterator ptr = children.begin(); + ptr != children.end(); + ++ptr) { + (*ptr)->seekToRowGroup(positions); + } + } + + class ListColumnReader: public ColumnReader { + private: + std::unique_ptr<ColumnReader> child; + std::unique_ptr<RleDecoder> rle; + + public: + ListColumnReader(const Type& type, StripeStreams& stipe); + ~ListColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + ListColumnReader::ListColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + // count the number of selected sub-columns + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in List column"); + rle = createRleDecoder(std::move(stream), false, vers, memoryPool); + const Type& childType = *type.getSubtype(0); + if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) { + child = buildReader(childType, stripe); + } + } + + ListColumnReader::~ListColumnReader() { + // PASS + } + + uint64_t ListColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + ColumnReader *childReader = child.get(); + if (childReader) { + const uint64_t BUFFER_SIZE = 1024; + int64_t buffer[BUFFER_SIZE]; + uint64_t childrenElements = 0; + uint64_t lengthsRead = 0; + while (lengthsRead < numValues) { + uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); + rle->next(buffer, chunk, nullptr); + for(size_t i=0; i < chunk; ++i) { + childrenElements += static_cast<size_t>(buffer[i]); + } + lengthsRead += chunk; + } + childReader->skip(childrenElements); + } else { + rle->skip(numValues); + } + return numValues; + } + + void ListColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void ListColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void ListColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + ListVectorBatch &listBatch = dynamic_cast<ListVectorBatch&>(rowBatch); + int64_t* offsets = listBatch.offsets.data(); + notNull = listBatch.hasNulls ? listBatch.notNull.data() : nullptr; + rle->next(offsets, numValues, notNull); + uint64_t totalChildren = 0; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } else { + offsets[i] = static_cast<int64_t>(totalChildren); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } + } + offsets[numValues] = static_cast<int64_t>(totalChildren); + ColumnReader *childReader = child.get(); + if (childReader) { + if (encoded) { + childReader->nextEncoded(*(listBatch.elements.get()), totalChildren, nullptr); + } else { + childReader->next(*(listBatch.elements.get()), totalChildren, nullptr); + } + } + } + + void ListColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + if (child.get()) { + child->seekToRowGroup(positions); + } + } + + class MapColumnReader: public ColumnReader { + private: + std::unique_ptr<ColumnReader> keyReader; + std::unique_ptr<ColumnReader> elementReader; + std::unique_ptr<RleDecoder> rle; + + public: + MapColumnReader(const Type& type, StripeStreams& stipe); + ~MapColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + MapColumnReader::MapColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + // Determine if the key and/or value columns are selected + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in Map column"); + rle = createRleDecoder(std::move(stream), false, vers, memoryPool); + const Type& keyType = *type.getSubtype(0); + if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) { + keyReader = buildReader(keyType, stripe); + } + const Type& elementType = *type.getSubtype(1); + if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) { + elementReader = buildReader(elementType, stripe); + } + } + + MapColumnReader::~MapColumnReader() { + // PASS + } + + uint64_t MapColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + ColumnReader *rawKeyReader = keyReader.get(); + ColumnReader *rawElementReader = elementReader.get(); + if (rawKeyReader || rawElementReader) { + const uint64_t BUFFER_SIZE = 1024; + int64_t buffer[BUFFER_SIZE]; + uint64_t childrenElements = 0; + uint64_t lengthsRead = 0; + while (lengthsRead < numValues) { + uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); + rle->next(buffer, chunk, nullptr); + for(size_t i=0; i < chunk; ++i) { + childrenElements += static_cast<size_t>(buffer[i]); + } + lengthsRead += chunk; + } + if (rawKeyReader) { + rawKeyReader->skip(childrenElements); + } + if (rawElementReader) { + rawElementReader->skip(childrenElements); + } + } else { + rle->skip(numValues); + } + return numValues; + } + + void MapColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) + { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void MapColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) + { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void MapColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + MapVectorBatch &mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch); + int64_t* offsets = mapBatch.offsets.data(); + notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : nullptr; + rle->next(offsets, numValues, notNull); + uint64_t totalChildren = 0; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } else { + offsets[i] = static_cast<int64_t>(totalChildren); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + uint64_t tmp = static_cast<uint64_t>(offsets[i]); + offsets[i] = static_cast<int64_t>(totalChildren); + totalChildren += tmp; + } + } + offsets[numValues] = static_cast<int64_t>(totalChildren); + ColumnReader *rawKeyReader = keyReader.get(); + if (rawKeyReader) { + if (encoded) { + rawKeyReader->nextEncoded(*(mapBatch.keys.get()), totalChildren, nullptr); + } else { + rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, nullptr); + } + } + ColumnReader *rawElementReader = elementReader.get(); + if (rawElementReader) { + if (encoded) { + rawElementReader->nextEncoded(*(mapBatch.elements.get()), totalChildren, nullptr); + } else { + rawElementReader->next(*(mapBatch.elements.get()), totalChildren, nullptr); + } + } + } + + void MapColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + if (keyReader.get()) { + keyReader->seekToRowGroup(positions); + } + if (elementReader.get()) { + elementReader->seekToRowGroup(positions); + } + } + + class UnionColumnReader: public ColumnReader { + private: + std::unique_ptr<ByteRleDecoder> rle; + std::vector<ColumnReader*> childrenReader; + std::vector<int64_t> childrenCounts; + uint64_t numChildren; + + public: + UnionColumnReader(const Type& type, StripeStreams& stipe); + ~UnionColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + + private: + template<bool encoded> + void nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull); + }; + + UnionColumnReader::UnionColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + numChildren = type.getSubtypeCount(); + childrenReader.resize(numChildren); + childrenCounts.resize(numChildren); + + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (stream == nullptr) + throw ParseError("LENGTH stream not found in Union column"); + rle = createByteRleDecoder(std::move(stream)); + // figure out which types are selected + const std::vector<bool> selectedColumns = stripe.getSelectedColumns(); + for(unsigned int i=0; i < numChildren; ++i) { + const Type &child = *type.getSubtype(i); + if (selectedColumns[static_cast<size_t>(child.getColumnId())]) { + childrenReader[i] = buildReader(child, stripe).release(); + } + } + } + + UnionColumnReader::~UnionColumnReader() { + for(std::vector<ColumnReader*>::iterator itr = childrenReader.begin(); + itr != childrenReader.end(); ++itr) { + delete *itr; + } + } + + uint64_t UnionColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + const uint64_t BUFFER_SIZE = 1024; + char buffer[BUFFER_SIZE]; + uint64_t lengthsRead = 0; + int64_t *counts = childrenCounts.data(); + memset(counts, 0, sizeof(int64_t) * numChildren); + while (lengthsRead < numValues) { + uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE); + rle->next(buffer, chunk, nullptr); + for(size_t i=0; i < chunk; ++i) { + counts[static_cast<size_t>(buffer[i])] += 1; + } + lengthsRead += chunk; + } + for(size_t i=0; i < numChildren; ++i) { + if (counts[i] != 0 && childrenReader[i] != nullptr) { + childrenReader[i]->skip(static_cast<uint64_t>(counts[i])); + } + } + return numValues; + } + + void UnionColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<false>(rowBatch, numValues, notNull); + } + + void UnionColumnReader::nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + nextInternal<true>(rowBatch, numValues, notNull); + } + + template<bool encoded> + void UnionColumnReader::nextInternal(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + UnionVectorBatch &unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch); + uint64_t* offsets = unionBatch.offsets.data(); + int64_t* counts = childrenCounts.data(); + memset(counts, 0, sizeof(int64_t) * numChildren); + unsigned char* tags = unionBatch.tags.data(); + notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : nullptr; + rle->next(reinterpret_cast<char *>(tags), numValues, notNull); + // set the offsets for each row + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + offsets[i] = + static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + offsets[i] = + static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++); + } + } + // read the right number of each child column + for(size_t i=0; i < numChildren; ++i) { + if (childrenReader[i] != nullptr) { + if (encoded) { + childrenReader[i]->nextEncoded(*(unionBatch.children[i]), + static_cast<uint64_t>(counts[i]), nullptr); + } else { + childrenReader[i]->next(*(unionBatch.children[i]), + static_cast<uint64_t>(counts[i]), nullptr); + } + } + } + } + + void UnionColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + rle->seek(positions.at(columnId)); + for(size_t i = 0; i < numChildren; ++i) { + if (childrenReader[i] != nullptr) { + childrenReader[i]->seekToRowGroup(positions); + } + } + } + + /** + * Destructively convert the number from zigzag encoding to the + * natural signed representation. + */ + void unZigZagInt128(Int128& value) { + bool needsNegate = value.getLowBits() & 1; + value >>= 1; + if (needsNegate) { + value.negate(); + value -= 1; + } + } + + class Decimal64ColumnReader: public ColumnReader { + public: + static const uint32_t MAX_PRECISION_64 = 18; + static const uint32_t MAX_PRECISION_128 = 38; + static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1]; + + protected: + std::unique_ptr<SeekableInputStream> valueStream; + int32_t precision; + int32_t scale; + const char* buffer; + const char* bufferEnd; + + std::unique_ptr<RleDecoder> scaleDecoder; + + /** + * Read the valueStream for more bytes. + */ + void readBuffer() { + while (buffer == bufferEnd) { + int length; + if (!valueStream->Next(reinterpret_cast<const void**>(&buffer), + &length)) { + throw ParseError("Read past end of stream in Decimal64ColumnReader "+ + valueStream->getName()); + } + bufferEnd = buffer + length; + } + } + + void readInt64(int64_t& value, int32_t currentScale) { + value = 0; + size_t offset = 0; + while (true) { + readBuffer(); + unsigned char ch = static_cast<unsigned char>(*(buffer++)); + value |= static_cast<uint64_t>(ch & 0x7f) << offset; + offset += 7; + if (!(ch & 0x80)) { + break; + } + } + value = unZigZag(static_cast<uint64_t>(value)); + if (scale > currentScale && + static_cast<uint64_t>(scale - currentScale) <= MAX_PRECISION_64) { + value *= POWERS_OF_TEN[scale - currentScale]; + } else if (scale < currentScale && + static_cast<uint64_t>(currentScale - scale) <= MAX_PRECISION_64) { + value /= POWERS_OF_TEN[currentScale - scale]; + } else if (scale != currentScale) { + throw ParseError("Decimal scale out of range"); + } + } + + public: + Decimal64ColumnReader(const Type& type, StripeStreams& stipe); + ~Decimal64ColumnReader() override; + + uint64_t skip(uint64_t numValues) override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) override; + }; + const uint32_t Decimal64ColumnReader::MAX_PRECISION_64; + const uint32_t Decimal64ColumnReader::MAX_PRECISION_128; + const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1]= + {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + + Decimal64ColumnReader::Decimal64ColumnReader(const Type& type, + StripeStreams& stripe + ): ColumnReader(type, stripe) { + scale = static_cast<int32_t>(type.getScale()); + precision = static_cast<int32_t>(type.getPrecision()); + valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true); + if (valueStream == nullptr) + throw ParseError("DATA stream not found in Decimal64Column"); + buffer = nullptr; + bufferEnd = nullptr; + RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind()); + std::unique_ptr<SeekableInputStream> stream = + stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true); + if (stream == nullptr) + throw ParseError("SECONDARY stream not found in Decimal64Column"); + scaleDecoder = createRleDecoder(std::move(stream), true, vers, memoryPool); + } + + Decimal64ColumnReader::~Decimal64ColumnReader() { + // PASS + } + + uint64_t Decimal64ColumnReader::skip(uint64_t numValues) { + numValues = ColumnReader::skip(numValues); + uint64_t skipped = 0; + while (skipped < numValues) { + readBuffer(); + if (!(0x80 & *(buffer++))) { + skipped += 1; + } + } + scaleDecoder->skip(numValues); + return numValues; + } + + void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + Decimal64VectorBatch &batch = + dynamic_cast<Decimal64VectorBatch&>(rowBatch); + int64_t* values = batch.values.data(); + // read the next group of scales + int64_t* scaleBuffer = batch.readScales.data(); + scaleDecoder->next(scaleBuffer, numValues, notNull); + batch.precision = precision; + batch.scale = scale; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + readInt64(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } + + void scaleInt128(Int128& value, uint32_t scale, uint32_t currentScale) { + if (scale > currentScale) { + while(scale > currentScale) { + uint32_t scaleAdjust = + std::min(Decimal64ColumnReader::MAX_PRECISION_64, + scale - currentScale); + value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust]; + currentScale += scaleAdjust; + } + } else if (scale < currentScale) { + Int128 remainder; + while(currentScale > scale) { + uint32_t scaleAdjust = + std::min(Decimal64ColumnReader::MAX_PRECISION_64, + currentScale - scale); + value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust], + remainder); + currentScale -= scaleAdjust; + } + } + } + + void Decimal64ColumnReader::seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions) { + ColumnReader::seekToRowGroup(positions); + valueStream->seek(positions.at(columnId)); + scaleDecoder->seek(positions.at(columnId)); + } + + class Decimal128ColumnReader: public Decimal64ColumnReader { + public: + Decimal128ColumnReader(const Type& type, StripeStreams& stipe); + ~Decimal128ColumnReader() override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + + private: + void readInt128(Int128& value, int32_t currentScale) { + value = 0; + Int128 work; + uint32_t offset = 0; + while (true) { + readBuffer(); + unsigned char ch = static_cast<unsigned char>(*(buffer++)); + work = ch & 0x7f; + work <<= offset; + value |= work; + offset += 7; + if (!(ch & 0x80)) { + break; + } + } + unZigZagInt128(value); + scaleInt128(value, static_cast<uint32_t>(scale), + static_cast<uint32_t>(currentScale)); + } + }; + + Decimal128ColumnReader::Decimal128ColumnReader + (const Type& type, + StripeStreams& stripe + ): Decimal64ColumnReader(type, stripe) { + // PASS + } + + Decimal128ColumnReader::~Decimal128ColumnReader() { + // PASS + } + + void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + Decimal128VectorBatch &batch = + dynamic_cast<Decimal128VectorBatch&>(rowBatch); + Int128* values = batch.values.data(); + // read the next group of scales + int64_t* scaleBuffer = batch.readScales.data(); + scaleDecoder->next(scaleBuffer, numValues, notNull); + batch.precision = precision; + batch.scale = scale; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + readInt128(values[i], static_cast<int32_t>(scaleBuffer[i])); + } + } + } + + class DecimalHive11ColumnReader: public Decimal64ColumnReader { + private: + bool throwOnOverflow; + std::ostream* errorStream; + + /** + * Read an Int128 from the stream and correct it to the desired scale. + */ + bool readInt128(Int128& value, int32_t currentScale) { + // -/+ 99999999999999999999999999999999999999 + static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001); + static const Int128 MAX_VALUE( 0x4b3b4ca85a86c47a, 0x098a223fffffffff); + + value = 0; + Int128 work; + uint32_t offset = 0; + bool result = true; + while (true) { + readBuffer(); + unsigned char ch = static_cast<unsigned char>(*(buffer++)); + work = ch & 0x7f; + // If we have read more than 128 bits, we flag the error, but keep + // reading bytes so the stream isn't thrown off. + if (offset > 128 || (offset == 126 && work > 3)) { + result = false; + } + work <<= offset; + value |= work; + offset += 7; + if (!(ch & 0x80)) { + break; + } + } + + if (!result) { + return result; + } + unZigZagInt128(value); + scaleInt128(value, static_cast<uint32_t>(scale), + static_cast<uint32_t>(currentScale)); + return value >= MIN_VALUE && value <= MAX_VALUE; + } + + public: + DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe); + ~DecimalHive11ColumnReader() override; + + void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) override; + }; + + DecimalHive11ColumnReader::DecimalHive11ColumnReader + (const Type& type, + StripeStreams& stripe + ): Decimal64ColumnReader(type, stripe) { + scale = stripe.getForcedScaleOnHive11Decimal(); + throwOnOverflow = stripe.getThrowOnHive11DecimalOverflow(); + errorStream = stripe.getErrorStream(); + } + + DecimalHive11ColumnReader::~DecimalHive11ColumnReader() { + // PASS + } + + void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char *notNull) { + ColumnReader::next(rowBatch, numValues, notNull); + notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr; + Decimal128VectorBatch &batch = + dynamic_cast<Decimal128VectorBatch&>(rowBatch); + Int128* values = batch.values.data(); + // read the next group of scales + int64_t* scaleBuffer = batch.readScales.data(); + + scaleDecoder->next(scaleBuffer, numValues, notNull); + + batch.precision = precision; + batch.scale = scale; + if (notNull) { + for(size_t i=0; i < numValues; ++i) { + if (notNull[i]) { + if (!readInt128(values[i], + static_cast<int32_t>(scaleBuffer[i]))) { + if (throwOnOverflow) { + throw ParseError("Hive 0.11 decimal was more than 38 digits."); + } else { + *errorStream << "Warning: " + << "Hive 0.11 decimal with more than 38 digits " + << "replaced by NULL.\n"; + notNull[i] = false; + } + } + } + } + } else { + for(size_t i=0; i < numValues; ++i) { + if (!readInt128(values[i], + static_cast<int32_t>(scaleBuffer[i]))) { + if (throwOnOverflow) { + throw ParseError("Hive 0.11 decimal was more than 38 digits."); + } else { + *errorStream << "Warning: " + << "Hive 0.11 decimal with more than 38 digits " + << "replaced by NULL.\n"; + batch.hasNulls = true; + batch.notNull[i] = false; + } + } + } + } + } + + /** + * Create a reader for the given stripe. + */ + std::unique_ptr<ColumnReader> buildReader(const Type& type, + StripeStreams& stripe) { + switch (static_cast<int64_t>(type.getKind())) { + case DATE: + case INT: + case LONG: + case SHORT: + return std::unique_ptr<ColumnReader>( + new IntegerColumnReader(type, stripe)); + case BINARY: + case CHAR: + case STRING: + case VARCHAR: + switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())){ + case proto::ColumnEncoding_Kind_DICTIONARY: + case proto::ColumnEncoding_Kind_DICTIONARY_V2: + return std::unique_ptr<ColumnReader>( + new StringDictionaryColumnReader(type, stripe)); + case proto::ColumnEncoding_Kind_DIRECT: + case proto::ColumnEncoding_Kind_DIRECT_V2: + return std::unique_ptr<ColumnReader>( + new StringDirectColumnReader(type, stripe)); + default: + throw NotImplementedYet("buildReader unhandled string encoding"); + } + + case BOOLEAN: + return std::unique_ptr<ColumnReader>( + new BooleanColumnReader(type, stripe)); + + case BYTE: + return std::unique_ptr<ColumnReader>( + new ByteColumnReader(type, stripe)); + + case LIST: + return std::unique_ptr<ColumnReader>( + new ListColumnReader(type, stripe)); + + case MAP: + return std::unique_ptr<ColumnReader>( + new MapColumnReader(type, stripe)); + + case UNION: + return std::unique_ptr<ColumnReader>( + new UnionColumnReader(type, stripe)); + + case STRUCT: + return std::unique_ptr<ColumnReader>( + new StructColumnReader(type, stripe)); + + case FLOAT: + case DOUBLE: + return std::unique_ptr<ColumnReader>( + new DoubleColumnReader(type, stripe)); + + case TIMESTAMP: + return std::unique_ptr<ColumnReader> + (new TimestampColumnReader(type, stripe)); + + case DECIMAL: + // is this a Hive 0.11 or 0.12 file? + if (type.getPrecision() == 0) { + return std::unique_ptr<ColumnReader> + (new DecimalHive11ColumnReader(type, stripe)); + + // can we represent the values using int64_t? + } else if (type.getPrecision() <= + Decimal64ColumnReader::MAX_PRECISION_64) { + return std::unique_ptr<ColumnReader> + (new Decimal64ColumnReader(type, stripe)); + + // otherwise we use the Int128 implementation + } else { + return std::unique_ptr<ColumnReader> + (new Decimal128ColumnReader(type, stripe)); + } + + default: + throw NotImplementedYet("buildReader unhandled type"); + } + } + +} diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.hh b/contrib/libs/apache/orc/c++/src/ColumnReader.hh index 0c64e5b80f..5023cdfab5 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnReader.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnReader.hh @@ -1,156 +1,156 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COLUMN_READER_HH -#define ORC_COLUMN_READER_HH - -#include <unordered_map> - -#include "orc/Vector.hh" - -#include "ByteRLE.hh" -#include "Compression.hh" -#include "Timezone.hh" -#include "wrap/orc-proto-wrapper.hh" - -namespace orc { - - class StripeStreams { - public: - virtual ~StripeStreams(); - - /** - * Get the array of booleans for which columns are selected. - * @return the address of an array which contains true at the index of - * each columnId is selected. - */ - virtual const std::vector<bool> getSelectedColumns() const = 0; - - /** - * Get the encoding for the given column for this stripe. - */ - virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0; - - /** - * Get the stream for the given column/kind in this stripe. - * @param columnId the id of the column - * @param kind the kind of the stream - * @param shouldStream should the reading page the stream in - * @return the new stream - */ - virtual std::unique_ptr<SeekableInputStream> - getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const = 0; - - /** - * Get the memory pool for this reader. - */ - virtual MemoryPool& getMemoryPool() const = 0; - - /** - * Get the writer's timezone, so that we can convert their dates correctly. - */ - virtual const Timezone& getWriterTimezone() const = 0; - - /** - * Get the error stream. - * @return a pointer to the stream that should get error messages - */ - virtual std::ostream* getErrorStream() const = 0; - - /** - * Should the reader throw when the scale overflows when reading Hive 0.11 - * decimals. - * @return true if it should throw - */ - virtual bool getThrowOnHive11DecimalOverflow() const = 0; - - /** - * What is the scale forced on the Hive 0.11 decimals? - * @return the number of scale digits - */ - virtual int32_t getForcedScaleOnHive11Decimal() const = 0; - }; - - /** - * The interface for reading ORC data types. - */ - class ColumnReader { - protected: - std::unique_ptr<ByteRleDecoder> notNullDecoder; - uint64_t columnId; - MemoryPool& memoryPool; - - public: - ColumnReader(const Type& type, StripeStreams& stipe); - - virtual ~ColumnReader(); - - /** - * Skip number of specified rows. - * @param numValues the number of values to skip - * @return the number of non-null values skipped - */ - virtual uint64_t skip(uint64_t numValues); - - /** - * Read the next group of values into this rowBatch. - * @param rowBatch the memory to read into. - * @param numValues the number of values to read - * @param notNull if null, all values are not null. Otherwise, it is - * a mask (with at least numValues bytes) for which values to - * set. - */ - virtual void next(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull); - - /** - * Read the next group of values without decoding - * @param rowBatch the memory to read into. - * @param numValues the number of values to read - * @param notNull if null, all values are not null. Otherwise, it is - * a mask (with at least numValues bytes) for which values to - * set. - */ - virtual void nextEncoded(ColumnVectorBatch& rowBatch, - uint64_t numValues, - char* notNull) - { - rowBatch.isEncoded = false; - next(rowBatch, numValues, notNull); - } - - /** - * Seek to beginning of a row group in the current stripe - * @param positions a list of PositionProviders storing the positions - */ - virtual void seekToRowGroup( - std::unordered_map<uint64_t, PositionProvider>& positions); - - }; - - /** - * Create a reader for the given stripe. - */ - std::unique_ptr<ColumnReader> buildReader(const Type& type, - StripeStreams& stripe); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COLUMN_READER_HH +#define ORC_COLUMN_READER_HH + +#include <unordered_map> + +#include "orc/Vector.hh" + +#include "ByteRLE.hh" +#include "Compression.hh" +#include "Timezone.hh" +#include "wrap/orc-proto-wrapper.hh" + +namespace orc { + + class StripeStreams { + public: + virtual ~StripeStreams(); + + /** + * Get the array of booleans for which columns are selected. + * @return the address of an array which contains true at the index of + * each columnId is selected. + */ + virtual const std::vector<bool> getSelectedColumns() const = 0; + + /** + * Get the encoding for the given column for this stripe. + */ + virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0; + + /** + * Get the stream for the given column/kind in this stripe. + * @param columnId the id of the column + * @param kind the kind of the stream + * @param shouldStream should the reading page the stream in + * @return the new stream + */ + virtual std::unique_ptr<SeekableInputStream> + getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const = 0; + + /** + * Get the memory pool for this reader. + */ + virtual MemoryPool& getMemoryPool() const = 0; + + /** + * Get the writer's timezone, so that we can convert their dates correctly. + */ + virtual const Timezone& getWriterTimezone() const = 0; + + /** + * Get the error stream. + * @return a pointer to the stream that should get error messages + */ + virtual std::ostream* getErrorStream() const = 0; + + /** + * Should the reader throw when the scale overflows when reading Hive 0.11 + * decimals. + * @return true if it should throw + */ + virtual bool getThrowOnHive11DecimalOverflow() const = 0; + + /** + * What is the scale forced on the Hive 0.11 decimals? + * @return the number of scale digits + */ + virtual int32_t getForcedScaleOnHive11Decimal() const = 0; + }; + + /** + * The interface for reading ORC data types. + */ + class ColumnReader { + protected: + std::unique_ptr<ByteRleDecoder> notNullDecoder; + uint64_t columnId; + MemoryPool& memoryPool; + + public: + ColumnReader(const Type& type, StripeStreams& stipe); + + virtual ~ColumnReader(); + + /** + * Skip number of specified rows. + * @param numValues the number of values to skip + * @return the number of non-null values skipped + */ + virtual uint64_t skip(uint64_t numValues); + + /** + * Read the next group of values into this rowBatch. + * @param rowBatch the memory to read into. + * @param numValues the number of values to read + * @param notNull if null, all values are not null. Otherwise, it is + * a mask (with at least numValues bytes) for which values to + * set. + */ + virtual void next(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull); + + /** + * Read the next group of values without decoding + * @param rowBatch the memory to read into. + * @param numValues the number of values to read + * @param notNull if null, all values are not null. Otherwise, it is + * a mask (with at least numValues bytes) for which values to + * set. + */ + virtual void nextEncoded(ColumnVectorBatch& rowBatch, + uint64_t numValues, + char* notNull) + { + rowBatch.isEncoded = false; + next(rowBatch, numValues, notNull); + } + + /** + * Seek to beginning of a row group in the current stripe + * @param positions a list of PositionProviders storing the positions + */ + virtual void seekToRowGroup( + std::unordered_map<uint64_t, PositionProvider>& positions); + + }; + + /** + * Create a reader for the given stripe. + */ + std::unique_ptr<ColumnReader> buildReader(const Type& type, + StripeStreams& stripe); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc index 1408a15457..8d4d00cc61 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc @@ -1,3013 +1,3013 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" -#include "orc/Writer.hh" - -#include "ByteRLE.hh" -#include "ColumnWriter.hh" -#include "RLE.hh" -#include "Statistics.hh" -#include "Timezone.hh" - -namespace orc { - StreamsFactory::~StreamsFactory() { - //PASS - } - - class StreamsFactoryImpl : public StreamsFactory { - public: - StreamsFactoryImpl( - const WriterOptions& writerOptions, - OutputStream* outputStream) : - options(writerOptions), - outStream(outputStream) { - } - - virtual std::unique_ptr<BufferedOutputStream> - createStream(proto::Stream_Kind kind) const override; - private: - const WriterOptions& options; - OutputStream* outStream; - }; - - std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream( - proto::Stream_Kind) const { - // In the future, we can decide compression strategy and modifier - // based on stream kind. But for now we just use the setting from - // WriterOption - return createCompressor( - options.getCompression(), - outStream, - options.getCompressionStrategy(), - // BufferedOutputStream initial capacity - 1 * 1024 * 1024, - options.getCompressionBlockSize(), - *options.getMemoryPool()); - } - - std::unique_ptr<StreamsFactory> createStreamsFactory( - const WriterOptions& options, - OutputStream* outStream) { - return std::unique_ptr<StreamsFactory>( - new StreamsFactoryImpl(options, outStream)); - } - - RowIndexPositionRecorder::~RowIndexPositionRecorder() { - // PASS - } - - proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) - { - switch (rleVersion) - { - case RleVersion_1: - return proto::ColumnEncoding_Kind_DIRECT; - case RleVersion_2: - return proto::ColumnEncoding_Kind_DIRECT_V2; - default: - throw InvalidArgument("Invalid param"); - } - } - - ColumnWriter::ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - columnId(type.getColumnId()), - colIndexStatistics(), - colStripeStatistics(), - colFileStatistics(), - enableIndex(options.getEnableIndex()), - rowIndex(), - rowIndexEntry(), - rowIndexPosition(), - enableBloomFilter(false), - memPool(*options.getMemoryPool()), - indexStream(), - bloomFilterStream() { - - std::unique_ptr<BufferedOutputStream> presentStream = - factory.createStream(proto::Stream_Kind_PRESENT); - notNullEncoder = createBooleanRleEncoder(std::move(presentStream)); - - colIndexStatistics = createColumnStatistics(type); - colStripeStatistics = createColumnStatistics(type); - colFileStatistics = createColumnStatistics(type); - - if (enableIndex) { - rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex()); - rowIndexEntry = - std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry()); - rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>( - new RowIndexPositionRecorder(*rowIndexEntry)); - indexStream = - factory.createStream(proto::Stream_Kind_ROW_INDEX); - - // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported - if (options.isColumnUseBloomFilter(columnId) - && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) { - enableBloomFilter = true; - bloomFilter.reset(new BloomFilterImpl( - options.getRowIndexStride(), options.getBloomFilterFPP())); - bloomFilterIndex.reset(new proto::BloomFilterIndex()); - bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8); - } - } - } - - ColumnWriter::~ColumnWriter() { - // PASS - } - - void ColumnWriter::add(ColumnVectorBatch& batch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - notNullEncoder->add(batch.notNull.data() + offset, numValues, incomingMask); - } - - void ColumnWriter::flush(std::vector<proto::Stream>& streams) { - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_PRESENT); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(notNullEncoder->flush()); - streams.push_back(stream); - } - - uint64_t ColumnWriter::getEstimatedSize() const { - return notNullEncoder->getBufferSize(); - } - - void ColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - getProtoBufStatistics(stats, colStripeStatistics.get()); - } - - void ColumnWriter::mergeStripeStatsIntoFileStats() { - colFileStatistics->merge(*colStripeStatistics); - colStripeStatistics->reset(); - } - - void ColumnWriter::mergeRowGroupStatsIntoStripeStats() { - colStripeStatistics->merge(*colIndexStatistics); - colIndexStatistics->reset(); - } - - void ColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - getProtoBufStatistics(stats, colFileStatistics.get()); - } - - void ColumnWriter::createRowIndexEntry() { - proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics(); - colIndexStatistics->toProtoBuf(*indexStats); - - *rowIndex->add_entry() = *rowIndexEntry; - - rowIndexEntry->clear_positions(); - rowIndexEntry->clear_statistics(); - - colStripeStatistics->merge(*colIndexStatistics); - colIndexStatistics->reset(); - - addBloomFilterEntry(); - - recordPosition(); - } - - void ColumnWriter::addBloomFilterEntry() { - if (enableBloomFilter) { - BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter()); - bloomFilter->reset(); - } - } - - void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { - // write row index to output stream - rowIndex->SerializeToZeroCopyStream(indexStream.get()); - - // construct row index stream - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_ROW_INDEX); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(indexStream->flush()); - streams.push_back(stream); - - // write BLOOM_FILTER_UTF8 stream - if (enableBloomFilter) { - if (!bloomFilterIndex->SerializeToZeroCopyStream(bloomFilterStream.get())) { - throw std::logic_error("Failed to write bloom filter stream."); - } - stream.set_kind(proto::Stream_Kind_BLOOM_FILTER_UTF8); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(bloomFilterStream->flush()); - streams.push_back(stream); - } - } - - void ColumnWriter::recordPosition() const { - notNullEncoder->recordPosition(rowIndexPosition.get()); - } - - void ColumnWriter::reset() { - if (enableIndex) { - // clear row index - rowIndex->clear_entry(); - rowIndexEntry->clear_positions(); - rowIndexEntry->clear_statistics(); - - // write current positions - recordPosition(); - } - - if (enableBloomFilter) { - bloomFilter->reset(); - bloomFilterIndex->clear_bloomfilter(); - } - } - - void ColumnWriter::writeDictionary() { - // PASS - } - - class StructColumnWriter : public ColumnWriter { - public: - StructColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~StructColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::vector<ColumnWriter *> children; - }; - - StructColumnWriter::StructColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) { - const Type& child = *type.getSubtype(i); - children.push_back(buildWriter(child, factory, options).release()); - } - - if (enableIndex) { - recordPosition(); - } - } - - StructColumnWriter::~StructColumnWriter() { - for (uint32_t i = 0; i < children.size(); ++i) { - delete children[i]; - } - } - - void StructColumnWriter::add( - ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const StructVectorBatch* structBatch = - dynamic_cast<const StructVectorBatch *>(&rowBatch); - if (structBatch == nullptr) { - throw InvalidArgument("Failed to cast to StructVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - const char* notNull = structBatch->hasNulls ? - structBatch->notNull.data() + offset : nullptr; - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->add(*structBatch->fields[i], offset, numValues, notNull); - } - - // update stats - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - - void StructColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->flush(streams); - } - } - - void StructColumnWriter::writeIndex( - std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeIndex(streams); - } - } - - uint64_t StructColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - for (uint32_t i = 0; i < children.size(); ++i) { - size += children[i]->getEstimatedSize(); - } - return size; - } - - void StructColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - encodings.push_back(encoding); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getColumnEncoding(encodings); - } - } - - void StructColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getStripeStatistics(stats); - } - } - - void StructColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeStripeStatsIntoFileStats(); - } - } - - void StructColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getFileStatistics(stats); - } - } - - void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeRowGroupStatsIntoStripeStats(); - } - } - - void StructColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->createRowIndexEntry(); - } - } - - void StructColumnWriter::reset() { - ColumnWriter::reset(); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->reset(); - } - } - - void StructColumnWriter::writeDictionary() { - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeDictionary(); - } - } - - class IntegerColumnWriter : public ColumnWriter { - public: - IntegerColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - protected: - std::unique_ptr<RleEncoder> rleEncoder; - - private: - RleVersion rleVersion; - }; - - IntegerColumnWriter::IntegerColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()) { - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createRleEncoder( - std::move(dataStream), - true, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (enableIndex) { - recordPosition(); - } - } - - void IntegerColumnWriter::add( - ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const LongVectorBatch* longBatch = - dynamic_cast<const LongVectorBatch*>(&rowBatch); - if (longBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - IntegerColumnStatisticsImpl* intStats = - dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); - if (intStats == nullptr) { - throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const int64_t* data = longBatch->data.data() + offset; - const char* notNull = longBatch->hasNulls ? - longBatch->notNull.data() + offset : nullptr; - - rleEncoder->add(data, numValues, notNull); - - // update stats - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - intStats->update(data[i], 1); - } - } - intStats->increase(count); - if (count < numValues) { - intStats->setHasNull(true); - } - } - - void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); - streams.push_back(stream); - } - - uint64_t IntegerColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - return size; - } - - void IntegerColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void IntegerColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); - } - - class ByteColumnWriter : public ColumnWriter { - public: - ByteColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - private: - std::unique_ptr<ByteRleEncoder> byteRleEncoder; - }; - - ByteColumnWriter::ByteColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - byteRleEncoder = createByteRleEncoder(std::move(dataStream)); - - if (enableIndex) { - recordPosition(); - } - } - - void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); - if (byteBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - IntegerColumnStatisticsImpl* intStats = - dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); - if (intStats == nullptr) { - throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* data = byteBatch->data.data() + offset; - const char* notNull = byteBatch->hasNulls ? - byteBatch->notNull.data() + offset : nullptr; - - char* byteData = reinterpret_cast<char*>(data); - for (uint64_t i = 0; i < numValues; ++i) { - byteData[i] = static_cast<char>(data[i]); - } - byteRleEncoder->add(byteData, numValues, notNull); - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - intStats->update(static_cast<int64_t>(byteData[i]), 1); - } - } - intStats->increase(count); - if (count < numValues) { - intStats->setHasNull(true); - } - } - - void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(byteRleEncoder->flush()); - streams.push_back(stream); - } - - uint64_t ByteColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += byteRleEncoder->getBufferSize(); - return size; - } - - void ByteColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void ByteColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - byteRleEncoder->recordPosition(rowIndexPosition.get()); - } - - class BooleanColumnWriter : public ColumnWriter { - public: - BooleanColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - private: - std::unique_ptr<ByteRleEncoder> rleEncoder; - }; - - BooleanColumnWriter::BooleanColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createBooleanRleEncoder(std::move(dataStream)); - - if (enableIndex) { - recordPosition(); - } - } - - void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); - if (byteBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - BooleanColumnStatisticsImpl* boolStats = - dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get()); - if (boolStats == nullptr) { - throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* data = byteBatch->data.data() + offset; - const char* notNull = byteBatch->hasNulls ? - byteBatch->notNull.data() + offset : nullptr; - - char* byteData = reinterpret_cast<char*>(data); - for (uint64_t i = 0; i < numValues; ++i) { - byteData[i] = static_cast<char>(data[i]); - } - rleEncoder->add(byteData, numValues, notNull); - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - boolStats->update(byteData[i] != 0, 1); - } - } - boolStats->increase(count); - if (count < numValues) { - boolStats->setHasNull(true); - } - } - - void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); - streams.push_back(stream); - } - - uint64_t BooleanColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - return size; - } - - void BooleanColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void BooleanColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); - } - - class DoubleColumnWriter : public ColumnWriter { - public: - DoubleColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isFloat); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - private: - bool isFloat; - std::unique_ptr<AppendOnlyBufferedStream> dataStream; - DataBuffer<char> buffer; - }; - - DoubleColumnWriter::DoubleColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options, - bool isFloatType) : - ColumnWriter(type, factory, options), - isFloat(isFloatType), - buffer(*options.getMemoryPool()) { - dataStream.reset(new AppendOnlyBufferedStream( - factory.createStream(proto::Stream_Kind_DATA))); - buffer.resize(isFloat ? 4 : 8); - - if (enableIndex) { - recordPosition(); - } - } - - // Floating point types are stored using IEEE 754 floating point bit layout. - // Float columns use 4 bytes per value and double columns use 8 bytes. - template <typename FLOAT_TYPE, typename INTEGER_TYPE> - inline void encodeFloatNum(FLOAT_TYPE input, char* output) { - INTEGER_TYPE* intBits = reinterpret_cast<INTEGER_TYPE*>(&input); - for (size_t i = 0; i < sizeof(INTEGER_TYPE); ++i) { - output[i] = static_cast<char>(((*intBits) >> (8 * i)) & 0xff); - } - } - - void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const DoubleVectorBatch* dblBatch = - dynamic_cast<const DoubleVectorBatch*>(&rowBatch); - if (dblBatch == nullptr) { - throw InvalidArgument("Failed to cast to DoubleVectorBatch"); - } - DoubleColumnStatisticsImpl* doubleStats = - dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get()); - if (doubleStats == nullptr) { - throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const double* doubleData = dblBatch->data.data() + offset; - const char* notNull = dblBatch->hasNulls ? - dblBatch->notNull.data() + offset : nullptr; - - size_t bytes = isFloat ? 4 : 8; - char* data = buffer.data(); - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - if (isFloat) { - encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data); - } else { - encodeFloatNum<double, int64_t>(doubleData[i], data); - } - dataStream->write(data, bytes); - ++count; - if (enableBloomFilter) { - bloomFilter->addDouble(doubleData[i]); - } - doubleStats->update(doubleData[i]); - } - } - doubleStats->increase(count); - if (count < numValues) { - doubleStats->setHasNull(true); - } - } - - void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(dataStream->flush()); - streams.push_back(stream); - } - - uint64_t DoubleColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += dataStream->getSize(); - return size; - } - - void DoubleColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void DoubleColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - dataStream->recordPosition(rowIndexPosition.get()); - } - - /** - * Implementation of increasing sorted string dictionary - */ - class SortedStringDictionary { - public: - struct DictEntry { - DictEntry(const char * str, size_t len):data(str),length(len) {} - const char * data; - size_t length; - }; - - SortedStringDictionary():totalLength(0) {} - - // insert a new string into dictionary, return its insertion order - size_t insert(const char * data, size_t len); - - // write dictionary data & length to output buffer - void flush(AppendOnlyBufferedStream * dataStream, - RleEncoder * lengthEncoder) const; - - // reorder input index buffer from insertion order to dictionary order - void reorder(std::vector<int64_t>& idxBuffer) const; - - // get dict entries in insertion order - void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const; - - // return count of entries - size_t size() const; - - // return total length of strings in the dictioanry - uint64_t length() const; - - void clear(); - - private: - struct LessThan { - bool operator()(const DictEntry& left, const DictEntry& right) const { - int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); - if (ret != 0) { - return ret < 0; - } - return left.length < right.length; - } - }; - - std::map<DictEntry, size_t, LessThan> dict; - std::vector<std::vector<char>> data; - uint64_t totalLength; - - // use friend class here to avoid being bothered by const function calls - friend class StringColumnWriter; - friend class CharColumnWriter; - friend class VarCharColumnWriter; - // store indexes of insertion order in the dictionary for not-null rows - std::vector<int64_t> idxInDictBuffer; - }; - - // insert a new string into dictionary, return its insertion order - size_t SortedStringDictionary::insert(const char * str, size_t len) { - auto ret = dict.insert({DictEntry(str, len), dict.size()}); - if (ret.second) { - // make a copy to internal storage - data.push_back(std::vector<char>(len)); - memcpy(data.back().data(), str, len); - // update dictionary entry to link pointer to internal storage - DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first)); - entry->data = data.back().data(); - totalLength += len; - } - return ret.first->second; - } - - // write dictionary data & length to output buffer - void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream, - RleEncoder * lengthEncoder) const { - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - dataStream->write(it->first.data, it->first.length); - lengthEncoder->write(static_cast<int64_t>(it->first.length)); - } - } - - /** - * Reorder input index buffer from insertion order to dictionary order - * - * We require this function because string values are buffered by indexes - * in their insertion order. Until the entire dictionary is complete can - * we get their sorted indexes in the dictionary in that ORC specification - * demands dictionary should be ordered. Therefore this function transforms - * the indexes from insertion order to dictionary value order for final - * output. - */ - void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const { - // iterate the dictionary to get mapping from insertion order to value order - std::vector<size_t> mapping(dict.size()); - size_t dictIdx = 0; - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - mapping[it->second] = dictIdx++; - } - - // do the transformation - for (size_t i = 0; i != idxBuffer.size(); ++i) { - idxBuffer[i] = static_cast<int64_t>( - mapping[static_cast<size_t>(idxBuffer[i])]); - } - } - - // get dict entries in insertion order - void SortedStringDictionary::getEntriesInInsertionOrder( - std::vector<const DictEntry *>& entries) const { - entries.resize(dict.size()); - for (auto it = dict.cbegin(); it != dict.cend(); ++it) { - entries[it->second] = &(it->first); - } - } - - // return count of entries - size_t SortedStringDictionary::size() const { - return dict.size(); - } - - // return total length of strings in the dictioanry - uint64_t SortedStringDictionary::length() const { - return totalLength; - } - - void SortedStringDictionary::clear() { - totalLength = 0; - data.clear(); - dict.clear(); - } - - class StringColumnWriter : public ColumnWriter { - public: - StringColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - virtual void createRowIndexEntry() override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - /** - * dictionary related functions - */ - bool checkDictionaryKeyRatio(); - void createDirectStreams(); - void createDictStreams(); - void deleteDictStreams(); - void fallbackToDirectEncoding(); - - protected: - RleVersion rleVersion; - bool useCompression; - const StreamsFactory& streamsFactory; - bool alignedBitPacking; - - // direct encoding streams - std::unique_ptr<RleEncoder> directLengthEncoder; - std::unique_ptr<AppendOnlyBufferedStream> directDataStream; - - // dictionary encoding streams - std::unique_ptr<RleEncoder> dictDataEncoder; - std::unique_ptr<RleEncoder> dictLengthEncoder; - std::unique_ptr<AppendOnlyBufferedStream> dictStream; - - /** - * dictionary related variables - */ - SortedStringDictionary dictionary; - // whether or not dictionary checking is done - bool doneDictionaryCheck; - // whether or not it should be used - bool useDictionary; - // keys in the dictionary should not exceed this ratio - double dictSizeThreshold; - - // record start row of each row group; null rows are skipped - mutable std::vector<size_t> startOfRowGroups; - }; - - StringColumnWriter::StringColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - useCompression(options.getCompression() != CompressionKind_NONE), - streamsFactory(factory), - alignedBitPacking(options.getAlignedBitpacking()), - doneDictionaryCheck(false), - useDictionary(options.getEnableDictionary()), - dictSizeThreshold(options.getDictionaryKeySizeThreshold()){ - if (type.getKind() == TypeKind::BINARY) { - useDictionary = false; - doneDictionaryCheck = true; - } - - if (useDictionary) { - createDictStreams(); - } else { - doneDictionaryCheck = true; - createDirectStreams(); - } - - if (enableIndex) { - recordPosition(); - } - } - - void StringColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const StringVectorBatch* stringBatch = - dynamic_cast<const StringVectorBatch*>(&rowBatch); - if (stringBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - StringColumnStatisticsImpl* strStats = - dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); - if (strStats == nullptr) { - throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char *const * data = stringBatch->data.data() + offset; - const int64_t* length = stringBatch->length.data() + offset; - const char* notNull = stringBatch->hasNulls ? - stringBatch->notNull.data() + offset : nullptr; - - if (!useDictionary){ - directLengthEncoder->add(length, numValues, notNull); - } - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - const size_t len = static_cast<size_t>(length[i]); - if (useDictionary) { - size_t index = dictionary.insert(data[i], len); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); - } else { - directDataStream->write(data[i], len); - } - if (enableBloomFilter) { - bloomFilter->addBytes(data[i], static_cast<int64_t>(len)); - } - strStats->update(data[i], len); - ++count; - } - } - strStats->increase(count); - if (count < numValues) { - strStats->setHasNull(true); - } - } - - void StringColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - if (useDictionary) { - proto::Stream data; - data.set_kind(proto::Stream_Kind_DATA); - data.set_column(static_cast<uint32_t>(columnId)); - data.set_length(dictDataEncoder->flush()); - streams.push_back(data); - - proto::Stream dict; - dict.set_kind(proto::Stream_Kind_DICTIONARY_DATA); - dict.set_column(static_cast<uint32_t>(columnId)); - dict.set_length(dictStream->flush()); - streams.push_back(dict); - - proto::Stream length; - length.set_kind(proto::Stream_Kind_LENGTH); - length.set_column(static_cast<uint32_t>(columnId)); - length.set_length(dictLengthEncoder->flush()); - streams.push_back(length); - } else { - proto::Stream length; - length.set_kind(proto::Stream_Kind_LENGTH); - length.set_column(static_cast<uint32_t>(columnId)); - length.set_length(directLengthEncoder->flush()); - streams.push_back(length); - - proto::Stream data; - data.set_kind(proto::Stream_Kind_DATA); - data.set_column(static_cast<uint32_t>(columnId)); - data.set_length(directDataStream->flush()); - streams.push_back(data); - } - } - - uint64_t StringColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - if (!useDictionary) { - size += directLengthEncoder->getBufferSize(); - size += directDataStream->getSize(); - } else { - size += dictionary.length(); - size += dictionary.size() * sizeof(int32_t); - size += dictionary.idxInDictBuffer.size() * sizeof(int32_t); - if (useCompression) { - size /= 3; // estimated ratio is 3:1 - } - } - return size; - } - - void StringColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - if (!useDictionary) { - encoding.set_kind(rleVersion == RleVersion_1 ? - proto::ColumnEncoding_Kind_DIRECT : - proto::ColumnEncoding_Kind_DIRECT_V2); - } else { - encoding.set_kind(rleVersion == RleVersion_1 ? - proto::ColumnEncoding_Kind_DICTIONARY : - proto::ColumnEncoding_Kind_DICTIONARY_V2); - } - encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size())); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void StringColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - if (!useDictionary) { - directDataStream->recordPosition(rowIndexPosition.get()); - directLengthEncoder->recordPosition(rowIndexPosition.get()); - } else { - if (enableIndex) { - startOfRowGroups.push_back(dictionary.idxInDictBuffer.size()); - } - } - } - - bool StringColumnWriter::checkDictionaryKeyRatio() { - if (!doneDictionaryCheck) { - useDictionary = dictionary.size() <= static_cast<size_t>( - static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold); - doneDictionaryCheck = true; - } - - return useDictionary; - } - - void StringColumnWriter::createRowIndexEntry() { - if (useDictionary && !doneDictionaryCheck) { - if (!checkDictionaryKeyRatio()) { - fallbackToDirectEncoding(); - } - } - ColumnWriter::createRowIndexEntry(); - } - - void StringColumnWriter::reset() { - ColumnWriter::reset(); - - dictionary.clear(); - dictionary.idxInDictBuffer.resize(0); - startOfRowGroups.clear(); - startOfRowGroups.push_back(0); - } - - void StringColumnWriter::createDirectStreams() { - std::unique_ptr<BufferedOutputStream> directLengthStream = - streamsFactory.createStream(proto::Stream_Kind_LENGTH); - directLengthEncoder = createRleEncoder(std::move(directLengthStream), - false, - rleVersion, - memPool, - alignedBitPacking); - directDataStream.reset(new AppendOnlyBufferedStream( - streamsFactory.createStream(proto::Stream_Kind_DATA))); - } - - void StringColumnWriter::createDictStreams() { - std::unique_ptr<BufferedOutputStream> dictDataStream = - streamsFactory.createStream(proto::Stream_Kind_DATA); - dictDataEncoder = createRleEncoder(std::move(dictDataStream), - false, - rleVersion, - memPool, - alignedBitPacking); - std::unique_ptr<BufferedOutputStream> dictLengthStream = - streamsFactory.createStream(proto::Stream_Kind_LENGTH); - dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), - false, - rleVersion, - memPool, - alignedBitPacking); - dictStream.reset(new AppendOnlyBufferedStream( - streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA))); - } - - void StringColumnWriter::deleteDictStreams() { - dictDataEncoder.reset(nullptr); - dictLengthEncoder.reset(nullptr); - dictStream.reset(nullptr); - - dictionary.clear(); - dictionary.idxInDictBuffer.clear(); - startOfRowGroups.clear(); - } - - void StringColumnWriter::writeDictionary() { - if (useDictionary && !doneDictionaryCheck) { - // when index is disabled, dictionary check happens while writing 1st stripe - if (!checkDictionaryKeyRatio()) { - fallbackToDirectEncoding(); - return; - } - } - - if (useDictionary) { - // flush dictionary data & length streams - dictionary.flush(dictStream.get(), dictLengthEncoder.get()); - - // convert index from insertion order to dictionary order - dictionary.reorder(dictionary.idxInDictBuffer); - - // write data sequences - int64_t * data = dictionary.idxInDictBuffer.data(); - if (enableIndex) { - size_t prevOffset = 0; - for (size_t i = 0; i < startOfRowGroups.size(); ++i) { - // write sequences in batch for a row group stride - size_t offset = startOfRowGroups[i]; - dictDataEncoder->add(data + prevOffset, offset - prevOffset, nullptr); - - // update index positions - int rowGroupId = static_cast<int>(i); - proto::RowIndexEntry* indexEntry = - (rowGroupId < rowIndex->entry_size()) ? - rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get(); - - // add positions for direct streams - RowIndexPositionRecorder recorder(*indexEntry); - dictDataEncoder->recordPosition(&recorder); - - prevOffset = offset; - } - - dictDataEncoder->add(data + prevOffset, - dictionary.idxInDictBuffer.size() - prevOffset, - nullptr); - } else { - dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr); - } - } - } - - void StringColumnWriter::fallbackToDirectEncoding() { - createDirectStreams(); - - if (enableIndex) { - // fallback happens at the 1st row group; - // simply complete positions for direct streams - proto::RowIndexEntry * indexEntry = rowIndexEntry.get(); - RowIndexPositionRecorder recorder(*indexEntry); - directDataStream->recordPosition(&recorder); - directLengthEncoder->recordPosition(&recorder); - } - - // get dictionary entries in insertion order - std::vector<const SortedStringDictionary::DictEntry *> entries; - dictionary.getEntriesInInsertionOrder(entries); - - // store each length of the data into a vector - const SortedStringDictionary::DictEntry * dictEntry = nullptr; - for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) { - // write one row data in direct encoding - dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])]; - directDataStream->write(dictEntry->data, dictEntry->length); - directLengthEncoder->write(static_cast<int64_t>(dictEntry->length)); - } - - deleteDictStreams(); - } - - struct Utf8Utils { - /** - * Counts how many utf-8 chars of the input data - */ - static uint64_t charLength(const char * data, uint64_t length) { - uint64_t chars = 0; - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - } - return chars; - } - - /** - * Return the number of bytes required to read at most maxCharLength - * characters in full from a utf-8 encoded byte array provided - * by data. This does not validate utf-8 data, but - * operates correctly on already valid utf-8 data. - * - * @param maxCharLength number of characters required - * @param data the bytes of UTF-8 - * @param length the length of data to truncate - */ - static uint64_t truncateBytesTo(uint64_t maxCharLength, - const char * data, - uint64_t length) { - uint64_t chars = 0; - if (length <= maxCharLength) { - return length; - } - for (uint64_t i = 0; i < length; i++) { - if (isUtfStartByte(data[i])) { - chars++; - } - if (chars > maxCharLength) { - return i; - } - } - // everything fits - return length; - } - - /** - * Checks if b is the first byte of a UTF-8 character. - */ - inline static bool isUtfStartByte(char b) { - return (b & 0xC0) != 0x80; - } - - /** - * Find the start of the last character that ends in the current string. - * @param text the bytes of the utf-8 - * @param from the first byte location - * @param until the last byte location - * @return the index of the last character - */ - static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) { - uint64_t posn = until; - /* we don't expect characters more than 5 bytes */ - while (posn >= from) { - if (isUtfStartByte(text[posn])) { - return posn; - } - posn -= 1; - } - /* beginning of a valid char not found */ - throw std::logic_error( - "Could not truncate string, beginning of a valid char not found"); - } - }; - - class CharColumnWriter : public StringColumnWriter { - public: - CharColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()), - padBuffer(*options.getMemoryPool()) { - // utf-8 is currently 4 bytes long, but it could be up to 6 - padBuffer.resize(maxLength * 6); - } - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - private: - uint64_t maxLength; - DataBuffer<char> padBuffer; - }; - - void CharColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); - if (charsBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - StringColumnStatisticsImpl* strStats = - dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); - if (strStats == nullptr) { - throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char** data = charsBatch->data.data() + offset; - int64_t* length = charsBatch->length.data() + offset; - const char* notNull = charsBatch->hasNulls ? - charsBatch->notNull.data() + offset : nullptr; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - const char * charData = nullptr; - uint64_t originLength = static_cast<uint64_t>(length[i]); - uint64_t charLength = Utf8Utils::charLength(data[i], originLength); - if (charLength >= maxLength) { - charData = data[i]; - length[i] = static_cast<int64_t>( - Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); - } else { - charData = padBuffer.data(); - // the padding is exactly 1 byte per char - length[i] = length[i] + static_cast<int64_t>(maxLength - charLength); - memcpy(padBuffer.data(), data[i], originLength); - memset(padBuffer.data() + originLength, - ' ', - static_cast<size_t>(length[i]) - originLength); - } - - if (useDictionary) { - size_t index = dictionary.insert(charData, static_cast<size_t>(length[i])); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); - } else { - directDataStream->write(charData, static_cast<size_t>(length[i])); - } - - if (enableBloomFilter) { - bloomFilter->addBytes(data[i], length[i]); - } - strStats->update(charData, static_cast<size_t>(length[i])); - ++count; - } - } - - if (!useDictionary) { - directLengthEncoder->add(length, numValues, notNull); - } - - strStats->increase(count); - if (count < numValues) { - strStats->setHasNull(true); - } - } - - class VarCharColumnWriter : public StringColumnWriter { - public: - VarCharColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options), - maxLength(type.getMaximumLength()) { - // PASS - } - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - private: - uint64_t maxLength; - }; - - void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); - if (charsBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - StringColumnStatisticsImpl* strStats = - dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); - if (strStats == nullptr) { - throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char* const* data = charsBatch->data.data() + offset; - int64_t* length = charsBatch->length.data() + offset; - const char* notNull = charsBatch->hasNulls ? - charsBatch->notNull.data() + offset : nullptr; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - uint64_t itemLength = Utf8Utils::truncateBytesTo( - maxLength, data[i], static_cast<uint64_t>(length[i])); - length[i] = static_cast<int64_t>(itemLength); - - if (useDictionary) { - size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i])); - dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); - } else { - directDataStream->write(data[i], static_cast<size_t>(length[i])); - } - - if (enableBloomFilter) { - bloomFilter->addBytes(data[i], length[i]); - } - strStats->update(data[i], static_cast<size_t>(length[i])); - ++count; - } - } - - if (!useDictionary) { - directLengthEncoder->add(length, numValues, notNull); - } - - strStats->increase(count); - if (count < numValues) { - strStats->setHasNull(true); - } - } - - class BinaryColumnWriter : public StringColumnWriter { - public: - BinaryColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - StringColumnWriter(type, factory, options) { - // PASS - } - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - }; - - void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); - if (binBatch == nullptr) { - throw InvalidArgument("Failed to cast to StringVectorBatch"); - } - - BinaryColumnStatisticsImpl* binStats = - dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get()); - if (binStats == nullptr) { - throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - char** data = binBatch->data.data() + offset; - int64_t* length = binBatch->length.data() + offset; - const char* notNull = binBatch->hasNulls ? - binBatch->notNull.data() + offset : nullptr; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - uint64_t unsignedLength = static_cast<uint64_t>(length[i]); - if (!notNull || notNull[i]) { - directDataStream->write(data[i], unsignedLength); - - binStats->update(unsignedLength); - ++count; - } - } - directLengthEncoder->add(length, numValues, notNull); - binStats->increase(count); - if (count < numValues) { - binStats->setHasNull(true); - } - } - - class TimestampColumnWriter : public ColumnWriter { - public: - TimestampColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - protected: - std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder; - - private: - RleVersion rleVersion; - const Timezone& timezone; - }; - - TimestampColumnWriter::TimestampColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - timezone(getTimezoneByName("GMT")){ - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - std::unique_ptr<BufferedOutputStream> secondaryStream = - factory.createStream(proto::Stream_Kind_SECONDARY); - secRleEncoder = createRleEncoder(std::move(dataStream), - true, - rleVersion, - memPool, - options.getAlignedBitpacking()); - nanoRleEncoder = createRleEncoder(std::move(secondaryStream), - false, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (enableIndex) { - recordPosition(); - } - } - - // Because the number of nanoseconds often has a large number of trailing zeros, - // the number has trailing decimal zero digits removed and the last three bits - // are used to record how many zeros were removed if the trailing zeros are - // more than 2. Thus 1000 nanoseconds would be serialized as 0x0a and - // 100000 would be serialized as 0x0c. - static int64_t formatNano(int64_t nanos) { - if (nanos == 0) { - return 0; - } else if (nanos % 100 != 0) { - return (nanos) << 3; - } else { - nanos /= 100; - int64_t trailingZeros = 1; - while (nanos % 10 == 0 && trailingZeros < 7) { - nanos /= 10; - trailingZeros += 1; - } - return (nanos) << 3 | trailingZeros; - } - } - - void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - TimestampVectorBatch* tsBatch = - dynamic_cast<TimestampVectorBatch*>(&rowBatch); - if (tsBatch == nullptr) { - throw InvalidArgument("Failed to cast to TimestampVectorBatch"); - } - - TimestampColumnStatisticsImpl* tsStats = - dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get()); - if (tsStats == nullptr) { - throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = tsBatch->hasNulls ? - tsBatch->notNull.data() + offset : nullptr; - int64_t *secs = tsBatch->data.data() + offset; - int64_t *nanos = tsBatch->nanoseconds.data() + offset; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull == nullptr || notNull[i]) { - // TimestampVectorBatch already stores data in UTC - int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(millsUTC); - } - tsStats->update(millsUTC); - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" +#include "orc/Writer.hh" + +#include "ByteRLE.hh" +#include "ColumnWriter.hh" +#include "RLE.hh" +#include "Statistics.hh" +#include "Timezone.hh" + +namespace orc { + StreamsFactory::~StreamsFactory() { + //PASS + } + + class StreamsFactoryImpl : public StreamsFactory { + public: + StreamsFactoryImpl( + const WriterOptions& writerOptions, + OutputStream* outputStream) : + options(writerOptions), + outStream(outputStream) { + } + + virtual std::unique_ptr<BufferedOutputStream> + createStream(proto::Stream_Kind kind) const override; + private: + const WriterOptions& options; + OutputStream* outStream; + }; + + std::unique_ptr<BufferedOutputStream> StreamsFactoryImpl::createStream( + proto::Stream_Kind) const { + // In the future, we can decide compression strategy and modifier + // based on stream kind. But for now we just use the setting from + // WriterOption + return createCompressor( + options.getCompression(), + outStream, + options.getCompressionStrategy(), + // BufferedOutputStream initial capacity + 1 * 1024 * 1024, + options.getCompressionBlockSize(), + *options.getMemoryPool()); + } + + std::unique_ptr<StreamsFactory> createStreamsFactory( + const WriterOptions& options, + OutputStream* outStream) { + return std::unique_ptr<StreamsFactory>( + new StreamsFactoryImpl(options, outStream)); + } + + RowIndexPositionRecorder::~RowIndexPositionRecorder() { + // PASS + } + + proto::ColumnEncoding_Kind RleVersionMapper(RleVersion rleVersion) + { + switch (rleVersion) + { + case RleVersion_1: + return proto::ColumnEncoding_Kind_DIRECT; + case RleVersion_2: + return proto::ColumnEncoding_Kind_DIRECT_V2; + default: + throw InvalidArgument("Invalid param"); + } + } + + ColumnWriter::ColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + columnId(type.getColumnId()), + colIndexStatistics(), + colStripeStatistics(), + colFileStatistics(), + enableIndex(options.getEnableIndex()), + rowIndex(), + rowIndexEntry(), + rowIndexPosition(), + enableBloomFilter(false), + memPool(*options.getMemoryPool()), + indexStream(), + bloomFilterStream() { + + std::unique_ptr<BufferedOutputStream> presentStream = + factory.createStream(proto::Stream_Kind_PRESENT); + notNullEncoder = createBooleanRleEncoder(std::move(presentStream)); + + colIndexStatistics = createColumnStatistics(type); + colStripeStatistics = createColumnStatistics(type); + colFileStatistics = createColumnStatistics(type); + + if (enableIndex) { + rowIndex = std::unique_ptr<proto::RowIndex>(new proto::RowIndex()); + rowIndexEntry = + std::unique_ptr<proto::RowIndexEntry>(new proto::RowIndexEntry()); + rowIndexPosition = std::unique_ptr<RowIndexPositionRecorder>( + new RowIndexPositionRecorder(*rowIndexEntry)); + indexStream = + factory.createStream(proto::Stream_Kind_ROW_INDEX); + + // BloomFilters for non-UTF8 strings and non-UTC timestamps are not supported + if (options.isColumnUseBloomFilter(columnId) + && options.getBloomFilterVersion() == BloomFilterVersion::UTF8) { + enableBloomFilter = true; + bloomFilter.reset(new BloomFilterImpl( + options.getRowIndexStride(), options.getBloomFilterFPP())); + bloomFilterIndex.reset(new proto::BloomFilterIndex()); + bloomFilterStream = factory.createStream(proto::Stream_Kind_BLOOM_FILTER_UTF8); + } + } + } + + ColumnWriter::~ColumnWriter() { + // PASS + } + + void ColumnWriter::add(ColumnVectorBatch& batch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + notNullEncoder->add(batch.notNull.data() + offset, numValues, incomingMask); + } + + void ColumnWriter::flush(std::vector<proto::Stream>& streams) { + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_PRESENT); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(notNullEncoder->flush()); + streams.push_back(stream); + } + + uint64_t ColumnWriter::getEstimatedSize() const { + return notNullEncoder->getBufferSize(); + } + + void ColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + getProtoBufStatistics(stats, colStripeStatistics.get()); + } + + void ColumnWriter::mergeStripeStatsIntoFileStats() { + colFileStatistics->merge(*colStripeStatistics); + colStripeStatistics->reset(); + } + + void ColumnWriter::mergeRowGroupStatsIntoStripeStats() { + colStripeStatistics->merge(*colIndexStatistics); + colIndexStatistics->reset(); + } + + void ColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + getProtoBufStatistics(stats, colFileStatistics.get()); + } + + void ColumnWriter::createRowIndexEntry() { + proto::ColumnStatistics *indexStats = rowIndexEntry->mutable_statistics(); + colIndexStatistics->toProtoBuf(*indexStats); + + *rowIndex->add_entry() = *rowIndexEntry; + + rowIndexEntry->clear_positions(); + rowIndexEntry->clear_statistics(); + + colStripeStatistics->merge(*colIndexStatistics); + colIndexStatistics->reset(); + + addBloomFilterEntry(); + + recordPosition(); + } + + void ColumnWriter::addBloomFilterEntry() { + if (enableBloomFilter) { + BloomFilterUTF8Utils::serialize(*bloomFilter, *bloomFilterIndex->add_bloomfilter()); + bloomFilter->reset(); + } + } + + void ColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + // write row index to output stream + rowIndex->SerializeToZeroCopyStream(indexStream.get()); + + // construct row index stream + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_ROW_INDEX); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(indexStream->flush()); + streams.push_back(stream); + + // write BLOOM_FILTER_UTF8 stream + if (enableBloomFilter) { + if (!bloomFilterIndex->SerializeToZeroCopyStream(bloomFilterStream.get())) { + throw std::logic_error("Failed to write bloom filter stream."); + } + stream.set_kind(proto::Stream_Kind_BLOOM_FILTER_UTF8); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(bloomFilterStream->flush()); + streams.push_back(stream); + } + } + + void ColumnWriter::recordPosition() const { + notNullEncoder->recordPosition(rowIndexPosition.get()); + } + + void ColumnWriter::reset() { + if (enableIndex) { + // clear row index + rowIndex->clear_entry(); + rowIndexEntry->clear_positions(); + rowIndexEntry->clear_statistics(); + + // write current positions + recordPosition(); + } + + if (enableBloomFilter) { + bloomFilter->reset(); + bloomFilterIndex->clear_bloomfilter(); + } + } + + void ColumnWriter::writeDictionary() { + // PASS + } + + class StructColumnWriter : public ColumnWriter { + public: + StructColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~StructColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::vector<ColumnWriter *> children; + }; + + StructColumnWriter::StructColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + for(unsigned int i = 0; i < type.getSubtypeCount(); ++i) { + const Type& child = *type.getSubtype(i); + children.push_back(buildWriter(child, factory, options).release()); + } + + if (enableIndex) { + recordPosition(); + } + } + + StructColumnWriter::~StructColumnWriter() { + for (uint32_t i = 0; i < children.size(); ++i) { + delete children[i]; + } + } + + void StructColumnWriter::add( + ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const StructVectorBatch* structBatch = + dynamic_cast<const StructVectorBatch *>(&rowBatch); + if (structBatch == nullptr) { + throw InvalidArgument("Failed to cast to StructVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + const char* notNull = structBatch->hasNulls ? + structBatch->notNull.data() + offset : nullptr; + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->add(*structBatch->fields[i], offset, numValues, notNull); + } + + // update stats + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + + void StructColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->flush(streams); + } + } + + void StructColumnWriter::writeIndex( + std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeIndex(streams); + } + } + + uint64_t StructColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + for (uint32_t i = 0; i < children.size(); ++i) { + size += children[i]->getEstimatedSize(); + } + return size; + } + + void StructColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + encodings.push_back(encoding); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getColumnEncoding(encodings); + } + } + + void StructColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getStripeStatistics(stats); + } + } + + void StructColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeStripeStatsIntoFileStats(); + } + } + + void StructColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getFileStatistics(stats); + } + } + + void StructColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeRowGroupStatsIntoStripeStats(); + } + } + + void StructColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->createRowIndexEntry(); + } + } + + void StructColumnWriter::reset() { + ColumnWriter::reset(); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->reset(); + } + } + + void StructColumnWriter::writeDictionary() { + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeDictionary(); + } + } + + class IntegerColumnWriter : public ColumnWriter { + public: + IntegerColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + protected: + std::unique_ptr<RleEncoder> rleEncoder; + + private: + RleVersion rleVersion; + }; + + IntegerColumnWriter::IntegerColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()) { + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createRleEncoder( + std::move(dataStream), + true, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (enableIndex) { + recordPosition(); + } + } + + void IntegerColumnWriter::add( + ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const LongVectorBatch* longBatch = + dynamic_cast<const LongVectorBatch*>(&rowBatch); + if (longBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + IntegerColumnStatisticsImpl* intStats = + dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); + if (intStats == nullptr) { + throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const int64_t* data = longBatch->data.data() + offset; + const char* notNull = longBatch->hasNulls ? + longBatch->notNull.data() + offset : nullptr; + + rleEncoder->add(data, numValues, notNull); + + // update stats + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + intStats->update(data[i], 1); + } + } + intStats->increase(count); + if (count < numValues) { + intStats->setHasNull(true); + } + } + + void IntegerColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(rleEncoder->flush()); + streams.push_back(stream); + } + + uint64_t IntegerColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += rleEncoder->getBufferSize(); + return size; + } + + void IntegerColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void IntegerColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + rleEncoder->recordPosition(rowIndexPosition.get()); + } + + class ByteColumnWriter : public ColumnWriter { + public: + ByteColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + private: + std::unique_ptr<ByteRleEncoder> byteRleEncoder; + }; + + ByteColumnWriter::ByteColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + byteRleEncoder = createByteRleEncoder(std::move(dataStream)); + + if (enableIndex) { + recordPosition(); + } + } + + void ByteColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); + if (byteBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + IntegerColumnStatisticsImpl* intStats = + dynamic_cast<IntegerColumnStatisticsImpl*>(colIndexStatistics.get()); + if (intStats == nullptr) { + throw InvalidArgument("Failed to cast to IntegerColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* data = byteBatch->data.data() + offset; + const char* notNull = byteBatch->hasNulls ? + byteBatch->notNull.data() + offset : nullptr; + + char* byteData = reinterpret_cast<char*>(data); + for (uint64_t i = 0; i < numValues; ++i) { + byteData[i] = static_cast<char>(data[i]); + } + byteRleEncoder->add(byteData, numValues, notNull); + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + intStats->update(static_cast<int64_t>(byteData[i]), 1); + } + } + intStats->increase(count); + if (count < numValues) { + intStats->setHasNull(true); + } + } + + void ByteColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(byteRleEncoder->flush()); + streams.push_back(stream); + } + + uint64_t ByteColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += byteRleEncoder->getBufferSize(); + return size; + } + + void ByteColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void ByteColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + byteRleEncoder->recordPosition(rowIndexPosition.get()); + } + + class BooleanColumnWriter : public ColumnWriter { + public: + BooleanColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + private: + std::unique_ptr<ByteRleEncoder> rleEncoder; + }; + + BooleanColumnWriter::BooleanColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createBooleanRleEncoder(std::move(dataStream)); + + if (enableIndex) { + recordPosition(); + } + } + + void BooleanColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + LongVectorBatch* byteBatch = dynamic_cast<LongVectorBatch*>(&rowBatch); + if (byteBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + BooleanColumnStatisticsImpl* boolStats = + dynamic_cast<BooleanColumnStatisticsImpl*>(colIndexStatistics.get()); + if (boolStats == nullptr) { + throw InvalidArgument("Failed to cast to BooleanColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* data = byteBatch->data.data() + offset; + const char* notNull = byteBatch->hasNulls ? + byteBatch->notNull.data() + offset : nullptr; + + char* byteData = reinterpret_cast<char*>(data); + for (uint64_t i = 0; i < numValues; ++i) { + byteData[i] = static_cast<char>(data[i]); + } + rleEncoder->add(byteData, numValues, notNull); + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + boolStats->update(byteData[i] != 0, 1); + } + } + boolStats->increase(count); + if (count < numValues) { + boolStats->setHasNull(true); + } + } + + void BooleanColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(rleEncoder->flush()); + streams.push_back(stream); + } + + uint64_t BooleanColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += rleEncoder->getBufferSize(); + return size; + } + + void BooleanColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void BooleanColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + rleEncoder->recordPosition(rowIndexPosition.get()); + } + + class DoubleColumnWriter : public ColumnWriter { + public: + DoubleColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options, + bool isFloat); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + private: + bool isFloat; + std::unique_ptr<AppendOnlyBufferedStream> dataStream; + DataBuffer<char> buffer; + }; + + DoubleColumnWriter::DoubleColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options, + bool isFloatType) : + ColumnWriter(type, factory, options), + isFloat(isFloatType), + buffer(*options.getMemoryPool()) { + dataStream.reset(new AppendOnlyBufferedStream( + factory.createStream(proto::Stream_Kind_DATA))); + buffer.resize(isFloat ? 4 : 8); + + if (enableIndex) { + recordPosition(); + } + } + + // Floating point types are stored using IEEE 754 floating point bit layout. + // Float columns use 4 bytes per value and double columns use 8 bytes. + template <typename FLOAT_TYPE, typename INTEGER_TYPE> + inline void encodeFloatNum(FLOAT_TYPE input, char* output) { + INTEGER_TYPE* intBits = reinterpret_cast<INTEGER_TYPE*>(&input); + for (size_t i = 0; i < sizeof(INTEGER_TYPE); ++i) { + output[i] = static_cast<char>(((*intBits) >> (8 * i)) & 0xff); + } + } + + void DoubleColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const DoubleVectorBatch* dblBatch = + dynamic_cast<const DoubleVectorBatch*>(&rowBatch); + if (dblBatch == nullptr) { + throw InvalidArgument("Failed to cast to DoubleVectorBatch"); + } + DoubleColumnStatisticsImpl* doubleStats = + dynamic_cast<DoubleColumnStatisticsImpl*>(colIndexStatistics.get()); + if (doubleStats == nullptr) { + throw InvalidArgument("Failed to cast to DoubleColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const double* doubleData = dblBatch->data.data() + offset; + const char* notNull = dblBatch->hasNulls ? + dblBatch->notNull.data() + offset : nullptr; + + size_t bytes = isFloat ? 4 : 8; + char* data = buffer.data(); + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + if (isFloat) { + encodeFloatNum<float, int32_t>(static_cast<float>(doubleData[i]), data); + } else { + encodeFloatNum<double, int64_t>(doubleData[i], data); + } + dataStream->write(data, bytes); + ++count; + if (enableBloomFilter) { + bloomFilter->addDouble(doubleData[i]); + } + doubleStats->update(doubleData[i]); + } + } + doubleStats->increase(count); + if (count < numValues) { + doubleStats->setHasNull(true); + } + } + + void DoubleColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(dataStream->flush()); + streams.push_back(stream); + } + + uint64_t DoubleColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += dataStream->getSize(); + return size; + } + + void DoubleColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void DoubleColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + dataStream->recordPosition(rowIndexPosition.get()); + } + + /** + * Implementation of increasing sorted string dictionary + */ + class SortedStringDictionary { + public: + struct DictEntry { + DictEntry(const char * str, size_t len):data(str),length(len) {} + const char * data; + size_t length; + }; + + SortedStringDictionary():totalLength(0) {} + + // insert a new string into dictionary, return its insertion order + size_t insert(const char * data, size_t len); + + // write dictionary data & length to output buffer + void flush(AppendOnlyBufferedStream * dataStream, + RleEncoder * lengthEncoder) const; + + // reorder input index buffer from insertion order to dictionary order + void reorder(std::vector<int64_t>& idxBuffer) const; + + // get dict entries in insertion order + void getEntriesInInsertionOrder(std::vector<const DictEntry *>&) const; + + // return count of entries + size_t size() const; + + // return total length of strings in the dictioanry + uint64_t length() const; + + void clear(); + + private: + struct LessThan { + bool operator()(const DictEntry& left, const DictEntry& right) const { + int ret = memcmp(left.data, right.data, std::min(left.length, right.length)); + if (ret != 0) { + return ret < 0; + } + return left.length < right.length; + } + }; + + std::map<DictEntry, size_t, LessThan> dict; + std::vector<std::vector<char>> data; + uint64_t totalLength; + + // use friend class here to avoid being bothered by const function calls + friend class StringColumnWriter; + friend class CharColumnWriter; + friend class VarCharColumnWriter; + // store indexes of insertion order in the dictionary for not-null rows + std::vector<int64_t> idxInDictBuffer; + }; + + // insert a new string into dictionary, return its insertion order + size_t SortedStringDictionary::insert(const char * str, size_t len) { + auto ret = dict.insert({DictEntry(str, len), dict.size()}); + if (ret.second) { + // make a copy to internal storage + data.push_back(std::vector<char>(len)); + memcpy(data.back().data(), str, len); + // update dictionary entry to link pointer to internal storage + DictEntry * entry = const_cast<DictEntry *>(&(ret.first->first)); + entry->data = data.back().data(); + totalLength += len; + } + return ret.first->second; + } + + // write dictionary data & length to output buffer + void SortedStringDictionary::flush(AppendOnlyBufferedStream * dataStream, + RleEncoder * lengthEncoder) const { + for (auto it = dict.cbegin(); it != dict.cend(); ++it) { + dataStream->write(it->first.data, it->first.length); + lengthEncoder->write(static_cast<int64_t>(it->first.length)); + } + } + + /** + * Reorder input index buffer from insertion order to dictionary order + * + * We require this function because string values are buffered by indexes + * in their insertion order. Until the entire dictionary is complete can + * we get their sorted indexes in the dictionary in that ORC specification + * demands dictionary should be ordered. Therefore this function transforms + * the indexes from insertion order to dictionary value order for final + * output. + */ + void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const { + // iterate the dictionary to get mapping from insertion order to value order + std::vector<size_t> mapping(dict.size()); + size_t dictIdx = 0; + for (auto it = dict.cbegin(); it != dict.cend(); ++it) { + mapping[it->second] = dictIdx++; + } + + // do the transformation + for (size_t i = 0; i != idxBuffer.size(); ++i) { + idxBuffer[i] = static_cast<int64_t>( + mapping[static_cast<size_t>(idxBuffer[i])]); + } + } + + // get dict entries in insertion order + void SortedStringDictionary::getEntriesInInsertionOrder( + std::vector<const DictEntry *>& entries) const { + entries.resize(dict.size()); + for (auto it = dict.cbegin(); it != dict.cend(); ++it) { + entries[it->second] = &(it->first); + } + } + + // return count of entries + size_t SortedStringDictionary::size() const { + return dict.size(); + } + + // return total length of strings in the dictioanry + uint64_t SortedStringDictionary::length() const { + return totalLength; + } + + void SortedStringDictionary::clear() { + totalLength = 0; + data.clear(); + dict.clear(); + } + + class StringColumnWriter : public ColumnWriter { + public: + StringColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + virtual void createRowIndexEntry() override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + /** + * dictionary related functions + */ + bool checkDictionaryKeyRatio(); + void createDirectStreams(); + void createDictStreams(); + void deleteDictStreams(); + void fallbackToDirectEncoding(); + + protected: + RleVersion rleVersion; + bool useCompression; + const StreamsFactory& streamsFactory; + bool alignedBitPacking; + + // direct encoding streams + std::unique_ptr<RleEncoder> directLengthEncoder; + std::unique_ptr<AppendOnlyBufferedStream> directDataStream; + + // dictionary encoding streams + std::unique_ptr<RleEncoder> dictDataEncoder; + std::unique_ptr<RleEncoder> dictLengthEncoder; + std::unique_ptr<AppendOnlyBufferedStream> dictStream; + + /** + * dictionary related variables + */ + SortedStringDictionary dictionary; + // whether or not dictionary checking is done + bool doneDictionaryCheck; + // whether or not it should be used + bool useDictionary; + // keys in the dictionary should not exceed this ratio + double dictSizeThreshold; + + // record start row of each row group; null rows are skipped + mutable std::vector<size_t> startOfRowGroups; + }; + + StringColumnWriter::StringColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + useCompression(options.getCompression() != CompressionKind_NONE), + streamsFactory(factory), + alignedBitPacking(options.getAlignedBitpacking()), + doneDictionaryCheck(false), + useDictionary(options.getEnableDictionary()), + dictSizeThreshold(options.getDictionaryKeySizeThreshold()){ + if (type.getKind() == TypeKind::BINARY) { + useDictionary = false; + doneDictionaryCheck = true; + } + + if (useDictionary) { + createDictStreams(); + } else { + doneDictionaryCheck = true; + createDirectStreams(); + } + + if (enableIndex) { + recordPosition(); + } + } + + void StringColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const StringVectorBatch* stringBatch = + dynamic_cast<const StringVectorBatch*>(&rowBatch); + if (stringBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + StringColumnStatisticsImpl* strStats = + dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); + if (strStats == nullptr) { + throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char *const * data = stringBatch->data.data() + offset; + const int64_t* length = stringBatch->length.data() + offset; + const char* notNull = stringBatch->hasNulls ? + stringBatch->notNull.data() + offset : nullptr; + + if (!useDictionary){ + directLengthEncoder->add(length, numValues, notNull); + } + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + const size_t len = static_cast<size_t>(length[i]); + if (useDictionary) { + size_t index = dictionary.insert(data[i], len); + dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + } else { + directDataStream->write(data[i], len); + } + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], static_cast<int64_t>(len)); + } + strStats->update(data[i], len); + ++count; + } + } + strStats->increase(count); + if (count < numValues) { + strStats->setHasNull(true); + } + } + + void StringColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + if (useDictionary) { + proto::Stream data; + data.set_kind(proto::Stream_Kind_DATA); + data.set_column(static_cast<uint32_t>(columnId)); + data.set_length(dictDataEncoder->flush()); + streams.push_back(data); + + proto::Stream dict; + dict.set_kind(proto::Stream_Kind_DICTIONARY_DATA); + dict.set_column(static_cast<uint32_t>(columnId)); + dict.set_length(dictStream->flush()); + streams.push_back(dict); + + proto::Stream length; + length.set_kind(proto::Stream_Kind_LENGTH); + length.set_column(static_cast<uint32_t>(columnId)); + length.set_length(dictLengthEncoder->flush()); + streams.push_back(length); + } else { + proto::Stream length; + length.set_kind(proto::Stream_Kind_LENGTH); + length.set_column(static_cast<uint32_t>(columnId)); + length.set_length(directLengthEncoder->flush()); + streams.push_back(length); + + proto::Stream data; + data.set_kind(proto::Stream_Kind_DATA); + data.set_column(static_cast<uint32_t>(columnId)); + data.set_length(directDataStream->flush()); + streams.push_back(data); + } + } + + uint64_t StringColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + if (!useDictionary) { + size += directLengthEncoder->getBufferSize(); + size += directDataStream->getSize(); + } else { + size += dictionary.length(); + size += dictionary.size() * sizeof(int32_t); + size += dictionary.idxInDictBuffer.size() * sizeof(int32_t); + if (useCompression) { + size /= 3; // estimated ratio is 3:1 + } + } + return size; + } + + void StringColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + if (!useDictionary) { + encoding.set_kind(rleVersion == RleVersion_1 ? + proto::ColumnEncoding_Kind_DIRECT : + proto::ColumnEncoding_Kind_DIRECT_V2); + } else { + encoding.set_kind(rleVersion == RleVersion_1 ? + proto::ColumnEncoding_Kind_DICTIONARY : + proto::ColumnEncoding_Kind_DICTIONARY_V2); + } + encoding.set_dictionarysize(static_cast<uint32_t>(dictionary.size())); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void StringColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + if (!useDictionary) { + directDataStream->recordPosition(rowIndexPosition.get()); + directLengthEncoder->recordPosition(rowIndexPosition.get()); + } else { + if (enableIndex) { + startOfRowGroups.push_back(dictionary.idxInDictBuffer.size()); + } + } + } + + bool StringColumnWriter::checkDictionaryKeyRatio() { + if (!doneDictionaryCheck) { + useDictionary = dictionary.size() <= static_cast<size_t>( + static_cast<double>(dictionary.idxInDictBuffer.size()) * dictSizeThreshold); + doneDictionaryCheck = true; + } + + return useDictionary; + } + + void StringColumnWriter::createRowIndexEntry() { + if (useDictionary && !doneDictionaryCheck) { + if (!checkDictionaryKeyRatio()) { + fallbackToDirectEncoding(); + } + } + ColumnWriter::createRowIndexEntry(); + } + + void StringColumnWriter::reset() { + ColumnWriter::reset(); + + dictionary.clear(); + dictionary.idxInDictBuffer.resize(0); + startOfRowGroups.clear(); + startOfRowGroups.push_back(0); + } + + void StringColumnWriter::createDirectStreams() { + std::unique_ptr<BufferedOutputStream> directLengthStream = + streamsFactory.createStream(proto::Stream_Kind_LENGTH); + directLengthEncoder = createRleEncoder(std::move(directLengthStream), + false, + rleVersion, + memPool, + alignedBitPacking); + directDataStream.reset(new AppendOnlyBufferedStream( + streamsFactory.createStream(proto::Stream_Kind_DATA))); + } + + void StringColumnWriter::createDictStreams() { + std::unique_ptr<BufferedOutputStream> dictDataStream = + streamsFactory.createStream(proto::Stream_Kind_DATA); + dictDataEncoder = createRleEncoder(std::move(dictDataStream), + false, + rleVersion, + memPool, + alignedBitPacking); + std::unique_ptr<BufferedOutputStream> dictLengthStream = + streamsFactory.createStream(proto::Stream_Kind_LENGTH); + dictLengthEncoder = createRleEncoder(std::move(dictLengthStream), + false, + rleVersion, + memPool, + alignedBitPacking); + dictStream.reset(new AppendOnlyBufferedStream( + streamsFactory.createStream(proto::Stream_Kind_DICTIONARY_DATA))); + } + + void StringColumnWriter::deleteDictStreams() { + dictDataEncoder.reset(nullptr); + dictLengthEncoder.reset(nullptr); + dictStream.reset(nullptr); + + dictionary.clear(); + dictionary.idxInDictBuffer.clear(); + startOfRowGroups.clear(); + } + + void StringColumnWriter::writeDictionary() { + if (useDictionary && !doneDictionaryCheck) { + // when index is disabled, dictionary check happens while writing 1st stripe + if (!checkDictionaryKeyRatio()) { + fallbackToDirectEncoding(); + return; + } + } + + if (useDictionary) { + // flush dictionary data & length streams + dictionary.flush(dictStream.get(), dictLengthEncoder.get()); + + // convert index from insertion order to dictionary order + dictionary.reorder(dictionary.idxInDictBuffer); + + // write data sequences + int64_t * data = dictionary.idxInDictBuffer.data(); + if (enableIndex) { + size_t prevOffset = 0; + for (size_t i = 0; i < startOfRowGroups.size(); ++i) { + // write sequences in batch for a row group stride + size_t offset = startOfRowGroups[i]; + dictDataEncoder->add(data + prevOffset, offset - prevOffset, nullptr); + + // update index positions + int rowGroupId = static_cast<int>(i); + proto::RowIndexEntry* indexEntry = + (rowGroupId < rowIndex->entry_size()) ? + rowIndex->mutable_entry(rowGroupId) : rowIndexEntry.get(); + + // add positions for direct streams + RowIndexPositionRecorder recorder(*indexEntry); + dictDataEncoder->recordPosition(&recorder); + + prevOffset = offset; + } + + dictDataEncoder->add(data + prevOffset, + dictionary.idxInDictBuffer.size() - prevOffset, + nullptr); + } else { + dictDataEncoder->add(data, dictionary.idxInDictBuffer.size(), nullptr); + } + } + } + + void StringColumnWriter::fallbackToDirectEncoding() { + createDirectStreams(); + + if (enableIndex) { + // fallback happens at the 1st row group; + // simply complete positions for direct streams + proto::RowIndexEntry * indexEntry = rowIndexEntry.get(); + RowIndexPositionRecorder recorder(*indexEntry); + directDataStream->recordPosition(&recorder); + directLengthEncoder->recordPosition(&recorder); + } + + // get dictionary entries in insertion order + std::vector<const SortedStringDictionary::DictEntry *> entries; + dictionary.getEntriesInInsertionOrder(entries); + + // store each length of the data into a vector + const SortedStringDictionary::DictEntry * dictEntry = nullptr; + for (uint64_t i = 0; i != dictionary.idxInDictBuffer.size(); ++i) { + // write one row data in direct encoding + dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer[i])]; + directDataStream->write(dictEntry->data, dictEntry->length); + directLengthEncoder->write(static_cast<int64_t>(dictEntry->length)); + } + + deleteDictStreams(); + } + + struct Utf8Utils { + /** + * Counts how many utf-8 chars of the input data + */ + static uint64_t charLength(const char * data, uint64_t length) { + uint64_t chars = 0; + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + } + return chars; + } + + /** + * Return the number of bytes required to read at most maxCharLength + * characters in full from a utf-8 encoded byte array provided + * by data. This does not validate utf-8 data, but + * operates correctly on already valid utf-8 data. + * + * @param maxCharLength number of characters required + * @param data the bytes of UTF-8 + * @param length the length of data to truncate + */ + static uint64_t truncateBytesTo(uint64_t maxCharLength, + const char * data, + uint64_t length) { + uint64_t chars = 0; + if (length <= maxCharLength) { + return length; + } + for (uint64_t i = 0; i < length; i++) { + if (isUtfStartByte(data[i])) { + chars++; + } + if (chars > maxCharLength) { + return i; + } + } + // everything fits + return length; + } + + /** + * Checks if b is the first byte of a UTF-8 character. + */ + inline static bool isUtfStartByte(char b) { + return (b & 0xC0) != 0x80; + } + + /** + * Find the start of the last character that ends in the current string. + * @param text the bytes of the utf-8 + * @param from the first byte location + * @param until the last byte location + * @return the index of the last character + */ + static uint64_t findLastCharacter(const char * text, uint64_t from, uint64_t until) { + uint64_t posn = until; + /* we don't expect characters more than 5 bytes */ + while (posn >= from) { + if (isUtfStartByte(text[posn])) { + return posn; + } + posn -= 1; + } + /* beginning of a valid char not found */ + throw std::logic_error( + "Could not truncate string, beginning of a valid char not found"); + } + }; + + class CharColumnWriter : public StringColumnWriter { + public: + CharColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + StringColumnWriter(type, factory, options), + maxLength(type.getMaximumLength()), + padBuffer(*options.getMemoryPool()) { + // utf-8 is currently 4 bytes long, but it could be up to 6 + padBuffer.resize(maxLength * 6); + } + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + private: + uint64_t maxLength; + DataBuffer<char> padBuffer; + }; + + void CharColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); + if (charsBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + StringColumnStatisticsImpl* strStats = + dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); + if (strStats == nullptr) { + throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char** data = charsBatch->data.data() + offset; + int64_t* length = charsBatch->length.data() + offset; + const char* notNull = charsBatch->hasNulls ? + charsBatch->notNull.data() + offset : nullptr; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + const char * charData = nullptr; + uint64_t originLength = static_cast<uint64_t>(length[i]); + uint64_t charLength = Utf8Utils::charLength(data[i], originLength); + if (charLength >= maxLength) { + charData = data[i]; + length[i] = static_cast<int64_t>( + Utf8Utils::truncateBytesTo(maxLength, data[i], originLength)); + } else { + charData = padBuffer.data(); + // the padding is exactly 1 byte per char + length[i] = length[i] + static_cast<int64_t>(maxLength - charLength); + memcpy(padBuffer.data(), data[i], originLength); + memset(padBuffer.data() + originLength, + ' ', + static_cast<size_t>(length[i]) - originLength); + } + + if (useDictionary) { + size_t index = dictionary.insert(charData, static_cast<size_t>(length[i])); + dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + } else { + directDataStream->write(charData, static_cast<size_t>(length[i])); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + strStats->update(charData, static_cast<size_t>(length[i])); + ++count; + } + } + + if (!useDictionary) { + directLengthEncoder->add(length, numValues, notNull); + } + + strStats->increase(count); + if (count < numValues) { + strStats->setHasNull(true); + } + } + + class VarCharColumnWriter : public StringColumnWriter { + public: + VarCharColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + StringColumnWriter(type, factory, options), + maxLength(type.getMaximumLength()) { + // PASS + } + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + private: + uint64_t maxLength; + }; + + void VarCharColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + StringVectorBatch* charsBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); + if (charsBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + StringColumnStatisticsImpl* strStats = + dynamic_cast<StringColumnStatisticsImpl*>(colIndexStatistics.get()); + if (strStats == nullptr) { + throw InvalidArgument("Failed to cast to StringColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char* const* data = charsBatch->data.data() + offset; + int64_t* length = charsBatch->length.data() + offset; + const char* notNull = charsBatch->hasNulls ? + charsBatch->notNull.data() + offset : nullptr; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + uint64_t itemLength = Utf8Utils::truncateBytesTo( + maxLength, data[i], static_cast<uint64_t>(length[i])); + length[i] = static_cast<int64_t>(itemLength); + + if (useDictionary) { + size_t index = dictionary.insert(data[i], static_cast<size_t>(length[i])); + dictionary.idxInDictBuffer.push_back(static_cast<int64_t>(index)); + } else { + directDataStream->write(data[i], static_cast<size_t>(length[i])); + } + + if (enableBloomFilter) { + bloomFilter->addBytes(data[i], length[i]); + } + strStats->update(data[i], static_cast<size_t>(length[i])); + ++count; + } + } + + if (!useDictionary) { + directLengthEncoder->add(length, numValues, notNull); + } + + strStats->increase(count); + if (count < numValues) { + strStats->setHasNull(true); + } + } + + class BinaryColumnWriter : public StringColumnWriter { + public: + BinaryColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + StringColumnWriter(type, factory, options) { + // PASS + } + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + }; + + void BinaryColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + StringVectorBatch* binBatch = dynamic_cast<StringVectorBatch*>(&rowBatch); + if (binBatch == nullptr) { + throw InvalidArgument("Failed to cast to StringVectorBatch"); + } + + BinaryColumnStatisticsImpl* binStats = + dynamic_cast<BinaryColumnStatisticsImpl*>(colIndexStatistics.get()); + if (binStats == nullptr) { + throw InvalidArgument("Failed to cast to BinaryColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + char** data = binBatch->data.data() + offset; + int64_t* length = binBatch->length.data() + offset; + const char* notNull = binBatch->hasNulls ? + binBatch->notNull.data() + offset : nullptr; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + uint64_t unsignedLength = static_cast<uint64_t>(length[i]); + if (!notNull || notNull[i]) { + directDataStream->write(data[i], unsignedLength); + + binStats->update(unsignedLength); + ++count; + } + } + directLengthEncoder->add(length, numValues, notNull); + binStats->increase(count); + if (count < numValues) { + binStats->setHasNull(true); + } + } + + class TimestampColumnWriter : public ColumnWriter { + public: + TimestampColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + protected: + std::unique_ptr<RleEncoder> secRleEncoder, nanoRleEncoder; + + private: + RleVersion rleVersion; + const Timezone& timezone; + }; + + TimestampColumnWriter::TimestampColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + timezone(getTimezoneByName("GMT")){ + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + std::unique_ptr<BufferedOutputStream> secondaryStream = + factory.createStream(proto::Stream_Kind_SECONDARY); + secRleEncoder = createRleEncoder(std::move(dataStream), + true, + rleVersion, + memPool, + options.getAlignedBitpacking()); + nanoRleEncoder = createRleEncoder(std::move(secondaryStream), + false, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (enableIndex) { + recordPosition(); + } + } + + // Because the number of nanoseconds often has a large number of trailing zeros, + // the number has trailing decimal zero digits removed and the last three bits + // are used to record how many zeros were removed if the trailing zeros are + // more than 2. Thus 1000 nanoseconds would be serialized as 0x0a and + // 100000 would be serialized as 0x0c. + static int64_t formatNano(int64_t nanos) { + if (nanos == 0) { + return 0; + } else if (nanos % 100 != 0) { + return (nanos) << 3; + } else { + nanos /= 100; + int64_t trailingZeros = 1; + while (nanos % 10 == 0 && trailingZeros < 7) { + nanos /= 10; + trailingZeros += 1; + } + return (nanos) << 3 | trailingZeros; + } + } + + void TimestampColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + TimestampVectorBatch* tsBatch = + dynamic_cast<TimestampVectorBatch*>(&rowBatch); + if (tsBatch == nullptr) { + throw InvalidArgument("Failed to cast to TimestampVectorBatch"); + } + + TimestampColumnStatisticsImpl* tsStats = + dynamic_cast<TimestampColumnStatisticsImpl*>(colIndexStatistics.get()); + if (tsStats == nullptr) { + throw InvalidArgument("Failed to cast to TimestampColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = tsBatch->hasNulls ? + tsBatch->notNull.data() + offset : nullptr; + int64_t *secs = tsBatch->data.data() + offset; + int64_t *nanos = tsBatch->nanoseconds.data() + offset; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull == nullptr || notNull[i]) { + // TimestampVectorBatch already stores data in UTC + int64_t millsUTC = secs[i] * 1000 + nanos[i] / 1000000; + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(millsUTC); + } + tsStats->update(millsUTC); + if (secs[i] < 0 && nanos[i] > 999999) { - secs[i] += 1; - } - - secs[i] -= timezone.getEpoch(); - nanos[i] = formatNano(nanos[i]); - } - } - tsStats->increase(count); - if (count < numValues) { - tsStats->setHasNull(true); - } - - secRleEncoder->add(secs, numValues, notNull); - nanoRleEncoder->add(nanos, numValues, notNull); - } - - void TimestampColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream dataStream; - dataStream.set_kind(proto::Stream_Kind_DATA); - dataStream.set_column(static_cast<uint32_t>(columnId)); - dataStream.set_length(secRleEncoder->flush()); - streams.push_back(dataStream); - - proto::Stream secondaryStream; - secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); - secondaryStream.set_column(static_cast<uint32_t>(columnId)); - secondaryStream.set_length(nanoRleEncoder->flush()); - streams.push_back(secondaryStream); - } - - uint64_t TimestampColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += secRleEncoder->getBufferSize(); - size += nanoRleEncoder->getBufferSize(); - return size; - } - - void TimestampColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void TimestampColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - secRleEncoder->recordPosition(rowIndexPosition.get()); - nanoRleEncoder->recordPosition(rowIndexPosition.get()); - } - - class DateColumnWriter : public IntegerColumnWriter { - public: - DateColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - }; - - DateColumnWriter::DateColumnWriter( - const Type &type, - const StreamsFactory &factory, - const WriterOptions &options) : - IntegerColumnWriter(type, factory, options) { - // PASS - } - - void DateColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const LongVectorBatch* longBatch = - dynamic_cast<const LongVectorBatch*>(&rowBatch); - if (longBatch == nullptr) { - throw InvalidArgument("Failed to cast to LongVectorBatch"); - } - - DateColumnStatisticsImpl* dateStats = - dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get()); - if (dateStats == nullptr) { - throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const int64_t* data = longBatch->data.data() + offset; - const char* notNull = longBatch->hasNulls ? - longBatch->notNull.data() + offset : nullptr; - - rleEncoder->add(data, numValues, notNull); - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - ++count; - dateStats->update(static_cast<int32_t>(data[i])); - if (enableBloomFilter) { - bloomFilter->addLong(data[i]); - } - } - } - dateStats->increase(count); - if (count < numValues) { - dateStats->setHasNull(true); - } - } - - class Decimal64ColumnWriter : public ColumnWriter { - public: - static const uint32_t MAX_PRECISION_64 = 18; - static const uint32_t MAX_PRECISION_128 = 38; - - Decimal64ColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void recordPosition() const override; - - protected: - RleVersion rleVersion; - uint64_t precision; - uint64_t scale; - std::unique_ptr<AppendOnlyBufferedStream> valueStream; - std::unique_ptr<RleEncoder> scaleEncoder; - - private: - char buffer[10]; - }; - - Decimal64ColumnWriter::Decimal64ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()), - precision(type.getPrecision()), - scale(type.getScale()) { - valueStream.reset(new AppendOnlyBufferedStream( - factory.createStream(proto::Stream_Kind_DATA))); - std::unique_ptr<BufferedOutputStream> scaleStream = - factory.createStream(proto::Stream_Kind_SECONDARY); - scaleEncoder = createRleEncoder(std::move(scaleStream), - true, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (enableIndex) { - recordPosition(); - } - } - - void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const Decimal64VectorBatch* decBatch = - dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); - if (decBatch == nullptr) { - throw InvalidArgument("Failed to cast to Decimal64VectorBatch"); - } - - DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); - if (decStats == nullptr) { - throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; - const int64_t* values = decBatch->values.data() + offset; - - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - int64_t val = zigZag(values[i]); - char* data = buffer; - while (true) { - if ((val & ~0x7f) == 0) { - *(data++) = (static_cast<char>(val)); - break; - } else { - *(data++) = static_cast<char>(0x80 | (val & 0x7f)); - // cast val to unsigned so as to force 0-fill right shift - val = (static_cast<uint64_t>(val) >> 7); - } - } - valueStream->write(buffer, static_cast<size_t>(data - buffer)); - ++count; - if (enableBloomFilter) { - std::string decimal = Decimal( - values[i], static_cast<int32_t>(scale)).toString(); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); - } - decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); - } - } - decStats->increase(count); - if (count < numValues) { - decStats->setHasNull(true); - } - std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); - scaleEncoder->add(scales.data(), numValues, notNull); - } - - void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream dataStream; - dataStream.set_kind(proto::Stream_Kind_DATA); - dataStream.set_column(static_cast<uint32_t>(columnId)); - dataStream.set_length(valueStream->flush()); - streams.push_back(dataStream); - - proto::Stream secondaryStream; - secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); - secondaryStream.set_column(static_cast<uint32_t>(columnId)); - secondaryStream.set_length(scaleEncoder->flush()); - streams.push_back(secondaryStream); - } - - uint64_t Decimal64ColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += valueStream->getSize(); - size += scaleEncoder->getBufferSize(); - return size; - } - - void Decimal64ColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - } - - void Decimal64ColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - valueStream->recordPosition(rowIndexPosition.get()); - scaleEncoder->recordPosition(rowIndexPosition.get()); - } - - class Decimal128ColumnWriter : public Decimal64ColumnWriter { - public: - Decimal128ColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - private: - char buffer[20]; - }; - - Decimal128ColumnWriter::Decimal128ColumnWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - Decimal64ColumnWriter(type, factory, options) { - // PASS - } - - // Zigzag encoding moves the sign bit to the least significant bit using the - // expression (val « 1) ^ (val » 63) and derives its name from the fact that - // positive and negative numbers alternate once encoded. - Int128 zigZagInt128(const Int128& value) { - bool isNegative = value < 0; - Int128 val = value.abs(); - val <<= 1; - if (isNegative) { - val -= 1; - } - return val; - } - - void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - const Decimal128VectorBatch* decBatch = - dynamic_cast<const Decimal128VectorBatch*>(&rowBatch); - if (decBatch == nullptr) { - throw InvalidArgument("Failed to cast to Decimal128VectorBatch"); - } - - DecimalColumnStatisticsImpl* decStats = - dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); - if (decStats == nullptr) { - throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = decBatch->hasNulls ? - decBatch->notNull.data() + offset : nullptr; - const Int128* values = decBatch->values.data() + offset; - - // The current encoding of decimal columns stores the integer representation - // of the value as an unbounded length zigzag encoded base 128 varint. - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - Int128 val = zigZagInt128(values[i]); - char* data = buffer; - while (true) { - if ((val & ~0x7f) == 0) { - *(data++) = (static_cast<char>(val.getLowBits())); - break; - } else { - *(data++) = static_cast<char>(0x80 | (val.getLowBits() & 0x7f)); - val >>= 7; - } - } - valueStream->write(buffer, static_cast<size_t>(data - buffer)); - - ++count; - if (enableBloomFilter) { - std::string decimal = Decimal( - values[i], static_cast<int32_t>(scale)).toString(); - bloomFilter->addBytes( - decimal.c_str(), static_cast<int64_t>(decimal.size())); - } - decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); - } - } - decStats->increase(count); - if (count < numValues) { - decStats->setHasNull(true); - } - std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); - scaleEncoder->add(scales.data(), numValues, notNull); - } - - class ListColumnWriter : public ColumnWriter { - public: - ListColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~ListColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void recordPosition() const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::unique_ptr<RleEncoder> lengthEncoder; - RleVersion rleVersion; - std::unique_ptr<ColumnWriter> child; - }; - - ListColumnWriter::ListColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()){ - - std::unique_ptr<BufferedOutputStream> lengthStream = - factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), - false, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (type.getSubtypeCount() == 1) { - child = buildWriter(*type.getSubtype(0), factory, options); - } - - if (enableIndex) { - recordPosition(); - } - } - - ListColumnWriter::~ListColumnWriter() { - // PASS - } - - void ListColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch); - if (listBatch == nullptr) { - throw InvalidArgument("Failed to cast to ListVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* offsets = listBatch->offsets.data() + offset; - const char* notNull = listBatch->hasNulls ? - listBatch->notNull.data() + offset : nullptr; - - uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); - uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); - - // translate offsets to lengths - for (uint64_t i = 0; i != numValues; ++i) { - offsets[i] = offsets[i + 1] - offsets[i]; - } - - // unnecessary to deal with null as elements are packed together - if (child.get()) { - child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr); - } - lengthEncoder->add(offsets, numValues, notNull); - - if (enableIndex) { - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(offsets[i]); - } - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - } - - void ListColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_LENGTH); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(lengthEncoder->flush()); - streams.push_back(stream); - - if (child.get()) { - child->flush(streams); - } - } - - void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - if (child.get()) { - child->writeIndex(streams); - } - } - - uint64_t ListColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - if (child.get()) { - size += lengthEncoder->getBufferSize(); - size += child->getEstimatedSize(); - } - return size; - } - - void ListColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - if (child.get()) { - child->getColumnEncoding(encodings); - } - } - - void ListColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - if (child.get()) { - child->getStripeStatistics(stats); - } - } - - void ListColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - if (child.get()) { - child->mergeStripeStatsIntoFileStats(); - } - } - - void ListColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - if (child.get()) { - child->getFileStatistics(stats); - } - } - - void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - if (child.get()) { - child->mergeRowGroupStatsIntoStripeStats(); - } - } - - void ListColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - if (child.get()) { - child->createRowIndexEntry(); - } - } - - void ListColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - lengthEncoder->recordPosition(rowIndexPosition.get()); - } - - void ListColumnWriter::reset() { - ColumnWriter::reset(); - if (child) { - child->reset(); - } - } - - void ListColumnWriter::writeDictionary() { - if (child) { - child->writeDictionary(); - } - } - - class MapColumnWriter : public ColumnWriter { - public: - MapColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~MapColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void recordPosition() const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::unique_ptr<ColumnWriter> keyWriter; - std::unique_ptr<ColumnWriter> elemWriter; - std::unique_ptr<RleEncoder> lengthEncoder; - RleVersion rleVersion; - }; - - MapColumnWriter::MapColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options), - rleVersion(options.getRleVersion()){ - std::unique_ptr<BufferedOutputStream> lengthStream = - factory.createStream(proto::Stream_Kind_LENGTH); - lengthEncoder = createRleEncoder(std::move(lengthStream), - false, - rleVersion, - memPool, - options.getAlignedBitpacking()); - - if (type.getSubtypeCount() > 0) { - keyWriter = buildWriter(*type.getSubtype(0), factory, options); - } - - if (type.getSubtypeCount() > 1) { - elemWriter = buildWriter(*type.getSubtype(1), factory, options); - } - - if (enableIndex) { - recordPosition(); - } - } - - MapColumnWriter::~MapColumnWriter() { - // PASS - } - - void MapColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch); - if (mapBatch == nullptr) { - throw InvalidArgument("Failed to cast to MapVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - int64_t* offsets = mapBatch->offsets.data() + offset; - const char* notNull = mapBatch->hasNulls ? - mapBatch->notNull.data() + offset : nullptr; - - uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); - uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); - - // translate offsets to lengths - for (uint64_t i = 0; i != numValues; ++i) { - offsets[i] = offsets[i + 1] - offsets[i]; - } - - lengthEncoder->add(offsets, numValues, notNull); - - // unnecessary to deal with null as keys and values are packed together - if (keyWriter.get()) { - keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr); - } - if (elemWriter.get()) { - elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr); - } - - if (enableIndex) { - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(offsets[i]); - } - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - } - - void MapColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_LENGTH); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(lengthEncoder->flush()); - streams.push_back(stream); - - if (keyWriter.get()) { - keyWriter->flush(streams); - } - if (elemWriter.get()) { - elemWriter->flush(streams); - } - } - - void MapColumnWriter::writeIndex( - std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - if (keyWriter.get()) { - keyWriter->writeIndex(streams); - } - if (elemWriter.get()) { - elemWriter->writeIndex(streams); - } - } - - uint64_t MapColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += lengthEncoder->getBufferSize(); - if (keyWriter.get()) { - size += keyWriter->getEstimatedSize(); - } - if (elemWriter.get()) { - size += elemWriter->getEstimatedSize(); - } - return size; - } - - void MapColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(RleVersionMapper(rleVersion)); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - if (keyWriter.get()) { - keyWriter->getColumnEncoding(encodings); - } - if (elemWriter.get()) { - elemWriter->getColumnEncoding(encodings); - } - } - - void MapColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - if (keyWriter.get()) { - keyWriter->getStripeStatistics(stats); - } - if (elemWriter.get()) { - elemWriter->getStripeStatistics(stats); - } - } - - void MapColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - if (keyWriter.get()) { - keyWriter->mergeStripeStatsIntoFileStats(); - } - if (elemWriter.get()) { - elemWriter->mergeStripeStatsIntoFileStats(); - } - } - - void MapColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - if (keyWriter.get()) { - keyWriter->getFileStatistics(stats); - } - if (elemWriter.get()) { - elemWriter->getFileStatistics(stats); - } - } - - void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - if (keyWriter.get()) { - keyWriter->mergeRowGroupStatsIntoStripeStats(); - } - if (elemWriter.get()) { - elemWriter->mergeRowGroupStatsIntoStripeStats(); - } - } - - void MapColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - if (keyWriter.get()) { - keyWriter->createRowIndexEntry(); - } - if (elemWriter.get()) { - elemWriter->createRowIndexEntry(); - } - } - - void MapColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - lengthEncoder->recordPosition(rowIndexPosition.get()); - } - - void MapColumnWriter::reset() { - ColumnWriter::reset(); - if (keyWriter) { - keyWriter->reset(); - } - if (elemWriter) { - elemWriter->reset(); - } - } - - void MapColumnWriter::writeDictionary() { - if (keyWriter) { - keyWriter->writeDictionary(); - } - if (elemWriter) { - elemWriter->writeDictionary(); - } - } - - class UnionColumnWriter : public ColumnWriter { - public: - UnionColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); - ~UnionColumnWriter() override; - - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) override; - - virtual void flush(std::vector<proto::Stream>& streams) override; - - virtual uint64_t getEstimatedSize() const override; - - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const override; - - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const override; - - virtual void mergeStripeStatsIntoFileStats() override; - - virtual void mergeRowGroupStatsIntoStripeStats() override; - - virtual void createRowIndexEntry() override; - - virtual void writeIndex( - std::vector<proto::Stream> &streams) const override; - - virtual void recordPosition() const override; - - virtual void writeDictionary() override; - - virtual void reset() override; - - private: - std::unique_ptr<ByteRleEncoder> rleEncoder; - std::vector<ColumnWriter*> children; - }; - - UnionColumnWriter::UnionColumnWriter(const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) : - ColumnWriter(type, factory, options) { - - std::unique_ptr<BufferedOutputStream> dataStream = - factory.createStream(proto::Stream_Kind_DATA); - rleEncoder = createByteRleEncoder(std::move(dataStream)); - - for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { - children.push_back(buildWriter(*type.getSubtype(i), - factory, - options).release()); - } - - if (enableIndex) { - recordPosition(); - } - } - - UnionColumnWriter::~UnionColumnWriter() { - for (uint32_t i = 0; i < children.size(); ++i) { - delete children[i]; - } - } - - void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char* incomingMask) { - UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch); - if (unionBatch == nullptr) { - throw InvalidArgument("Failed to cast to UnionVectorBatch"); - } - - ColumnWriter::add(rowBatch, offset, numValues, incomingMask); - - const char* notNull = unionBatch->hasNulls ? - unionBatch->notNull.data() + offset : nullptr; - unsigned char * tags = unionBatch->tags.data() + offset; - uint64_t * offsets = unionBatch->offsets.data() + offset; - - std::vector<int64_t> childOffset(children.size(), -1); - std::vector<uint64_t> childLength(children.size(), 0); - - for (uint64_t i = 0; i != numValues; ++i) { - if (childOffset[tags[i]] == -1) { - childOffset[tags[i]] = static_cast<int64_t>(offsets[i]); - } - ++childLength[tags[i]]; - } - - rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull); - - for (uint32_t i = 0; i < children.size(); ++i) { - if (childLength[i] > 0) { - children[i]->add(*unionBatch->children[i], - static_cast<uint64_t>(childOffset[i]), - childLength[i], nullptr); - } - } - - // update stats - if (enableIndex) { - if (!notNull) { - colIndexStatistics->increase(numValues); - } else { - uint64_t count = 0; - for (uint64_t i = 0; i < numValues; ++i) { - if (notNull[i]) { - ++count; - if (enableBloomFilter) { - bloomFilter->addLong(tags[i]); - } - } - } - colIndexStatistics->increase(count); - if (count < numValues) { - colIndexStatistics->setHasNull(true); - } - } - } - } - - void UnionColumnWriter::flush(std::vector<proto::Stream>& streams) { - ColumnWriter::flush(streams); - - proto::Stream stream; - stream.set_kind(proto::Stream_Kind_DATA); - stream.set_column(static_cast<uint32_t>(columnId)); - stream.set_length(rleEncoder->flush()); - streams.push_back(stream); - - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->flush(streams); - } - } - - void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { - ColumnWriter::writeIndex(streams); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeIndex(streams); - } - } - - uint64_t UnionColumnWriter::getEstimatedSize() const { - uint64_t size = ColumnWriter::getEstimatedSize(); - size += rleEncoder->getBufferSize(); - for (uint32_t i = 0; i < children.size(); ++i) { - size += children[i]->getEstimatedSize(); - } - return size; - } - - void UnionColumnWriter::getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const { - proto::ColumnEncoding encoding; - encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); - encoding.set_dictionarysize(0); - if (enableBloomFilter) { - encoding.set_bloomencoding(BloomFilterVersion::UTF8); - } - encodings.push_back(encoding); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getColumnEncoding(encodings); - } - } - - void UnionColumnWriter::getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getStripeStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getStripeStatistics(stats); - } - } - - void UnionColumnWriter::mergeStripeStatsIntoFileStats() { - ColumnWriter::mergeStripeStatsIntoFileStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeStripeStatsIntoFileStats(); - } - } - - void UnionColumnWriter::getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const { - ColumnWriter::getFileStatistics(stats); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->getFileStatistics(stats); - } - } - - void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { - ColumnWriter::mergeRowGroupStatsIntoStripeStats(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->mergeRowGroupStatsIntoStripeStats(); - } - } - - void UnionColumnWriter::createRowIndexEntry() { - ColumnWriter::createRowIndexEntry(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->createRowIndexEntry(); - } - } - - void UnionColumnWriter::recordPosition() const { - ColumnWriter::recordPosition(); - rleEncoder->recordPosition(rowIndexPosition.get()); - } - - void UnionColumnWriter::reset() { - ColumnWriter::reset(); - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->reset(); - } - } - - void UnionColumnWriter::writeDictionary() { - for (uint32_t i = 0; i < children.size(); ++i) { - children[i]->writeDictionary(); - } - } - - std::unique_ptr<ColumnWriter> buildWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options) { - switch (static_cast<int64_t>(type.getKind())) { - case STRUCT: - return std::unique_ptr<ColumnWriter>( - new StructColumnWriter( - type, - factory, - options)); - case INT: - case LONG: - case SHORT: - return std::unique_ptr<ColumnWriter>( - new IntegerColumnWriter( - type, - factory, - options)); - case BYTE: - return std::unique_ptr<ColumnWriter>( - new ByteColumnWriter( - type, - factory, - options)); - case BOOLEAN: - return std::unique_ptr<ColumnWriter>( - new BooleanColumnWriter( - type, - factory, - options)); - case DOUBLE: - return std::unique_ptr<ColumnWriter>( - new DoubleColumnWriter( - type, - factory, - options, - false)); - case FLOAT: - return std::unique_ptr<ColumnWriter>( - new DoubleColumnWriter( - type, - factory, - options, - true)); - case BINARY: - return std::unique_ptr<ColumnWriter>( - new BinaryColumnWriter( - type, - factory, - options)); - case STRING: - return std::unique_ptr<ColumnWriter>( - new StringColumnWriter( - type, - factory, - options)); - case CHAR: - return std::unique_ptr<ColumnWriter>( - new CharColumnWriter( - type, - factory, - options)); - case VARCHAR: - return std::unique_ptr<ColumnWriter>( - new VarCharColumnWriter( - type, - factory, - options)); - case DATE: - return std::unique_ptr<ColumnWriter>( - new DateColumnWriter( - type, - factory, - options)); - case TIMESTAMP: - return std::unique_ptr<ColumnWriter>( - new TimestampColumnWriter( - type, - factory, - options)); - case DECIMAL: - if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) { - return std::unique_ptr<ColumnWriter>( - new Decimal64ColumnWriter( - type, - factory, - options)); - } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) { - return std::unique_ptr<ColumnWriter>( - new Decimal128ColumnWriter( - type, - factory, - options)); - } else { - throw NotImplementedYet("Decimal precision more than 38 is not " - "supported"); - } - case LIST: - return std::unique_ptr<ColumnWriter>( - new ListColumnWriter( - type, - factory, - options)); - case MAP: - return std::unique_ptr<ColumnWriter>( - new MapColumnWriter( - type, - factory, - options)); - case UNION: - return std::unique_ptr<ColumnWriter>( - new UnionColumnWriter( - type, - factory, - options)); - default: - throw NotImplementedYet("Type is not supported yet for creating " - "ColumnWriter."); - } - } -} + secs[i] += 1; + } + + secs[i] -= timezone.getEpoch(); + nanos[i] = formatNano(nanos[i]); + } + } + tsStats->increase(count); + if (count < numValues) { + tsStats->setHasNull(true); + } + + secRleEncoder->add(secs, numValues, notNull); + nanoRleEncoder->add(nanos, numValues, notNull); + } + + void TimestampColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream dataStream; + dataStream.set_kind(proto::Stream_Kind_DATA); + dataStream.set_column(static_cast<uint32_t>(columnId)); + dataStream.set_length(secRleEncoder->flush()); + streams.push_back(dataStream); + + proto::Stream secondaryStream; + secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); + secondaryStream.set_column(static_cast<uint32_t>(columnId)); + secondaryStream.set_length(nanoRleEncoder->flush()); + streams.push_back(secondaryStream); + } + + uint64_t TimestampColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += secRleEncoder->getBufferSize(); + size += nanoRleEncoder->getBufferSize(); + return size; + } + + void TimestampColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void TimestampColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + secRleEncoder->recordPosition(rowIndexPosition.get()); + nanoRleEncoder->recordPosition(rowIndexPosition.get()); + } + + class DateColumnWriter : public IntegerColumnWriter { + public: + DateColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + }; + + DateColumnWriter::DateColumnWriter( + const Type &type, + const StreamsFactory &factory, + const WriterOptions &options) : + IntegerColumnWriter(type, factory, options) { + // PASS + } + + void DateColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const LongVectorBatch* longBatch = + dynamic_cast<const LongVectorBatch*>(&rowBatch); + if (longBatch == nullptr) { + throw InvalidArgument("Failed to cast to LongVectorBatch"); + } + + DateColumnStatisticsImpl* dateStats = + dynamic_cast<DateColumnStatisticsImpl*>(colIndexStatistics.get()); + if (dateStats == nullptr) { + throw InvalidArgument("Failed to cast to DateColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const int64_t* data = longBatch->data.data() + offset; + const char* notNull = longBatch->hasNulls ? + longBatch->notNull.data() + offset : nullptr; + + rleEncoder->add(data, numValues, notNull); + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + ++count; + dateStats->update(static_cast<int32_t>(data[i])); + if (enableBloomFilter) { + bloomFilter->addLong(data[i]); + } + } + } + dateStats->increase(count); + if (count < numValues) { + dateStats->setHasNull(true); + } + } + + class Decimal64ColumnWriter : public ColumnWriter { + public: + static const uint32_t MAX_PRECISION_64 = 18; + static const uint32_t MAX_PRECISION_128 = 38; + + Decimal64ColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void recordPosition() const override; + + protected: + RleVersion rleVersion; + uint64_t precision; + uint64_t scale; + std::unique_ptr<AppendOnlyBufferedStream> valueStream; + std::unique_ptr<RleEncoder> scaleEncoder; + + private: + char buffer[10]; + }; + + Decimal64ColumnWriter::Decimal64ColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()), + precision(type.getPrecision()), + scale(type.getScale()) { + valueStream.reset(new AppendOnlyBufferedStream( + factory.createStream(proto::Stream_Kind_DATA))); + std::unique_ptr<BufferedOutputStream> scaleStream = + factory.createStream(proto::Stream_Kind_SECONDARY); + scaleEncoder = createRleEncoder(std::move(scaleStream), + true, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (enableIndex) { + recordPosition(); + } + } + + void Decimal64ColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const Decimal64VectorBatch* decBatch = + dynamic_cast<const Decimal64VectorBatch*>(&rowBatch); + if (decBatch == nullptr) { + throw InvalidArgument("Failed to cast to Decimal64VectorBatch"); + } + + DecimalColumnStatisticsImpl* decStats = + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + if (decStats == nullptr) { + throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = decBatch->hasNulls ? + decBatch->notNull.data() + offset : nullptr; + const int64_t* values = decBatch->values.data() + offset; + + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + int64_t val = zigZag(values[i]); + char* data = buffer; + while (true) { + if ((val & ~0x7f) == 0) { + *(data++) = (static_cast<char>(val)); + break; + } else { + *(data++) = static_cast<char>(0x80 | (val & 0x7f)); + // cast val to unsigned so as to force 0-fill right shift + val = (static_cast<uint64_t>(val) >> 7); + } + } + valueStream->write(buffer, static_cast<size_t>(data - buffer)); + ++count; + if (enableBloomFilter) { + std::string decimal = Decimal( + values[i], static_cast<int32_t>(scale)).toString(); + bloomFilter->addBytes( + decimal.c_str(), static_cast<int64_t>(decimal.size())); + } + decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); + } + } + decStats->increase(count); + if (count < numValues) { + decStats->setHasNull(true); + } + std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); + scaleEncoder->add(scales.data(), numValues, notNull); + } + + void Decimal64ColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream dataStream; + dataStream.set_kind(proto::Stream_Kind_DATA); + dataStream.set_column(static_cast<uint32_t>(columnId)); + dataStream.set_length(valueStream->flush()); + streams.push_back(dataStream); + + proto::Stream secondaryStream; + secondaryStream.set_kind(proto::Stream_Kind_SECONDARY); + secondaryStream.set_column(static_cast<uint32_t>(columnId)); + secondaryStream.set_length(scaleEncoder->flush()); + streams.push_back(secondaryStream); + } + + uint64_t Decimal64ColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += valueStream->getSize(); + size += scaleEncoder->getBufferSize(); + return size; + } + + void Decimal64ColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + } + + void Decimal64ColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + valueStream->recordPosition(rowIndexPosition.get()); + scaleEncoder->recordPosition(rowIndexPosition.get()); + } + + class Decimal128ColumnWriter : public Decimal64ColumnWriter { + public: + Decimal128ColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + private: + char buffer[20]; + }; + + Decimal128ColumnWriter::Decimal128ColumnWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + Decimal64ColumnWriter(type, factory, options) { + // PASS + } + + // Zigzag encoding moves the sign bit to the least significant bit using the + // expression (val « 1) ^ (val » 63) and derives its name from the fact that + // positive and negative numbers alternate once encoded. + Int128 zigZagInt128(const Int128& value) { + bool isNegative = value < 0; + Int128 val = value.abs(); + val <<= 1; + if (isNegative) { + val -= 1; + } + return val; + } + + void Decimal128ColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + const Decimal128VectorBatch* decBatch = + dynamic_cast<const Decimal128VectorBatch*>(&rowBatch); + if (decBatch == nullptr) { + throw InvalidArgument("Failed to cast to Decimal128VectorBatch"); + } + + DecimalColumnStatisticsImpl* decStats = + dynamic_cast<DecimalColumnStatisticsImpl*>(colIndexStatistics.get()); + if (decStats == nullptr) { + throw InvalidArgument("Failed to cast to DecimalColumnStatisticsImpl"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = decBatch->hasNulls ? + decBatch->notNull.data() + offset : nullptr; + const Int128* values = decBatch->values.data() + offset; + + // The current encoding of decimal columns stores the integer representation + // of the value as an unbounded length zigzag encoded base 128 varint. + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + Int128 val = zigZagInt128(values[i]); + char* data = buffer; + while (true) { + if ((val & ~0x7f) == 0) { + *(data++) = (static_cast<char>(val.getLowBits())); + break; + } else { + *(data++) = static_cast<char>(0x80 | (val.getLowBits() & 0x7f)); + val >>= 7; + } + } + valueStream->write(buffer, static_cast<size_t>(data - buffer)); + + ++count; + if (enableBloomFilter) { + std::string decimal = Decimal( + values[i], static_cast<int32_t>(scale)).toString(); + bloomFilter->addBytes( + decimal.c_str(), static_cast<int64_t>(decimal.size())); + } + decStats->update(Decimal(values[i], static_cast<int32_t>(scale))); + } + } + decStats->increase(count); + if (count < numValues) { + decStats->setHasNull(true); + } + std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale)); + scaleEncoder->add(scales.data(), numValues, notNull); + } + + class ListColumnWriter : public ColumnWriter { + public: + ListColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~ListColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void recordPosition() const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::unique_ptr<RleEncoder> lengthEncoder; + RleVersion rleVersion; + std::unique_ptr<ColumnWriter> child; + }; + + ListColumnWriter::ListColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()){ + + std::unique_ptr<BufferedOutputStream> lengthStream = + factory.createStream(proto::Stream_Kind_LENGTH); + lengthEncoder = createRleEncoder(std::move(lengthStream), + false, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (type.getSubtypeCount() == 1) { + child = buildWriter(*type.getSubtype(0), factory, options); + } + + if (enableIndex) { + recordPosition(); + } + } + + ListColumnWriter::~ListColumnWriter() { + // PASS + } + + void ListColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + ListVectorBatch* listBatch = dynamic_cast<ListVectorBatch*>(&rowBatch); + if (listBatch == nullptr) { + throw InvalidArgument("Failed to cast to ListVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* offsets = listBatch->offsets.data() + offset; + const char* notNull = listBatch->hasNulls ? + listBatch->notNull.data() + offset : nullptr; + + uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); + uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); + + // translate offsets to lengths + for (uint64_t i = 0; i != numValues; ++i) { + offsets[i] = offsets[i + 1] - offsets[i]; + } + + // unnecessary to deal with null as elements are packed together + if (child.get()) { + child->add(*listBatch->elements, elemOffset, totalNumValues, nullptr); + } + lengthEncoder->add(offsets, numValues, notNull); + + if (enableIndex) { + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(offsets[i]); + } + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + } + + void ListColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_LENGTH); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(lengthEncoder->flush()); + streams.push_back(stream); + + if (child.get()) { + child->flush(streams); + } + } + + void ListColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + if (child.get()) { + child->writeIndex(streams); + } + } + + uint64_t ListColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + if (child.get()) { + size += lengthEncoder->getBufferSize(); + size += child->getEstimatedSize(); + } + return size; + } + + void ListColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + if (child.get()) { + child->getColumnEncoding(encodings); + } + } + + void ListColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + if (child.get()) { + child->getStripeStatistics(stats); + } + } + + void ListColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + if (child.get()) { + child->mergeStripeStatsIntoFileStats(); + } + } + + void ListColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + if (child.get()) { + child->getFileStatistics(stats); + } + } + + void ListColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + if (child.get()) { + child->mergeRowGroupStatsIntoStripeStats(); + } + } + + void ListColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + if (child.get()) { + child->createRowIndexEntry(); + } + } + + void ListColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + lengthEncoder->recordPosition(rowIndexPosition.get()); + } + + void ListColumnWriter::reset() { + ColumnWriter::reset(); + if (child) { + child->reset(); + } + } + + void ListColumnWriter::writeDictionary() { + if (child) { + child->writeDictionary(); + } + } + + class MapColumnWriter : public ColumnWriter { + public: + MapColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~MapColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void recordPosition() const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::unique_ptr<ColumnWriter> keyWriter; + std::unique_ptr<ColumnWriter> elemWriter; + std::unique_ptr<RleEncoder> lengthEncoder; + RleVersion rleVersion; + }; + + MapColumnWriter::MapColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options), + rleVersion(options.getRleVersion()){ + std::unique_ptr<BufferedOutputStream> lengthStream = + factory.createStream(proto::Stream_Kind_LENGTH); + lengthEncoder = createRleEncoder(std::move(lengthStream), + false, + rleVersion, + memPool, + options.getAlignedBitpacking()); + + if (type.getSubtypeCount() > 0) { + keyWriter = buildWriter(*type.getSubtype(0), factory, options); + } + + if (type.getSubtypeCount() > 1) { + elemWriter = buildWriter(*type.getSubtype(1), factory, options); + } + + if (enableIndex) { + recordPosition(); + } + } + + MapColumnWriter::~MapColumnWriter() { + // PASS + } + + void MapColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + MapVectorBatch* mapBatch = dynamic_cast<MapVectorBatch*>(&rowBatch); + if (mapBatch == nullptr) { + throw InvalidArgument("Failed to cast to MapVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + int64_t* offsets = mapBatch->offsets.data() + offset; + const char* notNull = mapBatch->hasNulls ? + mapBatch->notNull.data() + offset : nullptr; + + uint64_t elemOffset = static_cast<uint64_t>(offsets[0]); + uint64_t totalNumValues = static_cast<uint64_t>(offsets[numValues] - offsets[0]); + + // translate offsets to lengths + for (uint64_t i = 0; i != numValues; ++i) { + offsets[i] = offsets[i + 1] - offsets[i]; + } + + lengthEncoder->add(offsets, numValues, notNull); + + // unnecessary to deal with null as keys and values are packed together + if (keyWriter.get()) { + keyWriter->add(*mapBatch->keys, elemOffset, totalNumValues, nullptr); + } + if (elemWriter.get()) { + elemWriter->add(*mapBatch->elements, elemOffset, totalNumValues, nullptr); + } + + if (enableIndex) { + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(offsets[i]); + } + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + } + + void MapColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_LENGTH); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(lengthEncoder->flush()); + streams.push_back(stream); + + if (keyWriter.get()) { + keyWriter->flush(streams); + } + if (elemWriter.get()) { + elemWriter->flush(streams); + } + } + + void MapColumnWriter::writeIndex( + std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + if (keyWriter.get()) { + keyWriter->writeIndex(streams); + } + if (elemWriter.get()) { + elemWriter->writeIndex(streams); + } + } + + uint64_t MapColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += lengthEncoder->getBufferSize(); + if (keyWriter.get()) { + size += keyWriter->getEstimatedSize(); + } + if (elemWriter.get()) { + size += elemWriter->getEstimatedSize(); + } + return size; + } + + void MapColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(RleVersionMapper(rleVersion)); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + if (keyWriter.get()) { + keyWriter->getColumnEncoding(encodings); + } + if (elemWriter.get()) { + elemWriter->getColumnEncoding(encodings); + } + } + + void MapColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + if (keyWriter.get()) { + keyWriter->getStripeStatistics(stats); + } + if (elemWriter.get()) { + elemWriter->getStripeStatistics(stats); + } + } + + void MapColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + if (keyWriter.get()) { + keyWriter->mergeStripeStatsIntoFileStats(); + } + if (elemWriter.get()) { + elemWriter->mergeStripeStatsIntoFileStats(); + } + } + + void MapColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + if (keyWriter.get()) { + keyWriter->getFileStatistics(stats); + } + if (elemWriter.get()) { + elemWriter->getFileStatistics(stats); + } + } + + void MapColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + if (keyWriter.get()) { + keyWriter->mergeRowGroupStatsIntoStripeStats(); + } + if (elemWriter.get()) { + elemWriter->mergeRowGroupStatsIntoStripeStats(); + } + } + + void MapColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + if (keyWriter.get()) { + keyWriter->createRowIndexEntry(); + } + if (elemWriter.get()) { + elemWriter->createRowIndexEntry(); + } + } + + void MapColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + lengthEncoder->recordPosition(rowIndexPosition.get()); + } + + void MapColumnWriter::reset() { + ColumnWriter::reset(); + if (keyWriter) { + keyWriter->reset(); + } + if (elemWriter) { + elemWriter->reset(); + } + } + + void MapColumnWriter::writeDictionary() { + if (keyWriter) { + keyWriter->writeDictionary(); + } + if (elemWriter) { + elemWriter->writeDictionary(); + } + } + + class UnionColumnWriter : public ColumnWriter { + public: + UnionColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); + ~UnionColumnWriter() override; + + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) override; + + virtual void flush(std::vector<proto::Stream>& streams) override; + + virtual uint64_t getEstimatedSize() const override; + + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const override; + + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const override; + + virtual void mergeStripeStatsIntoFileStats() override; + + virtual void mergeRowGroupStatsIntoStripeStats() override; + + virtual void createRowIndexEntry() override; + + virtual void writeIndex( + std::vector<proto::Stream> &streams) const override; + + virtual void recordPosition() const override; + + virtual void writeDictionary() override; + + virtual void reset() override; + + private: + std::unique_ptr<ByteRleEncoder> rleEncoder; + std::vector<ColumnWriter*> children; + }; + + UnionColumnWriter::UnionColumnWriter(const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) : + ColumnWriter(type, factory, options) { + + std::unique_ptr<BufferedOutputStream> dataStream = + factory.createStream(proto::Stream_Kind_DATA); + rleEncoder = createByteRleEncoder(std::move(dataStream)); + + for (uint64_t i = 0; i != type.getSubtypeCount(); ++i) { + children.push_back(buildWriter(*type.getSubtype(i), + factory, + options).release()); + } + + if (enableIndex) { + recordPosition(); + } + } + + UnionColumnWriter::~UnionColumnWriter() { + for (uint32_t i = 0; i < children.size(); ++i) { + delete children[i]; + } + } + + void UnionColumnWriter::add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char* incomingMask) { + UnionVectorBatch* unionBatch = dynamic_cast<UnionVectorBatch*>(&rowBatch); + if (unionBatch == nullptr) { + throw InvalidArgument("Failed to cast to UnionVectorBatch"); + } + + ColumnWriter::add(rowBatch, offset, numValues, incomingMask); + + const char* notNull = unionBatch->hasNulls ? + unionBatch->notNull.data() + offset : nullptr; + unsigned char * tags = unionBatch->tags.data() + offset; + uint64_t * offsets = unionBatch->offsets.data() + offset; + + std::vector<int64_t> childOffset(children.size(), -1); + std::vector<uint64_t> childLength(children.size(), 0); + + for (uint64_t i = 0; i != numValues; ++i) { + if (childOffset[tags[i]] == -1) { + childOffset[tags[i]] = static_cast<int64_t>(offsets[i]); + } + ++childLength[tags[i]]; + } + + rleEncoder->add(reinterpret_cast<char*>(tags), numValues, notNull); + + for (uint32_t i = 0; i < children.size(); ++i) { + if (childLength[i] > 0) { + children[i]->add(*unionBatch->children[i], + static_cast<uint64_t>(childOffset[i]), + childLength[i], nullptr); + } + } + + // update stats + if (enableIndex) { + if (!notNull) { + colIndexStatistics->increase(numValues); + } else { + uint64_t count = 0; + for (uint64_t i = 0; i < numValues; ++i) { + if (notNull[i]) { + ++count; + if (enableBloomFilter) { + bloomFilter->addLong(tags[i]); + } + } + } + colIndexStatistics->increase(count); + if (count < numValues) { + colIndexStatistics->setHasNull(true); + } + } + } + } + + void UnionColumnWriter::flush(std::vector<proto::Stream>& streams) { + ColumnWriter::flush(streams); + + proto::Stream stream; + stream.set_kind(proto::Stream_Kind_DATA); + stream.set_column(static_cast<uint32_t>(columnId)); + stream.set_length(rleEncoder->flush()); + streams.push_back(stream); + + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->flush(streams); + } + } + + void UnionColumnWriter::writeIndex(std::vector<proto::Stream> &streams) const { + ColumnWriter::writeIndex(streams); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeIndex(streams); + } + } + + uint64_t UnionColumnWriter::getEstimatedSize() const { + uint64_t size = ColumnWriter::getEstimatedSize(); + size += rleEncoder->getBufferSize(); + for (uint32_t i = 0; i < children.size(); ++i) { + size += children[i]->getEstimatedSize(); + } + return size; + } + + void UnionColumnWriter::getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const { + proto::ColumnEncoding encoding; + encoding.set_kind(proto::ColumnEncoding_Kind_DIRECT); + encoding.set_dictionarysize(0); + if (enableBloomFilter) { + encoding.set_bloomencoding(BloomFilterVersion::UTF8); + } + encodings.push_back(encoding); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getColumnEncoding(encodings); + } + } + + void UnionColumnWriter::getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getStripeStatistics(stats); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getStripeStatistics(stats); + } + } + + void UnionColumnWriter::mergeStripeStatsIntoFileStats() { + ColumnWriter::mergeStripeStatsIntoFileStats(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeStripeStatsIntoFileStats(); + } + } + + void UnionColumnWriter::getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const { + ColumnWriter::getFileStatistics(stats); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->getFileStatistics(stats); + } + } + + void UnionColumnWriter::mergeRowGroupStatsIntoStripeStats() { + ColumnWriter::mergeRowGroupStatsIntoStripeStats(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->mergeRowGroupStatsIntoStripeStats(); + } + } + + void UnionColumnWriter::createRowIndexEntry() { + ColumnWriter::createRowIndexEntry(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->createRowIndexEntry(); + } + } + + void UnionColumnWriter::recordPosition() const { + ColumnWriter::recordPosition(); + rleEncoder->recordPosition(rowIndexPosition.get()); + } + + void UnionColumnWriter::reset() { + ColumnWriter::reset(); + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->reset(); + } + } + + void UnionColumnWriter::writeDictionary() { + for (uint32_t i = 0; i < children.size(); ++i) { + children[i]->writeDictionary(); + } + } + + std::unique_ptr<ColumnWriter> buildWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options) { + switch (static_cast<int64_t>(type.getKind())) { + case STRUCT: + return std::unique_ptr<ColumnWriter>( + new StructColumnWriter( + type, + factory, + options)); + case INT: + case LONG: + case SHORT: + return std::unique_ptr<ColumnWriter>( + new IntegerColumnWriter( + type, + factory, + options)); + case BYTE: + return std::unique_ptr<ColumnWriter>( + new ByteColumnWriter( + type, + factory, + options)); + case BOOLEAN: + return std::unique_ptr<ColumnWriter>( + new BooleanColumnWriter( + type, + factory, + options)); + case DOUBLE: + return std::unique_ptr<ColumnWriter>( + new DoubleColumnWriter( + type, + factory, + options, + false)); + case FLOAT: + return std::unique_ptr<ColumnWriter>( + new DoubleColumnWriter( + type, + factory, + options, + true)); + case BINARY: + return std::unique_ptr<ColumnWriter>( + new BinaryColumnWriter( + type, + factory, + options)); + case STRING: + return std::unique_ptr<ColumnWriter>( + new StringColumnWriter( + type, + factory, + options)); + case CHAR: + return std::unique_ptr<ColumnWriter>( + new CharColumnWriter( + type, + factory, + options)); + case VARCHAR: + return std::unique_ptr<ColumnWriter>( + new VarCharColumnWriter( + type, + factory, + options)); + case DATE: + return std::unique_ptr<ColumnWriter>( + new DateColumnWriter( + type, + factory, + options)); + case TIMESTAMP: + return std::unique_ptr<ColumnWriter>( + new TimestampColumnWriter( + type, + factory, + options)); + case DECIMAL: + if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64) { + return std::unique_ptr<ColumnWriter>( + new Decimal64ColumnWriter( + type, + factory, + options)); + } else if (type.getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_128) { + return std::unique_ptr<ColumnWriter>( + new Decimal128ColumnWriter( + type, + factory, + options)); + } else { + throw NotImplementedYet("Decimal precision more than 38 is not " + "supported"); + } + case LIST: + return std::unique_ptr<ColumnWriter>( + new ListColumnWriter( + type, + factory, + options)); + case MAP: + return std::unique_ptr<ColumnWriter>( + new MapColumnWriter( + type, + factory, + options)); + case UNION: + return std::unique_ptr<ColumnWriter>( + new UnionColumnWriter( + type, + factory, + options)); + default: + throw NotImplementedYet("Type is not supported yet for creating " + "ColumnWriter."); + } + } +} diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh index cbbb5d00dc..4d7d71cb37 100644 --- a/contrib/libs/apache/orc/c++/src/ColumnWriter.hh +++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.hh @@ -1,221 +1,221 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COLUMN_WRITER_HH -#define ORC_COLUMN_WRITER_HH - -#include "orc/Vector.hh" - -#include "BloomFilter.hh" -#include "ByteRLE.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" -#include "Statistics.hh" - -#include "wrap/orc-proto-wrapper.hh" - -namespace orc { - - class StreamsFactory { - public: - virtual ~StreamsFactory(); - - /** - * Get the stream for the given column/kind in this stripe. - * @param kind the kind of the stream - * @return the buffered output stream - */ - virtual std::unique_ptr<BufferedOutputStream> - createStream(proto::Stream_Kind kind) const = 0; - }; - - std::unique_ptr<StreamsFactory> createStreamsFactory( - const WriterOptions& options, - OutputStream * outStream); - - /** - * record stream positions for row index - */ - class RowIndexPositionRecorder : public PositionRecorder { - public: - virtual ~RowIndexPositionRecorder() override; - - RowIndexPositionRecorder(proto::RowIndexEntry& entry): - rowIndexEntry(entry) {} - - virtual void add(uint64_t pos) override { - rowIndexEntry.add_positions(pos); - } - - private: - proto::RowIndexEntry& rowIndexEntry; - }; - - /** - * The interface for writing ORC data types. - */ - class ColumnWriter { - protected: - std::unique_ptr<ByteRleEncoder> notNullEncoder; - uint64_t columnId; - std::unique_ptr<MutableColumnStatistics> colIndexStatistics; - std::unique_ptr<MutableColumnStatistics> colStripeStatistics; - std::unique_ptr<MutableColumnStatistics> colFileStatistics; - - bool enableIndex; - // row index for this column, contains all RowIndexEntries in 1 stripe - std::unique_ptr<proto::RowIndex> rowIndex; - std::unique_ptr<proto::RowIndexEntry> rowIndexEntry; - std::unique_ptr<RowIndexPositionRecorder> rowIndexPosition; - - // bloom filters are recorded per row group - bool enableBloomFilter; - std::unique_ptr<BloomFilterImpl> bloomFilter; - std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex; - - public: - ColumnWriter(const Type& type, const StreamsFactory& factory, - const WriterOptions& options); - - virtual ~ColumnWriter(); - - /** - * Write the next group of values from this rowBatch. - * @param rowBatch the row batch data to write - * @param offset the starting point of row batch to write - * @param numValues the number of values to write - * @param incomingMask if null, all values are not null. Otherwise, it is - * a mask (with at least numValues bytes) for which - * values to write. - */ - virtual void add(ColumnVectorBatch& rowBatch, - uint64_t offset, - uint64_t numValues, - const char * incomingMask); - /** - * Flush column writer output streams. - * @param streams vector to store streams generated by flush() - */ - virtual void flush(std::vector<proto::Stream>& streams); - - /** - * Get estimated size of buffer used. - * @return estimated size of buffer used - */ - virtual uint64_t getEstimatedSize() const; - - /** - * Get the encoding used by the writer for this column. - * @param encodings vector to store the returned ColumnEncoding info - */ - virtual void getColumnEncoding( - std::vector<proto::ColumnEncoding>& encodings) const = 0; - - /** - * Get the stripe statistics for this column. - * @param stats vector to store the returned stripe statistics - */ - virtual void getStripeStatistics( - std::vector<proto::ColumnStatistics>& stats) const; - - /** - * Get the file statistics for this column. - * @param stats vector to store the returned file statistics - */ - virtual void getFileStatistics( - std::vector<proto::ColumnStatistics>& stats) const; - - /** - * Merge index stats into stripe stats and reset index stats. - */ - virtual void mergeRowGroupStatsIntoStripeStats(); - - /** - * Merge stripe stats into file stats and reset stripe stats. - */ - virtual void mergeStripeStatsIntoFileStats(); - - /** - * Create a row index entry with the previous location and the current - * index statistics. Also merges the index statistics into the stripe - * statistics before they are cleared. Finally, it records the start of the - * next index and ensures all of the children columns also create an entry. - */ - virtual void createRowIndexEntry(); - - /** - * Create a new BloomFilter entry and add the previous one to BloomFilterIndex - */ - virtual void addBloomFilterEntry(); - - /** - * Write row index streams for this column. - * @param streams output list of ROW_INDEX streams - */ - virtual void writeIndex(std::vector<proto::Stream> &streams) const; - - /** - * Record positions for index. - * - * This function is called by createRowIndexEntry() and ColumnWriter's - * constructor. So base classes do not need to call inherited classes' - * recordPosition() function. - */ - virtual void recordPosition() const; - - /** - * Reset positions for index. - */ - virtual void reset(); - - /** - * Write dictionary to streams for string columns - */ - virtual void writeDictionary(); - - protected: - /** - * Utility function to translate ColumnStatistics into protobuf form and - * add it to output list. - * @param statsList output list for protobuf stats - * @param stats ColumnStatistics to be transformed and added - */ - void getProtoBufStatistics( - std::vector<proto::ColumnStatistics>& statsList, - const MutableColumnStatistics* stats) const { - proto::ColumnStatistics pbStats; - stats->toProtoBuf(pbStats); - statsList.push_back(pbStats); - } - - protected: - MemoryPool& memPool; - std::unique_ptr<BufferedOutputStream> indexStream; - std::unique_ptr<BufferedOutputStream> bloomFilterStream; - }; - - /** - * Create a writer for the given type. - */ - std::unique_ptr<ColumnWriter> buildWriter( - const Type& type, - const StreamsFactory& factory, - const WriterOptions& options); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COLUMN_WRITER_HH +#define ORC_COLUMN_WRITER_HH + +#include "orc/Vector.hh" + +#include "BloomFilter.hh" +#include "ByteRLE.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" +#include "Statistics.hh" + +#include "wrap/orc-proto-wrapper.hh" + +namespace orc { + + class StreamsFactory { + public: + virtual ~StreamsFactory(); + + /** + * Get the stream for the given column/kind in this stripe. + * @param kind the kind of the stream + * @return the buffered output stream + */ + virtual std::unique_ptr<BufferedOutputStream> + createStream(proto::Stream_Kind kind) const = 0; + }; + + std::unique_ptr<StreamsFactory> createStreamsFactory( + const WriterOptions& options, + OutputStream * outStream); + + /** + * record stream positions for row index + */ + class RowIndexPositionRecorder : public PositionRecorder { + public: + virtual ~RowIndexPositionRecorder() override; + + RowIndexPositionRecorder(proto::RowIndexEntry& entry): + rowIndexEntry(entry) {} + + virtual void add(uint64_t pos) override { + rowIndexEntry.add_positions(pos); + } + + private: + proto::RowIndexEntry& rowIndexEntry; + }; + + /** + * The interface for writing ORC data types. + */ + class ColumnWriter { + protected: + std::unique_ptr<ByteRleEncoder> notNullEncoder; + uint64_t columnId; + std::unique_ptr<MutableColumnStatistics> colIndexStatistics; + std::unique_ptr<MutableColumnStatistics> colStripeStatistics; + std::unique_ptr<MutableColumnStatistics> colFileStatistics; + + bool enableIndex; + // row index for this column, contains all RowIndexEntries in 1 stripe + std::unique_ptr<proto::RowIndex> rowIndex; + std::unique_ptr<proto::RowIndexEntry> rowIndexEntry; + std::unique_ptr<RowIndexPositionRecorder> rowIndexPosition; + + // bloom filters are recorded per row group + bool enableBloomFilter; + std::unique_ptr<BloomFilterImpl> bloomFilter; + std::unique_ptr<proto::BloomFilterIndex> bloomFilterIndex; + + public: + ColumnWriter(const Type& type, const StreamsFactory& factory, + const WriterOptions& options); + + virtual ~ColumnWriter(); + + /** + * Write the next group of values from this rowBatch. + * @param rowBatch the row batch data to write + * @param offset the starting point of row batch to write + * @param numValues the number of values to write + * @param incomingMask if null, all values are not null. Otherwise, it is + * a mask (with at least numValues bytes) for which + * values to write. + */ + virtual void add(ColumnVectorBatch& rowBatch, + uint64_t offset, + uint64_t numValues, + const char * incomingMask); + /** + * Flush column writer output streams. + * @param streams vector to store streams generated by flush() + */ + virtual void flush(std::vector<proto::Stream>& streams); + + /** + * Get estimated size of buffer used. + * @return estimated size of buffer used + */ + virtual uint64_t getEstimatedSize() const; + + /** + * Get the encoding used by the writer for this column. + * @param encodings vector to store the returned ColumnEncoding info + */ + virtual void getColumnEncoding( + std::vector<proto::ColumnEncoding>& encodings) const = 0; + + /** + * Get the stripe statistics for this column. + * @param stats vector to store the returned stripe statistics + */ + virtual void getStripeStatistics( + std::vector<proto::ColumnStatistics>& stats) const; + + /** + * Get the file statistics for this column. + * @param stats vector to store the returned file statistics + */ + virtual void getFileStatistics( + std::vector<proto::ColumnStatistics>& stats) const; + + /** + * Merge index stats into stripe stats and reset index stats. + */ + virtual void mergeRowGroupStatsIntoStripeStats(); + + /** + * Merge stripe stats into file stats and reset stripe stats. + */ + virtual void mergeStripeStatsIntoFileStats(); + + /** + * Create a row index entry with the previous location and the current + * index statistics. Also merges the index statistics into the stripe + * statistics before they are cleared. Finally, it records the start of the + * next index and ensures all of the children columns also create an entry. + */ + virtual void createRowIndexEntry(); + + /** + * Create a new BloomFilter entry and add the previous one to BloomFilterIndex + */ + virtual void addBloomFilterEntry(); + + /** + * Write row index streams for this column. + * @param streams output list of ROW_INDEX streams + */ + virtual void writeIndex(std::vector<proto::Stream> &streams) const; + + /** + * Record positions for index. + * + * This function is called by createRowIndexEntry() and ColumnWriter's + * constructor. So base classes do not need to call inherited classes' + * recordPosition() function. + */ + virtual void recordPosition() const; + + /** + * Reset positions for index. + */ + virtual void reset(); + + /** + * Write dictionary to streams for string columns + */ + virtual void writeDictionary(); + + protected: + /** + * Utility function to translate ColumnStatistics into protobuf form and + * add it to output list. + * @param statsList output list for protobuf stats + * @param stats ColumnStatistics to be transformed and added + */ + void getProtoBufStatistics( + std::vector<proto::ColumnStatistics>& statsList, + const MutableColumnStatistics* stats) const { + proto::ColumnStatistics pbStats; + stats->toProtoBuf(pbStats); + statsList.push_back(pbStats); + } + + protected: + MemoryPool& memPool; + std::unique_ptr<BufferedOutputStream> indexStream; + std::unique_ptr<BufferedOutputStream> bloomFilterStream; + }; + + /** + * Create a writer for the given type. + */ + std::unique_ptr<ColumnWriter> buildWriter( + const Type& type, + const StreamsFactory& factory, + const WriterOptions& options); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Common.cc b/contrib/libs/apache/orc/c++/src/Common.cc index dbf073797e..e50f085d30 100644 --- a/contrib/libs/apache/orc/c++/src/Common.cc +++ b/contrib/libs/apache/orc/c++/src/Common.cc @@ -1,75 +1,75 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Common.hh" - -#include <sstream> - -namespace orc { - - std::string compressionKindToString(CompressionKind kind) { - switch (static_cast<int>(kind)) { - case CompressionKind_NONE: - return "none"; - case CompressionKind_ZLIB: - return "zlib"; - case CompressionKind_SNAPPY: - return "snappy"; - case CompressionKind_LZO: - return "lzo"; - case CompressionKind_LZ4: - return "lz4"; - case CompressionKind_ZSTD: - return "zstd"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string writerVersionToString(WriterVersion version) { - switch (static_cast<int>(version)) { - case WriterVersion_ORIGINAL: - return "original"; - case WriterVersion_HIVE_8732: - return "HIVE-8732"; - case WriterVersion_HIVE_4243: - return "HIVE-4243"; - case WriterVersion_HIVE_12055: - return "HIVE-12055"; - case WriterVersion_HIVE_13083: - return "HIVE-13083"; - case WriterVersion_ORC_101: - return "ORC-101"; - case WriterVersion_ORC_135: - return "ORC-135"; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Common.hh" + +#include <sstream> + +namespace orc { + + std::string compressionKindToString(CompressionKind kind) { + switch (static_cast<int>(kind)) { + case CompressionKind_NONE: + return "none"; + case CompressionKind_ZLIB: + return "zlib"; + case CompressionKind_SNAPPY: + return "snappy"; + case CompressionKind_LZO: + return "lzo"; + case CompressionKind_LZ4: + return "lz4"; + case CompressionKind_ZSTD: + return "zstd"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string writerVersionToString(WriterVersion version) { + switch (static_cast<int>(version)) { + case WriterVersion_ORIGINAL: + return "original"; + case WriterVersion_HIVE_8732: + return "HIVE-8732"; + case WriterVersion_HIVE_4243: + return "HIVE-4243"; + case WriterVersion_HIVE_12055: + return "HIVE-12055"; + case WriterVersion_HIVE_13083: + return "HIVE-13083"; + case WriterVersion_ORC_101: + return "ORC-101"; + case WriterVersion_ORC_135: + return "ORC-135"; case WriterVersion_ORC_517: return "ORC-517"; case WriterVersion_ORC_203: return "ORC-203"; case WriterVersion_ORC_14: return "ORC-14"; - } - std::stringstream buffer; - buffer << "future - " << version; - return buffer.str(); - } - + } + std::stringstream buffer; + buffer << "future - " << version; + return buffer.str(); + } + std::string writerIdToString(uint32_t id) { switch (id) { case ORC_JAVA_WRITER: @@ -90,59 +90,59 @@ namespace orc { } } - std::string streamKindToString(StreamKind kind) { - switch (static_cast<int>(kind)) { - case StreamKind_PRESENT: - return "present"; - case StreamKind_DATA: - return "data"; - case StreamKind_LENGTH: - return "length"; - case StreamKind_DICTIONARY_DATA: - return "dictionary"; - case StreamKind_DICTIONARY_COUNT: - return "dictionary count"; - case StreamKind_SECONDARY: - return "secondary"; - case StreamKind_ROW_INDEX: - return "index"; - case StreamKind_BLOOM_FILTER: - return "bloom"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string columnEncodingKindToString(ColumnEncodingKind kind) { - switch (static_cast<int>(kind)) { - case ColumnEncodingKind_DIRECT: - return "direct"; - case ColumnEncodingKind_DICTIONARY: - return "dictionary"; - case ColumnEncodingKind_DIRECT_V2: - return "direct rle2"; - case ColumnEncodingKind_DICTIONARY_V2: - return "dictionary rle2"; - } - std::stringstream buffer; - buffer << "unknown - " << kind; - return buffer.str(); - } - - std::string FileVersion::toString() const { - std::stringstream ss; - ss << getMajor() << '.' << getMinor(); - return ss.str(); - } - - const FileVersion& FileVersion::v_0_11(){ - static FileVersion version(0,11); - return version; - } - - const FileVersion& FileVersion::v_0_12(){ - static FileVersion version(0,12); - return version; - } -} + std::string streamKindToString(StreamKind kind) { + switch (static_cast<int>(kind)) { + case StreamKind_PRESENT: + return "present"; + case StreamKind_DATA: + return "data"; + case StreamKind_LENGTH: + return "length"; + case StreamKind_DICTIONARY_DATA: + return "dictionary"; + case StreamKind_DICTIONARY_COUNT: + return "dictionary count"; + case StreamKind_SECONDARY: + return "secondary"; + case StreamKind_ROW_INDEX: + return "index"; + case StreamKind_BLOOM_FILTER: + return "bloom"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string columnEncodingKindToString(ColumnEncodingKind kind) { + switch (static_cast<int>(kind)) { + case ColumnEncodingKind_DIRECT: + return "direct"; + case ColumnEncodingKind_DICTIONARY: + return "dictionary"; + case ColumnEncodingKind_DIRECT_V2: + return "direct rle2"; + case ColumnEncodingKind_DICTIONARY_V2: + return "dictionary rle2"; + } + std::stringstream buffer; + buffer << "unknown - " << kind; + return buffer.str(); + } + + std::string FileVersion::toString() const { + std::stringstream ss; + ss << getMajor() << '.' << getMinor(); + return ss.str(); + } + + const FileVersion& FileVersion::v_0_11(){ + static FileVersion version(0,11); + return version; + } + + const FileVersion& FileVersion::v_0_12(){ + static FileVersion version(0,12); + return version; + } +} diff --git a/contrib/libs/apache/orc/c++/src/Compression.cc b/contrib/libs/apache/orc/c++/src/Compression.cc index 4278ed7aae..057641ec1f 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.cc +++ b/contrib/libs/apache/orc/c++/src/Compression.cc @@ -1,1071 +1,1071 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" -#include "LzoDecompressor.hh" -#include "lz4.h" - -#include <algorithm> -#include <iomanip> -#include <iostream> -#include <sstream> - -#include "zlib.h" -#include "zstd.h" - -#include "wrap/snappy-wrapper.h" - -#ifndef ZSTD_CLEVEL_DEFAULT -#define ZSTD_CLEVEL_DEFAULT 3 -#endif - -namespace orc { - - class CompressionStreamBase: public BufferedOutputStream { - public: - CompressionStreamBase(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual bool Next(void** data, int*size) override = 0; - virtual void BackUp(int count) override; - - virtual std::string getName() const override = 0; - virtual uint64_t flush() override; - - virtual bool isCompressed() const override { return true; } - virtual uint64_t getSize() const override; - - protected: - void writeHeader(char * buffer, size_t compressedSize, bool original) { - buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); - buffer[1] = static_cast<char>(compressedSize >> 7); - buffer[2] = static_cast<char>(compressedSize >> 15); - } - - // ensure enough room for compression block header - void ensureHeader(); - - // Buffer to hold uncompressed data until user calls Next() - DataBuffer<unsigned char> rawInputBuffer; - - // Compress level - int level; - - // Compressed data output buffer - char * outputBuffer; - - // Size for compressionBuffer - int bufferSize; - - // Compress output position - int outputPosition; - - // Compress output buffer size - int outputSize; - }; - - CompressionStreamBase::CompressionStreamBase(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) : - BufferedOutputStream(pool, - outStream, - capacity, - blockSize), - rawInputBuffer(pool, blockSize), - level(compressionLevel), - outputBuffer(nullptr), - bufferSize(0), - outputPosition(0), - outputSize(0) { - // PASS - } - - void CompressionStreamBase::BackUp(int count) { - if (count > bufferSize) { - throw std::logic_error("Can't backup that much!"); - } - bufferSize -= count; - } - - uint64_t CompressionStreamBase::flush() { - void * data; - int size; - if (!Next(&data, &size)) { - throw std::runtime_error("Failed to flush compression buffer."); - } - BufferedOutputStream::BackUp(outputSize - outputPosition); - bufferSize = outputSize = outputPosition = 0; - return BufferedOutputStream::flush(); - } - - uint64_t CompressionStreamBase::getSize() const { - return BufferedOutputStream::getSize() - - static_cast<uint64_t>(outputSize - outputPosition); - } - - void CompressionStreamBase::ensureHeader() { - // adjust 3 bytes for the compression header - if (outputPosition + 3 >= outputSize) { - int newPosition = outputPosition + 3 - outputSize; - if (!BufferedOutputStream::Next( - reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::runtime_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = newPosition; - } else { - outputPosition += 3; - } - } - - /** - * Streaming compression base class - */ - class CompressionStream: public CompressionStreamBase { - public: - CompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual bool Next(void** data, int*size) override; - virtual std::string getName() const override = 0; - - protected: - // return total compressed size - virtual uint64_t doStreamingCompression() = 0; - }; - - CompressionStream::CompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) : - CompressionStreamBase(outStream, - compressionLevel, - capacity, - blockSize, - pool) { - // PASS - } - - bool CompressionStream::Next(void** data, int*size) { - if (bufferSize != 0) { - ensureHeader(); - - uint64_t totalCompressedSize = doStreamingCompression(); - - char * header = outputBuffer + outputPosition - totalCompressedSize - 3; - if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { - writeHeader(header, static_cast<size_t>(bufferSize), true); - memcpy( - header + 3, - rawInputBuffer.data(), - static_cast<size_t>(bufferSize)); - - int backup = static_cast<int>(totalCompressedSize) - bufferSize; - BufferedOutputStream::BackUp(backup); - outputPosition -= backup; - outputSize -= backup; - } else { - writeHeader(header, totalCompressedSize, false); - } - } - - *data = rawInputBuffer.data(); - *size = static_cast<int>(rawInputBuffer.size()); - bufferSize = *size; - - return true; - } - - class ZlibCompressionStream: public CompressionStream { - public: - ZlibCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool); - - virtual ~ZlibCompressionStream() override { - end(); - } - - virtual std::string getName() const override; - - protected: - virtual uint64_t doStreamingCompression() override; - - private: - void init(); - void end(); - z_stream strm; - }; - - ZlibCompressionStream::ZlibCompressionStream( - OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : CompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { - init(); - } - - uint64_t ZlibCompressionStream::doStreamingCompression() { - if (deflateReset(&strm) != Z_OK) { - throw std::runtime_error("Failed to reset inflate."); - } - - strm.avail_in = static_cast<unsigned int>(bufferSize); - strm.next_in = rawInputBuffer.data(); - - do { - if (outputPosition >= outputSize) { - if (!BufferedOutputStream::Next( - reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::runtime_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = 0; - } - strm.next_out = reinterpret_cast<unsigned char *> - (outputBuffer + outputPosition); - strm.avail_out = static_cast<unsigned int> - (outputSize - outputPosition); - - int ret = deflate(&strm, Z_FINISH); - outputPosition = outputSize - static_cast<int>(strm.avail_out); - - if (ret == Z_STREAM_END) { - break; - } else if (ret == Z_OK) { - // needs more buffer so will continue the loop - } else { - throw std::runtime_error("Failed to deflate input data."); - } - } while (strm.avail_out == 0); - - return strm.total_out; - } - - std::string ZlibCompressionStream::getName() const { - return "ZlibCompressionStream"; - } - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wold-style-cast") -#endif - - void ZlibCompressionStream::init() { - strm.zalloc = nullptr; - strm.zfree = nullptr; - strm.opaque = nullptr; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" +#include "LzoDecompressor.hh" +#include "lz4.h" + +#include <algorithm> +#include <iomanip> +#include <iostream> +#include <sstream> + +#include "zlib.h" +#include "zstd.h" + +#include "wrap/snappy-wrapper.h" + +#ifndef ZSTD_CLEVEL_DEFAULT +#define ZSTD_CLEVEL_DEFAULT 3 +#endif + +namespace orc { + + class CompressionStreamBase: public BufferedOutputStream { + public: + CompressionStreamBase(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool); + + virtual bool Next(void** data, int*size) override = 0; + virtual void BackUp(int count) override; + + virtual std::string getName() const override = 0; + virtual uint64_t flush() override; + + virtual bool isCompressed() const override { return true; } + virtual uint64_t getSize() const override; + + protected: + void writeHeader(char * buffer, size_t compressedSize, bool original) { + buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); + buffer[1] = static_cast<char>(compressedSize >> 7); + buffer[2] = static_cast<char>(compressedSize >> 15); + } + + // ensure enough room for compression block header + void ensureHeader(); + + // Buffer to hold uncompressed data until user calls Next() + DataBuffer<unsigned char> rawInputBuffer; + + // Compress level + int level; + + // Compressed data output buffer + char * outputBuffer; + + // Size for compressionBuffer + int bufferSize; + + // Compress output position + int outputPosition; + + // Compress output buffer size + int outputSize; + }; + + CompressionStreamBase::CompressionStreamBase(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) : + BufferedOutputStream(pool, + outStream, + capacity, + blockSize), + rawInputBuffer(pool, blockSize), + level(compressionLevel), + outputBuffer(nullptr), + bufferSize(0), + outputPosition(0), + outputSize(0) { + // PASS + } + + void CompressionStreamBase::BackUp(int count) { + if (count > bufferSize) { + throw std::logic_error("Can't backup that much!"); + } + bufferSize -= count; + } + + uint64_t CompressionStreamBase::flush() { + void * data; + int size; + if (!Next(&data, &size)) { + throw std::runtime_error("Failed to flush compression buffer."); + } + BufferedOutputStream::BackUp(outputSize - outputPosition); + bufferSize = outputSize = outputPosition = 0; + return BufferedOutputStream::flush(); + } + + uint64_t CompressionStreamBase::getSize() const { + return BufferedOutputStream::getSize() - + static_cast<uint64_t>(outputSize - outputPosition); + } + + void CompressionStreamBase::ensureHeader() { + // adjust 3 bytes for the compression header + if (outputPosition + 3 >= outputSize) { + int newPosition = outputPosition + 3 - outputSize; + if (!BufferedOutputStream::Next( + reinterpret_cast<void **>(&outputBuffer), + &outputSize)) { + throw std::runtime_error( + "Failed to get next output buffer from output stream."); + } + outputPosition = newPosition; + } else { + outputPosition += 3; + } + } + + /** + * Streaming compression base class + */ + class CompressionStream: public CompressionStreamBase { + public: + CompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool); + + virtual bool Next(void** data, int*size) override; + virtual std::string getName() const override = 0; + + protected: + // return total compressed size + virtual uint64_t doStreamingCompression() = 0; + }; + + CompressionStream::CompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) : + CompressionStreamBase(outStream, + compressionLevel, + capacity, + blockSize, + pool) { + // PASS + } + + bool CompressionStream::Next(void** data, int*size) { + if (bufferSize != 0) { + ensureHeader(); + + uint64_t totalCompressedSize = doStreamingCompression(); + + char * header = outputBuffer + outputPosition - totalCompressedSize - 3; + if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { + writeHeader(header, static_cast<size_t>(bufferSize), true); + memcpy( + header + 3, + rawInputBuffer.data(), + static_cast<size_t>(bufferSize)); + + int backup = static_cast<int>(totalCompressedSize) - bufferSize; + BufferedOutputStream::BackUp(backup); + outputPosition -= backup; + outputSize -= backup; + } else { + writeHeader(header, totalCompressedSize, false); + } + } + + *data = rawInputBuffer.data(); + *size = static_cast<int>(rawInputBuffer.size()); + bufferSize = *size; + + return true; + } + + class ZlibCompressionStream: public CompressionStream { + public: + ZlibCompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool); + + virtual ~ZlibCompressionStream() override { + end(); + } + + virtual std::string getName() const override; + + protected: + virtual uint64_t doStreamingCompression() override; + + private: + void init(); + void end(); + z_stream strm; + }; + + ZlibCompressionStream::ZlibCompressionStream( + OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) + : CompressionStream(outStream, + compressionLevel, + capacity, + blockSize, + pool) { + init(); + } + + uint64_t ZlibCompressionStream::doStreamingCompression() { + if (deflateReset(&strm) != Z_OK) { + throw std::runtime_error("Failed to reset inflate."); + } + + strm.avail_in = static_cast<unsigned int>(bufferSize); + strm.next_in = rawInputBuffer.data(); + + do { + if (outputPosition >= outputSize) { + if (!BufferedOutputStream::Next( + reinterpret_cast<void **>(&outputBuffer), + &outputSize)) { + throw std::runtime_error( + "Failed to get next output buffer from output stream."); + } + outputPosition = 0; + } + strm.next_out = reinterpret_cast<unsigned char *> + (outputBuffer + outputPosition); + strm.avail_out = static_cast<unsigned int> + (outputSize - outputPosition); + + int ret = deflate(&strm, Z_FINISH); + outputPosition = outputSize - static_cast<int>(strm.avail_out); + + if (ret == Z_STREAM_END) { + break; + } else if (ret == Z_OK) { + // needs more buffer so will continue the loop + } else { + throw std::runtime_error("Failed to deflate input data."); + } + } while (strm.avail_out == 0); + + return strm.total_out; + } + + std::string ZlibCompressionStream::getName() const { + return "ZlibCompressionStream"; + } + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wold-style-cast") +#endif + + void ZlibCompressionStream::init() { + strm.zalloc = nullptr; + strm.zfree = nullptr; + strm.opaque = nullptr; strm.next_in = nullptr; - - if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) - != Z_OK) { - throw std::runtime_error("Error while calling deflateInit2() for zlib."); - } - } - - void ZlibCompressionStream::end() { - (void)deflateEnd(&strm); - } - -DIAGNOSTIC_PUSH - - enum DecompressState { DECOMPRESS_HEADER, - DECOMPRESS_START, - DECOMPRESS_CONTINUE, - DECOMPRESS_ORIGINAL, - DECOMPRESS_EOF}; - - class ZlibDecompressionStream: public SeekableInputStream { - public: - ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool); - virtual ~ZlibDecompressionStream() override; - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - - private: - void readBuffer(bool failOnEof) { - int length; - if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), - &length)) { - if (failOnEof) { - throw ParseError("Read past EOF in " - "ZlibDecompressionStream::readBuffer"); - } - state = DECOMPRESS_EOF; - inputBuffer = nullptr; - inputBufferEnd = nullptr; - } else { - inputBufferEnd = inputBuffer + length; - } - } - - uint32_t readByte(bool failOnEof) { - if (inputBuffer == inputBufferEnd) { - readBuffer(failOnEof); - if (state == DECOMPRESS_EOF) { - return 0; - } - } - return static_cast<unsigned char>(*(inputBuffer++)); - } - - void readHeader() { - uint32_t header = readByte(false); - if (state != DECOMPRESS_EOF) { - header |= readByte(true) << 8; - header |= readByte(true) << 16; - if (header & 1) { - state = DECOMPRESS_ORIGINAL; - } else { - state = DECOMPRESS_START; - } - remainingLength = header >> 1; - } else { - remainingLength = 0; - } - } - - MemoryPool& pool; - const size_t blockSize; - std::unique_ptr<SeekableInputStream> input; - z_stream zstream; - DataBuffer<char> buffer; - - // the current state - DecompressState state; - - // the start of the current buffer - // This pointer is not owned by us. It is either owned by zstream or - // the underlying stream. - const char* outputBuffer; - // the size of the current buffer - size_t outputBufferLength; - // the size of the current chunk - size_t remainingLength; - - // the last buffer returned from the input - const char *inputBuffer; - const char *inputBufferEnd; - - // roughly the number of bytes returned - off_t bytesReturned; - }; - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wold-style-cast") -#endif - - ZlibDecompressionStream::ZlibDecompressionStream - (std::unique_ptr<SeekableInputStream> inStream, - size_t _blockSize, - MemoryPool& _pool - ): pool(_pool), - blockSize(_blockSize), - buffer(pool, _blockSize) { - input.reset(inStream.release()); - zstream.next_in = nullptr; - zstream.avail_in = 0; - zstream.zalloc = nullptr; - zstream.zfree = nullptr; - zstream.opaque = nullptr; - zstream.next_out = reinterpret_cast<Bytef*>(buffer.data()); - zstream.avail_out = static_cast<uInt>(blockSize); - int64_t result = inflateInit2(&zstream, -15); - switch (result) { - case Z_OK: - break; - case Z_MEM_ERROR: - throw std::logic_error("Memory error from inflateInit2"); - case Z_VERSION_ERROR: - throw std::logic_error("Version error from inflateInit2"); - case Z_STREAM_ERROR: - throw std::logic_error("Stream error from inflateInit2"); - default: - throw std::logic_error("Unknown error from inflateInit2"); - } - outputBuffer = nullptr; - outputBufferLength = 0; - remainingLength = 0; - state = DECOMPRESS_HEADER; - inputBuffer = nullptr; - inputBufferEnd = nullptr; - bytesReturned = 0; - } - -DIAGNOSTIC_POP - - ZlibDecompressionStream::~ZlibDecompressionStream() { - int64_t result = inflateEnd(&zstream); - if (result != Z_OK) { - // really can't throw in destructors - std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n"; - } - } - - bool ZlibDecompressionStream::Next(const void** data, int*size) { - // if the user pushed back, return them the partial buffer - if (outputBufferLength) { - *data = outputBuffer; - *size = static_cast<int>(outputBufferLength); - outputBuffer += outputBufferLength; - outputBufferLength = 0; - return true; - } - if (state == DECOMPRESS_HEADER || remainingLength == 0) { - readHeader(); - } - if (state == DECOMPRESS_EOF) { - return false; - } - if (inputBuffer == inputBufferEnd) { - readBuffer(true); - } - size_t availSize = - std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), - remainingLength); - if (state == DECOMPRESS_ORIGINAL) { - *data = inputBuffer; - *size = static_cast<int>(availSize); - outputBuffer = inputBuffer + availSize; - outputBufferLength = 0; - } else if (state == DECOMPRESS_START) { - zstream.next_in = - reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availSize); - outputBuffer = buffer.data(); - zstream.next_out = - reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); - zstream.avail_out = static_cast<uInt>(blockSize); - if (inflateReset(&zstream) != Z_OK) { - throw std::logic_error("Bad inflateReset in " - "ZlibDecompressionStream::Next"); - } - int64_t result; - do { - result = inflate(&zstream, availSize == remainingLength ? Z_FINISH : - Z_SYNC_FLUSH); - switch (result) { - case Z_OK: - remainingLength -= availSize; - inputBuffer += availSize; - readBuffer(true); - availSize = - std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), - remainingLength); - zstream.next_in = - reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); - zstream.avail_in = static_cast<uInt>(availSize); - break; - case Z_STREAM_END: - break; - case Z_BUF_ERROR: - throw std::logic_error("Buffer error in " - "ZlibDecompressionStream::Next"); - case Z_DATA_ERROR: - throw std::logic_error("Data error in " - "ZlibDecompressionStream::Next"); - case Z_STREAM_ERROR: - throw std::logic_error("Stream error in " - "ZlibDecompressionStream::Next"); - default: - throw std::logic_error("Unknown error in " - "ZlibDecompressionStream::Next"); - } - } while (result != Z_STREAM_END); - *size = static_cast<int>(blockSize - zstream.avail_out); - *data = outputBuffer; - outputBufferLength = 0; - outputBuffer += *size; - } else { - throw std::logic_error("Unknown compression state in " - "ZlibDecompressionStream::Next"); - } - inputBuffer += availSize; - remainingLength -= availSize; - bytesReturned += *size; - return true; - } - - void ZlibDecompressionStream::BackUp(int count) { - if (outputBuffer == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in " - "ZlibDecompressionStream"); - } - outputBuffer -= static_cast<size_t>(count); - outputBufferLength = static_cast<size_t>(count); - bytesReturned -= count; - } - - bool ZlibDecompressionStream::Skip(int count) { - bytesReturned += count; - // this is a stupid implementation for now. - // should skip entire blocks without decompressing - while (count > 0) { - const void *ptr; - int len; - if (!Next(&ptr, &len)) { - return false; - } - if (len > count) { - BackUp(len - count); - count = 0; - } else { - count -= len; - } - } - return true; - } - - int64_t ZlibDecompressionStream::ByteCount() const { - return bytesReturned; - } - - void ZlibDecompressionStream::seek(PositionProvider& position) { - // clear state to force seek to read from the right position - state = DECOMPRESS_HEADER; - outputBuffer = nullptr; - outputBufferLength = 0; - remainingLength = 0; - inputBuffer = nullptr; - inputBufferEnd = nullptr; - - input->seek(position); - bytesReturned = static_cast<off_t>(input->ByteCount()); - if (!Skip(static_cast<int>(position.next()))) { - throw ParseError("Bad skip in ZlibDecompressionStream::seek"); - } - } - - std::string ZlibDecompressionStream::getName() const { - std::ostringstream result; - result << "zlib(" << input->getName() << ")"; - return result.str(); - } - - class BlockDecompressionStream: public SeekableInputStream { - public: - BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool); - - virtual ~BlockDecompressionStream() override {} - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override = 0; - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength) = 0; - - std::string getStreamName() const { - return input->getName(); - } - - private: - void readBuffer(bool failOnEof) { - int length; - if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr), - &length)) { - if (failOnEof) { - throw ParseError(getName() + "read past EOF"); - } - state = DECOMPRESS_EOF; - inputBufferPtr = nullptr; - inputBufferPtrEnd = nullptr; - } else { - inputBufferPtrEnd = inputBufferPtr + length; - } - } - - uint32_t readByte(bool failOnEof) { - if (inputBufferPtr == inputBufferPtrEnd) { - readBuffer(failOnEof); - if (state == DECOMPRESS_EOF) { - return 0; - } - } - return static_cast<unsigned char>(*(inputBufferPtr++)); - } - - void readHeader() { - uint32_t header = readByte(false); - if (state != DECOMPRESS_EOF) { - header |= readByte(true) << 8; - header |= readByte(true) << 16; - if (header & 1) { - state = DECOMPRESS_ORIGINAL; - } else { - state = DECOMPRESS_START; - } - remainingLength = header >> 1; - } else { - remainingLength = 0; - } - } - - std::unique_ptr<SeekableInputStream> input; - MemoryPool& pool; - - // may need to stitch together multiple input buffers; - // to give snappy a contiguous block - DataBuffer<char> inputBuffer; - - // uncompressed output - DataBuffer<char> outputBuffer; - - // the current state - DecompressState state; - - // the start of the current output buffer - const char* outputBufferPtr; - // the size of the current output buffer - size_t outputBufferLength; - - // the size of the current chunk - size_t remainingLength; - - // the last buffer returned from the input - const char *inputBufferPtr; - const char *inputBufferPtrEnd; - - // bytes returned by this stream - off_t bytesReturned; - }; - - BlockDecompressionStream::BlockDecompressionStream - (std::unique_ptr<SeekableInputStream> inStream, - size_t bufferSize, - MemoryPool& _pool - ) : pool(_pool), - inputBuffer(pool, bufferSize), - outputBuffer(pool, bufferSize), - state(DECOMPRESS_HEADER), - outputBufferPtr(nullptr), - outputBufferLength(0), - remainingLength(0), - inputBufferPtr(nullptr), - inputBufferPtrEnd(nullptr), - bytesReturned(0) { - input.reset(inStream.release()); - } - - bool BlockDecompressionStream::Next(const void** data, int*size) { - // if the user pushed back, return them the partial buffer - if (outputBufferLength) { - *data = outputBufferPtr; - *size = static_cast<int>(outputBufferLength); - outputBufferPtr += outputBufferLength; - bytesReturned += static_cast<off_t>(outputBufferLength); - outputBufferLength = 0; - return true; - } - if (state == DECOMPRESS_HEADER || remainingLength == 0) { - readHeader(); - } - if (state == DECOMPRESS_EOF) { - return false; - } - if (inputBufferPtr == inputBufferPtrEnd) { - readBuffer(true); - } - - size_t availSize = - std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr), - remainingLength); - if (state == DECOMPRESS_ORIGINAL) { - *data = inputBufferPtr; - *size = static_cast<int>(availSize); - outputBufferPtr = inputBufferPtr + availSize; - outputBufferLength = 0; - inputBufferPtr += availSize; - remainingLength -= availSize; - } else if (state == DECOMPRESS_START) { - // Get contiguous bytes of compressed block. - const char *compressed = inputBufferPtr; - if (remainingLength == availSize) { - inputBufferPtr += availSize; - } else { - // Did not read enough from input. - if (inputBuffer.capacity() < remainingLength) { - inputBuffer.resize(remainingLength); - } - ::memcpy(inputBuffer.data(), inputBufferPtr, availSize); - inputBufferPtr += availSize; - compressed = inputBuffer.data(); - - for (size_t pos = availSize; pos < remainingLength; ) { - readBuffer(true); - size_t avail = - std::min(static_cast<size_t>(inputBufferPtrEnd - - inputBufferPtr), - remainingLength - pos); - ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail); - pos += avail; - inputBufferPtr += avail; - } - } - - outputBufferLength = decompress(compressed, remainingLength, - outputBuffer.data(), - outputBuffer.capacity()); - - remainingLength = 0; - state = DECOMPRESS_HEADER; - *data = outputBuffer.data(); - *size = static_cast<int>(outputBufferLength); - outputBufferPtr = outputBuffer.data() + outputBufferLength; - outputBufferLength = 0; - } - - bytesReturned += *size; - return true; - } - - void BlockDecompressionStream::BackUp(int count) { - if (outputBufferPtr == nullptr || outputBufferLength != 0) { - throw std::logic_error("Backup without previous Next in "+getName()); - } - outputBufferPtr -= static_cast<size_t>(count); - outputBufferLength = static_cast<size_t>(count); - bytesReturned -= count; - } - - bool BlockDecompressionStream::Skip(int count) { - bytesReturned += count; - // this is a stupid implementation for now. - // should skip entire blocks without decompressing - while (count > 0) { - const void *ptr; - int len; - if (!Next(&ptr, &len)) { - return false; - } - if (len > count) { - BackUp(len - count); - count = 0; - } else { - count -= len; - } - } - return true; - } - - int64_t BlockDecompressionStream::ByteCount() const { - return bytesReturned; - } - - void BlockDecompressionStream::seek(PositionProvider& position) { - // clear state to force seek to read from the right position - state = DECOMPRESS_HEADER; - outputBufferPtr = nullptr; - outputBufferLength = 0; - remainingLength = 0; - inputBufferPtr = nullptr; - inputBufferPtrEnd = nullptr; - - input->seek(position); - if (!Skip(static_cast<int>(position.next()))) { - throw ParseError("Bad skip in " + getName()); - } - } - - class SnappyDecompressionStream: public BlockDecompressionStream { - public: - SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - pool) { - // PASS - } - - std::string getName() const override { - std::ostringstream result; - result << "snappy(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; - }; - - uint64_t SnappyDecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { - size_t outLength; - if (!snappy::GetUncompressedLength(input, length, &outLength)) { - throw ParseError("SnappyDecompressionStream choked on corrupt input"); - } - - if (outLength > maxOutputLength) { - throw std::logic_error("Snappy length exceeds block size"); - } - - if (!snappy::RawUncompress(input, length, output)) { - throw ParseError("SnappyDecompressionStream choked on corrupt input"); - } - return outLength; - } - - class LzoDecompressionStream: public BlockDecompressionStream { - public: - LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - pool) { - // PASS - } - - std::string getName() const override { - std::ostringstream result; - result << "lzo(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; - }; - - uint64_t LzoDecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { - return lzoDecompress(input, input + length, output, - output + maxOutputLength); - } - - class Lz4DecompressionStream: public BlockDecompressionStream { - public: - Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool - ): BlockDecompressionStream - (std::move(inStream), - blockSize, - pool) { - // PASS - } - - std::string getName() const override { - std::ostringstream result; - result << "lz4(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, uint64_t length, - char *output, size_t maxOutputLength - ) override; - }; - - uint64_t Lz4DecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { - int result = LZ4_decompress_safe(input, output, static_cast<int>(length), - static_cast<int>(maxOutputLength)); - if (result < 0) { - throw ParseError(getName() + " - failed to decompress"); - } - return static_cast<uint64_t>(result); - } - - /** - * Block compression base class - */ - class BlockCompressionStream: public CompressionStreamBase { - public: - BlockCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : CompressionStreamBase(outStream, - compressionLevel, - capacity, - blockSize, - pool) - , compressorBuffer(pool) { - // PASS - } - - virtual bool Next(void** data, int*size) override; - virtual std::string getName() const override = 0; - - protected: - // compresses a block and returns the compressed size - virtual uint64_t doBlockCompression() = 0; - - // return maximum possible compression size for allocating space for - // compressorBuffer below - virtual uint64_t estimateMaxCompressionSize() = 0; - - // should allocate max possible compressed size - DataBuffer<unsigned char> compressorBuffer; - }; - - bool BlockCompressionStream::Next(void** data, int*size) { - if (bufferSize != 0) { - ensureHeader(); - - // perform compression - size_t totalCompressedSize = doBlockCompression(); - - const unsigned char * dataToWrite = nullptr; - int totalSizeToWrite = 0; - char * header = outputBuffer + outputPosition - 3; - - if (totalCompressedSize >= static_cast<size_t>(bufferSize)) { - writeHeader(header, static_cast<size_t>(bufferSize), true); - dataToWrite = rawInputBuffer.data(); - totalSizeToWrite = bufferSize; - } else { - writeHeader(header, totalCompressedSize, false); - dataToWrite = compressorBuffer.data(); - totalSizeToWrite = static_cast<int>(totalCompressedSize); - } - - char * dst = header + 3; - while (totalSizeToWrite > 0) { - if (outputPosition == outputSize) { - if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer), - &outputSize)) { - throw std::logic_error( - "Failed to get next output buffer from output stream."); - } - outputPosition = 0; - dst = outputBuffer; - } else if (outputPosition > outputSize) { - // this will unlikely happen, but we have seen a few on zstd v1.1.0 - throw std::logic_error("Write to an out-of-bound place!"); - } - - int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition); - std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite)); - - outputPosition += sizeToWrite; - dataToWrite += sizeToWrite; - totalSizeToWrite -= sizeToWrite; - dst += sizeToWrite; - } - } - - *data = rawInputBuffer.data(); - *size = static_cast<int>(rawInputBuffer.size()); - bufferSize = *size; - compressorBuffer.resize(estimateMaxCompressionSize()); - - return true; - } - - /** - * ZSTD block compression - */ - class ZSTDCompressionStream: public BlockCompressionStream { - public: - ZSTDCompressionStream(OutputStream * outStream, - int compressionLevel, - uint64_t capacity, - uint64_t blockSize, - MemoryPool& pool) - : BlockCompressionStream(outStream, - compressionLevel, - capacity, - blockSize, - pool) { + + if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) + != Z_OK) { + throw std::runtime_error("Error while calling deflateInit2() for zlib."); + } + } + + void ZlibCompressionStream::end() { + (void)deflateEnd(&strm); + } + +DIAGNOSTIC_PUSH + + enum DecompressState { DECOMPRESS_HEADER, + DECOMPRESS_START, + DECOMPRESS_CONTINUE, + DECOMPRESS_ORIGINAL, + DECOMPRESS_EOF}; + + class ZlibDecompressionStream: public SeekableInputStream { + public: + ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool); + virtual ~ZlibDecompressionStream() override; + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + + private: + void readBuffer(bool failOnEof) { + int length; + if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), + &length)) { + if (failOnEof) { + throw ParseError("Read past EOF in " + "ZlibDecompressionStream::readBuffer"); + } + state = DECOMPRESS_EOF; + inputBuffer = nullptr; + inputBufferEnd = nullptr; + } else { + inputBufferEnd = inputBuffer + length; + } + } + + uint32_t readByte(bool failOnEof) { + if (inputBuffer == inputBufferEnd) { + readBuffer(failOnEof); + if (state == DECOMPRESS_EOF) { + return 0; + } + } + return static_cast<unsigned char>(*(inputBuffer++)); + } + + void readHeader() { + uint32_t header = readByte(false); + if (state != DECOMPRESS_EOF) { + header |= readByte(true) << 8; + header |= readByte(true) << 16; + if (header & 1) { + state = DECOMPRESS_ORIGINAL; + } else { + state = DECOMPRESS_START; + } + remainingLength = header >> 1; + } else { + remainingLength = 0; + } + } + + MemoryPool& pool; + const size_t blockSize; + std::unique_ptr<SeekableInputStream> input; + z_stream zstream; + DataBuffer<char> buffer; + + // the current state + DecompressState state; + + // the start of the current buffer + // This pointer is not owned by us. It is either owned by zstream or + // the underlying stream. + const char* outputBuffer; + // the size of the current buffer + size_t outputBufferLength; + // the size of the current chunk + size_t remainingLength; + + // the last buffer returned from the input + const char *inputBuffer; + const char *inputBufferEnd; + + // roughly the number of bytes returned + off_t bytesReturned; + }; + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wold-style-cast") +#endif + + ZlibDecompressionStream::ZlibDecompressionStream + (std::unique_ptr<SeekableInputStream> inStream, + size_t _blockSize, + MemoryPool& _pool + ): pool(_pool), + blockSize(_blockSize), + buffer(pool, _blockSize) { + input.reset(inStream.release()); + zstream.next_in = nullptr; + zstream.avail_in = 0; + zstream.zalloc = nullptr; + zstream.zfree = nullptr; + zstream.opaque = nullptr; + zstream.next_out = reinterpret_cast<Bytef*>(buffer.data()); + zstream.avail_out = static_cast<uInt>(blockSize); + int64_t result = inflateInit2(&zstream, -15); + switch (result) { + case Z_OK: + break; + case Z_MEM_ERROR: + throw std::logic_error("Memory error from inflateInit2"); + case Z_VERSION_ERROR: + throw std::logic_error("Version error from inflateInit2"); + case Z_STREAM_ERROR: + throw std::logic_error("Stream error from inflateInit2"); + default: + throw std::logic_error("Unknown error from inflateInit2"); + } + outputBuffer = nullptr; + outputBufferLength = 0; + remainingLength = 0; + state = DECOMPRESS_HEADER; + inputBuffer = nullptr; + inputBufferEnd = nullptr; + bytesReturned = 0; + } + +DIAGNOSTIC_POP + + ZlibDecompressionStream::~ZlibDecompressionStream() { + int64_t result = inflateEnd(&zstream); + if (result != Z_OK) { + // really can't throw in destructors + std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n"; + } + } + + bool ZlibDecompressionStream::Next(const void** data, int*size) { + // if the user pushed back, return them the partial buffer + if (outputBufferLength) { + *data = outputBuffer; + *size = static_cast<int>(outputBufferLength); + outputBuffer += outputBufferLength; + outputBufferLength = 0; + return true; + } + if (state == DECOMPRESS_HEADER || remainingLength == 0) { + readHeader(); + } + if (state == DECOMPRESS_EOF) { + return false; + } + if (inputBuffer == inputBufferEnd) { + readBuffer(true); + } + size_t availSize = + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), + remainingLength); + if (state == DECOMPRESS_ORIGINAL) { + *data = inputBuffer; + *size = static_cast<int>(availSize); + outputBuffer = inputBuffer + availSize; + outputBufferLength = 0; + } else if (state == DECOMPRESS_START) { + zstream.next_in = + reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream.avail_in = static_cast<uInt>(availSize); + outputBuffer = buffer.data(); + zstream.next_out = + reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); + zstream.avail_out = static_cast<uInt>(blockSize); + if (inflateReset(&zstream) != Z_OK) { + throw std::logic_error("Bad inflateReset in " + "ZlibDecompressionStream::Next"); + } + int64_t result; + do { + result = inflate(&zstream, availSize == remainingLength ? Z_FINISH : + Z_SYNC_FLUSH); + switch (result) { + case Z_OK: + remainingLength -= availSize; + inputBuffer += availSize; + readBuffer(true); + availSize = + std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), + remainingLength); + zstream.next_in = + reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); + zstream.avail_in = static_cast<uInt>(availSize); + break; + case Z_STREAM_END: + break; + case Z_BUF_ERROR: + throw std::logic_error("Buffer error in " + "ZlibDecompressionStream::Next"); + case Z_DATA_ERROR: + throw std::logic_error("Data error in " + "ZlibDecompressionStream::Next"); + case Z_STREAM_ERROR: + throw std::logic_error("Stream error in " + "ZlibDecompressionStream::Next"); + default: + throw std::logic_error("Unknown error in " + "ZlibDecompressionStream::Next"); + } + } while (result != Z_STREAM_END); + *size = static_cast<int>(blockSize - zstream.avail_out); + *data = outputBuffer; + outputBufferLength = 0; + outputBuffer += *size; + } else { + throw std::logic_error("Unknown compression state in " + "ZlibDecompressionStream::Next"); + } + inputBuffer += availSize; + remainingLength -= availSize; + bytesReturned += *size; + return true; + } + + void ZlibDecompressionStream::BackUp(int count) { + if (outputBuffer == nullptr || outputBufferLength != 0) { + throw std::logic_error("Backup without previous Next in " + "ZlibDecompressionStream"); + } + outputBuffer -= static_cast<size_t>(count); + outputBufferLength = static_cast<size_t>(count); + bytesReturned -= count; + } + + bool ZlibDecompressionStream::Skip(int count) { + bytesReturned += count; + // this is a stupid implementation for now. + // should skip entire blocks without decompressing + while (count > 0) { + const void *ptr; + int len; + if (!Next(&ptr, &len)) { + return false; + } + if (len > count) { + BackUp(len - count); + count = 0; + } else { + count -= len; + } + } + return true; + } + + int64_t ZlibDecompressionStream::ByteCount() const { + return bytesReturned; + } + + void ZlibDecompressionStream::seek(PositionProvider& position) { + // clear state to force seek to read from the right position + state = DECOMPRESS_HEADER; + outputBuffer = nullptr; + outputBufferLength = 0; + remainingLength = 0; + inputBuffer = nullptr; + inputBufferEnd = nullptr; + + input->seek(position); + bytesReturned = static_cast<off_t>(input->ByteCount()); + if (!Skip(static_cast<int>(position.next()))) { + throw ParseError("Bad skip in ZlibDecompressionStream::seek"); + } + } + + std::string ZlibDecompressionStream::getName() const { + std::ostringstream result; + result << "zlib(" << input->getName() << ")"; + return result.str(); + } + + class BlockDecompressionStream: public SeekableInputStream { + public: + BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool); + + virtual ~BlockDecompressionStream() override {} + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override = 0; + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength) = 0; + + std::string getStreamName() const { + return input->getName(); + } + + private: + void readBuffer(bool failOnEof) { + int length; + if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr), + &length)) { + if (failOnEof) { + throw ParseError(getName() + "read past EOF"); + } + state = DECOMPRESS_EOF; + inputBufferPtr = nullptr; + inputBufferPtrEnd = nullptr; + } else { + inputBufferPtrEnd = inputBufferPtr + length; + } + } + + uint32_t readByte(bool failOnEof) { + if (inputBufferPtr == inputBufferPtrEnd) { + readBuffer(failOnEof); + if (state == DECOMPRESS_EOF) { + return 0; + } + } + return static_cast<unsigned char>(*(inputBufferPtr++)); + } + + void readHeader() { + uint32_t header = readByte(false); + if (state != DECOMPRESS_EOF) { + header |= readByte(true) << 8; + header |= readByte(true) << 16; + if (header & 1) { + state = DECOMPRESS_ORIGINAL; + } else { + state = DECOMPRESS_START; + } + remainingLength = header >> 1; + } else { + remainingLength = 0; + } + } + + std::unique_ptr<SeekableInputStream> input; + MemoryPool& pool; + + // may need to stitch together multiple input buffers; + // to give snappy a contiguous block + DataBuffer<char> inputBuffer; + + // uncompressed output + DataBuffer<char> outputBuffer; + + // the current state + DecompressState state; + + // the start of the current output buffer + const char* outputBufferPtr; + // the size of the current output buffer + size_t outputBufferLength; + + // the size of the current chunk + size_t remainingLength; + + // the last buffer returned from the input + const char *inputBufferPtr; + const char *inputBufferPtrEnd; + + // bytes returned by this stream + off_t bytesReturned; + }; + + BlockDecompressionStream::BlockDecompressionStream + (std::unique_ptr<SeekableInputStream> inStream, + size_t bufferSize, + MemoryPool& _pool + ) : pool(_pool), + inputBuffer(pool, bufferSize), + outputBuffer(pool, bufferSize), + state(DECOMPRESS_HEADER), + outputBufferPtr(nullptr), + outputBufferLength(0), + remainingLength(0), + inputBufferPtr(nullptr), + inputBufferPtrEnd(nullptr), + bytesReturned(0) { + input.reset(inStream.release()); + } + + bool BlockDecompressionStream::Next(const void** data, int*size) { + // if the user pushed back, return them the partial buffer + if (outputBufferLength) { + *data = outputBufferPtr; + *size = static_cast<int>(outputBufferLength); + outputBufferPtr += outputBufferLength; + bytesReturned += static_cast<off_t>(outputBufferLength); + outputBufferLength = 0; + return true; + } + if (state == DECOMPRESS_HEADER || remainingLength == 0) { + readHeader(); + } + if (state == DECOMPRESS_EOF) { + return false; + } + if (inputBufferPtr == inputBufferPtrEnd) { + readBuffer(true); + } + + size_t availSize = + std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr), + remainingLength); + if (state == DECOMPRESS_ORIGINAL) { + *data = inputBufferPtr; + *size = static_cast<int>(availSize); + outputBufferPtr = inputBufferPtr + availSize; + outputBufferLength = 0; + inputBufferPtr += availSize; + remainingLength -= availSize; + } else if (state == DECOMPRESS_START) { + // Get contiguous bytes of compressed block. + const char *compressed = inputBufferPtr; + if (remainingLength == availSize) { + inputBufferPtr += availSize; + } else { + // Did not read enough from input. + if (inputBuffer.capacity() < remainingLength) { + inputBuffer.resize(remainingLength); + } + ::memcpy(inputBuffer.data(), inputBufferPtr, availSize); + inputBufferPtr += availSize; + compressed = inputBuffer.data(); + + for (size_t pos = availSize; pos < remainingLength; ) { + readBuffer(true); + size_t avail = + std::min(static_cast<size_t>(inputBufferPtrEnd - + inputBufferPtr), + remainingLength - pos); + ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail); + pos += avail; + inputBufferPtr += avail; + } + } + + outputBufferLength = decompress(compressed, remainingLength, + outputBuffer.data(), + outputBuffer.capacity()); + + remainingLength = 0; + state = DECOMPRESS_HEADER; + *data = outputBuffer.data(); + *size = static_cast<int>(outputBufferLength); + outputBufferPtr = outputBuffer.data() + outputBufferLength; + outputBufferLength = 0; + } + + bytesReturned += *size; + return true; + } + + void BlockDecompressionStream::BackUp(int count) { + if (outputBufferPtr == nullptr || outputBufferLength != 0) { + throw std::logic_error("Backup without previous Next in "+getName()); + } + outputBufferPtr -= static_cast<size_t>(count); + outputBufferLength = static_cast<size_t>(count); + bytesReturned -= count; + } + + bool BlockDecompressionStream::Skip(int count) { + bytesReturned += count; + // this is a stupid implementation for now. + // should skip entire blocks without decompressing + while (count > 0) { + const void *ptr; + int len; + if (!Next(&ptr, &len)) { + return false; + } + if (len > count) { + BackUp(len - count); + count = 0; + } else { + count -= len; + } + } + return true; + } + + int64_t BlockDecompressionStream::ByteCount() const { + return bytesReturned; + } + + void BlockDecompressionStream::seek(PositionProvider& position) { + // clear state to force seek to read from the right position + state = DECOMPRESS_HEADER; + outputBufferPtr = nullptr; + outputBufferLength = 0; + remainingLength = 0; + inputBufferPtr = nullptr; + inputBufferPtrEnd = nullptr; + + input->seek(position); + if (!Skip(static_cast<int>(position.next()))) { + throw ParseError("Bad skip in " + getName()); + } + } + + class SnappyDecompressionStream: public BlockDecompressionStream { + public: + SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool + ): BlockDecompressionStream + (std::move(inStream), + blockSize, + pool) { + // PASS + } + + std::string getName() const override { + std::ostringstream result; + result << "snappy(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength + ) override; + }; + + uint64_t SnappyDecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { + size_t outLength; + if (!snappy::GetUncompressedLength(input, length, &outLength)) { + throw ParseError("SnappyDecompressionStream choked on corrupt input"); + } + + if (outLength > maxOutputLength) { + throw std::logic_error("Snappy length exceeds block size"); + } + + if (!snappy::RawUncompress(input, length, output)) { + throw ParseError("SnappyDecompressionStream choked on corrupt input"); + } + return outLength; + } + + class LzoDecompressionStream: public BlockDecompressionStream { + public: + LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool + ): BlockDecompressionStream + (std::move(inStream), + blockSize, + pool) { + // PASS + } + + std::string getName() const override { + std::ostringstream result; + result << "lzo(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength + ) override; + }; + + uint64_t LzoDecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { + return lzoDecompress(input, input + length, output, + output + maxOutputLength); + } + + class Lz4DecompressionStream: public BlockDecompressionStream { + public: + Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool + ): BlockDecompressionStream + (std::move(inStream), + blockSize, + pool) { + // PASS + } + + std::string getName() const override { + std::ostringstream result; + result << "lz4(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, uint64_t length, + char *output, size_t maxOutputLength + ) override; + }; + + uint64_t Lz4DecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { + int result = LZ4_decompress_safe(input, output, static_cast<int>(length), + static_cast<int>(maxOutputLength)); + if (result < 0) { + throw ParseError(getName() + " - failed to decompress"); + } + return static_cast<uint64_t>(result); + } + + /** + * Block compression base class + */ + class BlockCompressionStream: public CompressionStreamBase { + public: + BlockCompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) + : CompressionStreamBase(outStream, + compressionLevel, + capacity, + blockSize, + pool) + , compressorBuffer(pool) { + // PASS + } + + virtual bool Next(void** data, int*size) override; + virtual std::string getName() const override = 0; + + protected: + // compresses a block and returns the compressed size + virtual uint64_t doBlockCompression() = 0; + + // return maximum possible compression size for allocating space for + // compressorBuffer below + virtual uint64_t estimateMaxCompressionSize() = 0; + + // should allocate max possible compressed size + DataBuffer<unsigned char> compressorBuffer; + }; + + bool BlockCompressionStream::Next(void** data, int*size) { + if (bufferSize != 0) { + ensureHeader(); + + // perform compression + size_t totalCompressedSize = doBlockCompression(); + + const unsigned char * dataToWrite = nullptr; + int totalSizeToWrite = 0; + char * header = outputBuffer + outputPosition - 3; + + if (totalCompressedSize >= static_cast<size_t>(bufferSize)) { + writeHeader(header, static_cast<size_t>(bufferSize), true); + dataToWrite = rawInputBuffer.data(); + totalSizeToWrite = bufferSize; + } else { + writeHeader(header, totalCompressedSize, false); + dataToWrite = compressorBuffer.data(); + totalSizeToWrite = static_cast<int>(totalCompressedSize); + } + + char * dst = header + 3; + while (totalSizeToWrite > 0) { + if (outputPosition == outputSize) { + if (!BufferedOutputStream::Next(reinterpret_cast<void **>(&outputBuffer), + &outputSize)) { + throw std::logic_error( + "Failed to get next output buffer from output stream."); + } + outputPosition = 0; + dst = outputBuffer; + } else if (outputPosition > outputSize) { + // this will unlikely happen, but we have seen a few on zstd v1.1.0 + throw std::logic_error("Write to an out-of-bound place!"); + } + + int sizeToWrite = std::min(totalSizeToWrite, outputSize - outputPosition); + std::memcpy(dst, dataToWrite, static_cast<size_t>(sizeToWrite)); + + outputPosition += sizeToWrite; + dataToWrite += sizeToWrite; + totalSizeToWrite -= sizeToWrite; + dst += sizeToWrite; + } + } + + *data = rawInputBuffer.data(); + *size = static_cast<int>(rawInputBuffer.size()); + bufferSize = *size; + compressorBuffer.resize(estimateMaxCompressionSize()); + + return true; + } + + /** + * ZSTD block compression + */ + class ZSTDCompressionStream: public BlockCompressionStream { + public: + ZSTDCompressionStream(OutputStream * outStream, + int compressionLevel, + uint64_t capacity, + uint64_t blockSize, + MemoryPool& pool) + : BlockCompressionStream(outStream, + compressionLevel, + capacity, + blockSize, + pool) { this->init(); - } - - virtual std::string getName() const override { - return "ZstdCompressionStream"; - } + } + + virtual std::string getName() const override { + return "ZstdCompressionStream"; + } virtual ~ZSTDCompressionStream() override { this->end(); } - - protected: - virtual uint64_t doBlockCompression() override; - - virtual uint64_t estimateMaxCompressionSize() override { - return ZSTD_compressBound(static_cast<size_t>(bufferSize)); - } + + protected: + virtual uint64_t doBlockCompression() override; + + virtual uint64_t estimateMaxCompressionSize() override { + return ZSTD_compressBound(static_cast<size_t>(bufferSize)); + } private: void init(); void end(); ZSTD_CCtx *cctx; - }; - - uint64_t ZSTDCompressionStream::doBlockCompression() { + }; + + uint64_t ZSTDCompressionStream::doBlockCompression() { return ZSTD_compressCCtx(cctx, compressorBuffer.data(), compressorBuffer.size(), rawInputBuffer.data(), static_cast<size_t>(bufferSize), level); - } + } DIAGNOSTIC_PUSH - + #if defined(__GNUC__) || defined(__clang__) DIAGNOSTIC_IGNORE("-Wold-style-cast") #endif @@ -1086,53 +1086,53 @@ DIAGNOSTIC_PUSH DIAGNOSTIC_PUSH - /** - * ZSTD block decompression - */ - class ZSTDDecompressionStream: public BlockDecompressionStream { - public: - ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, - size_t blockSize, - MemoryPool& pool) - : BlockDecompressionStream(std::move(inStream), - blockSize, - pool) { + /** + * ZSTD block decompression + */ + class ZSTDDecompressionStream: public BlockDecompressionStream { + public: + ZSTDDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, + size_t blockSize, + MemoryPool& pool) + : BlockDecompressionStream(std::move(inStream), + blockSize, + pool) { this->init(); - } - + } + virtual ~ZSTDDecompressionStream() override { this->end(); } - std::string getName() const override { - std::ostringstream result; - result << "zstd(" << getStreamName() << ")"; - return result.str(); - } - - protected: - virtual uint64_t decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) override; + std::string getName() const override { + std::ostringstream result; + result << "zstd(" << getStreamName() << ")"; + return result.str(); + } + + protected: + virtual uint64_t decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) override; private: void init(); void end(); ZSTD_DCtx *dctx; - }; - - uint64_t ZSTDDecompressionStream::decompress(const char *input, - uint64_t length, - char *output, - size_t maxOutputLength) { + }; + + uint64_t ZSTDDecompressionStream::decompress(const char *input, + uint64_t length, + char *output, + size_t maxOutputLength) { return static_cast<uint64_t>(ZSTD_decompressDCtx(dctx, output, maxOutputLength, input, length)); - } - + } + DIAGNOSTIC_PUSH #if defined(__GNUC__) || defined(__clang__) @@ -1155,71 +1155,71 @@ DIAGNOSTIC_PUSH DIAGNOSTIC_PUSH - std::unique_ptr<BufferedOutputStream> - createCompressor( - CompressionKind kind, - OutputStream * outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool) { - switch (static_cast<int64_t>(kind)) { - case CompressionKind_NONE: { - return std::unique_ptr<BufferedOutputStream> - (new BufferedOutputStream( - pool, outStream, bufferCapacity, compressionBlockSize)); - } - case CompressionKind_ZLIB: { - int level = (strategy == CompressionStrategy_SPEED) ? - Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; - return std::unique_ptr<BufferedOutputStream> - (new ZlibCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_ZSTD: { - int level = (strategy == CompressionStrategy_SPEED) ? - 1 : ZSTD_CLEVEL_DEFAULT; - return std::unique_ptr<BufferedOutputStream> - (new ZSTDCompressionStream( - outStream, level, bufferCapacity, compressionBlockSize, pool)); - } - case CompressionKind_SNAPPY: - case CompressionKind_LZO: - case CompressionKind_LZ4: - default: - throw NotImplementedYet("compression codec"); - } - } - - std::unique_ptr<SeekableInputStream> - createDecompressor(CompressionKind kind, - std::unique_ptr<SeekableInputStream> input, - uint64_t blockSize, - MemoryPool& pool) { - switch (static_cast<int64_t>(kind)) { - case CompressionKind_NONE: - return REDUNDANT_MOVE(input); - case CompressionKind_ZLIB: - return std::unique_ptr<SeekableInputStream> - (new ZlibDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_SNAPPY: - return std::unique_ptr<SeekableInputStream> - (new SnappyDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_LZO: - return std::unique_ptr<SeekableInputStream> - (new LzoDecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_LZ4: - return std::unique_ptr<SeekableInputStream> - (new Lz4DecompressionStream(std::move(input), blockSize, pool)); - case CompressionKind_ZSTD: - return std::unique_ptr<SeekableInputStream> - (new ZSTDDecompressionStream(std::move(input), blockSize, pool)); - default: { - std::ostringstream buffer; - buffer << "Unknown compression codec " << kind; - throw NotImplementedYet(buffer.str()); - } - } - } - -} + std::unique_ptr<BufferedOutputStream> + createCompressor( + CompressionKind kind, + OutputStream * outStream, + CompressionStrategy strategy, + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + MemoryPool& pool) { + switch (static_cast<int64_t>(kind)) { + case CompressionKind_NONE: { + return std::unique_ptr<BufferedOutputStream> + (new BufferedOutputStream( + pool, outStream, bufferCapacity, compressionBlockSize)); + } + case CompressionKind_ZLIB: { + int level = (strategy == CompressionStrategy_SPEED) ? + Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; + return std::unique_ptr<BufferedOutputStream> + (new ZlibCompressionStream( + outStream, level, bufferCapacity, compressionBlockSize, pool)); + } + case CompressionKind_ZSTD: { + int level = (strategy == CompressionStrategy_SPEED) ? + 1 : ZSTD_CLEVEL_DEFAULT; + return std::unique_ptr<BufferedOutputStream> + (new ZSTDCompressionStream( + outStream, level, bufferCapacity, compressionBlockSize, pool)); + } + case CompressionKind_SNAPPY: + case CompressionKind_LZO: + case CompressionKind_LZ4: + default: + throw NotImplementedYet("compression codec"); + } + } + + std::unique_ptr<SeekableInputStream> + createDecompressor(CompressionKind kind, + std::unique_ptr<SeekableInputStream> input, + uint64_t blockSize, + MemoryPool& pool) { + switch (static_cast<int64_t>(kind)) { + case CompressionKind_NONE: + return REDUNDANT_MOVE(input); + case CompressionKind_ZLIB: + return std::unique_ptr<SeekableInputStream> + (new ZlibDecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_SNAPPY: + return std::unique_ptr<SeekableInputStream> + (new SnappyDecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_LZO: + return std::unique_ptr<SeekableInputStream> + (new LzoDecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_LZ4: + return std::unique_ptr<SeekableInputStream> + (new Lz4DecompressionStream(std::move(input), blockSize, pool)); + case CompressionKind_ZSTD: + return std::unique_ptr<SeekableInputStream> + (new ZSTDDecompressionStream(std::move(input), blockSize, pool)); + default: { + std::ostringstream buffer; + buffer << "Unknown compression codec " << kind; + throw NotImplementedYet(buffer.str()); + } + } + } + +} diff --git a/contrib/libs/apache/orc/c++/src/Compression.hh b/contrib/libs/apache/orc/c++/src/Compression.hh index ff79377d83..84e85bddaf 100644 --- a/contrib/libs/apache/orc/c++/src/Compression.hh +++ b/contrib/libs/apache/orc/c++/src/Compression.hh @@ -1,58 +1,58 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_COMPRESSION_HH -#define ORC_COMPRESSION_HH - -#include "io/InputStream.hh" -#include "io/OutputStream.hh" - -namespace orc { - - /** - * Create a decompressor for the given compression kind. - * @param kind the compression type to implement - * @param input the input stream that is the underlying source - * @param bufferSize the maximum size of the buffer - * @param pool the memory pool - */ - std::unique_ptr<SeekableInputStream> - createDecompressor(CompressionKind kind, - std::unique_ptr<SeekableInputStream> input, - uint64_t bufferSize, - MemoryPool& pool); - - /** - * Create a compressor for the given compression kind. - * @param kind the compression type to implement - * @param outStream the output stream that is the underlying target - * @param strategy compression strategy - * @param bufferCapacity compression stream buffer total capacity - * @param compressionBlockSize compression buffer block size - * @param pool the memory pool - */ - std::unique_ptr<BufferedOutputStream> - createCompressor(CompressionKind kind, - OutputStream * outStream, - CompressionStrategy strategy, - uint64_t bufferCapacity, - uint64_t compressionBlockSize, - MemoryPool& pool); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_COMPRESSION_HH +#define ORC_COMPRESSION_HH + +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +namespace orc { + + /** + * Create a decompressor for the given compression kind. + * @param kind the compression type to implement + * @param input the input stream that is the underlying source + * @param bufferSize the maximum size of the buffer + * @param pool the memory pool + */ + std::unique_ptr<SeekableInputStream> + createDecompressor(CompressionKind kind, + std::unique_ptr<SeekableInputStream> input, + uint64_t bufferSize, + MemoryPool& pool); + + /** + * Create a compressor for the given compression kind. + * @param kind the compression type to implement + * @param outStream the output stream that is the underlying target + * @param strategy compression strategy + * @param bufferCapacity compression stream buffer total capacity + * @param compressionBlockSize compression buffer block size + * @param pool the memory pool + */ + std::unique_ptr<BufferedOutputStream> + createCompressor(CompressionKind kind, + OutputStream * outStream, + CompressionStrategy strategy, + uint64_t bufferCapacity, + uint64_t compressionBlockSize, + MemoryPool& pool); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Exceptions.cc b/contrib/libs/apache/orc/c++/src/Exceptions.cc index 2077b27df4..f721c05a88 100644 --- a/contrib/libs/apache/orc/c++/src/Exceptions.cc +++ b/contrib/libs/apache/orc/c++/src/Exceptions.cc @@ -1,78 +1,78 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" - -namespace orc { - - NotImplementedYet::NotImplementedYet(const std::string& what_arg - ) : logic_error(what_arg) { - // PASS - } - - NotImplementedYet::NotImplementedYet(const char* what_arg - ) :logic_error(what_arg) { - // PASS - } - - NotImplementedYet::NotImplementedYet(const NotImplementedYet& error - ): logic_error(error) { - // PASS - } - - NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT { - // PASS - } - - ParseError::ParseError(const std::string& what_arg - ): runtime_error(what_arg) { - // PASS - } - - ParseError::ParseError(const char* what_arg - ): runtime_error(what_arg) { - // PASS - } - - ParseError::ParseError(const ParseError& error): runtime_error(error) { - // PASS - } - - ParseError::~ParseError() ORC_NOEXCEPT { - // PASS - } - - InvalidArgument::InvalidArgument(const std::string& what_arg - ): runtime_error(what_arg) { - // PASS - } - - InvalidArgument::InvalidArgument(const char* what_arg - ): runtime_error(what_arg) { - // PASS - } - - InvalidArgument::InvalidArgument(const InvalidArgument& error - ): runtime_error(error) { - // PASS - } - - InvalidArgument::~InvalidArgument() ORC_NOEXCEPT { - // PASS - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" + +namespace orc { + + NotImplementedYet::NotImplementedYet(const std::string& what_arg + ) : logic_error(what_arg) { + // PASS + } + + NotImplementedYet::NotImplementedYet(const char* what_arg + ) :logic_error(what_arg) { + // PASS + } + + NotImplementedYet::NotImplementedYet(const NotImplementedYet& error + ): logic_error(error) { + // PASS + } + + NotImplementedYet::~NotImplementedYet() ORC_NOEXCEPT { + // PASS + } + + ParseError::ParseError(const std::string& what_arg + ): runtime_error(what_arg) { + // PASS + } + + ParseError::ParseError(const char* what_arg + ): runtime_error(what_arg) { + // PASS + } + + ParseError::ParseError(const ParseError& error): runtime_error(error) { + // PASS + } + + ParseError::~ParseError() ORC_NOEXCEPT { + // PASS + } + + InvalidArgument::InvalidArgument(const std::string& what_arg + ): runtime_error(what_arg) { + // PASS + } + + InvalidArgument::InvalidArgument(const char* what_arg + ): runtime_error(what_arg) { + // PASS + } + + InvalidArgument::InvalidArgument(const InvalidArgument& error + ): runtime_error(error) { + // PASS + } + + InvalidArgument::~InvalidArgument() ORC_NOEXCEPT { + // PASS + } +} diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc index 433e6fa193..96266e855c 100644 --- a/contrib/libs/apache/orc/c++/src/Int128.cc +++ b/contrib/libs/apache/orc/c++/src/Int128.cc @@ -1,494 +1,494 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" -#include "Adaptor.hh" - -#include <algorithm> -#include <iomanip> -#include <iostream> -#include <sstream> - -namespace orc { - - Int128 Int128::maximumValue() { - return Int128(0x7fffffffffffffff, 0xfffffffffffffff); - } - - Int128 Int128::minimumValue() { - return Int128(static_cast<int64_t>(0x8000000000000000), 0x0); - } - - Int128::Int128(const std::string& str) { - lowbits = 0; - highbits = 0; - size_t length = str.length(); - if (length > 0) { - bool isNegative = str[0] == '-'; - size_t posn = isNegative ? 1 : 0; - while (posn < length) { - size_t group = std::min(static_cast<size_t>(18), length - posn); - int64_t chunk = std::stoll(str.substr(posn, group)); - int64_t multiple = 1; - for(size_t i=0; i < group; ++i) { - multiple *= 10; - } - *this *= multiple; - *this += chunk; - posn += group; - } - if (isNegative) { - negate(); - } - } - } - - Int128& Int128::operator*=(const Int128 &right) { - const uint64_t INT_MASK = 0xffffffff; - const uint64_t CARRY_BIT = INT_MASK + 1; - - // Break the left and right numbers into 32 bit chunks - // so that we can multiply them without overflow. - uint64_t L0 = static_cast<uint64_t>(highbits) >> 32; - uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK; - uint64_t L2 = lowbits >> 32; - uint64_t L3 = lowbits & INT_MASK; - uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32; - uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK; - uint64_t R2 = right.lowbits >> 32; - uint64_t R3 = right.lowbits & INT_MASK; - - uint64_t product = L3 * R3; - lowbits = product & INT_MASK; - uint64_t sum = product >> 32; - product = L2 * R3; - sum += product; - highbits = sum < product ? CARRY_BIT : 0; - product = L3 * R2; - sum += product; - if (sum < product) { - highbits += CARRY_BIT; - } - lowbits += sum << 32; - highbits += static_cast<int64_t>(sum >> 32); - highbits += L1 * R3 + L2 * R2 + L3 * R1; - highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; - return *this; - } - - /** - * Expands the given value into an array of ints so that we can work on - * it. The array will be converted to an absolute value and the wasNegative - * flag will be set appropriately. The array will remove leading zeros from - * the value. - * @param array an array of length 4 to set with the value - * @param wasNegative a flag for whether the value was original negative - * @result the output length of the array - */ - int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const { - uint64_t high; - uint64_t low; - if (highbits < 0) { - low = ~lowbits + 1; - high = static_cast<uint64_t>(~highbits); - if (low == 0) { - high += 1; - } - wasNegative = true; - } else { - low = lowbits; - high = static_cast<uint64_t>(highbits); - wasNegative = false; - } - if (high != 0) { - if (high > UINT32_MAX) { - array[0] = static_cast<uint32_t>(high >> 32); - array[1] = static_cast<uint32_t>(high); - array[2] = static_cast<uint32_t>(low >> 32); - array[3] = static_cast<uint32_t>(low); - return 4; - } else { - array[0] = static_cast<uint32_t>(high); - array[1] = static_cast<uint32_t>(low >> 32); - array[2] = static_cast<uint32_t>(low); - return 3; - } - } else if (low >= UINT32_MAX) { - array[0] = static_cast<uint32_t>(low >> 32); - array[1] = static_cast<uint32_t>(low); - return 2; - } else if (low == 0) { - return 0; - } else { - array[0] = static_cast<uint32_t>(low); - return 1; - } - } - - - /** - * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is - * the MSB. We can replace this with bsrq asm instruction on x64. - */ - int64_t fls(uint32_t x) { - int64_t bitpos = 0; - while (x) { - x >>= 1; - bitpos += 1; - } - return bitpos; - } - - /** - * Shift the number in the array left by bits positions. - * @param array the number to shift, must have length elements - * @param length the number of entries in the array - * @param bits the number of bits to shift (0 <= bits < 32) - */ - void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits != 0) { - for(int64_t i=0; i < length-1; ++i) { - array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits)); - } - array[length-1] <<= bits; - } - } - - /** - * Shift the number in the array right by bits positions. - * @param array the number to shift, must have length elements - * @param length the number of entries in the array - * @param bits the number of bits to shift (0 <= bits < 32) - */ - void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { - if (length > 0 && bits != 0) { - for(int64_t i=length-1; i > 0; --i) { - array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits)); - } - array[0] >>= bits; - } - } - - /** - * Fix the signs of the result and remainder at the end of the division - * based on the signs of the dividend and divisor. - */ - void fixDivisionSigns(Int128 &result, Int128 &remainder, - bool dividendWasNegative, bool divisorWasNegative) { - if (dividendWasNegative != divisorWasNegative) { - result.negate(); - } - if (dividendWasNegative) { - remainder.negate(); - } - } - - /** - * Build a Int128 from a list of ints. - */ - void buildFromArray(Int128& value, uint32_t* array, int64_t length) { - switch (length) { - case 0: - value = 0; - break; - case 1: - value = array[0]; - break; - case 2: - value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]); - break; - case 3: - value = Int128(array[0], - (static_cast<uint64_t>(array[1]) << 32) + array[2]); - break; - case 4: - value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1], - (static_cast<uint64_t>(array[2]) << 32) + array[3]); - break; - case 5: - if (array[0] != 0) { - throw std::logic_error("Can't build Int128 with 5 ints."); - } - value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2], - (static_cast<uint64_t>(array[3]) << 32) + array[4]); - break; - default: - throw std::logic_error("Unsupported length for building Int128"); - } - } - - /** - * Do a division where the divisor fits into a single 32 bit value. - */ - Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, - uint32_t divisor, Int128& remainder, - bool dividendWasNegative, bool divisorWasNegative) { - uint64_t r = 0; - uint32_t resultArray[5]; - for(int64_t j=0; j < dividendLength; j++) { - r <<= 32; - r += dividend[j]; - resultArray[j] = static_cast<uint32_t>(r / divisor); - r %= divisor; - } - Int128 result; - buildFromArray(result, resultArray, dividendLength); - remainder = static_cast<int64_t>(r); - fixDivisionSigns(result, remainder, dividendWasNegative, - divisorWasNegative); - return result; - } - - Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const { - // Split the dividend and divisor into integer pieces so that we can - // work on them. - uint32_t dividendArray[5]; - uint32_t divisorArray[4]; - bool dividendWasNegative; - bool divisorWasNegative; - // leave an extra zero before the dividend - dividendArray[0] = 0; - int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1; - int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative); - - // Handle some of the easy cases. - if (dividendLength <= divisorLength) { - remainder = *this; - return 0; - } else if (divisorLength == 0) { - throw std::range_error("Division by 0 in Int128"); - } else if (divisorLength == 1) { - return singleDivide(dividendArray, dividendLength, divisorArray[0], - remainder, dividendWasNegative, divisorWasNegative); - } - - int64_t resultLength = dividendLength - divisorLength; - uint32_t resultArray[4]; - - // Normalize by shifting both by a multiple of 2 so that - // the digit guessing is better. The requirement is that - // divisorArray[0] is greater than 2**31. - int64_t normalizeBits = 32 - fls(divisorArray[0]); - shiftArrayLeft(divisorArray, divisorLength, normalizeBits); - shiftArrayLeft(dividendArray, dividendLength, normalizeBits); - - // compute each digit in the result - for(int64_t j=0; j < resultLength; ++j) { - // Guess the next digit. At worst it is two too large - uint32_t guess = UINT32_MAX; - uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | - dividendArray[j+1]; - if (dividendArray[j] != divisorArray[0]) { - guess = static_cast<uint32_t>(highDividend / divisorArray[0]); - } - - // catch all of the cases where guess is two too large and most of the - // cases where it is one too large - uint32_t rhat = - static_cast<uint32_t>(highDividend - guess * - static_cast<uint64_t>(divisorArray[0])); - while (static_cast<uint64_t>(divisorArray[1]) * guess > - (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) { - guess -= 1; - rhat += divisorArray[0]; - if (static_cast<uint64_t>(rhat) < divisorArray[0]) { - break; - } - } - - // subtract off the guess * divisor from the dividend - uint64_t mult = 0; - for(int64_t i=divisorLength-1; i >= 0; --i) { - mult += static_cast<uint64_t>(guess) * divisorArray[i]; - uint32_t prev = dividendArray[j+i+1]; - dividendArray[j+i+1] -= static_cast<uint32_t>(mult); - mult >>= 32; - if (dividendArray[j+i+1] > prev) { - mult += 1; - } - } - uint32_t prev = dividendArray[j]; - dividendArray[j] -= static_cast<uint32_t>(mult); - - // if guess was too big, we add back divisor - if (dividendArray[j] > prev) { - guess -= 1; - uint32_t carry = 0; - for(int64_t i=divisorLength-1; i >= 0; --i) { - uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + - dividendArray[j+i+1] + carry; - dividendArray[j+i+1] = static_cast<uint32_t>(sum); - carry = static_cast<uint32_t>(sum >> 32); - } - dividendArray[j] += carry; - } - - resultArray[j] = guess; - } - - // denormalize the remainder - shiftArrayRight(dividendArray, dividendLength, normalizeBits); - - // return result and remainder - Int128 result; - buildFromArray(result, resultArray, resultLength); - buildFromArray(remainder, dividendArray, dividendLength); - fixDivisionSigns(result, remainder, - dividendWasNegative, divisorWasNegative); - return result; - } - - std::string Int128::toString() const { - // 10**18 - the largest power of 10 less than 63 bits - const Int128 tenTo18(0xde0b6b3a7640000); - // 10**36 - const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000); - Int128 remainder; - std::stringstream buf; - bool needFill = false; - - // get anything above 10**36 and print it - Int128 top = divide(tenTo36, remainder); - if (top != 0) { - buf << top.toLong(); - remainder.abs(); - needFill = true; - } - - // now get anything above 10**18 and print it - Int128 tail; - top = remainder.divide(tenTo18, tail); - if (needFill || top != 0) { - if (needFill) { - buf << std::setw(18) << std::setfill('0'); - } else { - needFill = true; - tail.abs(); - } - buf << top.toLong(); - } - - // finally print the tail, which is less than 10**18 - if (needFill) { - buf << std::setw(18) << std::setfill('0'); - } - buf << tail.toLong(); - return buf.str(); - } - - std::string Int128::toDecimalString(int32_t scale) const { - std::string str = toString(); - if (scale == 0) { - return str; - } else if (*this < 0) { - int32_t len = static_cast<int32_t>(str.length()); - if (len - 1 > scale) { - return str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); - } else if (len - 1 == scale) { - return "-0." + str.substr(1, std::string::npos); - } else { - std::string result = "-0."; - for(int32_t i=0; i < scale - len + 1; ++i) { - result += "0"; - } - return result + str.substr(1, std::string::npos); - } - } else { - int32_t len = static_cast<int32_t>(str.length()); - if (len > scale) { - return str.substr(0, static_cast<size_t>(len - scale)) + "." + - str.substr(static_cast<size_t>(len - scale), - static_cast<size_t>(scale)); - } else if (len == scale) { - return "0." + str; - } else { - std::string result = "0."; - for(int32_t i=0; i < scale - len; ++i) { - result += "0"; - } - return result + str; - } - } - } - - std::string Int128::toHexString() const { - std::stringstream buf; - buf << std::hex << "0x" - << std::setw(16) << std::setfill('0') << highbits - << std::setw(16) << std::setfill('0') << lowbits; - return buf.str(); - } - - const static int32_t MAX_PRECISION_64 = 18; - const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = - {1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000, - 100000000000, - 1000000000000, - 10000000000000, - 100000000000000, - 1000000000000000, - 10000000000000000, - 100000000000000000, - 1000000000000000000}; - - Int128 scaleUpInt128ByPowerOfTen(Int128 value, - int32_t power, - bool &overflow) { - overflow = false; - Int128 remainder; - - while (power > 0) { - int32_t step = std::min(power, MAX_PRECISION_64); - if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) { - overflow = true; - return Int128::maximumValue(); - } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) { - overflow = true; - return Int128::minimumValue(); - } - - value *= POWERS_OF_TEN[step]; - power -= step; - } - - return value; - } - - Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) { - Int128 remainder; - while (power > 0) { - int32_t step = std::min(std::abs(power), MAX_PRECISION_64); - value = value.divide(POWERS_OF_TEN[step], remainder); - power -= step; - } - return value; - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" +#include "Adaptor.hh" + +#include <algorithm> +#include <iomanip> +#include <iostream> +#include <sstream> + +namespace orc { + + Int128 Int128::maximumValue() { + return Int128(0x7fffffffffffffff, 0xfffffffffffffff); + } + + Int128 Int128::minimumValue() { + return Int128(static_cast<int64_t>(0x8000000000000000), 0x0); + } + + Int128::Int128(const std::string& str) { + lowbits = 0; + highbits = 0; + size_t length = str.length(); + if (length > 0) { + bool isNegative = str[0] == '-'; + size_t posn = isNegative ? 1 : 0; + while (posn < length) { + size_t group = std::min(static_cast<size_t>(18), length - posn); + int64_t chunk = std::stoll(str.substr(posn, group)); + int64_t multiple = 1; + for(size_t i=0; i < group; ++i) { + multiple *= 10; + } + *this *= multiple; + *this += chunk; + posn += group; + } + if (isNegative) { + negate(); + } + } + } + + Int128& Int128::operator*=(const Int128 &right) { + const uint64_t INT_MASK = 0xffffffff; + const uint64_t CARRY_BIT = INT_MASK + 1; + + // Break the left and right numbers into 32 bit chunks + // so that we can multiply them without overflow. + uint64_t L0 = static_cast<uint64_t>(highbits) >> 32; + uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK; + uint64_t L2 = lowbits >> 32; + uint64_t L3 = lowbits & INT_MASK; + uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32; + uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK; + uint64_t R2 = right.lowbits >> 32; + uint64_t R3 = right.lowbits & INT_MASK; + + uint64_t product = L3 * R3; + lowbits = product & INT_MASK; + uint64_t sum = product >> 32; + product = L2 * R3; + sum += product; + highbits = sum < product ? CARRY_BIT : 0; + product = L3 * R2; + sum += product; + if (sum < product) { + highbits += CARRY_BIT; + } + lowbits += sum << 32; + highbits += static_cast<int64_t>(sum >> 32); + highbits += L1 * R3 + L2 * R2 + L3 * R1; + highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32; + return *this; + } + + /** + * Expands the given value into an array of ints so that we can work on + * it. The array will be converted to an absolute value and the wasNegative + * flag will be set appropriately. The array will remove leading zeros from + * the value. + * @param array an array of length 4 to set with the value + * @param wasNegative a flag for whether the value was original negative + * @result the output length of the array + */ + int64_t Int128::fillInArray(uint32_t* array, bool &wasNegative) const { + uint64_t high; + uint64_t low; + if (highbits < 0) { + low = ~lowbits + 1; + high = static_cast<uint64_t>(~highbits); + if (low == 0) { + high += 1; + } + wasNegative = true; + } else { + low = lowbits; + high = static_cast<uint64_t>(highbits); + wasNegative = false; + } + if (high != 0) { + if (high > UINT32_MAX) { + array[0] = static_cast<uint32_t>(high >> 32); + array[1] = static_cast<uint32_t>(high); + array[2] = static_cast<uint32_t>(low >> 32); + array[3] = static_cast<uint32_t>(low); + return 4; + } else { + array[0] = static_cast<uint32_t>(high); + array[1] = static_cast<uint32_t>(low >> 32); + array[2] = static_cast<uint32_t>(low); + return 3; + } + } else if (low >= UINT32_MAX) { + array[0] = static_cast<uint32_t>(low >> 32); + array[1] = static_cast<uint32_t>(low); + return 2; + } else if (low == 0) { + return 0; + } else { + array[0] = static_cast<uint32_t>(low); + return 1; + } + } + + + /** + * Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is + * the MSB. We can replace this with bsrq asm instruction on x64. + */ + int64_t fls(uint32_t x) { + int64_t bitpos = 0; + while (x) { + x >>= 1; + bitpos += 1; + } + return bitpos; + } + + /** + * Shift the number in the array left by bits positions. + * @param array the number to shift, must have length elements + * @param length the number of entries in the array + * @param bits the number of bits to shift (0 <= bits < 32) + */ + void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits != 0) { + for(int64_t i=0; i < length-1; ++i) { + array[i] = (array[i] << bits) | (array[i+1] >> (32 - bits)); + } + array[length-1] <<= bits; + } + } + + /** + * Shift the number in the array right by bits positions. + * @param array the number to shift, must have length elements + * @param length the number of entries in the array + * @param bits the number of bits to shift (0 <= bits < 32) + */ + void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) { + if (length > 0 && bits != 0) { + for(int64_t i=length-1; i > 0; --i) { + array[i] = (array[i] >> bits) | (array[i-1] << (32 - bits)); + } + array[0] >>= bits; + } + } + + /** + * Fix the signs of the result and remainder at the end of the division + * based on the signs of the dividend and divisor. + */ + void fixDivisionSigns(Int128 &result, Int128 &remainder, + bool dividendWasNegative, bool divisorWasNegative) { + if (dividendWasNegative != divisorWasNegative) { + result.negate(); + } + if (dividendWasNegative) { + remainder.negate(); + } + } + + /** + * Build a Int128 from a list of ints. + */ + void buildFromArray(Int128& value, uint32_t* array, int64_t length) { + switch (length) { + case 0: + value = 0; + break; + case 1: + value = array[0]; + break; + case 2: + value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]); + break; + case 3: + value = Int128(array[0], + (static_cast<uint64_t>(array[1]) << 32) + array[2]); + break; + case 4: + value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1], + (static_cast<uint64_t>(array[2]) << 32) + array[3]); + break; + case 5: + if (array[0] != 0) { + throw std::logic_error("Can't build Int128 with 5 ints."); + } + value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2], + (static_cast<uint64_t>(array[3]) << 32) + array[4]); + break; + default: + throw std::logic_error("Unsupported length for building Int128"); + } + } + + /** + * Do a division where the divisor fits into a single 32 bit value. + */ + Int128 singleDivide(uint32_t* dividend, int64_t dividendLength, + uint32_t divisor, Int128& remainder, + bool dividendWasNegative, bool divisorWasNegative) { + uint64_t r = 0; + uint32_t resultArray[5]; + for(int64_t j=0; j < dividendLength; j++) { + r <<= 32; + r += dividend[j]; + resultArray[j] = static_cast<uint32_t>(r / divisor); + r %= divisor; + } + Int128 result; + buildFromArray(result, resultArray, dividendLength); + remainder = static_cast<int64_t>(r); + fixDivisionSigns(result, remainder, dividendWasNegative, + divisorWasNegative); + return result; + } + + Int128 Int128::divide(const Int128 &divisor, Int128 &remainder) const { + // Split the dividend and divisor into integer pieces so that we can + // work on them. + uint32_t dividendArray[5]; + uint32_t divisorArray[4]; + bool dividendWasNegative; + bool divisorWasNegative; + // leave an extra zero before the dividend + dividendArray[0] = 0; + int64_t dividendLength = fillInArray(dividendArray + 1, dividendWasNegative)+1; + int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative); + + // Handle some of the easy cases. + if (dividendLength <= divisorLength) { + remainder = *this; + return 0; + } else if (divisorLength == 0) { + throw std::range_error("Division by 0 in Int128"); + } else if (divisorLength == 1) { + return singleDivide(dividendArray, dividendLength, divisorArray[0], + remainder, dividendWasNegative, divisorWasNegative); + } + + int64_t resultLength = dividendLength - divisorLength; + uint32_t resultArray[4]; + + // Normalize by shifting both by a multiple of 2 so that + // the digit guessing is better. The requirement is that + // divisorArray[0] is greater than 2**31. + int64_t normalizeBits = 32 - fls(divisorArray[0]); + shiftArrayLeft(divisorArray, divisorLength, normalizeBits); + shiftArrayLeft(dividendArray, dividendLength, normalizeBits); + + // compute each digit in the result + for(int64_t j=0; j < resultLength; ++j) { + // Guess the next digit. At worst it is two too large + uint32_t guess = UINT32_MAX; + uint64_t highDividend = static_cast<uint64_t>(dividendArray[j]) << 32 | + dividendArray[j+1]; + if (dividendArray[j] != divisorArray[0]) { + guess = static_cast<uint32_t>(highDividend / divisorArray[0]); + } + + // catch all of the cases where guess is two too large and most of the + // cases where it is one too large + uint32_t rhat = + static_cast<uint32_t>(highDividend - guess * + static_cast<uint64_t>(divisorArray[0])); + while (static_cast<uint64_t>(divisorArray[1]) * guess > + (static_cast<uint64_t>(rhat) << 32) + dividendArray[j+2]) { + guess -= 1; + rhat += divisorArray[0]; + if (static_cast<uint64_t>(rhat) < divisorArray[0]) { + break; + } + } + + // subtract off the guess * divisor from the dividend + uint64_t mult = 0; + for(int64_t i=divisorLength-1; i >= 0; --i) { + mult += static_cast<uint64_t>(guess) * divisorArray[i]; + uint32_t prev = dividendArray[j+i+1]; + dividendArray[j+i+1] -= static_cast<uint32_t>(mult); + mult >>= 32; + if (dividendArray[j+i+1] > prev) { + mult += 1; + } + } + uint32_t prev = dividendArray[j]; + dividendArray[j] -= static_cast<uint32_t>(mult); + + // if guess was too big, we add back divisor + if (dividendArray[j] > prev) { + guess -= 1; + uint32_t carry = 0; + for(int64_t i=divisorLength-1; i >= 0; --i) { + uint64_t sum = static_cast<uint64_t>(divisorArray[i]) + + dividendArray[j+i+1] + carry; + dividendArray[j+i+1] = static_cast<uint32_t>(sum); + carry = static_cast<uint32_t>(sum >> 32); + } + dividendArray[j] += carry; + } + + resultArray[j] = guess; + } + + // denormalize the remainder + shiftArrayRight(dividendArray, dividendLength, normalizeBits); + + // return result and remainder + Int128 result; + buildFromArray(result, resultArray, resultLength); + buildFromArray(remainder, dividendArray, dividendLength); + fixDivisionSigns(result, remainder, + dividendWasNegative, divisorWasNegative); + return result; + } + + std::string Int128::toString() const { + // 10**18 - the largest power of 10 less than 63 bits + const Int128 tenTo18(0xde0b6b3a7640000); + // 10**36 + const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000); + Int128 remainder; + std::stringstream buf; + bool needFill = false; + + // get anything above 10**36 and print it + Int128 top = divide(tenTo36, remainder); + if (top != 0) { + buf << top.toLong(); + remainder.abs(); + needFill = true; + } + + // now get anything above 10**18 and print it + Int128 tail; + top = remainder.divide(tenTo18, tail); + if (needFill || top != 0) { + if (needFill) { + buf << std::setw(18) << std::setfill('0'); + } else { + needFill = true; + tail.abs(); + } + buf << top.toLong(); + } + + // finally print the tail, which is less than 10**18 + if (needFill) { + buf << std::setw(18) << std::setfill('0'); + } + buf << tail.toLong(); + return buf.str(); + } + + std::string Int128::toDecimalString(int32_t scale) const { + std::string str = toString(); + if (scale == 0) { + return str; + } else if (*this < 0) { + int32_t len = static_cast<int32_t>(str.length()); + if (len - 1 > scale) { + return str.substr(0, static_cast<size_t>(len - scale)) + "." + + str.substr(static_cast<size_t>(len - scale), + static_cast<size_t>(scale)); + } else if (len - 1 == scale) { + return "-0." + str.substr(1, std::string::npos); + } else { + std::string result = "-0."; + for(int32_t i=0; i < scale - len + 1; ++i) { + result += "0"; + } + return result + str.substr(1, std::string::npos); + } + } else { + int32_t len = static_cast<int32_t>(str.length()); + if (len > scale) { + return str.substr(0, static_cast<size_t>(len - scale)) + "." + + str.substr(static_cast<size_t>(len - scale), + static_cast<size_t>(scale)); + } else if (len == scale) { + return "0." + str; + } else { + std::string result = "0."; + for(int32_t i=0; i < scale - len; ++i) { + result += "0"; + } + return result + str; + } + } + } + + std::string Int128::toHexString() const { + std::stringstream buf; + buf << std::hex << "0x" + << std::setw(16) << std::setfill('0') << highbits + << std::setw(16) << std::setfill('0') << lowbits; + return buf.str(); + } + + const static int32_t MAX_PRECISION_64 = 18; + const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = + {1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000, + 100000000000, + 1000000000000, + 10000000000000, + 100000000000000, + 1000000000000000, + 10000000000000000, + 100000000000000000, + 1000000000000000000}; + + Int128 scaleUpInt128ByPowerOfTen(Int128 value, + int32_t power, + bool &overflow) { + overflow = false; + Int128 remainder; + + while (power > 0) { + int32_t step = std::min(power, MAX_PRECISION_64); + if (value > 0 && Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) { + overflow = true; + return Int128::maximumValue(); + } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step], remainder) > value) { + overflow = true; + return Int128::minimumValue(); + } + + value *= POWERS_OF_TEN[step]; + power -= step; + } + + return value; + } + + Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) { + Int128 remainder; + while (power > 0) { + int32_t step = std::min(std::abs(power), MAX_PRECISION_64); + value = value.divide(POWERS_OF_TEN[step], remainder); + power -= step; + } + return value; + } + +} diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc index d1ba183aeb..7bf91dee13 100644 --- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc +++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.cc @@ -1,391 +1,391 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" - -#include <string> - -namespace orc { - - static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4}; - static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3}; - - static const int32_t SIZE_OF_SHORT = 2; - static const int32_t SIZE_OF_INT = 4; - static const int32_t SIZE_OF_LONG = 8; - - static std::string toHex(uint64_t val) { - std::ostringstream out; - out << "0x" << std::hex << val; - return out.str(); - } - - static std::string toString(int64_t val) { - std::ostringstream out; - out << val; - return out.str(); - } - - class MalformedInputException: public ParseError { - public: - MalformedInputException(int64_t off - ) :ParseError("MalformedInputException at " + - toString(off)) { - } - - MalformedInputException(int64_t off, const std::string& msg - ): ParseError("MalformedInputException " + msg + - " at " + toString(off)) { - } - - MalformedInputException(const MalformedInputException& other - ): ParseError(other.what()) { - } - - virtual ~MalformedInputException() noexcept; - }; - - MalformedInputException::~MalformedInputException() noexcept { - // PASS - } - - uint64_t lzoDecompress(const char *inputAddress, - const char *inputLimit, - char *outputAddress, - char *outputLimit) { - // nothing compresses to nothing - if (inputAddress == inputLimit) { - return 0; - } - - // maximum offset in buffers to which it's safe to write long-at-a-time - char * const fastOutputLimit = outputLimit - SIZE_OF_LONG; - - // LZO can concat two blocks together so, decode until the input data is - // consumed - const char *input = inputAddress; - char *output = outputAddress; - while (input < inputLimit) { - // - // Note: For safety some of the code below may stop decoding early or - // skip decoding, because input is not available. This makes the code - // safe, and since LZO requires an explicit "stop" command, the decoder - // will still throw a exception. - // - - bool firstCommand = true; - uint32_t lastLiteralLength = 0; - while (true) { - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - uint32_t command = *(input++) & 0xFF; - if (command == 0x11) { - break; - } - - // Commands are described using a bit pattern notation: - // 0: bit is not set - // 1: bit is set - // L: part of literal length - // P: part of match offset position - // M: part of match length - // ?: see documentation in command decoder - - int32_t matchLength; - int32_t matchOffset; - uint32_t literalLength; - if ((command & 0xf0) == 0) { - if (lastLiteralLength == 0) { - // 0b0000_LLLL (0bLLLL_LLLL)* - - // copy length :: fixed - // 0 - matchOffset = 0; - - // copy offset :: fixed - // 0 - matchLength = 0; - - // literal length - 3 :: variable bits :: valid range [4..] - // 3 + variableLength(command bits [0..3], 4) - literalLength = command & 0xf; - if (literalLength == 0) { - literalLength = 0xf; - - uint32_t nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { - literalLength += 0xff; - } - literalLength += nextByte; - } - literalLength += 3; - } else if (lastLiteralLength <= 3) { - // 0b0000_PPLL 0bPPPP_PPPP - - // copy length: fixed - // 3 - matchLength = 3; - - // copy offset :: 12 bits :: valid range [2048..3071] - // [0..1] from command [2..3] - // [2..9] from trailer [0..7] - // [10] unset - // [11] set - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - matchOffset = (command & 0xc) >> 2; - matchOffset |= (*(input++) & 0xFF) << 2; - matchOffset |= 0x800; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from command [0..1] - literalLength = (command & 0x3); - } else { - // 0b0000_PPLL 0bPPPP_PPPP - - // copy length :: fixed - // 2 - matchLength = 2; - - // copy offset :: 10 bits :: valid range [0..1023] - // [0..1] from command [2..3] - // [2..9] from trailer [0..7] - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - matchOffset = (command & 0xc) >> 2; - matchOffset |= (*(input++) & 0xFF) << 2; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from command [0..1] - literalLength = (command & 0x3); - } - } else if (firstCommand) { - // first command has special handling when high nibble is set - matchLength = 0; - matchOffset = 0; - literalLength = command - 17; - } else if ((command & 0xf0) == 0x10) { - // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL - - // copy length - 2 :: variable bits :: valid range [3..] - // 2 + variableLength(command bits [0..2], 3) - matchLength = command & 0x7; - if (matchLength == 0) { - matchLength = 0x7; - - int32_t nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { - matchLength += 0xff; - } - matchLength += nextByte; - } - matchLength += 2; - - // read trailer - if (input + SIZE_OF_SHORT > inputLimit) { - throw MalformedInputException(input - inputAddress); - } - uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF; - input += SIZE_OF_SHORT; - - // copy offset :: 16 bits :: valid range [32767..49151] - // [0..13] from trailer [2..15] - // [14] if command bit [3] unset - // [15] if command bit [3] set - matchOffset = trailer >> 2; - if ((command & 0x8) == 0) { - matchOffset |= 0x4000; - } else { - matchOffset |= 0x8000; - } - matchOffset--; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from trailer [0..1] - literalLength = trailer & 0x3; - } else if ((command & 0xe0) == 0x20) { - // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL - - // copy length - 2 :: variable bits :: valid range [3..] - // 2 + variableLength(command bits [0..4], 5) - matchLength = command & 0x1f; - if (matchLength == 0) { - matchLength = 0x1f; - - int nextByte = 0; - while (input < inputLimit && - (nextByte = *(input++) & 0xFF) == 0) { - matchLength += 0xff; - } - matchLength += nextByte; - } - matchLength += 2; - - // read trailer - if (input + SIZE_OF_SHORT > inputLimit) { - throw MalformedInputException(input - inputAddress); - } - int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF; - input += SIZE_OF_SHORT; - - // copy offset :: 14 bits :: valid range [0..16383] - // [0..13] from trailer [2..15] - matchOffset = trailer >> 2; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from trailer [0..1] - literalLength = trailer & 0x3; - } else if ((command & 0xc0) != 0) { - // 0bMMMP_PPLL 0bPPPP_PPPP - - // copy length - 1 :: 3 bits :: valid range [1..8] - // [0..2] from command [5..7] - // add 1 - matchLength = (command & 0xe0) >> 5; - matchLength += 1; - - // copy offset :: 11 bits :: valid range [0..4095] - // [0..2] from command [2..4] - // [3..10] from trailer [0..7] - if (input >= inputLimit) { - throw MalformedInputException(input - inputAddress); - } - matchOffset = (command & 0x1c) >> 2; - matchOffset |= (*(input++) & 0xFF) << 3; - - // literal length :: 2 bits :: valid range [0..3] - // [0..1] from command [0..1] - literalLength = (command & 0x3); - } else { - throw MalformedInputException(input - inputAddress - 1, - "Invalid LZO command " + - toHex(command)); - } - firstCommand = false; - - // copy match - if (matchLength != 0) { - // lzo encodes match offset minus one - matchOffset++; - - char *matchAddress = output - matchOffset; - if (matchAddress < outputAddress || - output + matchLength > outputLimit) { - throw MalformedInputException(input - inputAddress); - } - char *matchOutputLimit = output + matchLength; - - if (output > fastOutputLimit) { - // slow match copy - while (output < matchOutputLimit) { - *(output++) = *(matchAddress++); - } - } else { - // copy repeated sequence - if (matchOffset < SIZE_OF_LONG) { - // 8 bytes apart so that we can copy long-at-a-time below - int32_t increment32 = DEC_32_TABLE[matchOffset]; - int32_t decrement64 = DEC_64_TABLE[matchOffset]; - - output[0] = *matchAddress; - output[1] = *(matchAddress + 1); - output[2] = *(matchAddress + 2); - output[3] = *(matchAddress + 3); - output += SIZE_OF_INT; - matchAddress += increment32; - - *reinterpret_cast<int32_t*>(output) = - *reinterpret_cast<int32_t*>(matchAddress); - output += SIZE_OF_INT; - matchAddress -= decrement64; - } else { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<int64_t*>(matchAddress); - matchAddress += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } - - if (matchOutputLimit >= fastOutputLimit) { - if (matchOutputLimit > outputLimit) { - throw MalformedInputException(input - inputAddress); - } - - while (output < fastOutputLimit) { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<int64_t*>(matchAddress); - matchAddress += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } - - while (output < matchOutputLimit) { - *(output++) = *(matchAddress++); - } - } else { - while (output < matchOutputLimit) { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<int64_t*>(matchAddress); - matchAddress += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } - } - } - output = matchOutputLimit; // correction in case we over-copied - } - - // copy literal - char *literalOutputLimit = output + literalLength; - if (literalOutputLimit > fastOutputLimit || - input + literalLength > inputLimit - SIZE_OF_LONG) { - if (literalOutputLimit > outputLimit) { - throw MalformedInputException(input - inputAddress); - } - - // slow, precise copy - memcpy(output, input, literalLength); - input += literalLength; - output += literalLength; - } else { - // fast copy. We may over-copy but there's enough room in input - // and output to not overrun them - do { - *reinterpret_cast<int64_t*>(output) = - *reinterpret_cast<const int64_t*>(input); - input += SIZE_OF_LONG; - output += SIZE_OF_LONG; - } while (output < literalOutputLimit); - // adjust index if we over-copied - input -= (output - literalOutputLimit); - output = literalOutputLimit; - } - lastLiteralLength = literalLength; - } - - if (input + SIZE_OF_SHORT > inputLimit && - *reinterpret_cast<const int16_t*>(input) != 0) { - throw MalformedInputException(input - inputAddress); - } - input += SIZE_OF_SHORT; - } - - return static_cast<uint64_t>(output - outputAddress); - } - -} +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" + +#include <string> + +namespace orc { + + static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4}; + static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3}; + + static const int32_t SIZE_OF_SHORT = 2; + static const int32_t SIZE_OF_INT = 4; + static const int32_t SIZE_OF_LONG = 8; + + static std::string toHex(uint64_t val) { + std::ostringstream out; + out << "0x" << std::hex << val; + return out.str(); + } + + static std::string toString(int64_t val) { + std::ostringstream out; + out << val; + return out.str(); + } + + class MalformedInputException: public ParseError { + public: + MalformedInputException(int64_t off + ) :ParseError("MalformedInputException at " + + toString(off)) { + } + + MalformedInputException(int64_t off, const std::string& msg + ): ParseError("MalformedInputException " + msg + + " at " + toString(off)) { + } + + MalformedInputException(const MalformedInputException& other + ): ParseError(other.what()) { + } + + virtual ~MalformedInputException() noexcept; + }; + + MalformedInputException::~MalformedInputException() noexcept { + // PASS + } + + uint64_t lzoDecompress(const char *inputAddress, + const char *inputLimit, + char *outputAddress, + char *outputLimit) { + // nothing compresses to nothing + if (inputAddress == inputLimit) { + return 0; + } + + // maximum offset in buffers to which it's safe to write long-at-a-time + char * const fastOutputLimit = outputLimit - SIZE_OF_LONG; + + // LZO can concat two blocks together so, decode until the input data is + // consumed + const char *input = inputAddress; + char *output = outputAddress; + while (input < inputLimit) { + // + // Note: For safety some of the code below may stop decoding early or + // skip decoding, because input is not available. This makes the code + // safe, and since LZO requires an explicit "stop" command, the decoder + // will still throw a exception. + // + + bool firstCommand = true; + uint32_t lastLiteralLength = 0; + while (true) { + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + uint32_t command = *(input++) & 0xFF; + if (command == 0x11) { + break; + } + + // Commands are described using a bit pattern notation: + // 0: bit is not set + // 1: bit is set + // L: part of literal length + // P: part of match offset position + // M: part of match length + // ?: see documentation in command decoder + + int32_t matchLength; + int32_t matchOffset; + uint32_t literalLength; + if ((command & 0xf0) == 0) { + if (lastLiteralLength == 0) { + // 0b0000_LLLL (0bLLLL_LLLL)* + + // copy length :: fixed + // 0 + matchOffset = 0; + + // copy offset :: fixed + // 0 + matchLength = 0; + + // literal length - 3 :: variable bits :: valid range [4..] + // 3 + variableLength(command bits [0..3], 4) + literalLength = command & 0xf; + if (literalLength == 0) { + literalLength = 0xf; + + uint32_t nextByte = 0; + while (input < inputLimit && + (nextByte = *(input++) & 0xFF) == 0) { + literalLength += 0xff; + } + literalLength += nextByte; + } + literalLength += 3; + } else if (lastLiteralLength <= 3) { + // 0b0000_PPLL 0bPPPP_PPPP + + // copy length: fixed + // 3 + matchLength = 3; + + // copy offset :: 12 bits :: valid range [2048..3071] + // [0..1] from command [2..3] + // [2..9] from trailer [0..7] + // [10] unset + // [11] set + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + matchOffset = (command & 0xc) >> 2; + matchOffset |= (*(input++) & 0xFF) << 2; + matchOffset |= 0x800; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from command [0..1] + literalLength = (command & 0x3); + } else { + // 0b0000_PPLL 0bPPPP_PPPP + + // copy length :: fixed + // 2 + matchLength = 2; + + // copy offset :: 10 bits :: valid range [0..1023] + // [0..1] from command [2..3] + // [2..9] from trailer [0..7] + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + matchOffset = (command & 0xc) >> 2; + matchOffset |= (*(input++) & 0xFF) << 2; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from command [0..1] + literalLength = (command & 0x3); + } + } else if (firstCommand) { + // first command has special handling when high nibble is set + matchLength = 0; + matchOffset = 0; + literalLength = command - 17; + } else if ((command & 0xf0) == 0x10) { + // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL + + // copy length - 2 :: variable bits :: valid range [3..] + // 2 + variableLength(command bits [0..2], 3) + matchLength = command & 0x7; + if (matchLength == 0) { + matchLength = 0x7; + + int32_t nextByte = 0; + while (input < inputLimit && + (nextByte = *(input++) & 0xFF) == 0) { + matchLength += 0xff; + } + matchLength += nextByte; + } + matchLength += 2; + + // read trailer + if (input + SIZE_OF_SHORT > inputLimit) { + throw MalformedInputException(input - inputAddress); + } + uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF; + input += SIZE_OF_SHORT; + + // copy offset :: 16 bits :: valid range [32767..49151] + // [0..13] from trailer [2..15] + // [14] if command bit [3] unset + // [15] if command bit [3] set + matchOffset = trailer >> 2; + if ((command & 0x8) == 0) { + matchOffset |= 0x4000; + } else { + matchOffset |= 0x8000; + } + matchOffset--; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from trailer [0..1] + literalLength = trailer & 0x3; + } else if ((command & 0xe0) == 0x20) { + // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL + + // copy length - 2 :: variable bits :: valid range [3..] + // 2 + variableLength(command bits [0..4], 5) + matchLength = command & 0x1f; + if (matchLength == 0) { + matchLength = 0x1f; + + int nextByte = 0; + while (input < inputLimit && + (nextByte = *(input++) & 0xFF) == 0) { + matchLength += 0xff; + } + matchLength += nextByte; + } + matchLength += 2; + + // read trailer + if (input + SIZE_OF_SHORT > inputLimit) { + throw MalformedInputException(input - inputAddress); + } + int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF; + input += SIZE_OF_SHORT; + + // copy offset :: 14 bits :: valid range [0..16383] + // [0..13] from trailer [2..15] + matchOffset = trailer >> 2; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from trailer [0..1] + literalLength = trailer & 0x3; + } else if ((command & 0xc0) != 0) { + // 0bMMMP_PPLL 0bPPPP_PPPP + + // copy length - 1 :: 3 bits :: valid range [1..8] + // [0..2] from command [5..7] + // add 1 + matchLength = (command & 0xe0) >> 5; + matchLength += 1; + + // copy offset :: 11 bits :: valid range [0..4095] + // [0..2] from command [2..4] + // [3..10] from trailer [0..7] + if (input >= inputLimit) { + throw MalformedInputException(input - inputAddress); + } + matchOffset = (command & 0x1c) >> 2; + matchOffset |= (*(input++) & 0xFF) << 3; + + // literal length :: 2 bits :: valid range [0..3] + // [0..1] from command [0..1] + literalLength = (command & 0x3); + } else { + throw MalformedInputException(input - inputAddress - 1, + "Invalid LZO command " + + toHex(command)); + } + firstCommand = false; + + // copy match + if (matchLength != 0) { + // lzo encodes match offset minus one + matchOffset++; + + char *matchAddress = output - matchOffset; + if (matchAddress < outputAddress || + output + matchLength > outputLimit) { + throw MalformedInputException(input - inputAddress); + } + char *matchOutputLimit = output + matchLength; + + if (output > fastOutputLimit) { + // slow match copy + while (output < matchOutputLimit) { + *(output++) = *(matchAddress++); + } + } else { + // copy repeated sequence + if (matchOffset < SIZE_OF_LONG) { + // 8 bytes apart so that we can copy long-at-a-time below + int32_t increment32 = DEC_32_TABLE[matchOffset]; + int32_t decrement64 = DEC_64_TABLE[matchOffset]; + + output[0] = *matchAddress; + output[1] = *(matchAddress + 1); + output[2] = *(matchAddress + 2); + output[3] = *(matchAddress + 3); + output += SIZE_OF_INT; + matchAddress += increment32; + + *reinterpret_cast<int32_t*>(output) = + *reinterpret_cast<int32_t*>(matchAddress); + output += SIZE_OF_INT; + matchAddress -= decrement64; + } else { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<int64_t*>(matchAddress); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + if (matchOutputLimit >= fastOutputLimit) { + if (matchOutputLimit > outputLimit) { + throw MalformedInputException(input - inputAddress); + } + + while (output < fastOutputLimit) { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<int64_t*>(matchAddress); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + + while (output < matchOutputLimit) { + *(output++) = *(matchAddress++); + } + } else { + while (output < matchOutputLimit) { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<int64_t*>(matchAddress); + matchAddress += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } + } + } + output = matchOutputLimit; // correction in case we over-copied + } + + // copy literal + char *literalOutputLimit = output + literalLength; + if (literalOutputLimit > fastOutputLimit || + input + literalLength > inputLimit - SIZE_OF_LONG) { + if (literalOutputLimit > outputLimit) { + throw MalformedInputException(input - inputAddress); + } + + // slow, precise copy + memcpy(output, input, literalLength); + input += literalLength; + output += literalLength; + } else { + // fast copy. We may over-copy but there's enough room in input + // and output to not overrun them + do { + *reinterpret_cast<int64_t*>(output) = + *reinterpret_cast<const int64_t*>(input); + input += SIZE_OF_LONG; + output += SIZE_OF_LONG; + } while (output < literalOutputLimit); + // adjust index if we over-copied + input -= (output - literalOutputLimit); + output = literalOutputLimit; + } + lastLiteralLength = literalLength; + } + + if (input + SIZE_OF_SHORT > inputLimit && + *reinterpret_cast<const int16_t*>(input) != 0) { + throw MalformedInputException(input - inputAddress); + } + input += SIZE_OF_SHORT; + } + + return static_cast<uint64_t>(output - outputAddress); + } + +} diff --git a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh index 9de8537dd8..32d8085174 100644 --- a/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh +++ b/contrib/libs/apache/orc/c++/src/LzoDecompressor.hh @@ -1,42 +1,42 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_LZO_HH -#define ORC_LZO_HH - -#include "orc/OrcFile.hh" - -#include "Adaptor.hh" - -namespace orc { - - /** - * Decompress the bytes in to the output buffer. - * @param inputAddress the start of the input - * @param inputLimit one past the last byte of the input - * @param outputAddress the start of the output buffer - * @param outputLimit one past the last byte of the output buffer - * @result the number of bytes decompressed - */ - uint64_t lzoDecompress(const char *inputAddress, - const char *inputLimit, - char *outputAddress, - char *outputLimit); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_LZO_HH +#define ORC_LZO_HH + +#include "orc/OrcFile.hh" + +#include "Adaptor.hh" + +namespace orc { + + /** + * Decompress the bytes in to the output buffer. + * @param inputAddress the start of the input + * @param inputLimit one past the last byte of the input + * @param outputAddress the start of the output buffer + * @param outputLimit one past the last byte of the output buffer + * @result the number of bytes decompressed + */ + uint64_t lzoDecompress(const char *inputAddress, + const char *inputLimit, + char *outputAddress, + char *outputLimit); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/MemoryPool.cc b/contrib/libs/apache/orc/c++/src/MemoryPool.cc index ecfb295bae..178e9cc316 100644 --- a/contrib/libs/apache/orc/c++/src/MemoryPool.cc +++ b/contrib/libs/apache/orc/c++/src/MemoryPool.cc @@ -1,244 +1,244 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Int128.hh" -#include "orc/MemoryPool.hh" - -#include "Adaptor.hh" - -#include <cstdlib> -#include <iostream> -#include <string.h> - -namespace orc { - - MemoryPool::~MemoryPool() { - // PASS - } - - class MemoryPoolImpl: public MemoryPool { - public: - virtual ~MemoryPoolImpl() override; - - char* malloc(uint64_t size) override; - void free(char* p) override; - }; - - char* MemoryPoolImpl::malloc(uint64_t size) { - return static_cast<char*>(std::malloc(size)); - } - - void MemoryPoolImpl::free(char* p) { - std::free(p); - } - - MemoryPoolImpl::~MemoryPoolImpl() { - // PASS - } - - template <class T> - DataBuffer<T>::DataBuffer(MemoryPool& pool, - uint64_t newSize - ): memoryPool(pool), - buf(nullptr), - currentSize(0), - currentCapacity(0) { - resize(newSize); - } - - template <class T> - DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer - ) noexcept: - memoryPool(buffer.memoryPool), - buf(buffer.buf), - currentSize(buffer.currentSize), - currentCapacity(buffer.currentCapacity) { - buffer.buf = nullptr; - buffer.currentSize = 0; - buffer.currentCapacity = 0; - } - - template <class T> - DataBuffer<T>::~DataBuffer(){ - for(uint64_t i=currentSize; i > 0; --i) { - (buf + i - 1)->~T(); - } - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <class T> - void DataBuffer<T>::resize(uint64_t newSize) { - reserve(newSize); - if (currentSize > newSize) { - for(uint64_t i=currentSize; i > newSize; --i) { - (buf + i - 1)->~T(); - } - } else if (newSize > currentSize) { - for(uint64_t i=currentSize; i < newSize; ++i) { - new (buf + i) T(); - } - } - currentSize = newSize; - } - - template <class T> - void DataBuffer<T>::reserve(uint64_t newCapacity){ - if (newCapacity > currentCapacity || !buf) { - if (buf) { - T* buf_old = buf; - buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); - memcpy(buf, buf_old, sizeof(T) * currentSize); - memoryPool.free(reinterpret_cast<char*>(buf_old)); - } else { - buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); - } - currentCapacity = newCapacity; - } - } - - // Specializations for char - - template <> - DataBuffer<char>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<char>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, newSize - currentSize); - } - currentSize = newSize; - } - - // Specializations for char* - - template <> - DataBuffer<char*>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<char*>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*)); - } - currentSize = newSize; - } - - // Specializations for double - - template <> - DataBuffer<double>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<double>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double)); - } - currentSize = newSize; - } - - // Specializations for int64_t - - template <> - DataBuffer<int64_t>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<int64_t>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t)); - } - currentSize = newSize; - } - - // Specializations for uint64_t - - template <> - DataBuffer<uint64_t>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<uint64_t>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t)); - } - currentSize = newSize; - } - - // Specializations for unsigned char - - template <> - DataBuffer<unsigned char>::~DataBuffer(){ - if (buf) { - memoryPool.free(reinterpret_cast<char*>(buf)); - } - } - - template <> - void DataBuffer<unsigned char>::resize(uint64_t newSize) { - reserve(newSize); - if (newSize > currentSize) { - memset(buf + currentSize, 0, newSize - currentSize); - } - currentSize = newSize; - } - - #ifdef __clang__ - #pragma clang diagnostic ignored "-Wweak-template-vtables" - #endif - - template class DataBuffer<char>; - template class DataBuffer<char*>; - template class DataBuffer<double>; - template class DataBuffer<Int128>; - template class DataBuffer<int64_t>; - template class DataBuffer<uint64_t>; - template class DataBuffer<unsigned char>; - - #ifdef __clang__ - #pragma clang diagnostic ignored "-Wexit-time-destructors" - #endif - - MemoryPool* getDefaultPool() { - static MemoryPoolImpl internal; - return &internal; - } -} // namespace orc +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Int128.hh" +#include "orc/MemoryPool.hh" + +#include "Adaptor.hh" + +#include <cstdlib> +#include <iostream> +#include <string.h> + +namespace orc { + + MemoryPool::~MemoryPool() { + // PASS + } + + class MemoryPoolImpl: public MemoryPool { + public: + virtual ~MemoryPoolImpl() override; + + char* malloc(uint64_t size) override; + void free(char* p) override; + }; + + char* MemoryPoolImpl::malloc(uint64_t size) { + return static_cast<char*>(std::malloc(size)); + } + + void MemoryPoolImpl::free(char* p) { + std::free(p); + } + + MemoryPoolImpl::~MemoryPoolImpl() { + // PASS + } + + template <class T> + DataBuffer<T>::DataBuffer(MemoryPool& pool, + uint64_t newSize + ): memoryPool(pool), + buf(nullptr), + currentSize(0), + currentCapacity(0) { + resize(newSize); + } + + template <class T> + DataBuffer<T>::DataBuffer(DataBuffer<T>&& buffer + ) noexcept: + memoryPool(buffer.memoryPool), + buf(buffer.buf), + currentSize(buffer.currentSize), + currentCapacity(buffer.currentCapacity) { + buffer.buf = nullptr; + buffer.currentSize = 0; + buffer.currentCapacity = 0; + } + + template <class T> + DataBuffer<T>::~DataBuffer(){ + for(uint64_t i=currentSize; i > 0; --i) { + (buf + i - 1)->~T(); + } + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <class T> + void DataBuffer<T>::resize(uint64_t newSize) { + reserve(newSize); + if (currentSize > newSize) { + for(uint64_t i=currentSize; i > newSize; --i) { + (buf + i - 1)->~T(); + } + } else if (newSize > currentSize) { + for(uint64_t i=currentSize; i < newSize; ++i) { + new (buf + i) T(); + } + } + currentSize = newSize; + } + + template <class T> + void DataBuffer<T>::reserve(uint64_t newCapacity){ + if (newCapacity > currentCapacity || !buf) { + if (buf) { + T* buf_old = buf; + buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); + memcpy(buf, buf_old, sizeof(T) * currentSize); + memoryPool.free(reinterpret_cast<char*>(buf_old)); + } else { + buf = reinterpret_cast<T*>(memoryPool.malloc(sizeof(T) * newCapacity)); + } + currentCapacity = newCapacity; + } + } + + // Specializations for char + + template <> + DataBuffer<char>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<char>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, newSize - currentSize); + } + currentSize = newSize; + } + + // Specializations for char* + + template <> + DataBuffer<char*>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<char*>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(char*)); + } + currentSize = newSize; + } + + // Specializations for double + + template <> + DataBuffer<double>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<double>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(double)); + } + currentSize = newSize; + } + + // Specializations for int64_t + + template <> + DataBuffer<int64_t>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<int64_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(int64_t)); + } + currentSize = newSize; + } + + // Specializations for uint64_t + + template <> + DataBuffer<uint64_t>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<uint64_t>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, (newSize - currentSize) * sizeof(uint64_t)); + } + currentSize = newSize; + } + + // Specializations for unsigned char + + template <> + DataBuffer<unsigned char>::~DataBuffer(){ + if (buf) { + memoryPool.free(reinterpret_cast<char*>(buf)); + } + } + + template <> + void DataBuffer<unsigned char>::resize(uint64_t newSize) { + reserve(newSize); + if (newSize > currentSize) { + memset(buf + currentSize, 0, newSize - currentSize); + } + currentSize = newSize; + } + + #ifdef __clang__ + #pragma clang diagnostic ignored "-Wweak-template-vtables" + #endif + + template class DataBuffer<char>; + template class DataBuffer<char*>; + template class DataBuffer<double>; + template class DataBuffer<Int128>; + template class DataBuffer<int64_t>; + template class DataBuffer<uint64_t>; + template class DataBuffer<unsigned char>; + + #ifdef __clang__ + #pragma clang diagnostic ignored "-Wexit-time-destructors" + #endif + + MemoryPool* getDefaultPool() { + static MemoryPoolImpl internal; + return &internal; + } +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.cc b/contrib/libs/apache/orc/c++/src/Murmur3.cc index b45bd6d492..63cf797a04 100644 --- a/contrib/libs/apache/orc/c++/src/Murmur3.cc +++ b/contrib/libs/apache/orc/c++/src/Murmur3.cc @@ -1,98 +1,98 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Murmur3.hh" - -#define ROTL64(x, r) ((x << r) | (x >> (64 - r))) - -namespace orc { - - inline uint64_t rotl64 ( uint64_t x, int8_t r ) { - return (x << r) | (x >> (64 - r)); - } - - inline uint64_t Murmur3::fmix64(uint64_t value) { - value ^= (value >> 33); - value *= 0xff51afd7ed558ccdL; - value ^= (value >> 33); - value *= 0xc4ceb9fe1a85ec53L; - value ^= (value >> 33); - return value; - } - - uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) { - return hash64(data, len, DEFAULT_SEED); - } - - DIAGNOSTIC_PUSH - -#if defined(__clang__) - DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough") -#endif - - uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) { - uint64_t h = seed; - uint32_t blocks = len >> 3; - - const uint64_t* src = reinterpret_cast<const uint64_t*>(data); - uint64_t c1 = 0x87c37b91114253d5L; - uint64_t c2 = 0x4cf5ad432745937fL; - for (uint32_t i = 0; i < blocks; i++) { - uint64_t k = src[i]; - k *= c1; - k = ROTL64(k, 31); - k *= c2; - - h ^= k; - h = ROTL64(h, 27); - h = h * 5 + 0x52dce729; - } - - uint64_t k = 0; - uint32_t idx = blocks << 3; - switch (len - idx) { - case 7: - k ^= static_cast<uint64_t>(data[idx + 6]) << 48; - case 6: - k ^= static_cast<uint64_t>(data[idx + 5]) << 40; - case 5: - k ^= static_cast<uint64_t>(data[idx + 4]) << 32; - case 4: - k ^= static_cast<uint64_t>(data[idx + 3]) << 24; - case 3: - k ^= static_cast<uint64_t>(data[idx + 2]) << 16; - case 2: - k ^= static_cast<uint64_t>(data[idx + 1]) << 8; - case 1: - k ^= static_cast<uint64_t>(data[idx + 0]); - - k *= c1; - k = ROTL64(k, 31); - k *= c2; - h ^= k; - } - - h ^= len; - h = fmix64(h); - return h; - } - - DIAGNOSTIC_POP - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Murmur3.hh" + +#define ROTL64(x, r) ((x << r) | (x >> (64 - r))) + +namespace orc { + + inline uint64_t rotl64 ( uint64_t x, int8_t r ) { + return (x << r) | (x >> (64 - r)); + } + + inline uint64_t Murmur3::fmix64(uint64_t value) { + value ^= (value >> 33); + value *= 0xff51afd7ed558ccdL; + value ^= (value >> 33); + value *= 0xc4ceb9fe1a85ec53L; + value ^= (value >> 33); + return value; + } + + uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len) { + return hash64(data, len, DEFAULT_SEED); + } + + DIAGNOSTIC_PUSH + +#if defined(__clang__) + DIAGNOSTIC_IGNORE("-Wimplicit-fallthrough") +#endif + + uint64_t Murmur3::hash64(const uint8_t *data, uint32_t len, uint32_t seed) { + uint64_t h = seed; + uint32_t blocks = len >> 3; + + const uint64_t* src = reinterpret_cast<const uint64_t*>(data); + uint64_t c1 = 0x87c37b91114253d5L; + uint64_t c2 = 0x4cf5ad432745937fL; + for (uint32_t i = 0; i < blocks; i++) { + uint64_t k = src[i]; + k *= c1; + k = ROTL64(k, 31); + k *= c2; + + h ^= k; + h = ROTL64(h, 27); + h = h * 5 + 0x52dce729; + } + + uint64_t k = 0; + uint32_t idx = blocks << 3; + switch (len - idx) { + case 7: + k ^= static_cast<uint64_t>(data[idx + 6]) << 48; + case 6: + k ^= static_cast<uint64_t>(data[idx + 5]) << 40; + case 5: + k ^= static_cast<uint64_t>(data[idx + 4]) << 32; + case 4: + k ^= static_cast<uint64_t>(data[idx + 3]) << 24; + case 3: + k ^= static_cast<uint64_t>(data[idx + 2]) << 16; + case 2: + k ^= static_cast<uint64_t>(data[idx + 1]) << 8; + case 1: + k ^= static_cast<uint64_t>(data[idx + 0]); + + k *= c1; + k = ROTL64(k, 31); + k *= c2; + h ^= k; + } + + h ^= len; + h = fmix64(h); + return h; + } + + DIAGNOSTIC_POP + +} diff --git a/contrib/libs/apache/orc/c++/src/Murmur3.hh b/contrib/libs/apache/orc/c++/src/Murmur3.hh index 02391811b0..9cf1de138f 100644 --- a/contrib/libs/apache/orc/c++/src/Murmur3.hh +++ b/contrib/libs/apache/orc/c++/src/Murmur3.hh @@ -1,40 +1,40 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_MURMUR3_HH -#define ORC_MURMUR3_HH - -#include "orc/orc-config.hh" - -namespace orc { - - class Murmur3 { - public: - static const uint32_t DEFAULT_SEED = 104729; - static const uint64_t NULL_HASHCODE = 2862933555777941757LL; - - static uint64_t hash64(const uint8_t *data, uint32_t len); - - private: - static uint64_t fmix64(uint64_t value); - static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed); - }; - -} - -#endif //ORC_MURMUR3_HH +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_MURMUR3_HH +#define ORC_MURMUR3_HH + +#include "orc/orc-config.hh" + +namespace orc { + + class Murmur3 { + public: + static const uint32_t DEFAULT_SEED = 104729; + static const uint64_t NULL_HASHCODE = 2862933555777941757LL; + + static uint64_t hash64(const uint8_t *data, uint32_t len); + + private: + static uint64_t fmix64(uint64_t value); + static uint64_t hash64(const uint8_t* data, uint32_t len, uint32_t seed); + }; + +} + +#endif //ORC_MURMUR3_HH diff --git a/contrib/libs/apache/orc/c++/src/Options.hh b/contrib/libs/apache/orc/c++/src/Options.hh index 795e166138..ee9982cdc2 100644 --- a/contrib/libs/apache/orc/c++/src/Options.hh +++ b/contrib/libs/apache/orc/c++/src/Options.hh @@ -1,258 +1,258 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_OPTIONS_HH -#define ORC_OPTIONS_HH - -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include <limits> - -namespace orc { - - enum ColumnSelection { - ColumnSelection_NONE = 0, - ColumnSelection_NAMES = 1, - ColumnSelection_FIELD_IDS = 2, - ColumnSelection_TYPE_IDS = 3, - }; - -/** - * ReaderOptions Implementation - */ - struct ReaderOptionsPrivate { - uint64_t tailLocation; - std::ostream* errorStream; - MemoryPool* memoryPool; - std::string serializedTail; - - ReaderOptionsPrivate() { - tailLocation = std::numeric_limits<uint64_t>::max(); - errorStream = &std::cerr; - memoryPool = getDefaultPool(); - } - }; - - ReaderOptions::ReaderOptions(): - privateBits(std::unique_ptr<ReaderOptionsPrivate> - (new ReaderOptionsPrivate())) { - // PASS - } - - ReaderOptions::ReaderOptions(const ReaderOptions& rhs): - privateBits(std::unique_ptr<ReaderOptionsPrivate> - (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) { - // PASS - } - - ReaderOptions::ReaderOptions(ReaderOptions& rhs) { - // swap privateBits with rhs - ReaderOptionsPrivate* l = privateBits.release(); - privateBits.reset(rhs.privateBits.release()); - rhs.privateBits.reset(l); - } - - ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) { - if (this != &rhs) { - privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get()))); - } - return *this; - } - - ReaderOptions::~ReaderOptions() { - // PASS - } - - ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) { - privateBits->memoryPool = &pool; - return *this; - } - - MemoryPool* ReaderOptions::getMemoryPool() const{ - return privateBits->memoryPool; - } - - ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) { - privateBits->tailLocation = offset; - return *this; - } - - uint64_t ReaderOptions::getTailLocation() const { - return privateBits->tailLocation; - } - - ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value - ) { - privateBits->serializedTail = value; - return *this; - } - - std::string ReaderOptions::getSerializedFileTail() const { - return privateBits->serializedTail; - } - - ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) { - privateBits->errorStream = &stream; - return *this; - } - - std::ostream* ReaderOptions::getErrorStream() const { - return privateBits->errorStream; - } - -/** - * RowReaderOptions Implementation - */ - - struct RowReaderOptionsPrivate { - ColumnSelection selection; - std::list<uint64_t> includedColumnIndexes; - std::list<std::string> includedColumnNames; - uint64_t dataStart; - uint64_t dataLength; - bool throwOnHive11DecimalOverflow; - int32_t forcedScaleOnHive11Decimal; - bool enableLazyDecoding; - - RowReaderOptionsPrivate() { - selection = ColumnSelection_NONE; - dataStart = 0; - dataLength = std::numeric_limits<uint64_t>::max(); - throwOnHive11DecimalOverflow = true; - forcedScaleOnHive11Decimal = 6; - enableLazyDecoding = false; - } - }; - - RowReaderOptions::RowReaderOptions(): - privateBits(std::unique_ptr<RowReaderOptionsPrivate> - (new RowReaderOptionsPrivate())) { - // PASS - } - - RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs): - privateBits(std::unique_ptr<RowReaderOptionsPrivate> - (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) { - // PASS - } - - RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) { - // swap privateBits with rhs - RowReaderOptionsPrivate* l = privateBits.release(); - privateBits.reset(rhs.privateBits.release()); - rhs.privateBits.reset(l); - } - - RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) { - if (this != &rhs) { - privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get()))); - } - return *this; - } - - RowReaderOptions::~RowReaderOptions() { - // PASS - } - - RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) { - privateBits->selection = ColumnSelection_FIELD_IDS; - privateBits->includedColumnIndexes.assign(include.begin(), include.end()); - privateBits->includedColumnNames.clear(); - return *this; - } - - RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) { - privateBits->selection = ColumnSelection_NAMES; - privateBits->includedColumnNames.assign(include.begin(), include.end()); - privateBits->includedColumnIndexes.clear(); - return *this; - } - - RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) { - privateBits->selection = ColumnSelection_TYPE_IDS; - privateBits->includedColumnIndexes.assign(types.begin(), types.end()); - privateBits->includedColumnNames.clear(); - return *this; - } - - RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { - privateBits->dataStart = offset; - privateBits->dataLength = length; - return *this; - } - - bool RowReaderOptions::getIndexesSet() const { - return privateBits->selection == ColumnSelection_FIELD_IDS; - } - - bool RowReaderOptions::getTypeIdsSet() const { - return privateBits->selection == ColumnSelection_TYPE_IDS; - } - - const std::list<uint64_t>& RowReaderOptions::getInclude() const { - return privateBits->includedColumnIndexes; - } - - bool RowReaderOptions::getNamesSet() const { - return privateBits->selection == ColumnSelection_NAMES; - } - - const std::list<std::string>& RowReaderOptions::getIncludeNames() const { - return privateBits->includedColumnNames; - } - - uint64_t RowReaderOptions::getOffset() const { - return privateBits->dataStart; - } - - uint64_t RowReaderOptions::getLength() const { - return privateBits->dataLength; - } - - RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){ - privateBits->throwOnHive11DecimalOverflow = shouldThrow; - return *this; - } - - bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const { - return privateBits->throwOnHive11DecimalOverflow; - } - - RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale - ) { - privateBits->forcedScaleOnHive11Decimal = forcedScale; - return *this; - } - - int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const { - return privateBits->forcedScaleOnHive11Decimal; - } - - bool RowReaderOptions::getEnableLazyDecoding() const { - return privateBits->enableLazyDecoding; - } - - RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) { - privateBits->enableLazyDecoding = enable; - return *this; - } -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_OPTIONS_HH +#define ORC_OPTIONS_HH + +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include <limits> + +namespace orc { + + enum ColumnSelection { + ColumnSelection_NONE = 0, + ColumnSelection_NAMES = 1, + ColumnSelection_FIELD_IDS = 2, + ColumnSelection_TYPE_IDS = 3, + }; + +/** + * ReaderOptions Implementation + */ + struct ReaderOptionsPrivate { + uint64_t tailLocation; + std::ostream* errorStream; + MemoryPool* memoryPool; + std::string serializedTail; + + ReaderOptionsPrivate() { + tailLocation = std::numeric_limits<uint64_t>::max(); + errorStream = &std::cerr; + memoryPool = getDefaultPool(); + } + }; + + ReaderOptions::ReaderOptions(): + privateBits(std::unique_ptr<ReaderOptionsPrivate> + (new ReaderOptionsPrivate())) { + // PASS + } + + ReaderOptions::ReaderOptions(const ReaderOptions& rhs): + privateBits(std::unique_ptr<ReaderOptionsPrivate> + (new ReaderOptionsPrivate(*(rhs.privateBits.get())))) { + // PASS + } + + ReaderOptions::ReaderOptions(ReaderOptions& rhs) { + // swap privateBits with rhs + ReaderOptionsPrivate* l = privateBits.release(); + privateBits.reset(rhs.privateBits.release()); + rhs.privateBits.reset(l); + } + + ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) { + if (this != &rhs) { + privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get()))); + } + return *this; + } + + ReaderOptions::~ReaderOptions() { + // PASS + } + + ReaderOptions& ReaderOptions::setMemoryPool(MemoryPool& pool) { + privateBits->memoryPool = &pool; + return *this; + } + + MemoryPool* ReaderOptions::getMemoryPool() const{ + return privateBits->memoryPool; + } + + ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) { + privateBits->tailLocation = offset; + return *this; + } + + uint64_t ReaderOptions::getTailLocation() const { + return privateBits->tailLocation; + } + + ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value + ) { + privateBits->serializedTail = value; + return *this; + } + + std::string ReaderOptions::getSerializedFileTail() const { + return privateBits->serializedTail; + } + + ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) { + privateBits->errorStream = &stream; + return *this; + } + + std::ostream* ReaderOptions::getErrorStream() const { + return privateBits->errorStream; + } + +/** + * RowReaderOptions Implementation + */ + + struct RowReaderOptionsPrivate { + ColumnSelection selection; + std::list<uint64_t> includedColumnIndexes; + std::list<std::string> includedColumnNames; + uint64_t dataStart; + uint64_t dataLength; + bool throwOnHive11DecimalOverflow; + int32_t forcedScaleOnHive11Decimal; + bool enableLazyDecoding; + + RowReaderOptionsPrivate() { + selection = ColumnSelection_NONE; + dataStart = 0; + dataLength = std::numeric_limits<uint64_t>::max(); + throwOnHive11DecimalOverflow = true; + forcedScaleOnHive11Decimal = 6; + enableLazyDecoding = false; + } + }; + + RowReaderOptions::RowReaderOptions(): + privateBits(std::unique_ptr<RowReaderOptionsPrivate> + (new RowReaderOptionsPrivate())) { + // PASS + } + + RowReaderOptions::RowReaderOptions(const RowReaderOptions& rhs): + privateBits(std::unique_ptr<RowReaderOptionsPrivate> + (new RowReaderOptionsPrivate(*(rhs.privateBits.get())))) { + // PASS + } + + RowReaderOptions::RowReaderOptions(RowReaderOptions& rhs) { + // swap privateBits with rhs + RowReaderOptionsPrivate* l = privateBits.release(); + privateBits.reset(rhs.privateBits.release()); + rhs.privateBits.reset(l); + } + + RowReaderOptions& RowReaderOptions::operator=(const RowReaderOptions& rhs) { + if (this != &rhs) { + privateBits.reset(new RowReaderOptionsPrivate(*(rhs.privateBits.get()))); + } + return *this; + } + + RowReaderOptions::~RowReaderOptions() { + // PASS + } + + RowReaderOptions& RowReaderOptions::include(const std::list<uint64_t>& include) { + privateBits->selection = ColumnSelection_FIELD_IDS; + privateBits->includedColumnIndexes.assign(include.begin(), include.end()); + privateBits->includedColumnNames.clear(); + return *this; + } + + RowReaderOptions& RowReaderOptions::include(const std::list<std::string>& include) { + privateBits->selection = ColumnSelection_NAMES; + privateBits->includedColumnNames.assign(include.begin(), include.end()); + privateBits->includedColumnIndexes.clear(); + return *this; + } + + RowReaderOptions& RowReaderOptions::includeTypes(const std::list<uint64_t>& types) { + privateBits->selection = ColumnSelection_TYPE_IDS; + privateBits->includedColumnIndexes.assign(types.begin(), types.end()); + privateBits->includedColumnNames.clear(); + return *this; + } + + RowReaderOptions& RowReaderOptions::range(uint64_t offset, uint64_t length) { + privateBits->dataStart = offset; + privateBits->dataLength = length; + return *this; + } + + bool RowReaderOptions::getIndexesSet() const { + return privateBits->selection == ColumnSelection_FIELD_IDS; + } + + bool RowReaderOptions::getTypeIdsSet() const { + return privateBits->selection == ColumnSelection_TYPE_IDS; + } + + const std::list<uint64_t>& RowReaderOptions::getInclude() const { + return privateBits->includedColumnIndexes; + } + + bool RowReaderOptions::getNamesSet() const { + return privateBits->selection == ColumnSelection_NAMES; + } + + const std::list<std::string>& RowReaderOptions::getIncludeNames() const { + return privateBits->includedColumnNames; + } + + uint64_t RowReaderOptions::getOffset() const { + return privateBits->dataStart; + } + + uint64_t RowReaderOptions::getLength() const { + return privateBits->dataLength; + } + + RowReaderOptions& RowReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow){ + privateBits->throwOnHive11DecimalOverflow = shouldThrow; + return *this; + } + + bool RowReaderOptions::getThrowOnHive11DecimalOverflow() const { + return privateBits->throwOnHive11DecimalOverflow; + } + + RowReaderOptions& RowReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale + ) { + privateBits->forcedScaleOnHive11Decimal = forcedScale; + return *this; + } + + int32_t RowReaderOptions::getForcedScaleOnHive11Decimal() const { + return privateBits->forcedScaleOnHive11Decimal; + } + + bool RowReaderOptions::getEnableLazyDecoding() const { + return privateBits->enableLazyDecoding; + } + + RowReaderOptions& RowReaderOptions::setEnableLazyDecoding(bool enable) { + privateBits->enableLazyDecoding = enable; + return *this; + } +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/OrcFile.cc b/contrib/libs/apache/orc/c++/src/OrcFile.cc index a0158bbadf..5856db692e 100644 --- a/contrib/libs/apache/orc/c++/src/OrcFile.cc +++ b/contrib/libs/apache/orc/c++/src/OrcFile.cc @@ -1,184 +1,184 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "orc/OrcFile.hh" -#include "orc/Exceptions.hh" - -#include <errno.h> -#include <fcntl.h> -#include <stdio.h> -#include <sys/stat.h> -#include <string.h> - -#ifdef _MSC_VER -#include <io.h> -#define S_IRUSR _S_IREAD -#define S_IWUSR _S_IWRITE +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "orc/Exceptions.hh" + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/stat.h> +#include <string.h> + +#ifdef _MSC_VER +#include <io.h> +#define S_IRUSR _S_IREAD +#define S_IWUSR _S_IWRITE #define stat _stat64 #define fstat _fstat64 -#else -#include <unistd.h> -#define O_BINARY 0 -#endif - -namespace orc { - - class FileInputStream : public InputStream { - private: - std::string filename; - int file; - uint64_t totalLength; - - public: - FileInputStream(std::string _filename) { - filename = _filename; - file = open(filename.c_str(), O_BINARY | O_RDONLY); - if (file == -1) { - throw ParseError("Can't open " + filename); - } - struct stat fileStat; - if (fstat(file, &fileStat) == -1) { - throw ParseError("Can't stat " + filename); - } - totalLength = static_cast<uint64_t>(fileStat.st_size); - } - - ~FileInputStream() override; - - uint64_t getLength() const override { - return totalLength; - } - - uint64_t getNaturalReadSize() const override { - return 128 * 1024; - } - - void read(void* buf, - uint64_t length, - uint64_t offset) override { - if (!buf) { - throw ParseError("Buffer is null"); - } - ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset)); - - if (bytesRead == -1) { - throw ParseError("Bad read of " + filename); - } - if (static_cast<uint64_t>(bytesRead) != length) { - throw ParseError("Short read of " + filename); - } - } - - const std::string& getName() const override { - return filename; - } - }; - - FileInputStream::~FileInputStream() { - close(file); - } - - std::unique_ptr<InputStream> readFile(const std::string& path) { -#ifdef BUILD_LIBHDFSPP - if(strncmp (path.c_str(), "hdfs://", 7) == 0){ - return orc::readHdfsFile(std::string(path)); - } else { -#endif - return orc::readLocalFile(std::string(path)); -#ifdef BUILD_LIBHDFSPP - } -#endif - } - - std::unique_ptr<InputStream> readLocalFile(const std::string& path) { - return std::unique_ptr<InputStream>(new FileInputStream(path)); - } - - OutputStream::~OutputStream() { - // PASS - }; - - class FileOutputStream : public OutputStream { - private: - std::string filename; - int file; - uint64_t bytesWritten; - bool closed; - - public: - FileOutputStream(std::string _filename) { - bytesWritten = 0; - filename = _filename; - closed = false; - file = open( - filename.c_str(), - O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, - S_IRUSR | S_IWUSR); - if (file == -1) { - throw ParseError("Can't open " + filename); - } - } - - ~FileOutputStream() override; - - uint64_t getLength() const override { - return bytesWritten; - } - - uint64_t getNaturalWriteSize() const override { - return 128 * 1024; - } - - void write(const void* buf, size_t length) override { - if (closed) { - throw std::logic_error("Cannot write to closed stream."); - } - ssize_t bytesWrite = ::write(file, buf, length); - if (bytesWrite == -1) { - throw ParseError("Bad write of " + filename); - } - if (static_cast<uint64_t>(bytesWrite) != length) { - throw ParseError("Short write of " + filename); - } - bytesWritten += static_cast<uint64_t>(bytesWrite); - } - - const std::string& getName() const override { - return filename; - } - - void close() override { - if (!closed) { - ::close(file); - closed = true; - } - } - }; - - FileOutputStream::~FileOutputStream() { - if (!closed) { - ::close(file); - closed = true; - } - } - - std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) { - return std::unique_ptr<OutputStream>(new FileOutputStream(path)); - } -} +#else +#include <unistd.h> +#define O_BINARY 0 +#endif + +namespace orc { + + class FileInputStream : public InputStream { + private: + std::string filename; + int file; + uint64_t totalLength; + + public: + FileInputStream(std::string _filename) { + filename = _filename; + file = open(filename.c_str(), O_BINARY | O_RDONLY); + if (file == -1) { + throw ParseError("Can't open " + filename); + } + struct stat fileStat; + if (fstat(file, &fileStat) == -1) { + throw ParseError("Can't stat " + filename); + } + totalLength = static_cast<uint64_t>(fileStat.st_size); + } + + ~FileInputStream() override; + + uint64_t getLength() const override { + return totalLength; + } + + uint64_t getNaturalReadSize() const override { + return 128 * 1024; + } + + void read(void* buf, + uint64_t length, + uint64_t offset) override { + if (!buf) { + throw ParseError("Buffer is null"); + } + ssize_t bytesRead = pread(file, buf, length, static_cast<off_t>(offset)); + + if (bytesRead == -1) { + throw ParseError("Bad read of " + filename); + } + if (static_cast<uint64_t>(bytesRead) != length) { + throw ParseError("Short read of " + filename); + } + } + + const std::string& getName() const override { + return filename; + } + }; + + FileInputStream::~FileInputStream() { + close(file); + } + + std::unique_ptr<InputStream> readFile(const std::string& path) { +#ifdef BUILD_LIBHDFSPP + if(strncmp (path.c_str(), "hdfs://", 7) == 0){ + return orc::readHdfsFile(std::string(path)); + } else { +#endif + return orc::readLocalFile(std::string(path)); +#ifdef BUILD_LIBHDFSPP + } +#endif + } + + std::unique_ptr<InputStream> readLocalFile(const std::string& path) { + return std::unique_ptr<InputStream>(new FileInputStream(path)); + } + + OutputStream::~OutputStream() { + // PASS + }; + + class FileOutputStream : public OutputStream { + private: + std::string filename; + int file; + uint64_t bytesWritten; + bool closed; + + public: + FileOutputStream(std::string _filename) { + bytesWritten = 0; + filename = _filename; + closed = false; + file = open( + filename.c_str(), + O_BINARY | O_CREAT | O_WRONLY | O_TRUNC, + S_IRUSR | S_IWUSR); + if (file == -1) { + throw ParseError("Can't open " + filename); + } + } + + ~FileOutputStream() override; + + uint64_t getLength() const override { + return bytesWritten; + } + + uint64_t getNaturalWriteSize() const override { + return 128 * 1024; + } + + void write(const void* buf, size_t length) override { + if (closed) { + throw std::logic_error("Cannot write to closed stream."); + } + ssize_t bytesWrite = ::write(file, buf, length); + if (bytesWrite == -1) { + throw ParseError("Bad write of " + filename); + } + if (static_cast<uint64_t>(bytesWrite) != length) { + throw ParseError("Short write of " + filename); + } + bytesWritten += static_cast<uint64_t>(bytesWrite); + } + + const std::string& getName() const override { + return filename; + } + + void close() override { + if (!closed) { + ::close(file); + closed = true; + } + } + }; + + FileOutputStream::~FileOutputStream() { + if (!closed) { + ::close(file); + closed = true; + } + } + + std::unique_ptr<OutputStream> writeLocalFile(const std::string& path) { + return std::unique_ptr<OutputStream>(new FileOutputStream(path)); + } +} diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc index 21f9082216..ea0181deaf 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.cc +++ b/contrib/libs/apache/orc/c++/src/RLE.cc @@ -1,121 +1,121 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#include "RLEv1.hh" -#include "RLEv2.hh" -#include "orc/Exceptions.hh" - -namespace orc { - - RleEncoder::~RleEncoder() { - // PASS - } - - RleDecoder::~RleDecoder() { - // PASS - } - - std::unique_ptr<RleEncoder> createRleEncoder - (std::unique_ptr<BufferedOutputStream> output, - bool isSigned, - RleVersion version, - MemoryPool&, - bool alignedBitpacking) { - switch (static_cast<int64_t>(version)) { - case RleVersion_1: - // We don't have std::make_unique() yet. - return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output), - isSigned)); - case RleVersion_2: - return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output), - isSigned, alignedBitpacking)); - default: - throw NotImplementedYet("Not implemented yet"); - } - } - - std::unique_ptr<RleDecoder> createRleDecoder - (std::unique_ptr<SeekableInputStream> input, - bool isSigned, - RleVersion version, - MemoryPool& pool) { - switch (static_cast<int64_t>(version)) { - case RleVersion_1: - // We don't have std::make_unique() yet. - return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input), - isSigned)); - case RleVersion_2: - return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input), - isSigned, pool)); - default: - throw NotImplementedYet("Not implemented yet"); - } - } - - void RleEncoder::add(const int64_t* data, uint64_t numValues, - const char* notNull) { - for (uint64_t i = 0; i < numValues; ++i) { - if (!notNull || notNull[i]) { - write(data[i]); - } - } - } - - void RleEncoder::writeVslong(int64_t val) { - writeVulong((val << 1) ^ (val >> 63)); - } - - void RleEncoder::writeVulong(int64_t val) { - while (true) { - if ((val & ~0x7f) == 0) { - writeByte(static_cast<char>(val)); - return; - } else { - writeByte(static_cast<char>(0x80 | (val & 0x7f))); - // cast val to unsigned so as to force 0-fill right shift - val = (static_cast<uint64_t>(val) >> 7); - } - } - } - - void RleEncoder::writeByte(char c) { - if (bufferPosition == bufferLength) { - int addedSize = 0; - if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { - throw std::bad_alloc(); - } - bufferPosition = 0; - bufferLength = static_cast<size_t>(addedSize); - } - buffer[bufferPosition++] = c; - } - - void RleEncoder::recordPosition(PositionRecorder* recorder) const { - uint64_t flushedSize = outputStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); - if (outputStream->isCompressed()) { - recorder->add(flushedSize); - recorder->add(unflushedSize); - } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - recorder->add(flushedSize + unflushedSize); - } - recorder->add(static_cast<uint64_t>(numLiterals)); - } - -} // namespace orc +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "RLEv1.hh" +#include "RLEv2.hh" +#include "orc/Exceptions.hh" + +namespace orc { + + RleEncoder::~RleEncoder() { + // PASS + } + + RleDecoder::~RleDecoder() { + // PASS + } + + std::unique_ptr<RleEncoder> createRleEncoder + (std::unique_ptr<BufferedOutputStream> output, + bool isSigned, + RleVersion version, + MemoryPool&, + bool alignedBitpacking) { + switch (static_cast<int64_t>(version)) { + case RleVersion_1: + // We don't have std::make_unique() yet. + return std::unique_ptr<RleEncoder>(new RleEncoderV1(std::move(output), + isSigned)); + case RleVersion_2: + return std::unique_ptr<RleEncoder>(new RleEncoderV2(std::move(output), + isSigned, alignedBitpacking)); + default: + throw NotImplementedYet("Not implemented yet"); + } + } + + std::unique_ptr<RleDecoder> createRleDecoder + (std::unique_ptr<SeekableInputStream> input, + bool isSigned, + RleVersion version, + MemoryPool& pool) { + switch (static_cast<int64_t>(version)) { + case RleVersion_1: + // We don't have std::make_unique() yet. + return std::unique_ptr<RleDecoder>(new RleDecoderV1(std::move(input), + isSigned)); + case RleVersion_2: + return std::unique_ptr<RleDecoder>(new RleDecoderV2(std::move(input), + isSigned, pool)); + default: + throw NotImplementedYet("Not implemented yet"); + } + } + + void RleEncoder::add(const int64_t* data, uint64_t numValues, + const char* notNull) { + for (uint64_t i = 0; i < numValues; ++i) { + if (!notNull || notNull[i]) { + write(data[i]); + } + } + } + + void RleEncoder::writeVslong(int64_t val) { + writeVulong((val << 1) ^ (val >> 63)); + } + + void RleEncoder::writeVulong(int64_t val) { + while (true) { + if ((val & ~0x7f) == 0) { + writeByte(static_cast<char>(val)); + return; + } else { + writeByte(static_cast<char>(0x80 | (val & 0x7f))); + // cast val to unsigned so as to force 0-fill right shift + val = (static_cast<uint64_t>(val) >> 7); + } + } + } + + void RleEncoder::writeByte(char c) { + if (bufferPosition == bufferLength) { + int addedSize = 0; + if (!outputStream->Next(reinterpret_cast<void **>(&buffer), &addedSize)) { + throw std::bad_alloc(); + } + bufferPosition = 0; + bufferLength = static_cast<size_t>(addedSize); + } + buffer[bufferPosition++] = c; + } + + void RleEncoder::recordPosition(PositionRecorder* recorder) const { + uint64_t flushedSize = outputStream->getSize(); + uint64_t unflushedSize = static_cast<uint64_t>(bufferPosition); + if (outputStream->isCompressed()) { + recorder->add(flushedSize); + recorder->add(unflushedSize); + } else { + flushedSize -= static_cast<uint64_t>(bufferLength); + recorder->add(flushedSize + unflushedSize); + } + recorder->add(static_cast<uint64_t>(numLiterals)); + } + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh index 6822bd812e..ec0330559e 100644 --- a/contrib/libs/apache/orc/c++/src/RLE.hh +++ b/contrib/libs/apache/orc/c++/src/RLE.hh @@ -1,155 +1,155 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_RLE_HH -#define ORC_RLE_HH - -#include "io/InputStream.hh" -#include "io/OutputStream.hh" - -#include <memory> - -namespace orc { - - inline int64_t zigZag(int64_t value) { - return (value << 1) ^ (value >> 63); - } - - inline int64_t unZigZag(uint64_t value) { - return value >> 1 ^ -(value & 1); - } - - class RleEncoder { - public: - // must be non-inline! - virtual ~RleEncoder(); - - RleEncoder( - std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned): - outputStream(std::move(outStream)), - bufferPosition(0), - bufferLength(0), - numLiterals(0), - isSigned(hasSigned), - buffer(nullptr){ - //pass - } - - /** - * Encode the next batch of values. - * @param data the array to read from - * @param numValues the number of values to write - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void add(const int64_t* data, uint64_t numValues, - const char* notNull); - - /** - * Get size of buffer used so far. - */ - uint64_t getBufferSize() const { - return outputStream->getSize(); - } - - /** - * Flushing underlying BufferedOutputStream - */ - virtual uint64_t flush() = 0; - - /** - * record current position - * @param recorder use the recorder to record current positions - */ - virtual void recordPosition(PositionRecorder* recorder) const; - - virtual void write(int64_t val) = 0; - - protected: - std::unique_ptr<BufferedOutputStream> outputStream; - size_t bufferPosition; - size_t bufferLength; - size_t numLiterals; - int64_t* literals; - bool isSigned; - char* buffer; - - virtual void writeByte(char c); - - virtual void writeVulong(int64_t val); - - virtual void writeVslong(int64_t val); - }; - - class RleDecoder { - public: - // must be non-inline! - virtual ~RleDecoder(); - - /** - * Seek to a particular spot. - */ - virtual void seek(PositionProvider&) = 0; - - /** - * Seek over a given number of values. - */ - virtual void skip(uint64_t numValues) = 0; - - /** - * Read a number of values into the batch. - * @param data the array to read into - * @param numValues the number of values to read - * @param notNull If the pointer is null, all values are read. If the - * pointer is not null, positions that are false are skipped. - */ - virtual void next(int64_t* data, uint64_t numValues, - const char* notNull) = 0; - }; - - /** - * Create an RLE encoder. - * @param output the output stream to write to - * @param isSigned true if the number sequence is signed - * @param version version of RLE decoding to do - * @param pool memory pool to use for allocation - */ - std::unique_ptr<RleEncoder> createRleEncoder - (std::unique_ptr<BufferedOutputStream> output, - bool isSigned, - RleVersion version, - MemoryPool& pool, - bool alignedBitpacking); - - /** - * Create an RLE decoder. - * @param input the input stream to read from - * @param isSigned true if the number sequence is signed - * @param version version of RLE decoding to do - * @param pool memory pool to use for allocation - */ - std::unique_ptr<RleDecoder> createRleDecoder - (std::unique_ptr<SeekableInputStream> input, - bool isSigned, - RleVersion version, - MemoryPool& pool); - -} // namespace orc - -#endif // ORC_RLE_HH +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_RLE_HH +#define ORC_RLE_HH + +#include "io/InputStream.hh" +#include "io/OutputStream.hh" + +#include <memory> + +namespace orc { + + inline int64_t zigZag(int64_t value) { + return (value << 1) ^ (value >> 63); + } + + inline int64_t unZigZag(uint64_t value) { + return value >> 1 ^ -(value & 1); + } + + class RleEncoder { + public: + // must be non-inline! + virtual ~RleEncoder(); + + RleEncoder( + std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned): + outputStream(std::move(outStream)), + bufferPosition(0), + bufferLength(0), + numLiterals(0), + isSigned(hasSigned), + buffer(nullptr){ + //pass + } + + /** + * Encode the next batch of values. + * @param data the array to read from + * @param numValues the number of values to write + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void add(const int64_t* data, uint64_t numValues, + const char* notNull); + + /** + * Get size of buffer used so far. + */ + uint64_t getBufferSize() const { + return outputStream->getSize(); + } + + /** + * Flushing underlying BufferedOutputStream + */ + virtual uint64_t flush() = 0; + + /** + * record current position + * @param recorder use the recorder to record current positions + */ + virtual void recordPosition(PositionRecorder* recorder) const; + + virtual void write(int64_t val) = 0; + + protected: + std::unique_ptr<BufferedOutputStream> outputStream; + size_t bufferPosition; + size_t bufferLength; + size_t numLiterals; + int64_t* literals; + bool isSigned; + char* buffer; + + virtual void writeByte(char c); + + virtual void writeVulong(int64_t val); + + virtual void writeVslong(int64_t val); + }; + + class RleDecoder { + public: + // must be non-inline! + virtual ~RleDecoder(); + + /** + * Seek to a particular spot. + */ + virtual void seek(PositionProvider&) = 0; + + /** + * Seek over a given number of values. + */ + virtual void skip(uint64_t numValues) = 0; + + /** + * Read a number of values into the batch. + * @param data the array to read into + * @param numValues the number of values to read + * @param notNull If the pointer is null, all values are read. If the + * pointer is not null, positions that are false are skipped. + */ + virtual void next(int64_t* data, uint64_t numValues, + const char* notNull) = 0; + }; + + /** + * Create an RLE encoder. + * @param output the output stream to write to + * @param isSigned true if the number sequence is signed + * @param version version of RLE decoding to do + * @param pool memory pool to use for allocation + */ + std::unique_ptr<RleEncoder> createRleEncoder + (std::unique_ptr<BufferedOutputStream> output, + bool isSigned, + RleVersion version, + MemoryPool& pool, + bool alignedBitpacking); + + /** + * Create an RLE decoder. + * @param input the input stream to read from + * @param isSigned true if the number sequence is signed + * @param version version of RLE decoding to do + * @param pool memory pool to use for allocation + */ + std::unique_ptr<RleDecoder> createRleDecoder + (std::unique_ptr<SeekableInputStream> input, + bool isSigned, + RleVersion version, + MemoryPool& pool); + +} // namespace orc + +#endif // ORC_RLE_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc index 12e2d057cd..20fc0931ef 100644 --- a/contrib/libs/apache/orc/c++/src/RLEV2Util.cc +++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.cc @@ -1,70 +1,70 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with option work for additional information - * regarding copyright ownership. The ASF licenses option file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use option file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "RLEV2Util.hh" - -namespace orc { - - // Map FBS enum to bit width value. - const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = { - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 26, 28, 30, 32, 40, 48, 56, 64 - }; - - // Map bit length i to closest fixed bit width that can contain i bits. - const uint8_t ClosestFixedBitsMap[65] = { - 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 26, 26, 28, 28, 30, 30, 32, 32, - 40, 40, 40, 40, 40, 40, 40, 40, - 48, 48, 48, 48, 48, 48, 48, 48, - 56, 56, 56, 56, 56, 56, 56, 56, - 64, 64, 64, 64, 64, 64, 64, 64 - }; - - // Map bit length i to closest aligned fixed bit width that can contain i bits. - const uint8_t ClosestAlignedFixedBitsMap[65] = { - 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, - 32, 32, 32, 32, 32, 32, 32, 32, - 40, 40, 40, 40, 40, 40, 40, 40, - 48, 48, 48, 48, 48, 48, 48, 48, - 56, 56, 56, 56, 56, 56, 56, 56, - 64, 64, 64, 64, 64, 64, 64, 64 - }; - - // Map bit width to FBS enum. - const uint8_t BitWidthToFBSMap[65] = { - FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR, - FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT, - FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE, - FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, - FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY, - FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR, - FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX, - FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, - FixedBitSizes::THIRTY, FixedBitSizes::THIRTY, - FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO, - FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, - FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, - FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, - FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, - FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, - FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, - FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, - FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR - }; -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with option work for additional information + * regarding copyright ownership. The ASF licenses option file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use option file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "RLEV2Util.hh" + +namespace orc { + + // Map FBS enum to bit width value. + const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE] = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 26, 28, 30, 32, 40, 48, 56, 64 + }; + + // Map bit length i to closest fixed bit width that can contain i bits. + const uint8_t ClosestFixedBitsMap[65] = { + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 26, 26, 28, 28, 30, 30, 32, 32, + 40, 40, 40, 40, 40, 40, 40, 40, + 48, 48, 48, 48, 48, 48, 48, 48, + 56, 56, 56, 56, 56, 56, 56, 56, + 64, 64, 64, 64, 64, 64, 64, 64 + }; + + // Map bit length i to closest aligned fixed bit width that can contain i bits. + const uint8_t ClosestAlignedFixedBitsMap[65] = { + 1, 1, 2, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, 16, 16, 16, 16, 24, 24, 24, 24, 24, 24, 24, 24, + 32, 32, 32, 32, 32, 32, 32, 32, + 40, 40, 40, 40, 40, 40, 40, 40, + 48, 48, 48, 48, 48, 48, 48, 48, + 56, 56, 56, 56, 56, 56, 56, 56, + 64, 64, 64, 64, 64, 64, 64, 64 + }; + + // Map bit width to FBS enum. + const uint8_t BitWidthToFBSMap[65] = { + FixedBitSizes::ONE, FixedBitSizes::ONE, FixedBitSizes::TWO, FixedBitSizes::THREE, FixedBitSizes::FOUR, + FixedBitSizes::FIVE, FixedBitSizes::SIX, FixedBitSizes::SEVEN, FixedBitSizes::EIGHT, + FixedBitSizes::NINE, FixedBitSizes::TEN, FixedBitSizes::ELEVEN, FixedBitSizes::TWELVE, + FixedBitSizes::THIRTEEN, FixedBitSizes::FOURTEEN, FixedBitSizes::FIFTEEN, FixedBitSizes::SIXTEEN, + FixedBitSizes::SEVENTEEN, FixedBitSizes::EIGHTEEN, FixedBitSizes::NINETEEN, FixedBitSizes::TWENTY, + FixedBitSizes::TWENTYONE, FixedBitSizes::TWENTYTWO, FixedBitSizes::TWENTYTHREE, FixedBitSizes::TWENTYFOUR, + FixedBitSizes::TWENTYSIX, FixedBitSizes::TWENTYSIX, + FixedBitSizes::TWENTYEIGHT, FixedBitSizes::TWENTYEIGHT, + FixedBitSizes::THIRTY, FixedBitSizes::THIRTY, + FixedBitSizes::THIRTYTWO, FixedBitSizes::THIRTYTWO, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, + FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, FixedBitSizes::FORTY, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, FixedBitSizes::FORTYEIGHT, + FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, FixedBitSizes::FIFTYSIX, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, + FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR, FixedBitSizes::SIXTYFOUR + }; +} diff --git a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh index 95a6826eaa..67a94c7c48 100644 --- a/contrib/libs/apache/orc/c++/src/RLEV2Util.hh +++ b/contrib/libs/apache/orc/c++/src/RLEV2Util.hh @@ -1,81 +1,81 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef ORC_RLEV2UTIL_HH -#define ORC_RLEV2UTIL_HH - -#include "RLEv2.hh" - -namespace orc { - extern const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE]; - extern const uint8_t ClosestFixedBitsMap[65]; - extern const uint8_t ClosestAlignedFixedBitsMap[65]; - extern const uint8_t BitWidthToFBSMap[65]; - - // The input n must be less than FixedBitSizes::SIZE. - inline uint32_t decodeBitWidth(uint32_t n) { - return FBSToBitWidthMap[n]; - } - - inline uint32_t getClosestFixedBits(uint32_t n) { - if (n <= 64) { - return ClosestFixedBitsMap[n]; - } else { - return 64; - } - } - - inline uint32_t getClosestAlignedFixedBits(uint32_t n) { - if (n <= 64) { - return ClosestAlignedFixedBitsMap[n]; - } else { - return 64; - } - } - - inline uint32_t encodeBitWidth(uint32_t n) { - if (n <= 64) { - return BitWidthToFBSMap[n]; - } else { - return FixedBitSizes::SIXTYFOUR; - } - } - - inline uint32_t findClosestNumBits(int64_t value) { - if (value < 0) { - return getClosestFixedBits(64); - } - - uint32_t count = 0; - while (value != 0) { - count++; - value = value >> 1; - } - return getClosestFixedBits(count); - } - - inline bool isSafeSubtract(int64_t left, int64_t right) { - return ((left ^ right) >= 0) || ((left ^ (left - right)) >= 0); - } - - inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) { - return static_cast<uint32_t >(encoding << 6); - } -} - -#endif //ORC_RLEV2UTIL_HH +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef ORC_RLEV2UTIL_HH +#define ORC_RLEV2UTIL_HH + +#include "RLEv2.hh" + +namespace orc { + extern const uint8_t FBSToBitWidthMap[FixedBitSizes::SIZE]; + extern const uint8_t ClosestFixedBitsMap[65]; + extern const uint8_t ClosestAlignedFixedBitsMap[65]; + extern const uint8_t BitWidthToFBSMap[65]; + + // The input n must be less than FixedBitSizes::SIZE. + inline uint32_t decodeBitWidth(uint32_t n) { + return FBSToBitWidthMap[n]; + } + + inline uint32_t getClosestFixedBits(uint32_t n) { + if (n <= 64) { + return ClosestFixedBitsMap[n]; + } else { + return 64; + } + } + + inline uint32_t getClosestAlignedFixedBits(uint32_t n) { + if (n <= 64) { + return ClosestAlignedFixedBitsMap[n]; + } else { + return 64; + } + } + + inline uint32_t encodeBitWidth(uint32_t n) { + if (n <= 64) { + return BitWidthToFBSMap[n]; + } else { + return FixedBitSizes::SIXTYFOUR; + } + } + + inline uint32_t findClosestNumBits(int64_t value) { + if (value < 0) { + return getClosestFixedBits(64); + } + + uint32_t count = 0; + while (value != 0) { + count++; + value = value >> 1; + } + return getClosestFixedBits(count); + } + + inline bool isSafeSubtract(int64_t left, int64_t right) { + return ((left ^ right) >= 0) || ((left ^ (left - right)) >= 0); + } + + inline uint32_t RleEncoderV2::getOpCode(EncodingType encoding) { + return static_cast<uint32_t >(encoding << 6); + } +} + +#endif //ORC_RLEV2UTIL_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.cc b/contrib/libs/apache/orc/c++/src/RLEv1.cc index fe333978db..aae9726bf6 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.cc +++ b/contrib/libs/apache/orc/c++/src/RLEv1.cc @@ -1,302 +1,302 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "orc/Exceptions.hh" -#include "RLEv1.hh" - -#include <algorithm> - -namespace orc { - -const uint64_t MINIMUM_REPEAT = 3; -const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; - -const int64_t BASE_128_MASK = 0x7f; - -const int64_t MAX_DELTA = 127; -const int64_t MIN_DELTA = -128; -const uint64_t MAX_LITERAL_SIZE = 128; - -RleEncoderV1::RleEncoderV1( - std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned): - RleEncoder(std::move(outStream), hasSigned) { - literals = new int64_t[MAX_LITERAL_SIZE]; - delta = 0; - repeat = false; - tailRunLength = 0; -} - -RleEncoderV1::~RleEncoderV1() { - delete [] literals; -} - -void RleEncoderV1::writeValues() { - if (numLiterals != 0) { - if (repeat) { - writeByte(static_cast<char> - (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); - writeByte(static_cast<char>(delta)); - if (isSigned) { - writeVslong(literals[0]); - } else { - writeVulong(literals[0]); - } - } else { - writeByte(static_cast<char>(-numLiterals)); - for(size_t i=0; i < numLiterals; ++i) { - if (isSigned) { - writeVslong(literals[i]); - } else { - writeVulong(literals[i]); - } - } - } - repeat = false; - numLiterals = 0; - tailRunLength = 0; - } -} - -uint64_t RleEncoderV1::flush() { - writeValues(); - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; -} - -void RleEncoderV1::write(int64_t value) { - if (numLiterals == 0) { - literals[numLiterals++] = value; - tailRunLength = 1; - } else if (repeat) { - if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { - numLiterals += 1; - if (numLiterals == MAXIMUM_REPEAT) { - writeValues(); - } - } else { - writeValues(); - literals[numLiterals++] = value; - tailRunLength = 1; - } - } else { - if (tailRunLength == 1) { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; - } else { - tailRunLength = 2; - } - } else if (value == literals[numLiterals - 1] + delta) { - tailRunLength += 1; - } else { - delta = value - literals[numLiterals - 1]; - if (delta < MIN_DELTA || delta > MAX_DELTA) { - tailRunLength = 1; - } else { - tailRunLength = 2; - } - } - if (tailRunLength == MINIMUM_REPEAT) { - if (numLiterals + 1 == MINIMUM_REPEAT) { - repeat = true; - numLiterals += 1; - } else { - numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); - int64_t base = literals[numLiterals]; - writeValues(); - literals[0] = base; - repeat = true; - numLiterals = MINIMUM_REPEAT; - } - } else { - literals[numLiterals++] = value; - if (numLiterals == MAX_LITERAL_SIZE) { - writeValues(); - } - } - } -} - -signed char RleDecoderV1::readByte() { - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in readByte"); - } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - return *(bufferStart++); -} - -uint64_t RleDecoderV1::readLong() { - uint64_t result = 0; - int64_t offset = 0; - signed char ch = readByte(); - if (ch >= 0) { - result = static_cast<uint64_t>(ch); - } else { - result = static_cast<uint64_t>(ch) & BASE_128_MASK; - while ((ch = readByte()) < 0) { - offset += 7; - result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset; - } - result |= static_cast<uint64_t>(ch) << (offset + 7); - } - return result; -} - -void RleDecoderV1::skipLongs(uint64_t numValues) { - while (numValues > 0) { - if (readByte() >= 0) { - --numValues; - } - } -} - -void RleDecoderV1::readHeader() { - signed char ch = readByte(); - if (ch < 0) { - remainingValues = static_cast<uint64_t>(-ch); - repeating = false; - } else { - remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; - repeating = true; - delta = readByte(); - value = isSigned - ? unZigZag(readLong()) - : static_cast<int64_t>(readLong()); - } -} - -RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, - bool hasSigned) - : inputStream(std::move(input)), - isSigned(hasSigned), - remainingValues(0), - value(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - delta(0), - repeating(false) { -} - -void RleDecoderV1::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // force a re-read from the stream - bufferEnd = bufferStart; - // read a new header - readHeader(); - // skip ahead the given number of records - skip(location.next()); -} - -void RleDecoderV1::skip(uint64_t numValues) { - while (numValues > 0) { - if (remainingValues == 0) { - readHeader(); - } - uint64_t count = std::min(numValues, remainingValues); - remainingValues -= count; - numValues -= count; - if (repeating) { - value += delta * static_cast<int64_t>(count); - } else { - skipLongs(count); - } - } -} - -void RleDecoderV1::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - uint64_t position = 0; - // skipNulls() - if (notNull) { - // Skip over null values. - while (position < numValues && !notNull[position]) { - ++position; - } - } - while (position < numValues) { - // If we are out of values, read more. - if (remainingValues == 0) { - readHeader(); - } - // How many do we read out of this block? - uint64_t count = std::min(numValues - position, remainingValues); - uint64_t consumed = 0; - if (repeating) { - if (notNull) { - for (uint64_t i = 0; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = value + static_cast<int64_t>(consumed) * delta; - consumed += 1; - } - } - } else { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = value + static_cast<int64_t>(i) * delta; - } - consumed = count; - } - value += static_cast<int64_t>(consumed) * delta; - } else { - if (notNull) { - for (uint64_t i = 0 ; i < count; ++i) { - if (notNull[position + i]) { - data[position + i] = isSigned - ? unZigZag(readLong()) - : static_cast<int64_t>(readLong()); - ++consumed; - } - } - } else { - if (isSigned) { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = unZigZag(readLong()); - } - } else { - for (uint64_t i = 0; i < count; ++i) { - data[position + i] = static_cast<int64_t>(readLong()); - } - } - consumed = count; - } - } - remainingValues -= consumed; - position += count; - - // skipNulls() - if (notNull) { - // Skip over null values. - while (position < numValues && !notNull[position]) { - ++position; - } - } - } -} - -} // namespace orc +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "orc/Exceptions.hh" +#include "RLEv1.hh" + +#include <algorithm> + +namespace orc { + +const uint64_t MINIMUM_REPEAT = 3; +const uint64_t MAXIMUM_REPEAT = 127 + MINIMUM_REPEAT; + +const int64_t BASE_128_MASK = 0x7f; + +const int64_t MAX_DELTA = 127; +const int64_t MIN_DELTA = -128; +const uint64_t MAX_LITERAL_SIZE = 128; + +RleEncoderV1::RleEncoderV1( + std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned): + RleEncoder(std::move(outStream), hasSigned) { + literals = new int64_t[MAX_LITERAL_SIZE]; + delta = 0; + repeat = false; + tailRunLength = 0; +} + +RleEncoderV1::~RleEncoderV1() { + delete [] literals; +} + +void RleEncoderV1::writeValues() { + if (numLiterals != 0) { + if (repeat) { + writeByte(static_cast<char> + (static_cast<uint64_t>(numLiterals) - MINIMUM_REPEAT)); + writeByte(static_cast<char>(delta)); + if (isSigned) { + writeVslong(literals[0]); + } else { + writeVulong(literals[0]); + } + } else { + writeByte(static_cast<char>(-numLiterals)); + for(size_t i=0; i < numLiterals; ++i) { + if (isSigned) { + writeVslong(literals[i]); + } else { + writeVulong(literals[i]); + } + } + } + repeat = false; + numLiterals = 0; + tailRunLength = 0; + } +} + +uint64_t RleEncoderV1::flush() { + writeValues(); + outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; +} + +void RleEncoderV1::write(int64_t value) { + if (numLiterals == 0) { + literals[numLiterals++] = value; + tailRunLength = 1; + } else if (repeat) { + if (value == literals[0] + delta * static_cast<int64_t>(numLiterals)) { + numLiterals += 1; + if (numLiterals == MAXIMUM_REPEAT) { + writeValues(); + } + } else { + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; + } + } else { + if (tailRunLength == 1) { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } else if (value == literals[numLiterals - 1] + delta) { + tailRunLength += 1; + } else { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } + if (tailRunLength == MINIMUM_REPEAT) { + if (numLiterals + 1 == MINIMUM_REPEAT) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= static_cast<int>(MINIMUM_REPEAT - 1); + int64_t base = literals[numLiterals]; + writeValues(); + literals[0] = base; + repeat = true; + numLiterals = MINIMUM_REPEAT; + } + } else { + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } + } + } +} + +signed char RleDecoderV1::readByte() { + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in readByte"); + } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + return *(bufferStart++); +} + +uint64_t RleDecoderV1::readLong() { + uint64_t result = 0; + int64_t offset = 0; + signed char ch = readByte(); + if (ch >= 0) { + result = static_cast<uint64_t>(ch); + } else { + result = static_cast<uint64_t>(ch) & BASE_128_MASK; + while ((ch = readByte()) < 0) { + offset += 7; + result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset; + } + result |= static_cast<uint64_t>(ch) << (offset + 7); + } + return result; +} + +void RleDecoderV1::skipLongs(uint64_t numValues) { + while (numValues > 0) { + if (readByte() >= 0) { + --numValues; + } + } +} + +void RleDecoderV1::readHeader() { + signed char ch = readByte(); + if (ch < 0) { + remainingValues = static_cast<uint64_t>(-ch); + repeating = false; + } else { + remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT; + repeating = true; + delta = readByte(); + value = isSigned + ? unZigZag(readLong()) + : static_cast<int64_t>(readLong()); + } +} + +RleDecoderV1::RleDecoderV1(std::unique_ptr<SeekableInputStream> input, + bool hasSigned) + : inputStream(std::move(input)), + isSigned(hasSigned), + remainingValues(0), + value(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + delta(0), + repeating(false) { +} + +void RleDecoderV1::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // force a re-read from the stream + bufferEnd = bufferStart; + // read a new header + readHeader(); + // skip ahead the given number of records + skip(location.next()); +} + +void RleDecoderV1::skip(uint64_t numValues) { + while (numValues > 0) { + if (remainingValues == 0) { + readHeader(); + } + uint64_t count = std::min(numValues, remainingValues); + remainingValues -= count; + numValues -= count; + if (repeating) { + value += delta * static_cast<int64_t>(count); + } else { + skipLongs(count); + } + } +} + +void RleDecoderV1::next(int64_t* const data, + const uint64_t numValues, + const char* const notNull) { + uint64_t position = 0; + // skipNulls() + if (notNull) { + // Skip over null values. + while (position < numValues && !notNull[position]) { + ++position; + } + } + while (position < numValues) { + // If we are out of values, read more. + if (remainingValues == 0) { + readHeader(); + } + // How many do we read out of this block? + uint64_t count = std::min(numValues - position, remainingValues); + uint64_t consumed = 0; + if (repeating) { + if (notNull) { + for (uint64_t i = 0; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = value + static_cast<int64_t>(consumed) * delta; + consumed += 1; + } + } + } else { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = value + static_cast<int64_t>(i) * delta; + } + consumed = count; + } + value += static_cast<int64_t>(consumed) * delta; + } else { + if (notNull) { + for (uint64_t i = 0 ; i < count; ++i) { + if (notNull[position + i]) { + data[position + i] = isSigned + ? unZigZag(readLong()) + : static_cast<int64_t>(readLong()); + ++consumed; + } + } + } else { + if (isSigned) { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = unZigZag(readLong()); + } + } else { + for (uint64_t i = 0; i < count; ++i) { + data[position + i] = static_cast<int64_t>(readLong()); + } + } + consumed = count; + } + } + remainingValues -= consumed; + position += count; + + // skipNulls() + if (notNull) { + // Skip over null values. + while (position < numValues && !notNull[position]) { + ++position; + } + } + } +} + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RLEv1.hh b/contrib/libs/apache/orc/c++/src/RLEv1.hh index 8e31d70873..eb0cf1d8c2 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv1.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv1.hh @@ -1,91 +1,91 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef ORC_RLEV1_HH -#define ORC_RLEV1_HH - -#include "Adaptor.hh" -#include "RLE.hh" - -#include <memory> - -namespace orc { - -class RleEncoderV1 : public RleEncoder { -public: - RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned); - ~RleEncoderV1() override ; - - /** - * Flushing underlying BufferedOutputStream - */ - uint64_t flush() override; - - void write(int64_t val) override; - -private: - int64_t delta; - bool repeat; - uint64_t tailRunLength; - - void writeValues(); -}; - -class RleDecoderV1 : public RleDecoder { -public: - RleDecoderV1(std::unique_ptr<SeekableInputStream> input, - bool isSigned); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; - -private: - inline signed char readByte(); - - inline void readHeader(); - - inline uint64_t readLong(); - - inline void skipLongs(uint64_t numValues); - - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - uint64_t remainingValues; - int64_t value; - const char *bufferStart; - const char *bufferEnd; - int64_t delta; - bool repeating; -}; -} // namespace orc - -#endif // ORC_RLEV1_HH +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef ORC_RLEV1_HH +#define ORC_RLEV1_HH + +#include "Adaptor.hh" +#include "RLE.hh" + +#include <memory> + +namespace orc { + +class RleEncoderV1 : public RleEncoder { +public: + RleEncoderV1(std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned); + ~RleEncoderV1() override ; + + /** + * Flushing underlying BufferedOutputStream + */ + uint64_t flush() override; + + void write(int64_t val) override; + +private: + int64_t delta; + bool repeat; + uint64_t tailRunLength; + + void writeValues(); +}; + +class RleDecoderV1 : public RleDecoder { +public: + RleDecoderV1(std::unique_ptr<SeekableInputStream> input, + bool isSigned); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + void next(int64_t* data, uint64_t numValues, + const char* notNull) override; + +private: + inline signed char readByte(); + + inline void readHeader(); + + inline uint64_t readLong(); + + inline void skipLongs(uint64_t numValues); + + const std::unique_ptr<SeekableInputStream> inputStream; + const bool isSigned; + uint64_t remainingValues; + int64_t value; + const char *bufferStart; + const char *bufferEnd; + int64_t delta; + bool repeating; +}; +} // namespace orc + +#endif // ORC_RLEV1_HH diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh index f85dabd9e6..5c740dfd27 100644 --- a/contrib/libs/apache/orc/c++/src/RLEv2.hh +++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh @@ -1,251 +1,251 @@ -/** -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ - -#ifndef ORC_RLEV2_HH -#define ORC_RLEV2_HH - -#include "Adaptor.hh" -#include "orc/Exceptions.hh" -#include "RLE.hh" - -#include <vector> - -#define MIN_REPEAT 3 -#define HIST_LEN 32 -namespace orc { - -struct FixedBitSizes { - enum FBS { - ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, - TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE - }; -}; - -enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; - -struct EncodingOption { - EncodingType encoding; - int64_t fixedDelta; - int64_t gapVsPatchListCount; - int64_t zigzagLiteralsCount; - int64_t baseRedLiteralsCount; - int64_t adjDeltasCount; - uint32_t zzBits90p; - uint32_t zzBits100p; - uint32_t brBits95p; - uint32_t brBits100p; - uint32_t bitsDeltaMax; - uint32_t patchWidth; - uint32_t patchGapWidth; - uint32_t patchLength; - int64_t min; - bool isFixedDelta; -}; - -class RleEncoderV2 : public RleEncoder { -public: - RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true); - - ~RleEncoderV2() override { - delete [] literals; - delete [] gapVsPatchList; - delete [] zigzagLiterals; - delete [] baseRedLiterals; - delete [] adjDeltas; - } - /** - * Flushing underlying BufferedOutputStream - */ - uint64_t flush() override; - - void write(int64_t val) override; - -private: - - const bool alignedBitPacking; - uint32_t fixedRunLength; - uint32_t variableRunLength; - int64_t prevDelta; - int32_t histgram[HIST_LEN]; - - // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), - // it is move here for performance consideration. - int64_t* gapVsPatchList; - int64_t* zigzagLiterals; - int64_t* baseRedLiterals; - int64_t* adjDeltas; - - uint32_t getOpCode(EncodingType encoding); - void determineEncoding(EncodingOption& option); - void computeZigZagLiterals(EncodingOption& option); - void preparePatchedBlob(EncodingOption& option); - - void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize); - void initializeLiterals(int64_t val); - void writeValues(EncodingOption& option); - void writeShortRepeatValues(EncodingOption& option); - void writeDirectValues(EncodingOption& option); - void writePatchedBasedValues(EncodingOption& option); - void writeDeltaValues(EncodingOption& option); - uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); -}; - -class RleDecoderV2 : public RleDecoder { -public: - RleDecoderV2(std::unique_ptr<SeekableInputStream> input, - bool isSigned, MemoryPool& pool); - - /** - * Seek to a particular spot. - */ - void seek(PositionProvider&) override; - - /** - * Seek over a given number of values. - */ - void skip(uint64_t numValues) override; - - /** - * Read a number of values into the batch. - */ - void next(int64_t* data, uint64_t numValues, - const char* notNull) override; - -private: - - // Used by PATCHED_BASE - void adjustGapAndPatch() { - curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> - patchBitSize; - curPatch = unpackedPatch[patchIdx] & patchMask; - actualGap = 0; - - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (curGap == 255 && curPatch == 0) { - actualGap += 255; - ++patchIdx; - curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> - patchBitSize; - curPatch = unpackedPatch[patchIdx] & patchMask; - } - // add the left over gap - actualGap += curGap; - } - - void resetReadLongs() { - bitsLeft = 0; - curByte = 0; - } - - void resetRun() { - resetReadLongs(); - bitSize = 0; - } - - unsigned char readByte() { - if (bufferStart == bufferEnd) { - int bufferLength; - const void* bufferPointer; - if (!inputStream->Next(&bufferPointer, &bufferLength)) { - throw ParseError("bad read in RleDecoderV2::readByte"); - } - bufferStart = static_cast<const char*>(bufferPointer); - bufferEnd = bufferStart + bufferLength; - } - - unsigned char result = static_cast<unsigned char>(*bufferStart++); - return result; -} - - int64_t readLongBE(uint64_t bsz); - int64_t readVslong(); - uint64_t readVulong(); - uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len, - uint64_t fb, const char* notNull = nullptr) { - uint64_t ret = 0; - - // TODO: unroll to improve performance - for(uint64_t i = offset; i < (offset + len); i++) { - // skip null positions - if (notNull && !notNull[i]) { - continue; - } - uint64_t result = 0; - uint64_t bitsLeftToRead = fb; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= curByte & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - curByte = readByte(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= static_cast<uint32_t>(bitsLeftToRead); - result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - data[i] = static_cast<int64_t>(result); - ++ret; - } - - return ret; -} - - uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, - const char* notNull); - - const std::unique_ptr<SeekableInputStream> inputStream; - const bool isSigned; - - unsigned char firstByte; - uint64_t runLength; - uint64_t runRead; - const char *bufferStart; - const char *bufferEnd; - int64_t deltaBase; // Used by DELTA - uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE - int64_t firstValue; // Used by SHORT_REPEAT and DELTA - int64_t prevValue; // Used by DELTA - uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA - uint32_t bitsLeft; // Used by anything that uses readLongs - uint32_t curByte; // Used by anything that uses readLongs - uint32_t patchBitSize; // Used by PATCHED_BASE - uint64_t unpackedIdx; // Used by PATCHED_BASE - uint64_t patchIdx; // Used by PATCHED_BASE - int64_t base; // Used by PATCHED_BASE - uint64_t curGap; // Used by PATCHED_BASE - int64_t curPatch; // Used by PATCHED_BASE - int64_t patchMask; // Used by PATCHED_BASE - int64_t actualGap; // Used by PATCHED_BASE - DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE - DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE -}; -} // namespace orc - -#endif // ORC_RLEV2_HH +/** +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#ifndef ORC_RLEV2_HH +#define ORC_RLEV2_HH + +#include "Adaptor.hh" +#include "orc/Exceptions.hh" +#include "RLE.hh" + +#include <vector> + +#define MIN_REPEAT 3 +#define HIST_LEN 32 +namespace orc { + +struct FixedBitSizes { + enum FBS { + ONE = 0, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, + THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, + TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, + TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR, SIZE + }; +}; + +enum EncodingType { SHORT_REPEAT=0, DIRECT=1, PATCHED_BASE=2, DELTA=3 }; + +struct EncodingOption { + EncodingType encoding; + int64_t fixedDelta; + int64_t gapVsPatchListCount; + int64_t zigzagLiteralsCount; + int64_t baseRedLiteralsCount; + int64_t adjDeltasCount; + uint32_t zzBits90p; + uint32_t zzBits100p; + uint32_t brBits95p; + uint32_t brBits100p; + uint32_t bitsDeltaMax; + uint32_t patchWidth; + uint32_t patchGapWidth; + uint32_t patchLength; + int64_t min; + bool isFixedDelta; +}; + +class RleEncoderV2 : public RleEncoder { +public: + RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, bool hasSigned, bool alignBitPacking = true); + + ~RleEncoderV2() override { + delete [] literals; + delete [] gapVsPatchList; + delete [] zigzagLiterals; + delete [] baseRedLiterals; + delete [] adjDeltas; + } + /** + * Flushing underlying BufferedOutputStream + */ + uint64_t flush() override; + + void write(int64_t val) override; + +private: + + const bool alignedBitPacking; + uint32_t fixedRunLength; + uint32_t variableRunLength; + int64_t prevDelta; + int32_t histgram[HIST_LEN]; + + // The four list below should actually belong to EncodingOption since it only holds temporal values in write(int64_t val), + // it is move here for performance consideration. + int64_t* gapVsPatchList; + int64_t* zigzagLiterals; + int64_t* baseRedLiterals; + int64_t* adjDeltas; + + uint32_t getOpCode(EncodingType encoding); + void determineEncoding(EncodingOption& option); + void computeZigZagLiterals(EncodingOption& option); + void preparePatchedBlob(EncodingOption& option); + + void writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize); + void initializeLiterals(int64_t val); + void writeValues(EncodingOption& option); + void writeShortRepeatValues(EncodingOption& option); + void writeDirectValues(EncodingOption& option); + void writePatchedBasedValues(EncodingOption& option); + void writeDeltaValues(EncodingOption& option); + uint32_t percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist = false); +}; + +class RleDecoderV2 : public RleDecoder { +public: + RleDecoderV2(std::unique_ptr<SeekableInputStream> input, + bool isSigned, MemoryPool& pool); + + /** + * Seek to a particular spot. + */ + void seek(PositionProvider&) override; + + /** + * Seek over a given number of values. + */ + void skip(uint64_t numValues) override; + + /** + * Read a number of values into the batch. + */ + void next(int64_t* data, uint64_t numValues, + const char* notNull) override; + +private: + + // Used by PATCHED_BASE + void adjustGapAndPatch() { + curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> + patchBitSize; + curPatch = unpackedPatch[patchIdx] & patchMask; + actualGap = 0; + + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (curGap == 255 && curPatch == 0) { + actualGap += 255; + ++patchIdx; + curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> + patchBitSize; + curPatch = unpackedPatch[patchIdx] & patchMask; + } + // add the left over gap + actualGap += curGap; + } + + void resetReadLongs() { + bitsLeft = 0; + curByte = 0; + } + + void resetRun() { + resetReadLongs(); + bitSize = 0; + } + + unsigned char readByte() { + if (bufferStart == bufferEnd) { + int bufferLength; + const void* bufferPointer; + if (!inputStream->Next(&bufferPointer, &bufferLength)) { + throw ParseError("bad read in RleDecoderV2::readByte"); + } + bufferStart = static_cast<const char*>(bufferPointer); + bufferEnd = bufferStart + bufferLength; + } + + unsigned char result = static_cast<unsigned char>(*bufferStart++); + return result; +} + + int64_t readLongBE(uint64_t bsz); + int64_t readVslong(); + uint64_t readVulong(); + uint64_t readLongs(int64_t *data, uint64_t offset, uint64_t len, + uint64_t fb, const char* notNull = nullptr) { + uint64_t ret = 0; + + // TODO: unroll to improve performance + for(uint64_t i = offset; i < (offset + len); i++) { + // skip null positions + if (notNull && !notNull[i]) { + continue; + } + uint64_t result = 0; + uint64_t bitsLeftToRead = fb; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= curByte & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + curByte = readByte(); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= static_cast<uint32_t>(bitsLeftToRead); + result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + data[i] = static_cast<int64_t>(result); + ++ret; + } + + return ret; +} + + uint64_t nextShortRepeats(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextDirect(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextPatched(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + uint64_t nextDelta(int64_t* data, uint64_t offset, uint64_t numValues, + const char* notNull); + + const std::unique_ptr<SeekableInputStream> inputStream; + const bool isSigned; + + unsigned char firstByte; + uint64_t runLength; + uint64_t runRead; + const char *bufferStart; + const char *bufferEnd; + int64_t deltaBase; // Used by DELTA + uint64_t byteSize; // Used by SHORT_REPEAT and PATCHED_BASE + int64_t firstValue; // Used by SHORT_REPEAT and DELTA + int64_t prevValue; // Used by DELTA + uint32_t bitSize; // Used by DIRECT, PATCHED_BASE and DELTA + uint32_t bitsLeft; // Used by anything that uses readLongs + uint32_t curByte; // Used by anything that uses readLongs + uint32_t patchBitSize; // Used by PATCHED_BASE + uint64_t unpackedIdx; // Used by PATCHED_BASE + uint64_t patchIdx; // Used by PATCHED_BASE + int64_t base; // Used by PATCHED_BASE + uint64_t curGap; // Used by PATCHED_BASE + int64_t curPatch; // Used by PATCHED_BASE + int64_t patchMask; // Used by PATCHED_BASE + int64_t actualGap; // Used by PATCHED_BASE + DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE + DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE +}; +} // namespace orc + +#endif // ORC_RLEV2_HH diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc index f35106ee44..a633567a9c 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.cc +++ b/contrib/libs/apache/orc/c++/src/Reader.cc @@ -1,513 +1,513 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "BloomFilter.hh" -#include "Options.hh" -#include "Reader.hh" -#include "Statistics.hh" -#include "StripeStream.hh" - -#include "wrap/coded-stream-wrapper.h" - -#include <algorithm> -#include <iostream> -#include <memory> -#include <sstream> -#include <string> -#include <vector> -#include <iterator> -#include <set> - -namespace orc { - - const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() { - static const WriterVersionImpl version(WriterVersion_HIVE_8732); - return version; - } - - uint64_t getCompressionBlockSize(const proto::PostScript& ps) { - if (ps.has_compressionblocksize()) { - return ps.compressionblocksize(); - } else { - return 256 * 1024; - } - } - - CompressionKind convertCompressionKind(const proto::PostScript& ps) { - if (ps.has_compression()) { - return static_cast<CompressionKind>(ps.compression()); - } else { - throw ParseError("Unknown compression type"); - } - } - - std::string ColumnSelector::toDotColumnPath() { - if (columns.empty()) { - return std::string(); - } - std::ostringstream columnStream; - std::copy(columns.begin(), columns.end(), - std::ostream_iterator<std::string>(columnStream, ".")); - std::string columnPath = columnStream.str(); - return columnPath.substr(0, columnPath.length() - 1); - } - - - void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) { - size_t id = static_cast<size_t>(type.getColumnId()); - if (!selectedColumns[id]) { - selectedColumns[id] = true; - for(size_t c = id; c <= type.getMaximumColumnId(); ++c){ - selectedColumns[c] = true; - } - } - } - - /** - * Recurses over a type tree and selects the parents of every selected type. - * @return true if any child was selected. - */ - bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const Type& type) { - size_t id = static_cast<size_t>(type.getColumnId()); - bool result = selectedColumns[id]; - for(uint64_t c=0; c < type.getSubtypeCount(); ++c) { - result |= selectParents(selectedColumns, *type.getSubtype(c)); - } - selectedColumns[id] = result; - return result; - } - - /** - * Recurses over a type tree and build two maps - * map<TypeName, TypeId>, map<TypeId, Type> - */ - void ColumnSelector::buildTypeNameIdMap(const Type* type) { - // map<type_id, Type*> - idTypeMap[type->getColumnId()] = type; - - if (STRUCT == type->getKind()) { - for (size_t i = 0; i < type->getSubtypeCount(); ++i) { - const std::string& fieldName = type->getFieldName(i); - columns.push_back(fieldName); - nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId(); - buildTypeNameIdMap(type->getSubtype(i)); - columns.pop_back(); - } - } else { - // other non-primitive type - for (size_t j = 0; j < type->getSubtypeCount(); ++j) { - buildTypeNameIdMap(type->getSubtype(j)); - } - } - } - - void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns, - const RowReaderOptions& options) { - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) { - for(std::list<uint64_t>::const_iterator field = options.getInclude().begin(); - field != options.getInclude().end(); ++field) { - updateSelectedByFieldId(selectedColumns, *field); - } - } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { - for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); - field != options.getIncludeNames().end(); ++field) { - updateSelectedByName(selectedColumns, *field); - } - } else if (options.getTypeIdsSet()) { - for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); - typeId != options.getInclude().end(); ++typeId) { - updateSelectedByTypeId(selectedColumns, *typeId); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - } - - void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns, - uint64_t fieldId) { - if (fieldId < contents->schema->getSubtypeCount()) { - selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId)); - } else { - std::stringstream buffer; - buffer << "Invalid column selected " << fieldId << " out of " - << contents->schema->getSubtypeCount(); - throw ParseError(buffer.str()); - } - } - - void ColumnSelector::updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId) { - if (typeId < selectedColumns.size()) { - const Type& type = *idTypeMap[typeId]; - selectChildren(selectedColumns, type); - } else { - std::stringstream buffer; - buffer << "Invalid type id selected " << typeId << " out of " - << selectedColumns.size(); - throw ParseError(buffer.str()); - } - } - - void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns, - const std::string& fieldName) { - std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName); - if (ite != nameIdMap.end()) { - updateSelectedByTypeId(selectedColumns, ite->second); - } else { - throw ParseError("Invalid column selected " + fieldName); - } - } - - ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) { - buildTypeNameIdMap(contents->schema.get()); - } - - RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents, - const RowReaderOptions& opts - ): localTimezone(getLocalTimezone()), - contents(_contents), - throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), - forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), - footer(contents->footer.get()), - firstRowOfStripe(*contents->pool, 0), - enableEncodedBlock(opts.getEnableLazyDecoding()) { - uint64_t numberOfStripes; - numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - currentStripe = numberOfStripes; - lastStripe = 0; - currentRowInStripe = 0; - rowsInCurrentStripe = 0; - uint64_t rowTotal = 0; - - firstRowOfStripe.resize(numberOfStripes); - for(size_t i=0; i < numberOfStripes; ++i) { - firstRowOfStripe[i] = rowTotal; - proto::StripeInformation stripeInfo = - footer->stripes(static_cast<int>(i)); - rowTotal += stripeInfo.numberofrows(); - bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() && - stripeInfo.offset() < opts.getOffset() + opts.getLength(); - if (isStripeInRange) { - if (i < currentStripe) { - currentStripe = i; - } - if (i >= lastStripe) { - lastStripe = i + 1; - } - } - } - firstStripe = currentStripe; - - if (currentStripe == 0) { - previousRow = (std::numeric_limits<uint64_t>::max)(); - } else if (currentStripe == numberOfStripes) { - previousRow = footer->numberofrows(); - } else { - previousRow = firstRowOfStripe[firstStripe]-1; - } - - ColumnSelector column_selector(contents.get()); - column_selector.updateSelected(selectedColumns, opts); - } - - CompressionKind RowReaderImpl::getCompression() const { - return contents->compression; - } - - uint64_t RowReaderImpl::getCompressionSize() const { - return contents->blockSize; - } - - const std::vector<bool> RowReaderImpl::getSelectedColumns() const { - return selectedColumns; - } - - const Type& RowReaderImpl::getSelectedType() const { - if (selectedSchema.get() == nullptr) { - selectedSchema = buildSelectedType(contents->schema.get(), - selectedColumns); - } - return *(selectedSchema.get()); - } - - uint64_t RowReaderImpl::getRowNumber() const { - return previousRow; - } - - void RowReaderImpl::seekToRow(uint64_t rowNumber) { - // Empty file - if (lastStripe == 0) { - return; - } - - // If we are reading only a portion of the file - // (bounded by firstStripe and lastStripe), - // seeking before or after the portion of interest should return no data. - // Implement this by setting previousRow to the number of rows in the file. - - // seeking past lastStripe - uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size()); - if ( (lastStripe == num_stripes - && rowNumber >= footer->numberofrows()) || - (lastStripe < num_stripes - && rowNumber >= firstRowOfStripe[lastStripe]) ) { - currentStripe = num_stripes; - previousRow = footer->numberofrows(); - return; - } - - uint64_t seekToStripe = 0; - while (seekToStripe+1 < lastStripe && - firstRowOfStripe[seekToStripe+1] <= rowNumber) { - seekToStripe++; - } - - // seeking before the first stripe - if (seekToStripe < firstStripe) { - currentStripe = num_stripes; - previousRow = footer->numberofrows(); - return; - } - - currentStripe = seekToStripe; - currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; - previousRow = rowNumber; - startNextStripe(); - - uint64_t rowsToSkip = currentRowInStripe; - - if (footer->rowindexstride() > 0 && - currentStripeInfo.indexlength() > 0) { - uint32_t rowGroupId = - static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride()); - rowsToSkip -= rowGroupId * footer->rowindexstride(); - - if (rowGroupId != 0) { - seekToRowGroup(rowGroupId); - } - } - - reader->skip(rowsToSkip); - } - - void RowReaderImpl::seekToRowGroup(uint32_t rowGroupEntryId) { - // reset all previous row indexes - rowIndexes.clear(); - - // obtain row indexes for selected columns - uint64_t offset = currentStripeInfo.offset(); - for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { - const proto::Stream& pbStream = currentStripeFooter.streams(i); - uint64_t colId = pbStream.column(); - if (selectedColumns[colId] && pbStream.has_kind() - && pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { - std::unique_ptr<SeekableInputStream> inStream = - createDecompressor(getCompression(), - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream - (contents->stream.get(), - offset, - pbStream.length(), - *contents->pool)), - getCompressionSize(), - *contents->pool); - - proto::RowIndex rowIndex; - if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) { - throw ParseError("Failed to parse the row index"); - } - - rowIndexes[colId] = rowIndex; - } - offset += pbStream.length(); - } - - // store positions for selected columns - std::vector<std::list<uint64_t>> positions; - // store position providers for selected colimns - std::unordered_map<uint64_t, PositionProvider> positionProviders; - - for (auto rowIndex = rowIndexes.cbegin(); - rowIndex != rowIndexes.cend(); ++rowIndex) { - uint64_t colId = rowIndex->first; - const proto::RowIndexEntry& entry = - rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); - - // copy index positions for a specific column - positions.push_back({}); - auto& position = positions.back(); - for (int pos = 0; pos != entry.positions_size(); ++pos) { - position.push_back(entry.positions(pos)); - } - positionProviders.insert(std::make_pair(colId, PositionProvider(position))); - } - - reader->seekToRowGroup(positionProviders); - } - - const FileContents& RowReaderImpl::getFileContents() const { - return *contents; - } - - bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const { - return throwOnHive11DecimalOverflow; - } - - int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const { - return forcedScaleOnHive11Decimal; - } - - proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, - const FileContents& contents) { - uint64_t stripeFooterStart = info.offset() + info.indexlength() + - info.datalength(); - uint64_t stripeFooterLength = info.footerlength(); - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents.compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents.stream.get(), - stripeFooterStart, - stripeFooterLength, - *contents.pool)), - contents.blockSize, - *contents.pool); - proto::StripeFooter result; - if (!result.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError(std::string("bad StripeFooter from ") + - pbStream->getName()); - } - return result; - } - - ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, - const ReaderOptions& opts, - uint64_t _fileLength, - uint64_t _postscriptLength - ): contents(std::move(_contents)), - options(opts), - fileLength(_fileLength), - postscriptLength(_postscriptLength), - footer(contents->footer.get()) { - isMetadataLoaded = false; - checkOrcVersion(); - numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); - contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer)); - contents->blockSize = getCompressionBlockSize(*contents->postscript); - contents->compression= convertCompressionKind(*contents->postscript); - } - - std::string ReaderImpl::getSerializedFileTail() const { - proto::FileTail tail; - proto::PostScript *mutable_ps = tail.mutable_postscript(); - mutable_ps->CopyFrom(*contents->postscript); - proto::Footer *mutableFooter = tail.mutable_footer(); - mutableFooter->CopyFrom(*footer); - tail.set_filelength(fileLength); - tail.set_postscriptlength(postscriptLength); - TString result; - if (!tail.SerializeToString(&result)) { - throw ParseError("Failed to serialize file tail"); - } - return result; - } - - const ReaderOptions& ReaderImpl::getReaderOptions() const { - return options; - } - - CompressionKind ReaderImpl::getCompression() const { - return contents->compression; - } - - uint64_t ReaderImpl::getCompressionSize() const { - return contents->blockSize; - } - - uint64_t ReaderImpl::getNumberOfStripes() const { - return numberOfStripes; - } - - uint64_t ReaderImpl::getNumberOfStripeStatistics() const { - if (!isMetadataLoaded) { - readMetadata(); - } - return metadata.get() == nullptr ? 0 : - static_cast<uint64_t>(metadata->stripestats_size()); - } - - std::unique_ptr<StripeInformation> - ReaderImpl::getStripe(uint64_t stripeIndex) const { - if (stripeIndex > getNumberOfStripes()) { - throw std::logic_error("stripe index out of range"); - } - proto::StripeInformation stripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - - return std::unique_ptr<StripeInformation> - (new StripeInformationImpl - (stripeInfo.offset(), - stripeInfo.indexlength(), - stripeInfo.datalength(), - stripeInfo.footerlength(), - stripeInfo.numberofrows(), - contents->stream.get(), - *contents->pool, - contents->compression, - contents->blockSize)); - } - - FileVersion ReaderImpl::getFormatVersion() const { - if (contents->postscript->version_size() != 2) { - return FileVersion::v_0_11(); - } - return FileVersion( - contents->postscript->version(0), - contents->postscript->version(1)); - } - - uint64_t ReaderImpl::getNumberOfRows() const { - return footer->numberofrows(); - } - - WriterId ReaderImpl::getWriterId() const { - if (footer->has_writer()) { - uint32_t id = footer->writer(); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "BloomFilter.hh" +#include "Options.hh" +#include "Reader.hh" +#include "Statistics.hh" +#include "StripeStream.hh" + +#include "wrap/coded-stream-wrapper.h" + +#include <algorithm> +#include <iostream> +#include <memory> +#include <sstream> +#include <string> +#include <vector> +#include <iterator> +#include <set> + +namespace orc { + + const WriterVersionImpl &WriterVersionImpl::VERSION_HIVE_8732() { + static const WriterVersionImpl version(WriterVersion_HIVE_8732); + return version; + } + + uint64_t getCompressionBlockSize(const proto::PostScript& ps) { + if (ps.has_compressionblocksize()) { + return ps.compressionblocksize(); + } else { + return 256 * 1024; + } + } + + CompressionKind convertCompressionKind(const proto::PostScript& ps) { + if (ps.has_compression()) { + return static_cast<CompressionKind>(ps.compression()); + } else { + throw ParseError("Unknown compression type"); + } + } + + std::string ColumnSelector::toDotColumnPath() { + if (columns.empty()) { + return std::string(); + } + std::ostringstream columnStream; + std::copy(columns.begin(), columns.end(), + std::ostream_iterator<std::string>(columnStream, ".")); + std::string columnPath = columnStream.str(); + return columnPath.substr(0, columnPath.length() - 1); + } + + + void ColumnSelector::selectChildren(std::vector<bool>& selectedColumns, const Type& type) { + size_t id = static_cast<size_t>(type.getColumnId()); + if (!selectedColumns[id]) { + selectedColumns[id] = true; + for(size_t c = id; c <= type.getMaximumColumnId(); ++c){ + selectedColumns[c] = true; + } + } + } + + /** + * Recurses over a type tree and selects the parents of every selected type. + * @return true if any child was selected. + */ + bool ColumnSelector::selectParents(std::vector<bool>& selectedColumns, const Type& type) { + size_t id = static_cast<size_t>(type.getColumnId()); + bool result = selectedColumns[id]; + for(uint64_t c=0; c < type.getSubtypeCount(); ++c) { + result |= selectParents(selectedColumns, *type.getSubtype(c)); + } + selectedColumns[id] = result; + return result; + } + + /** + * Recurses over a type tree and build two maps + * map<TypeName, TypeId>, map<TypeId, Type> + */ + void ColumnSelector::buildTypeNameIdMap(const Type* type) { + // map<type_id, Type*> + idTypeMap[type->getColumnId()] = type; + + if (STRUCT == type->getKind()) { + for (size_t i = 0; i < type->getSubtypeCount(); ++i) { + const std::string& fieldName = type->getFieldName(i); + columns.push_back(fieldName); + nameIdMap[toDotColumnPath()] = type->getSubtype(i)->getColumnId(); + buildTypeNameIdMap(type->getSubtype(i)); + columns.pop_back(); + } + } else { + // other non-primitive type + for (size_t j = 0; j < type->getSubtypeCount(); ++j) { + buildTypeNameIdMap(type->getSubtype(j)); + } + } + } + + void ColumnSelector::updateSelected(std::vector<bool>& selectedColumns, + const RowReaderOptions& options) { + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + if (contents->schema->getKind() == STRUCT && options.getIndexesSet()) { + for(std::list<uint64_t>::const_iterator field = options.getInclude().begin(); + field != options.getInclude().end(); ++field) { + updateSelectedByFieldId(selectedColumns, *field); + } + } else if (contents->schema->getKind() == STRUCT && options.getNamesSet()) { + for(std::list<std::string>::const_iterator field = options.getIncludeNames().begin(); + field != options.getIncludeNames().end(); ++field) { + updateSelectedByName(selectedColumns, *field); + } + } else if (options.getTypeIdsSet()) { + for(std::list<uint64_t>::const_iterator typeId = options.getInclude().begin(); + typeId != options.getInclude().end(); ++typeId) { + updateSelectedByTypeId(selectedColumns, *typeId); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + } + + void ColumnSelector::updateSelectedByFieldId(std::vector<bool>& selectedColumns, + uint64_t fieldId) { + if (fieldId < contents->schema->getSubtypeCount()) { + selectChildren(selectedColumns, *contents->schema->getSubtype(fieldId)); + } else { + std::stringstream buffer; + buffer << "Invalid column selected " << fieldId << " out of " + << contents->schema->getSubtypeCount(); + throw ParseError(buffer.str()); + } + } + + void ColumnSelector::updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId) { + if (typeId < selectedColumns.size()) { + const Type& type = *idTypeMap[typeId]; + selectChildren(selectedColumns, type); + } else { + std::stringstream buffer; + buffer << "Invalid type id selected " << typeId << " out of " + << selectedColumns.size(); + throw ParseError(buffer.str()); + } + } + + void ColumnSelector::updateSelectedByName(std::vector<bool>& selectedColumns, + const std::string& fieldName) { + std::map<std::string, uint64_t>::const_iterator ite = nameIdMap.find(fieldName); + if (ite != nameIdMap.end()) { + updateSelectedByTypeId(selectedColumns, ite->second); + } else { + throw ParseError("Invalid column selected " + fieldName); + } + } + + ColumnSelector::ColumnSelector(const FileContents* _contents): contents(_contents) { + buildTypeNameIdMap(contents->schema.get()); + } + + RowReaderImpl::RowReaderImpl(std::shared_ptr<FileContents> _contents, + const RowReaderOptions& opts + ): localTimezone(getLocalTimezone()), + contents(_contents), + throwOnHive11DecimalOverflow(opts.getThrowOnHive11DecimalOverflow()), + forcedScaleOnHive11Decimal(opts.getForcedScaleOnHive11Decimal()), + footer(contents->footer.get()), + firstRowOfStripe(*contents->pool, 0), + enableEncodedBlock(opts.getEnableLazyDecoding()) { + uint64_t numberOfStripes; + numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); + currentStripe = numberOfStripes; + lastStripe = 0; + currentRowInStripe = 0; + rowsInCurrentStripe = 0; + uint64_t rowTotal = 0; + + firstRowOfStripe.resize(numberOfStripes); + for(size_t i=0; i < numberOfStripes; ++i) { + firstRowOfStripe[i] = rowTotal; + proto::StripeInformation stripeInfo = + footer->stripes(static_cast<int>(i)); + rowTotal += stripeInfo.numberofrows(); + bool isStripeInRange = stripeInfo.offset() >= opts.getOffset() && + stripeInfo.offset() < opts.getOffset() + opts.getLength(); + if (isStripeInRange) { + if (i < currentStripe) { + currentStripe = i; + } + if (i >= lastStripe) { + lastStripe = i + 1; + } + } + } + firstStripe = currentStripe; + + if (currentStripe == 0) { + previousRow = (std::numeric_limits<uint64_t>::max)(); + } else if (currentStripe == numberOfStripes) { + previousRow = footer->numberofrows(); + } else { + previousRow = firstRowOfStripe[firstStripe]-1; + } + + ColumnSelector column_selector(contents.get()); + column_selector.updateSelected(selectedColumns, opts); + } + + CompressionKind RowReaderImpl::getCompression() const { + return contents->compression; + } + + uint64_t RowReaderImpl::getCompressionSize() const { + return contents->blockSize; + } + + const std::vector<bool> RowReaderImpl::getSelectedColumns() const { + return selectedColumns; + } + + const Type& RowReaderImpl::getSelectedType() const { + if (selectedSchema.get() == nullptr) { + selectedSchema = buildSelectedType(contents->schema.get(), + selectedColumns); + } + return *(selectedSchema.get()); + } + + uint64_t RowReaderImpl::getRowNumber() const { + return previousRow; + } + + void RowReaderImpl::seekToRow(uint64_t rowNumber) { + // Empty file + if (lastStripe == 0) { + return; + } + + // If we are reading only a portion of the file + // (bounded by firstStripe and lastStripe), + // seeking before or after the portion of interest should return no data. + // Implement this by setting previousRow to the number of rows in the file. + + // seeking past lastStripe + uint64_t num_stripes = static_cast<uint64_t>(footer->stripes_size()); + if ( (lastStripe == num_stripes + && rowNumber >= footer->numberofrows()) || + (lastStripe < num_stripes + && rowNumber >= firstRowOfStripe[lastStripe]) ) { + currentStripe = num_stripes; + previousRow = footer->numberofrows(); + return; + } + + uint64_t seekToStripe = 0; + while (seekToStripe+1 < lastStripe && + firstRowOfStripe[seekToStripe+1] <= rowNumber) { + seekToStripe++; + } + + // seeking before the first stripe + if (seekToStripe < firstStripe) { + currentStripe = num_stripes; + previousRow = footer->numberofrows(); + return; + } + + currentStripe = seekToStripe; + currentRowInStripe = rowNumber - firstRowOfStripe[currentStripe]; + previousRow = rowNumber; + startNextStripe(); + + uint64_t rowsToSkip = currentRowInStripe; + + if (footer->rowindexstride() > 0 && + currentStripeInfo.indexlength() > 0) { + uint32_t rowGroupId = + static_cast<uint32_t>(currentRowInStripe / footer->rowindexstride()); + rowsToSkip -= rowGroupId * footer->rowindexstride(); + + if (rowGroupId != 0) { + seekToRowGroup(rowGroupId); + } + } + + reader->skip(rowsToSkip); + } + + void RowReaderImpl::seekToRowGroup(uint32_t rowGroupEntryId) { + // reset all previous row indexes + rowIndexes.clear(); + + // obtain row indexes for selected columns + uint64_t offset = currentStripeInfo.offset(); + for (int i = 0; i < currentStripeFooter.streams_size(); ++i) { + const proto::Stream& pbStream = currentStripeFooter.streams(i); + uint64_t colId = pbStream.column(); + if (selectedColumns[colId] && pbStream.has_kind() + && pbStream.kind() == proto::Stream_Kind_ROW_INDEX) { + std::unique_ptr<SeekableInputStream> inStream = + createDecompressor(getCompression(), + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream + (contents->stream.get(), + offset, + pbStream.length(), + *contents->pool)), + getCompressionSize(), + *contents->pool); + + proto::RowIndex rowIndex; + if (!rowIndex.ParseFromZeroCopyStream(inStream.get())) { + throw ParseError("Failed to parse the row index"); + } + + rowIndexes[colId] = rowIndex; + } + offset += pbStream.length(); + } + + // store positions for selected columns + std::vector<std::list<uint64_t>> positions; + // store position providers for selected colimns + std::unordered_map<uint64_t, PositionProvider> positionProviders; + + for (auto rowIndex = rowIndexes.cbegin(); + rowIndex != rowIndexes.cend(); ++rowIndex) { + uint64_t colId = rowIndex->first; + const proto::RowIndexEntry& entry = + rowIndex->second.entry(static_cast<int32_t>(rowGroupEntryId)); + + // copy index positions for a specific column + positions.push_back({}); + auto& position = positions.back(); + for (int pos = 0; pos != entry.positions_size(); ++pos) { + position.push_back(entry.positions(pos)); + } + positionProviders.insert(std::make_pair(colId, PositionProvider(position))); + } + + reader->seekToRowGroup(positionProviders); + } + + const FileContents& RowReaderImpl::getFileContents() const { + return *contents; + } + + bool RowReaderImpl::getThrowOnHive11DecimalOverflow() const { + return throwOnHive11DecimalOverflow; + } + + int32_t RowReaderImpl::getForcedScaleOnHive11Decimal() const { + return forcedScaleOnHive11Decimal; + } + + proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, + const FileContents& contents) { + uint64_t stripeFooterStart = info.offset() + info.indexlength() + + info.datalength(); + uint64_t stripeFooterLength = info.footerlength(); + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents.compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents.stream.get(), + stripeFooterStart, + stripeFooterLength, + *contents.pool)), + contents.blockSize, + *contents.pool); + proto::StripeFooter result; + if (!result.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError(std::string("bad StripeFooter from ") + + pbStream->getName()); + } + return result; + } + + ReaderImpl::ReaderImpl(std::shared_ptr<FileContents> _contents, + const ReaderOptions& opts, + uint64_t _fileLength, + uint64_t _postscriptLength + ): contents(std::move(_contents)), + options(opts), + fileLength(_fileLength), + postscriptLength(_postscriptLength), + footer(contents->footer.get()) { + isMetadataLoaded = false; + checkOrcVersion(); + numberOfStripes = static_cast<uint64_t>(footer->stripes_size()); + contents->schema = REDUNDANT_MOVE(convertType(footer->types(0), *footer)); + contents->blockSize = getCompressionBlockSize(*contents->postscript); + contents->compression= convertCompressionKind(*contents->postscript); + } + + std::string ReaderImpl::getSerializedFileTail() const { + proto::FileTail tail; + proto::PostScript *mutable_ps = tail.mutable_postscript(); + mutable_ps->CopyFrom(*contents->postscript); + proto::Footer *mutableFooter = tail.mutable_footer(); + mutableFooter->CopyFrom(*footer); + tail.set_filelength(fileLength); + tail.set_postscriptlength(postscriptLength); + TString result; + if (!tail.SerializeToString(&result)) { + throw ParseError("Failed to serialize file tail"); + } + return result; + } + + const ReaderOptions& ReaderImpl::getReaderOptions() const { + return options; + } + + CompressionKind ReaderImpl::getCompression() const { + return contents->compression; + } + + uint64_t ReaderImpl::getCompressionSize() const { + return contents->blockSize; + } + + uint64_t ReaderImpl::getNumberOfStripes() const { + return numberOfStripes; + } + + uint64_t ReaderImpl::getNumberOfStripeStatistics() const { + if (!isMetadataLoaded) { + readMetadata(); + } + return metadata.get() == nullptr ? 0 : + static_cast<uint64_t>(metadata->stripestats_size()); + } + + std::unique_ptr<StripeInformation> + ReaderImpl::getStripe(uint64_t stripeIndex) const { + if (stripeIndex > getNumberOfStripes()) { + throw std::logic_error("stripe index out of range"); + } + proto::StripeInformation stripeInfo = + footer->stripes(static_cast<int>(stripeIndex)); + + return std::unique_ptr<StripeInformation> + (new StripeInformationImpl + (stripeInfo.offset(), + stripeInfo.indexlength(), + stripeInfo.datalength(), + stripeInfo.footerlength(), + stripeInfo.numberofrows(), + contents->stream.get(), + *contents->pool, + contents->compression, + contents->blockSize)); + } + + FileVersion ReaderImpl::getFormatVersion() const { + if (contents->postscript->version_size() != 2) { + return FileVersion::v_0_11(); + } + return FileVersion( + contents->postscript->version(0), + contents->postscript->version(1)); + } + + uint64_t ReaderImpl::getNumberOfRows() const { + return footer->numberofrows(); + } + + WriterId ReaderImpl::getWriterId() const { + if (footer->has_writer()) { + uint32_t id = footer->writer(); if (id > WriterId::TRINO_WRITER) { - return WriterId::UNKNOWN_WRITER; - } else { - return static_cast<WriterId>(id); - } - } - return WriterId::ORC_JAVA_WRITER; - } - - uint32_t ReaderImpl::getWriterIdValue() const { - if (footer->has_writer()) { - return footer->writer(); - } else { - return WriterId::ORC_JAVA_WRITER; - } - } - + return WriterId::UNKNOWN_WRITER; + } else { + return static_cast<WriterId>(id); + } + } + return WriterId::ORC_JAVA_WRITER; + } + + uint32_t ReaderImpl::getWriterIdValue() const { + if (footer->has_writer()) { + return footer->writer(); + } else { + return WriterId::ORC_JAVA_WRITER; + } + } + std::string ReaderImpl::getSoftwareVersion() const { std::ostringstream buffer; buffer << writerIdToString(getWriterIdValue()); @@ -517,704 +517,704 @@ namespace orc { return buffer.str(); } - WriterVersion ReaderImpl::getWriterVersion() const { - if (!contents->postscript->has_writerversion()) { - return WriterVersion_ORIGINAL; - } - return static_cast<WriterVersion>(contents->postscript->writerversion()); - } - - uint64_t ReaderImpl::getContentLength() const { - return footer->contentlength(); - } - - uint64_t ReaderImpl::getStripeStatisticsLength() const { - return contents->postscript->metadatalength(); - } - - uint64_t ReaderImpl::getFileFooterLength() const { - return contents->postscript->footerlength(); - } - - uint64_t ReaderImpl::getFilePostscriptLength() const { - return postscriptLength; - } - - uint64_t ReaderImpl::getFileLength() const { - return fileLength; - } - - uint64_t ReaderImpl::getRowIndexStride() const { - return footer->rowindexstride(); - } - - const std::string& ReaderImpl::getStreamName() const { - return contents->stream->getName(); - } - - std::list<std::string> ReaderImpl::getMetadataKeys() const { - std::list<std::string> result; - for(int i=0; i < footer->metadata_size(); ++i) { - result.push_back(footer->metadata(i).name()); - } - return result; - } - - std::string ReaderImpl::getMetadataValue(const std::string& key) const { - for(int i=0; i < footer->metadata_size(); ++i) { - if (footer->metadata(i).name() == TString(key)) { - return footer->metadata(i).value(); - } - } - throw std::range_error("key not found"); - } - - void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo, - uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const { - int num_streams = currentStripeFooter.streams_size(); - uint64_t offset = stripeInfo.offset(); - uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength(); - for (int i = 0; i < num_streams; i++) { - const proto::Stream& stream = currentStripeFooter.streams(i); - StreamKind streamKind = static_cast<StreamKind>(stream.kind()); - uint64_t length = static_cast<uint64_t>(stream.length()); - if (streamKind == StreamKind::StreamKind_ROW_INDEX) { - if (offset + length > indexEnd) { - std::stringstream msg; - msg << "Malformed RowIndex stream meta in stripe " << stripeIndex - << ": streamOffset=" << offset << ", streamLength=" << length - << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" - << stripeInfo.indexlength(); - throw ParseError(msg.str()); - } - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - offset, - length, - *contents->pool)), - contents->blockSize, - *(contents->pool)); - - proto::RowIndex rowIndex; - if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse RowIndex from stripe footer"); - } - int num_entries = rowIndex.entry_size(); - size_t column = static_cast<size_t>(stream.column()); - for (int j = 0; j < num_entries; j++) { - const proto::RowIndexEntry& entry = rowIndex.entry(j); - (*indexStats)[column].push_back(entry.statistics()); - } - } - offset += length; - } - } - - bool ReaderImpl::hasMetadataValue(const std::string& key) const { - for(int i=0; i < footer->metadata_size(); ++i) { - if (footer->metadata(i).name() == TString(key)) { - return true; - } - } - return false; - } - - const Type& ReaderImpl::getType() const { - return *(contents->schema.get()); - } - - std::unique_ptr<StripeStatistics> - ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { - if (!isMetadataLoaded) { - readMetadata(); - } - if (metadata.get() == nullptr) { - throw std::logic_error("No stripe statistics in file"); - } - size_t num_cols = static_cast<size_t>( - metadata->stripestats( - static_cast<int>(stripeIndex)).colstats_size()); - std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols); - - proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - proto::StripeFooter currentStripeFooter = - getStripeFooter(currentStripeInfo, *contents.get()); - - getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); - - const Timezone& writerTZ = - currentStripeFooter.has_writertimezone() ? - getTimezoneByName(currentStripeFooter.writertimezone()) : - getLocalTimezone(); - StatContext statContext(hasCorrectStatistics(), &writerTZ); - return std::unique_ptr<StripeStatistics> - (new StripeStatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)), - indexStats, statContext)); - } - - std::unique_ptr<Statistics> ReaderImpl::getStatistics() const { - StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr<Statistics> - (new StatisticsImpl(*footer, statContext)); - } - - std::unique_ptr<ColumnStatistics> - ReaderImpl::getColumnStatistics(uint32_t index) const { - if (index >= static_cast<uint64_t>(footer->statistics_size())) { - throw std::logic_error("column index out of range"); - } - proto::ColumnStatistics col = - footer->statistics(static_cast<int32_t>(index)); - - StatContext statContext(hasCorrectStatistics()); - return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext)); - } - - void ReaderImpl::readMetadata() const { - uint64_t metadataSize = contents->postscript->metadatalength(); - uint64_t footerLength = contents->postscript->footerlength(); - if (fileLength < metadataSize + footerLength + postscriptLength + 1) { - std::stringstream msg; - msg << "Invalid Metadata length: fileLength=" << fileLength - << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength - << ", postscriptLength=" << postscriptLength; - throw ParseError(msg.str()); - } - uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; - if (metadataSize != 0) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - metadataStart, - metadataSize, - *contents->pool)), - contents->blockSize, - *contents->pool); - metadata.reset(new proto::Metadata()); - if (!metadata->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the metadata"); - } - } - isMetadataLoaded = true; - } - - bool ReaderImpl::hasCorrectStatistics() const { - return !WriterVersionImpl::VERSION_HIVE_8732().compareGT(getWriterVersion()); - } - - void ReaderImpl::checkOrcVersion() { - FileVersion version = getFormatVersion(); - if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) { - *(options.getErrorStream()) - << "Warning: ORC file " << contents->stream->getName() - << " was written in an unknown format version " - << version.toString() << "\n"; - } - } - - std::unique_ptr<RowReader> ReaderImpl::createRowReader() const { - RowReaderOptions defaultOpts; - return createRowReader(defaultOpts); - } - - std::unique_ptr<RowReader> ReaderImpl::createRowReader( - const RowReaderOptions& opts) const { - return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts)); - } - - uint64_t maxStreamsForType(const proto::Type& type) { - switch (static_cast<int64_t>(type.kind())) { - case proto::Type_Kind_STRUCT: - return 1; - case proto::Type_Kind_INT: - case proto::Type_Kind_LONG: - case proto::Type_Kind_SHORT: - case proto::Type_Kind_FLOAT: - case proto::Type_Kind_DOUBLE: - case proto::Type_Kind_BOOLEAN: - case proto::Type_Kind_BYTE: - case proto::Type_Kind_DATE: - case proto::Type_Kind_LIST: - case proto::Type_Kind_MAP: - case proto::Type_Kind_UNION: - return 2; - case proto::Type_Kind_BINARY: - case proto::Type_Kind_DECIMAL: - case proto::Type_Kind_TIMESTAMP: - return 3; - case proto::Type_Kind_CHAR: - case proto::Type_Kind_STRING: - case proto::Type_Kind_VARCHAR: - return 4; - default: - return 0; - } - } - - uint64_t ReaderImpl::getMemoryUse(int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true); - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) { - for(std::list<uint64_t>::const_iterator field = include.begin(); - field != include.end(); ++field) { - column_selector.updateSelectedByFieldId(selectedColumns, *field); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) { - for(std::list<std::string>::const_iterator field = names.begin(); - field != names.end(); ++field) { - column_selector.updateSelectedByName(selectedColumns, *field); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) { - std::vector<bool> selectedColumns; - selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); - ColumnSelector column_selector(contents.get()); - if (include.begin() != include.end()) { - for(std::list<uint64_t>::const_iterator field = include.begin(); - field != include.end(); ++field) { - column_selector.updateSelectedByTypeId(selectedColumns, *field); - } - } else { - // default is to select all columns - std::fill(selectedColumns.begin(), selectedColumns.end(), true); - } - column_selector.selectParents(selectedColumns, *contents->schema.get()); - selectedColumns[0] = true; // column 0 is selected by default - return getMemoryUse(stripeIx, selectedColumns); - } - - uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) { - uint64_t maxDataLength = 0; - - if (stripeIx >= 0 && stripeIx < footer->stripes_size()) { - uint64_t stripe = footer->stripes(stripeIx).datalength(); - if (maxDataLength < stripe) { - maxDataLength = stripe; - } - } else { - for (int i=0; i < footer->stripes_size(); i++) { - uint64_t stripe = footer->stripes(i).datalength(); - if (maxDataLength < stripe) { - maxDataLength = stripe; - } - } - } - - bool hasStringColumn = false; - uint64_t nSelectedStreams = 0; - for (int i=0; !hasStringColumn && i < footer->types_size(); i++) { - if (selectedColumns[static_cast<size_t>(i)]) { - const proto::Type& type = footer->types(i); - nSelectedStreams += maxStreamsForType(type) ; - switch (static_cast<int64_t>(type.kind())) { - case proto::Type_Kind_CHAR: - case proto::Type_Kind_STRING: - case proto::Type_Kind_VARCHAR: - case proto::Type_Kind_BINARY: { - hasStringColumn = true; - break; - } - default: { - break; - } - } - } - } - - /* If a string column is read, use stripe datalength as a memory estimate - * because we don't know the dictionary size. Multiply by 2 because - * a string column requires two buffers: - * in the input stream and in the seekable input stream. - * If no string column is read, estimate from the number of streams. - */ - uint64_t memory = hasStringColumn ? 2 * maxDataLength : - std::min(uint64_t(maxDataLength), - nSelectedStreams * contents->stream->getNaturalReadSize()); - - // Do we need even more memory to read the footer or the metadata? - if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) { - memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS; - } - if (memory < contents->postscript->metadatalength()) { - memory = contents->postscript->metadatalength(); - } - - // Account for firstRowOfStripe. - memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t); - - // Decompressors need buffers for each stream - uint64_t decompressorMemory = 0; - if (contents->compression != CompressionKind_NONE) { - for (int i=0; i < footer->types_size(); i++) { - if (selectedColumns[static_cast<size_t>(i)]) { - const proto::Type& type = footer->types(i); - decompressorMemory += maxStreamsForType(type) * contents->blockSize; - } - } - if (contents->compression == CompressionKind_SNAPPY) { - decompressorMemory *= 2; // Snappy decompressor uses a second buffer - } - } - - return memory + decompressorMemory ; - } - - void RowReaderImpl::startNextStripe() { - reader.reset(); // ColumnReaders use lots of memory; free old memory first - currentStripeInfo = footer->stripes(static_cast<int>(currentStripe)); - uint64_t fileLength = contents->stream->getLength(); - if (currentStripeInfo.offset() + currentStripeInfo.indexlength() + - currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) { - std::stringstream msg; - msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength=" - << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength=" - << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength() - << ", footerLength=" << currentStripeInfo.footerlength() << ")"; - throw ParseError(msg.str()); - } - currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); - rowsInCurrentStripe = currentStripeInfo.numberofrows(); - const Timezone& writerTimezone = - currentStripeFooter.has_writertimezone() ? - getTimezoneByName(currentStripeFooter.writertimezone()) : - localTimezone; - StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, - currentStripeFooter, - currentStripeInfo.offset(), - *(contents->stream.get()), - writerTimezone); - reader = buildReader(*contents->schema.get(), stripeStreams); - } - - bool RowReaderImpl::next(ColumnVectorBatch& data) { - if (currentStripe >= lastStripe) { - data.numElements = 0; - if (lastStripe > 0) { - previousRow = firstRowOfStripe[lastStripe - 1] + - footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows(); - } else { - previousRow = 0; - } - return false; - } - if (currentRowInStripe == 0) { - startNextStripe(); - } - uint64_t rowsToRead = - std::min(static_cast<uint64_t>(data.capacity), - rowsInCurrentStripe - currentRowInStripe); - data.numElements = rowsToRead; - if (enableEncodedBlock) { - reader->nextEncoded(data, rowsToRead, nullptr); - } - else { - reader->next(data, rowsToRead, nullptr); - } - // update row number - previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe; - currentRowInStripe += rowsToRead; - if (currentRowInStripe >= rowsInCurrentStripe) { - currentStripe += 1; - currentRowInStripe = 0; - } - return rowsToRead != 0; - } - - std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch - (uint64_t capacity) const { - return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock); - } - - void ensureOrcFooter(InputStream* stream, - DataBuffer<char> *buffer, - uint64_t postscriptLength) { - - const std::string MAGIC("ORC"); - const uint64_t magicLength = MAGIC.length(); - const char * const bufferStart = buffer->data(); - const uint64_t bufferLength = buffer->size(); - - if (postscriptLength < magicLength || bufferLength < magicLength) { - throw ParseError("Invalid ORC postscript length"); - } - const char* magicStart = bufferStart + bufferLength - 1 - magicLength; - - // Look for the magic string at the end of the postscript. - if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) { - // If there is no magic string at the end, check the beginning. - // Only files written by Hive 0.11.0 don't have the tail ORC string. - std::unique_ptr<char[]> frontBuffer( new char[magicLength] ); - stream->read(frontBuffer.get(), magicLength, 0); - bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0; - - if (!foundMatch) { - throw ParseError("Not an ORC file"); - } - } - } - - /** - * Read the file's postscript from the given buffer. - * @param stream the file stream - * @param buffer the buffer with the tail of the file. - * @param postscriptSize the length of postscript in bytes - */ - std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream, - DataBuffer<char> *buffer, - uint64_t postscriptSize) { - char *ptr = buffer->data(); - uint64_t readSize = buffer->size(); - - ensureOrcFooter(stream, buffer, postscriptSize); - - std::unique_ptr<proto::PostScript> postscript = - std::unique_ptr<proto::PostScript>(new proto::PostScript()); - if (readSize < 1 + postscriptSize) { - std::stringstream msg; - msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = " - << stream->getLength(); - throw ParseError(msg.str()); - } - if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize, - static_cast<int>(postscriptSize))) { - throw ParseError("Failed to parse the postscript from " + - stream->getName()); - } - return REDUNDANT_MOVE(postscript); - } - - /** - * Check that indices in the type tree are valid, so we won't crash - * when we convert the proto::Types to TypeImpls. - */ - void checkProtoTypeIds(const proto::Footer &footer) { - std::stringstream msg; - int maxId = footer.types_size(); - if (maxId <= 0) { - throw ParseError("Footer is corrupt: no types found"); - } - for (int i = 0; i < maxId; ++i) { - const proto::Type& type = footer.types(i); - for (int j = 0; j < type.subtypes_size(); ++j) { - int subTypeId = static_cast<int>(type.subtypes(j)); - if (subTypeId <= i) { - msg << "Footer is corrupt: malformed link from type " << i << " to " - << subTypeId; - throw ParseError(msg.str()); - } - if (subTypeId >= maxId) { - msg << "Footer is corrupt: types(" << subTypeId << ") not exists"; - throw ParseError(msg.str()); - } - if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) { - msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j - << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= " - << subTypeId << ")"; - throw ParseError(msg.str()); - } - } - } - } - - /** - * Parse the footer from the given buffer. - * @param stream the file's stream - * @param buffer the buffer to parse the footer from - * @param footerOffset the offset within the buffer that contains the footer - * @param ps the file's postscript - * @param memoryPool the memory pool to use - */ - std::unique_ptr<proto::Footer> readFooter(InputStream* stream, - const DataBuffer<char> *buffer, - uint64_t footerOffset, - const proto::PostScript& ps, - MemoryPool& memoryPool) { - const char *footerPtr = buffer->data() + footerOffset; - - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(convertCompressionKind(ps), - std::unique_ptr<SeekableInputStream> - (new SeekableArrayInputStream(footerPtr, - ps.footerlength())), - getCompressionBlockSize(ps), - memoryPool); - - std::unique_ptr<proto::Footer> footer = - std::unique_ptr<proto::Footer>(new proto::Footer()); - if (!footer->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the footer from " + - stream->getName()); - } - - checkProtoTypeIds(*footer); - return REDUNDANT_MOVE(footer); - } - - std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream, - const ReaderOptions& options) { - std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents()); - contents->pool = options.getMemoryPool(); - contents->errorStream = options.getErrorStream(); - std::string serializedFooter = options.getSerializedFileTail(); - uint64_t fileLength; - uint64_t postscriptLength; - if (serializedFooter.length() != 0) { - // Parse the file tail from the serialized one. - proto::FileTail tail; - if (!tail.ParseFromString(TString(serializedFooter))) { - throw ParseError("Failed to parse the file tail from string"); - } - contents->postscript.reset(new proto::PostScript(tail.postscript())); - contents->footer.reset(new proto::Footer(tail.footer())); - fileLength = tail.filelength(); - postscriptLength = tail.postscriptlength(); - } else { - // figure out the size of the file using the option or filesystem - fileLength = std::min(options.getTailLocation(), - static_cast<uint64_t>(stream->getLength())); - - //read last bytes into buffer to get PostScript - uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS); - if (readSize < 4) { - throw ParseError("File size too small"); - } - std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) ); - stream->read(buffer->data(), readSize, fileLength - readSize); - - postscriptLength = buffer->data()[readSize - 1] & 0xff; - contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(), - buffer.get(), postscriptLength)); - uint64_t footerSize = contents->postscript->footerlength(); - uint64_t tailSize = 1 + postscriptLength + footerSize; - if (tailSize >= fileLength) { - std::stringstream msg; - msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; - throw ParseError(msg.str()); - } - uint64_t footerOffset; - - if (tailSize > readSize) { - buffer->resize(footerSize); - stream->read(buffer->data(), footerSize, fileLength - tailSize); - footerOffset = 0; - } else { - footerOffset = readSize - tailSize; - } - - contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(), - footerOffset, *contents->postscript, *contents->pool)); - } - contents->stream = std::move(stream); - return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents), - options, - fileLength, - postscriptLength)); - } - - std::map<uint32_t, BloomFilterIndex> - ReaderImpl::getBloomFilters(uint32_t stripeIndex, - const std::set<uint32_t>& included) const { - std::map<uint32_t, BloomFilterIndex> ret; - - // find stripe info - if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) { - throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex))); - } - const proto::StripeInformation currentStripeInfo = - footer->stripes(static_cast<int>(stripeIndex)); - const proto::StripeFooter currentStripeFooter = - getStripeFooter(currentStripeInfo, *contents); - - // iterate stripe footer to get stream of bloomfilter - uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset()); - for (int i = 0; i < currentStripeFooter.streams_size(); i++) { - const proto::Stream& stream = currentStripeFooter.streams(i); - uint32_t column = static_cast<uint32_t>(stream.column()); - uint64_t length = static_cast<uint64_t>(stream.length()); - - // a bloom filter stream from a selected column is found - if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 && - (included.empty() || included.find(column) != included.end())) { - - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(contents->compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(contents->stream.get(), - offset, - length, - *contents->pool)), - contents->blockSize, - *(contents->pool)); - - proto::BloomFilterIndex pbBFIndex; - if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse BloomFilterIndex"); - } - - BloomFilterIndex bfIndex; - for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) { - std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize( - stream.kind(), - currentStripeFooter.columns(static_cast<int>(stream.column())), - pbBFIndex.bloomfilter(j)); - bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry))); - } - - // add bloom filters to result for one column - ret[column] = bfIndex; - } - - offset += length; - } - - return ret; - } - - RowReader::~RowReader() { - // PASS - } - - Reader::~Reader() { - // PASS - } - - InputStream::~InputStream() { - // PASS - }; - - - -}// namespace + WriterVersion ReaderImpl::getWriterVersion() const { + if (!contents->postscript->has_writerversion()) { + return WriterVersion_ORIGINAL; + } + return static_cast<WriterVersion>(contents->postscript->writerversion()); + } + + uint64_t ReaderImpl::getContentLength() const { + return footer->contentlength(); + } + + uint64_t ReaderImpl::getStripeStatisticsLength() const { + return contents->postscript->metadatalength(); + } + + uint64_t ReaderImpl::getFileFooterLength() const { + return contents->postscript->footerlength(); + } + + uint64_t ReaderImpl::getFilePostscriptLength() const { + return postscriptLength; + } + + uint64_t ReaderImpl::getFileLength() const { + return fileLength; + } + + uint64_t ReaderImpl::getRowIndexStride() const { + return footer->rowindexstride(); + } + + const std::string& ReaderImpl::getStreamName() const { + return contents->stream->getName(); + } + + std::list<std::string> ReaderImpl::getMetadataKeys() const { + std::list<std::string> result; + for(int i=0; i < footer->metadata_size(); ++i) { + result.push_back(footer->metadata(i).name()); + } + return result; + } + + std::string ReaderImpl::getMetadataValue(const std::string& key) const { + for(int i=0; i < footer->metadata_size(); ++i) { + if (footer->metadata(i).name() == TString(key)) { + return footer->metadata(i).value(); + } + } + throw std::range_error("key not found"); + } + + void ReaderImpl::getRowIndexStatistics(const proto::StripeInformation& stripeInfo, + uint64_t stripeIndex, const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const { + int num_streams = currentStripeFooter.streams_size(); + uint64_t offset = stripeInfo.offset(); + uint64_t indexEnd = stripeInfo.offset() + stripeInfo.indexlength(); + for (int i = 0; i < num_streams; i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + StreamKind streamKind = static_cast<StreamKind>(stream.kind()); + uint64_t length = static_cast<uint64_t>(stream.length()); + if (streamKind == StreamKind::StreamKind_ROW_INDEX) { + if (offset + length > indexEnd) { + std::stringstream msg; + msg << "Malformed RowIndex stream meta in stripe " << stripeIndex + << ": streamOffset=" << offset << ", streamLength=" << length + << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" + << stripeInfo.indexlength(); + throw ParseError(msg.str()); + } + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents->stream.get(), + offset, + length, + *contents->pool)), + contents->blockSize, + *(contents->pool)); + + proto::RowIndex rowIndex; + if (!rowIndex.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse RowIndex from stripe footer"); + } + int num_entries = rowIndex.entry_size(); + size_t column = static_cast<size_t>(stream.column()); + for (int j = 0; j < num_entries; j++) { + const proto::RowIndexEntry& entry = rowIndex.entry(j); + (*indexStats)[column].push_back(entry.statistics()); + } + } + offset += length; + } + } + + bool ReaderImpl::hasMetadataValue(const std::string& key) const { + for(int i=0; i < footer->metadata_size(); ++i) { + if (footer->metadata(i).name() == TString(key)) { + return true; + } + } + return false; + } + + const Type& ReaderImpl::getType() const { + return *(contents->schema.get()); + } + + std::unique_ptr<StripeStatistics> + ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const { + if (!isMetadataLoaded) { + readMetadata(); + } + if (metadata.get() == nullptr) { + throw std::logic_error("No stripe statistics in file"); + } + size_t num_cols = static_cast<size_t>( + metadata->stripestats( + static_cast<int>(stripeIndex)).colstats_size()); + std::vector<std::vector<proto::ColumnStatistics> > indexStats(num_cols); + + proto::StripeInformation currentStripeInfo = + footer->stripes(static_cast<int>(stripeIndex)); + proto::StripeFooter currentStripeFooter = + getStripeFooter(currentStripeInfo, *contents.get()); + + getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats); + + const Timezone& writerTZ = + currentStripeFooter.has_writertimezone() ? + getTimezoneByName(currentStripeFooter.writertimezone()) : + getLocalTimezone(); + StatContext statContext(hasCorrectStatistics(), &writerTZ); + return std::unique_ptr<StripeStatistics> + (new StripeStatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)), + indexStats, statContext)); + } + + std::unique_ptr<Statistics> ReaderImpl::getStatistics() const { + StatContext statContext(hasCorrectStatistics()); + return std::unique_ptr<Statistics> + (new StatisticsImpl(*footer, statContext)); + } + + std::unique_ptr<ColumnStatistics> + ReaderImpl::getColumnStatistics(uint32_t index) const { + if (index >= static_cast<uint64_t>(footer->statistics_size())) { + throw std::logic_error("column index out of range"); + } + proto::ColumnStatistics col = + footer->statistics(static_cast<int32_t>(index)); + + StatContext statContext(hasCorrectStatistics()); + return std::unique_ptr<ColumnStatistics> (convertColumnStatistics(col, statContext)); + } + + void ReaderImpl::readMetadata() const { + uint64_t metadataSize = contents->postscript->metadatalength(); + uint64_t footerLength = contents->postscript->footerlength(); + if (fileLength < metadataSize + footerLength + postscriptLength + 1) { + std::stringstream msg; + msg << "Invalid Metadata length: fileLength=" << fileLength + << ", metadataLength=" << metadataSize << ", footerLength=" << footerLength + << ", postscriptLength=" << postscriptLength; + throw ParseError(msg.str()); + } + uint64_t metadataStart = fileLength - metadataSize - footerLength - postscriptLength - 1; + if (metadataSize != 0) { + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents->stream.get(), + metadataStart, + metadataSize, + *contents->pool)), + contents->blockSize, + *contents->pool); + metadata.reset(new proto::Metadata()); + if (!metadata->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the metadata"); + } + } + isMetadataLoaded = true; + } + + bool ReaderImpl::hasCorrectStatistics() const { + return !WriterVersionImpl::VERSION_HIVE_8732().compareGT(getWriterVersion()); + } + + void ReaderImpl::checkOrcVersion() { + FileVersion version = getFormatVersion(); + if (version != FileVersion(0, 11) && version != FileVersion(0, 12)) { + *(options.getErrorStream()) + << "Warning: ORC file " << contents->stream->getName() + << " was written in an unknown format version " + << version.toString() << "\n"; + } + } + + std::unique_ptr<RowReader> ReaderImpl::createRowReader() const { + RowReaderOptions defaultOpts; + return createRowReader(defaultOpts); + } + + std::unique_ptr<RowReader> ReaderImpl::createRowReader( + const RowReaderOptions& opts) const { + return std::unique_ptr<RowReader>(new RowReaderImpl(contents, opts)); + } + + uint64_t maxStreamsForType(const proto::Type& type) { + switch (static_cast<int64_t>(type.kind())) { + case proto::Type_Kind_STRUCT: + return 1; + case proto::Type_Kind_INT: + case proto::Type_Kind_LONG: + case proto::Type_Kind_SHORT: + case proto::Type_Kind_FLOAT: + case proto::Type_Kind_DOUBLE: + case proto::Type_Kind_BOOLEAN: + case proto::Type_Kind_BYTE: + case proto::Type_Kind_DATE: + case proto::Type_Kind_LIST: + case proto::Type_Kind_MAP: + case proto::Type_Kind_UNION: + return 2; + case proto::Type_Kind_BINARY: + case proto::Type_Kind_DECIMAL: + case proto::Type_Kind_TIMESTAMP: + return 3; + case proto::Type_Kind_CHAR: + case proto::Type_Kind_STRING: + case proto::Type_Kind_VARCHAR: + return 4; + default: + return 0; + } + } + + uint64_t ReaderImpl::getMemoryUse(int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), true); + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + ColumnSelector column_selector(contents.get()); + if (contents->schema->getKind() == STRUCT && include.begin() != include.end()) { + for(std::list<uint64_t>::const_iterator field = include.begin(); + field != include.end(); ++field) { + column_selector.updateSelectedByFieldId(selectedColumns, *field); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + column_selector.selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUseByName(const std::list<std::string>& names, int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + ColumnSelector column_selector(contents.get()); + if (contents->schema->getKind() == STRUCT && names.begin() != names.end()) { + for(std::list<std::string>::const_iterator field = names.begin(); + field != names.end(); ++field) { + column_selector.updateSelectedByName(selectedColumns, *field); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + column_selector.selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx) { + std::vector<bool> selectedColumns; + selectedColumns.assign(static_cast<size_t>(contents->footer->types_size()), false); + ColumnSelector column_selector(contents.get()); + if (include.begin() != include.end()) { + for(std::list<uint64_t>::const_iterator field = include.begin(); + field != include.end(); ++field) { + column_selector.updateSelectedByTypeId(selectedColumns, *field); + } + } else { + // default is to select all columns + std::fill(selectedColumns.begin(), selectedColumns.end(), true); + } + column_selector.selectParents(selectedColumns, *contents->schema.get()); + selectedColumns[0] = true; // column 0 is selected by default + return getMemoryUse(stripeIx, selectedColumns); + } + + uint64_t ReaderImpl::getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns) { + uint64_t maxDataLength = 0; + + if (stripeIx >= 0 && stripeIx < footer->stripes_size()) { + uint64_t stripe = footer->stripes(stripeIx).datalength(); + if (maxDataLength < stripe) { + maxDataLength = stripe; + } + } else { + for (int i=0; i < footer->stripes_size(); i++) { + uint64_t stripe = footer->stripes(i).datalength(); + if (maxDataLength < stripe) { + maxDataLength = stripe; + } + } + } + + bool hasStringColumn = false; + uint64_t nSelectedStreams = 0; + for (int i=0; !hasStringColumn && i < footer->types_size(); i++) { + if (selectedColumns[static_cast<size_t>(i)]) { + const proto::Type& type = footer->types(i); + nSelectedStreams += maxStreamsForType(type) ; + switch (static_cast<int64_t>(type.kind())) { + case proto::Type_Kind_CHAR: + case proto::Type_Kind_STRING: + case proto::Type_Kind_VARCHAR: + case proto::Type_Kind_BINARY: { + hasStringColumn = true; + break; + } + default: { + break; + } + } + } + } + + /* If a string column is read, use stripe datalength as a memory estimate + * because we don't know the dictionary size. Multiply by 2 because + * a string column requires two buffers: + * in the input stream and in the seekable input stream. + * If no string column is read, estimate from the number of streams. + */ + uint64_t memory = hasStringColumn ? 2 * maxDataLength : + std::min(uint64_t(maxDataLength), + nSelectedStreams * contents->stream->getNaturalReadSize()); + + // Do we need even more memory to read the footer or the metadata? + if (memory < contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS) { + memory = contents->postscript->footerlength() + DIRECTORY_SIZE_GUESS; + } + if (memory < contents->postscript->metadatalength()) { + memory = contents->postscript->metadatalength(); + } + + // Account for firstRowOfStripe. + memory += static_cast<uint64_t>(footer->stripes_size()) * sizeof(uint64_t); + + // Decompressors need buffers for each stream + uint64_t decompressorMemory = 0; + if (contents->compression != CompressionKind_NONE) { + for (int i=0; i < footer->types_size(); i++) { + if (selectedColumns[static_cast<size_t>(i)]) { + const proto::Type& type = footer->types(i); + decompressorMemory += maxStreamsForType(type) * contents->blockSize; + } + } + if (contents->compression == CompressionKind_SNAPPY) { + decompressorMemory *= 2; // Snappy decompressor uses a second buffer + } + } + + return memory + decompressorMemory ; + } + + void RowReaderImpl::startNextStripe() { + reader.reset(); // ColumnReaders use lots of memory; free old memory first + currentStripeInfo = footer->stripes(static_cast<int>(currentStripe)); + uint64_t fileLength = contents->stream->getLength(); + if (currentStripeInfo.offset() + currentStripeInfo.indexlength() + + currentStripeInfo.datalength() + currentStripeInfo.footerlength() >= fileLength) { + std::stringstream msg; + msg << "Malformed StripeInformation at stripe index " << currentStripe << ": fileLength=" + << fileLength << ", StripeInfo=(offset=" << currentStripeInfo.offset() << ", indexLength=" + << currentStripeInfo.indexlength() << ", dataLength=" << currentStripeInfo.datalength() + << ", footerLength=" << currentStripeInfo.footerlength() << ")"; + throw ParseError(msg.str()); + } + currentStripeFooter = getStripeFooter(currentStripeInfo, *contents.get()); + rowsInCurrentStripe = currentStripeInfo.numberofrows(); + const Timezone& writerTimezone = + currentStripeFooter.has_writertimezone() ? + getTimezoneByName(currentStripeFooter.writertimezone()) : + localTimezone; + StripeStreamsImpl stripeStreams(*this, currentStripe, currentStripeInfo, + currentStripeFooter, + currentStripeInfo.offset(), + *(contents->stream.get()), + writerTimezone); + reader = buildReader(*contents->schema.get(), stripeStreams); + } + + bool RowReaderImpl::next(ColumnVectorBatch& data) { + if (currentStripe >= lastStripe) { + data.numElements = 0; + if (lastStripe > 0) { + previousRow = firstRowOfStripe[lastStripe - 1] + + footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows(); + } else { + previousRow = 0; + } + return false; + } + if (currentRowInStripe == 0) { + startNextStripe(); + } + uint64_t rowsToRead = + std::min(static_cast<uint64_t>(data.capacity), + rowsInCurrentStripe - currentRowInStripe); + data.numElements = rowsToRead; + if (enableEncodedBlock) { + reader->nextEncoded(data, rowsToRead, nullptr); + } + else { + reader->next(data, rowsToRead, nullptr); + } + // update row number + previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe; + currentRowInStripe += rowsToRead; + if (currentRowInStripe >= rowsInCurrentStripe) { + currentStripe += 1; + currentRowInStripe = 0; + } + return rowsToRead != 0; + } + + std::unique_ptr<ColumnVectorBatch> RowReaderImpl::createRowBatch + (uint64_t capacity) const { + return getSelectedType().createRowBatch(capacity, *contents->pool, enableEncodedBlock); + } + + void ensureOrcFooter(InputStream* stream, + DataBuffer<char> *buffer, + uint64_t postscriptLength) { + + const std::string MAGIC("ORC"); + const uint64_t magicLength = MAGIC.length(); + const char * const bufferStart = buffer->data(); + const uint64_t bufferLength = buffer->size(); + + if (postscriptLength < magicLength || bufferLength < magicLength) { + throw ParseError("Invalid ORC postscript length"); + } + const char* magicStart = bufferStart + bufferLength - 1 - magicLength; + + // Look for the magic string at the end of the postscript. + if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) { + // If there is no magic string at the end, check the beginning. + // Only files written by Hive 0.11.0 don't have the tail ORC string. + std::unique_ptr<char[]> frontBuffer( new char[magicLength] ); + stream->read(frontBuffer.get(), magicLength, 0); + bool foundMatch = memcmp(frontBuffer.get(), MAGIC.c_str(), magicLength) == 0; + + if (!foundMatch) { + throw ParseError("Not an ORC file"); + } + } + } + + /** + * Read the file's postscript from the given buffer. + * @param stream the file stream + * @param buffer the buffer with the tail of the file. + * @param postscriptSize the length of postscript in bytes + */ + std::unique_ptr<proto::PostScript> readPostscript(InputStream *stream, + DataBuffer<char> *buffer, + uint64_t postscriptSize) { + char *ptr = buffer->data(); + uint64_t readSize = buffer->size(); + + ensureOrcFooter(stream, buffer, postscriptSize); + + std::unique_ptr<proto::PostScript> postscript = + std::unique_ptr<proto::PostScript>(new proto::PostScript()); + if (readSize < 1 + postscriptSize) { + std::stringstream msg; + msg << "Invalid ORC postscript length: " << postscriptSize << ", file length = " + << stream->getLength(); + throw ParseError(msg.str()); + } + if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize, + static_cast<int>(postscriptSize))) { + throw ParseError("Failed to parse the postscript from " + + stream->getName()); + } + return REDUNDANT_MOVE(postscript); + } + + /** + * Check that indices in the type tree are valid, so we won't crash + * when we convert the proto::Types to TypeImpls. + */ + void checkProtoTypeIds(const proto::Footer &footer) { + std::stringstream msg; + int maxId = footer.types_size(); + if (maxId <= 0) { + throw ParseError("Footer is corrupt: no types found"); + } + for (int i = 0; i < maxId; ++i) { + const proto::Type& type = footer.types(i); + for (int j = 0; j < type.subtypes_size(); ++j) { + int subTypeId = static_cast<int>(type.subtypes(j)); + if (subTypeId <= i) { + msg << "Footer is corrupt: malformed link from type " << i << " to " + << subTypeId; + throw ParseError(msg.str()); + } + if (subTypeId >= maxId) { + msg << "Footer is corrupt: types(" << subTypeId << ") not exists"; + throw ParseError(msg.str()); + } + if (j > 0 && static_cast<int>(type.subtypes(j - 1)) >= subTypeId) { + msg << "Footer is corrupt: subType(" << (j-1) << ") >= subType(" << j + << ") in types(" << i << "). (" << type.subtypes(j - 1) << " >= " + << subTypeId << ")"; + throw ParseError(msg.str()); + } + } + } + } + + /** + * Parse the footer from the given buffer. + * @param stream the file's stream + * @param buffer the buffer to parse the footer from + * @param footerOffset the offset within the buffer that contains the footer + * @param ps the file's postscript + * @param memoryPool the memory pool to use + */ + std::unique_ptr<proto::Footer> readFooter(InputStream* stream, + const DataBuffer<char> *buffer, + uint64_t footerOffset, + const proto::PostScript& ps, + MemoryPool& memoryPool) { + const char *footerPtr = buffer->data() + footerOffset; + + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(convertCompressionKind(ps), + std::unique_ptr<SeekableInputStream> + (new SeekableArrayInputStream(footerPtr, + ps.footerlength())), + getCompressionBlockSize(ps), + memoryPool); + + std::unique_ptr<proto::Footer> footer = + std::unique_ptr<proto::Footer>(new proto::Footer()); + if (!footer->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the footer from " + + stream->getName()); + } + + checkProtoTypeIds(*footer); + return REDUNDANT_MOVE(footer); + } + + std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream, + const ReaderOptions& options) { + std::shared_ptr<FileContents> contents = std::shared_ptr<FileContents>(new FileContents()); + contents->pool = options.getMemoryPool(); + contents->errorStream = options.getErrorStream(); + std::string serializedFooter = options.getSerializedFileTail(); + uint64_t fileLength; + uint64_t postscriptLength; + if (serializedFooter.length() != 0) { + // Parse the file tail from the serialized one. + proto::FileTail tail; + if (!tail.ParseFromString(TString(serializedFooter))) { + throw ParseError("Failed to parse the file tail from string"); + } + contents->postscript.reset(new proto::PostScript(tail.postscript())); + contents->footer.reset(new proto::Footer(tail.footer())); + fileLength = tail.filelength(); + postscriptLength = tail.postscriptlength(); + } else { + // figure out the size of the file using the option or filesystem + fileLength = std::min(options.getTailLocation(), + static_cast<uint64_t>(stream->getLength())); + + //read last bytes into buffer to get PostScript + uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS); + if (readSize < 4) { + throw ParseError("File size too small"); + } + std::unique_ptr<DataBuffer<char>> buffer( new DataBuffer<char>(*contents->pool, readSize) ); + stream->read(buffer->data(), readSize, fileLength - readSize); + + postscriptLength = buffer->data()[readSize - 1] & 0xff; + contents->postscript = REDUNDANT_MOVE(readPostscript(stream.get(), + buffer.get(), postscriptLength)); + uint64_t footerSize = contents->postscript->footerlength(); + uint64_t tailSize = 1 + postscriptLength + footerSize; + if (tailSize >= fileLength) { + std::stringstream msg; + msg << "Invalid ORC tailSize=" << tailSize << ", fileLength=" << fileLength; + throw ParseError(msg.str()); + } + uint64_t footerOffset; + + if (tailSize > readSize) { + buffer->resize(footerSize); + stream->read(buffer->data(), footerSize, fileLength - tailSize); + footerOffset = 0; + } else { + footerOffset = readSize - tailSize; + } + + contents->footer = REDUNDANT_MOVE(readFooter(stream.get(), buffer.get(), + footerOffset, *contents->postscript, *contents->pool)); + } + contents->stream = std::move(stream); + return std::unique_ptr<Reader>(new ReaderImpl(std::move(contents), + options, + fileLength, + postscriptLength)); + } + + std::map<uint32_t, BloomFilterIndex> + ReaderImpl::getBloomFilters(uint32_t stripeIndex, + const std::set<uint32_t>& included) const { + std::map<uint32_t, BloomFilterIndex> ret; + + // find stripe info + if (stripeIndex >= static_cast<uint32_t>(footer->stripes_size())) { + throw std::logic_error("Illegal stripe index: " + to_string(static_cast<int64_t>(stripeIndex))); + } + const proto::StripeInformation currentStripeInfo = + footer->stripes(static_cast<int>(stripeIndex)); + const proto::StripeFooter currentStripeFooter = + getStripeFooter(currentStripeInfo, *contents); + + // iterate stripe footer to get stream of bloomfilter + uint64_t offset = static_cast<uint64_t>(currentStripeInfo.offset()); + for (int i = 0; i < currentStripeFooter.streams_size(); i++) { + const proto::Stream& stream = currentStripeFooter.streams(i); + uint32_t column = static_cast<uint32_t>(stream.column()); + uint64_t length = static_cast<uint64_t>(stream.length()); + + // a bloom filter stream from a selected column is found + if (stream.kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8 && + (included.empty() || included.find(column) != included.end())) { + + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(contents->compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(contents->stream.get(), + offset, + length, + *contents->pool)), + contents->blockSize, + *(contents->pool)); + + proto::BloomFilterIndex pbBFIndex; + if (!pbBFIndex.ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse BloomFilterIndex"); + } + + BloomFilterIndex bfIndex; + for (int j = 0; j < pbBFIndex.bloomfilter_size(); j++) { + std::unique_ptr<BloomFilter> entry = BloomFilterUTF8Utils::deserialize( + stream.kind(), + currentStripeFooter.columns(static_cast<int>(stream.column())), + pbBFIndex.bloomfilter(j)); + bfIndex.entries.push_back(std::shared_ptr<BloomFilter>(std::move(entry))); + } + + // add bloom filters to result for one column + ret[column] = bfIndex; + } + + offset += length; + } + + return ret; + } + + RowReader::~RowReader() { + // PASS + } + + Reader::~Reader() { + // PASS + } + + InputStream::~InputStream() { + // PASS + }; + + + +}// namespace diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh index 49e9d033d9..b4ce7f6529 100644 --- a/contrib/libs/apache/orc/c++/src/Reader.hh +++ b/contrib/libs/apache/orc/c++/src/Reader.hh @@ -1,155 +1,155 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_READER_IMPL_HH -#define ORC_READER_IMPL_HH - -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include "ColumnReader.hh" -#include "orc/Exceptions.hh" -#include "RLE.hh" -#include "TypeImpl.hh" - -namespace orc { - - static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; - - /** - * WriterVersion Implementation - */ - class WriterVersionImpl { - private: - WriterVersion version; - public: - // Known Versions with issues resolved - // The static method below is to fix global constructors Clang warning - static const WriterVersionImpl& VERSION_HIVE_8732(); - - WriterVersionImpl(WriterVersion ver) : version(ver) {} - - bool compareGT(const WriterVersion other) const { - return version > other; - } - }; - - /** - * State shared between Reader and Row Reader - */ - struct FileContents { - std::unique_ptr<InputStream> stream; - std::unique_ptr<proto::PostScript> postscript; - std::unique_ptr<proto::Footer> footer; - std::unique_ptr<Type> schema; - uint64_t blockSize; - CompressionKind compression; - MemoryPool *pool; - std::ostream *errorStream; - }; - - proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, - const FileContents& contents); - - class ReaderImpl; - - class ColumnSelector { - private: - std::map<std::string, uint64_t> nameIdMap; - std::map<uint64_t, const Type*> idTypeMap; - const FileContents* contents; - std::vector<std::string> columns; - - // build map from type name and id, id to Type - void buildTypeNameIdMap(const Type* type); - std::string toDotColumnPath(); - - public: - // Select a field by name - void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); - // Select a field by id - void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); - // Select a type by id - void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); - - // Select all of the recursive children of the given type. - void selectChildren(std::vector<bool>& selectedColumns, const Type& type); - - // For each child of type, select it if one of its children - // is selected. - bool selectParents(std::vector<bool>& selectedColumns, const Type& type); - /** - * Constructor that selects columns. - * @param contents of the file - */ - ColumnSelector(const FileContents* contents); - - // Select the columns from the RowReaderoptions object - void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); - - // Select the columns from the Readeroptions object - void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); - }; - - - class RowReaderImpl : public RowReader { - private: - const Timezone& localTimezone; - - // contents - std::shared_ptr<FileContents> contents; - const bool throwOnHive11DecimalOverflow; - const int32_t forcedScaleOnHive11Decimal; - - // inputs - std::vector<bool> selectedColumns; - - // footer - proto::Footer* footer; - DataBuffer<uint64_t> firstRowOfStripe; - mutable std::unique_ptr<Type> selectedSchema; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_READER_IMPL_HH +#define ORC_READER_IMPL_HH + +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include "ColumnReader.hh" +#include "orc/Exceptions.hh" +#include "RLE.hh" +#include "TypeImpl.hh" + +namespace orc { + + static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024; + + /** + * WriterVersion Implementation + */ + class WriterVersionImpl { + private: + WriterVersion version; + public: + // Known Versions with issues resolved + // The static method below is to fix global constructors Clang warning + static const WriterVersionImpl& VERSION_HIVE_8732(); + + WriterVersionImpl(WriterVersion ver) : version(ver) {} + + bool compareGT(const WriterVersion other) const { + return version > other; + } + }; + + /** + * State shared between Reader and Row Reader + */ + struct FileContents { + std::unique_ptr<InputStream> stream; + std::unique_ptr<proto::PostScript> postscript; + std::unique_ptr<proto::Footer> footer; + std::unique_ptr<Type> schema; + uint64_t blockSize; + CompressionKind compression; + MemoryPool *pool; + std::ostream *errorStream; + }; + + proto::StripeFooter getStripeFooter(const proto::StripeInformation& info, + const FileContents& contents); + + class ReaderImpl; + + class ColumnSelector { + private: + std::map<std::string, uint64_t> nameIdMap; + std::map<uint64_t, const Type*> idTypeMap; + const FileContents* contents; + std::vector<std::string> columns; + + // build map from type name and id, id to Type + void buildTypeNameIdMap(const Type* type); + std::string toDotColumnPath(); + + public: + // Select a field by name + void updateSelectedByName(std::vector<bool>& selectedColumns, const std::string& name); + // Select a field by id + void updateSelectedByFieldId(std::vector<bool>& selectedColumns, uint64_t fieldId); + // Select a type by id + void updateSelectedByTypeId(std::vector<bool>& selectedColumns, uint64_t typeId); + + // Select all of the recursive children of the given type. + void selectChildren(std::vector<bool>& selectedColumns, const Type& type); + + // For each child of type, select it if one of its children + // is selected. + bool selectParents(std::vector<bool>& selectedColumns, const Type& type); + /** + * Constructor that selects columns. + * @param contents of the file + */ + ColumnSelector(const FileContents* contents); + + // Select the columns from the RowReaderoptions object + void updateSelected(std::vector<bool>& selectedColumns, const RowReaderOptions& options); + + // Select the columns from the Readeroptions object + void updateSelected(std::vector<bool>& selectedColumns, const ReaderOptions& options); + }; + + + class RowReaderImpl : public RowReader { + private: + const Timezone& localTimezone; + + // contents + std::shared_ptr<FileContents> contents; + const bool throwOnHive11DecimalOverflow; + const int32_t forcedScaleOnHive11Decimal; + + // inputs + std::vector<bool> selectedColumns; + + // footer + proto::Footer* footer; + DataBuffer<uint64_t> firstRowOfStripe; + mutable std::unique_ptr<Type> selectedSchema; bool skipBloomFilters; - - // reading state - uint64_t previousRow; - uint64_t firstStripe; - uint64_t currentStripe; - uint64_t lastStripe; // the stripe AFTER the last one - uint64_t currentRowInStripe; - uint64_t rowsInCurrentStripe; - proto::StripeInformation currentStripeInfo; - proto::StripeFooter currentStripeFooter; - std::unique_ptr<ColumnReader> reader; - - bool enableEncodedBlock; - // internal methods - void startNextStripe(); - - // row index of current stripe with column id as the key - std::unordered_map<uint64_t, proto::RowIndex> rowIndexes; - - /** - * Seek to the start of a row group in the current stripe - * @param rowGroupEntryId the row group id to seek to - */ - void seekToRowGroup(uint32_t rowGroupEntryId); - + + // reading state + uint64_t previousRow; + uint64_t firstStripe; + uint64_t currentStripe; + uint64_t lastStripe; // the stripe AFTER the last one + uint64_t currentRowInStripe; + uint64_t rowsInCurrentStripe; + proto::StripeInformation currentStripeInfo; + proto::StripeFooter currentStripeFooter; + std::unique_ptr<ColumnReader> reader; + + bool enableEncodedBlock; + // internal methods + void startNextStripe(); + + // row index of current stripe with column id as the key + std::unordered_map<uint64_t, proto::RowIndex> rowIndexes; + + /** + * Seek to the start of a row group in the current stripe + * @param rowGroupEntryId the row group id to seek to + */ + void seekToRowGroup(uint32_t rowGroupEntryId); + /** * Check if the file has bad bloom filters. We will skip using them in the * following reads. @@ -157,159 +157,159 @@ namespace orc { */ bool hasBadBloomFilters(); - public: - /** - * Constructor that lets the user specify additional options. - * @param contents of the file - * @param options options for reading - */ - RowReaderImpl(std::shared_ptr<FileContents> contents, - const RowReaderOptions& options); - - // Select the columns from the options object - void updateSelected(); - const std::vector<bool> getSelectedColumns() const override; - - const Type& getSelectedType() const override; - - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size - ) const override; - - bool next(ColumnVectorBatch& data) override; - - CompressionKind getCompression() const; - - uint64_t getCompressionSize() const; - - uint64_t getRowNumber() const override; - - void seekToRow(uint64_t rowNumber) override; - - const FileContents& getFileContents() const; - bool getThrowOnHive11DecimalOverflow() const; - int32_t getForcedScaleOnHive11Decimal() const; - }; - - class ReaderImpl : public Reader { - private: - // FileContents - std::shared_ptr<FileContents> contents; - - // inputs - const ReaderOptions options; - const uint64_t fileLength; - const uint64_t postscriptLength; - - // footer - proto::Footer* footer; - uint64_t numberOfStripes; - uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); - - // internal methods - void readMetadata() const; - void checkOrcVersion(); - void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, - const proto::StripeFooter& currentStripeFooter, - std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; - - // metadata - mutable std::unique_ptr<proto::Metadata> metadata; - mutable bool isMetadataLoaded; - public: - /** - * Constructor that lets the user specify additional options. - * @param contents of the file - * @param options options for reading - * @param fileLength the length of the file in bytes - * @param postscriptLength the length of the postscript in bytes - */ - ReaderImpl(std::shared_ptr<FileContents> contents, - const ReaderOptions& options, - uint64_t fileLength, - uint64_t postscriptLength); - - const ReaderOptions& getReaderOptions() const; - - CompressionKind getCompression() const override; - - FileVersion getFormatVersion() const override; - - WriterId getWriterId() const override; - - uint32_t getWriterIdValue() const override; - + public: + /** + * Constructor that lets the user specify additional options. + * @param contents of the file + * @param options options for reading + */ + RowReaderImpl(std::shared_ptr<FileContents> contents, + const RowReaderOptions& options); + + // Select the columns from the options object + void updateSelected(); + const std::vector<bool> getSelectedColumns() const override; + + const Type& getSelectedType() const override; + + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size + ) const override; + + bool next(ColumnVectorBatch& data) override; + + CompressionKind getCompression() const; + + uint64_t getCompressionSize() const; + + uint64_t getRowNumber() const override; + + void seekToRow(uint64_t rowNumber) override; + + const FileContents& getFileContents() const; + bool getThrowOnHive11DecimalOverflow() const; + int32_t getForcedScaleOnHive11Decimal() const; + }; + + class ReaderImpl : public Reader { + private: + // FileContents + std::shared_ptr<FileContents> contents; + + // inputs + const ReaderOptions options; + const uint64_t fileLength; + const uint64_t postscriptLength; + + // footer + proto::Footer* footer; + uint64_t numberOfStripes; + uint64_t getMemoryUse(int stripeIx, std::vector<bool>& selectedColumns); + + // internal methods + void readMetadata() const; + void checkOrcVersion(); + void getRowIndexStatistics(const proto::StripeInformation& stripeInfo, uint64_t stripeIndex, + const proto::StripeFooter& currentStripeFooter, + std::vector<std::vector<proto::ColumnStatistics> >* indexStats) const; + + // metadata + mutable std::unique_ptr<proto::Metadata> metadata; + mutable bool isMetadataLoaded; + public: + /** + * Constructor that lets the user specify additional options. + * @param contents of the file + * @param options options for reading + * @param fileLength the length of the file in bytes + * @param postscriptLength the length of the postscript in bytes + */ + ReaderImpl(std::shared_ptr<FileContents> contents, + const ReaderOptions& options, + uint64_t fileLength, + uint64_t postscriptLength); + + const ReaderOptions& getReaderOptions() const; + + CompressionKind getCompression() const override; + + FileVersion getFormatVersion() const override; + + WriterId getWriterId() const override; + + uint32_t getWriterIdValue() const override; + std::string getSoftwareVersion() const override; - WriterVersion getWriterVersion() const override; - - uint64_t getNumberOfRows() const override; - - uint64_t getRowIndexStride() const override; - - std::list<std::string> getMetadataKeys() const override; - - std::string getMetadataValue(const std::string& key) const override; - - bool hasMetadataValue(const std::string& key) const override; - - uint64_t getCompressionSize() const override; - - uint64_t getNumberOfStripes() const override; - - std::unique_ptr<StripeInformation> getStripe(uint64_t - ) const override; - - uint64_t getNumberOfStripeStatistics() const override; - - const std::string& getStreamName() const override; - - std::unique_ptr<StripeStatistics> - getStripeStatistics(uint64_t stripeIndex) const override; - - std::unique_ptr<RowReader> createRowReader() const override; - - std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options - ) const override; - - uint64_t getContentLength() const override; - uint64_t getStripeStatisticsLength() const override; - uint64_t getFileFooterLength() const override; - uint64_t getFilePostscriptLength() const override; - uint64_t getFileLength() const override; - - std::unique_ptr<Statistics> getStatistics() const override; - - std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId - ) const override; - - std::string getSerializedFileTail() const override; - - const Type& getType() const override; - - bool hasCorrectStatistics() const override; - - const proto::PostScript* getPostscript() const {return contents->postscript.get();} - - uint64_t getBlockSize() const {return contents->blockSize;} - - const proto::Footer* getFooter() const {return contents->footer.get();} - - const Type* getSchema() const {return contents->schema.get();} - - InputStream* getStream() const {return contents->stream.get();} - - uint64_t getMemoryUse(int stripeIx = -1) override; - - uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; - - uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; - - uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; - - std::map<uint32_t, BloomFilterIndex> - getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override; - }; - -}// namespace - -#endif + WriterVersion getWriterVersion() const override; + + uint64_t getNumberOfRows() const override; + + uint64_t getRowIndexStride() const override; + + std::list<std::string> getMetadataKeys() const override; + + std::string getMetadataValue(const std::string& key) const override; + + bool hasMetadataValue(const std::string& key) const override; + + uint64_t getCompressionSize() const override; + + uint64_t getNumberOfStripes() const override; + + std::unique_ptr<StripeInformation> getStripe(uint64_t + ) const override; + + uint64_t getNumberOfStripeStatistics() const override; + + const std::string& getStreamName() const override; + + std::unique_ptr<StripeStatistics> + getStripeStatistics(uint64_t stripeIndex) const override; + + std::unique_ptr<RowReader> createRowReader() const override; + + std::unique_ptr<RowReader> createRowReader(const RowReaderOptions& options + ) const override; + + uint64_t getContentLength() const override; + uint64_t getStripeStatisticsLength() const override; + uint64_t getFileFooterLength() const override; + uint64_t getFilePostscriptLength() const override; + uint64_t getFileLength() const override; + + std::unique_ptr<Statistics> getStatistics() const override; + + std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId + ) const override; + + std::string getSerializedFileTail() const override; + + const Type& getType() const override; + + bool hasCorrectStatistics() const override; + + const proto::PostScript* getPostscript() const {return contents->postscript.get();} + + uint64_t getBlockSize() const {return contents->blockSize;} + + const proto::Footer* getFooter() const {return contents->footer.get();} + + const Type* getSchema() const {return contents->schema.get();} + + InputStream* getStream() const {return contents->stream.get();} + + uint64_t getMemoryUse(int stripeIx = -1) override; + + uint64_t getMemoryUseByFieldId(const std::list<uint64_t>& include, int stripeIx=-1) override; + + uint64_t getMemoryUseByName(const std::list<std::string>& names, int stripeIx=-1) override; + + uint64_t getMemoryUseByTypeId(const std::list<uint64_t>& include, int stripeIx=-1) override; + + std::map<uint32_t, BloomFilterIndex> + getBloomFilters(uint32_t stripeIndex, const std::set<uint32_t>& included) const override; + }; + +}// namespace + +#endif diff --git a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc index c5c6f6a801..2b7acb0bd5 100644 --- a/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleDecoderV2.cc @@ -1,426 +1,426 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "RLEv2.hh" -#include "RLEV2Util.hh" - -namespace orc { - -int64_t RleDecoderV2::readLongBE(uint64_t bsz) { - int64_t ret = 0, val; - uint64_t n = bsz; - while (n > 0) { - n--; - val = readByte(); - ret |= (val << (n * 8)); - } - return ret; -} - -inline int64_t RleDecoderV2::readVslong() { - return unZigZag(readVulong()); -} - -uint64_t RleDecoderV2::readVulong() { - uint64_t ret = 0, b; - uint64_t offset = 0; - do { - b = readByte(); - ret |= (0x7f & b) << offset; - offset += 7; - } while (b >= 0x80); - return ret; -} - -RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, - bool _isSigned, MemoryPool& pool - ): inputStream(std::move(input)), - isSigned(_isSigned), - firstByte(0), - runLength(0), - runRead(0), - bufferStart(nullptr), - bufferEnd(bufferStart), - deltaBase(0), - byteSize(0), - firstValue(0), - prevValue(0), - bitSize(0), - bitsLeft(0), - curByte(0), - patchBitSize(0), - unpackedIdx(0), - patchIdx(0), - base(0), - curGap(0), - curPatch(0), - patchMask(0), - actualGap(0), - unpacked(pool, 0), - unpackedPatch(pool, 0) { - // PASS -} - -void RleDecoderV2::seek(PositionProvider& location) { - // move the input stream - inputStream->seek(location); - // clear state - bufferEnd = bufferStart = nullptr; - runRead = runLength = 0; - // skip ahead the given number of records - skip(location.next()); -} - -void RleDecoderV2::skip(uint64_t numValues) { - // simple for now, until perf tests indicate something encoding specific is - // needed - const uint64_t N = 64; - int64_t dummy[N]; - - while (numValues) { - uint64_t nRead = std::min(N, numValues); - next(dummy, nRead, nullptr); - numValues -= nRead; - } -} - -void RleDecoderV2::next(int64_t* const data, - const uint64_t numValues, - const char* const notNull) { - uint64_t nRead = 0; - - while (nRead < numValues) { - // Skip any nulls before attempting to read first byte. - while (notNull && !notNull[nRead]) { - if (++nRead == numValues) { - return; // ended with null values - } - } - - if (runRead == runLength) { - resetRun(); - firstByte = readByte(); - } - - uint64_t offset = nRead, length = numValues - nRead; - - EncodingType enc = static_cast<EncodingType> - ((firstByte >> 6) & 0x03); - switch(static_cast<int64_t>(enc)) { - case SHORT_REPEAT: - nRead += nextShortRepeats(data, offset, length, notNull); - break; - case DIRECT: - nRead += nextDirect(data, offset, length, notNull); - break; - case PATCHED_BASE: - nRead += nextPatched(data, offset, length, notNull); - break; - case DELTA: - nRead += nextDelta(data, offset, length, notNull); - break; - default: - throw ParseError("unknown encoding"); - } - } -} - -uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bytes - byteSize = (firstByte >> 3) & 0x07; - byteSize += 1; - - runLength = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - runLength += MIN_REPEAT; - runRead = 0; - - // read the repeated value which is store using fixed bytes - firstValue = readLongBE(byteSize); - - if (isSigned) { - firstValue = unZigZag(static_cast<uint64_t>(firstValue)); - } - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - if (notNull) { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = firstValue; - ++runRead; - } - } - } else { - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = firstValue; - ++runRead; - } - } - - return nRead; -} - -uint64_t RleDecoderV2::nextDirect(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - runRead += readLongs(data, offset, nRead, bitSize, notNull); - - if (isSigned) { - if (notNull) { - for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - if (notNull[pos]) { - data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); - } - } - } else { - for (uint64_t pos = offset; pos < offset + nRead; ++pos) { - data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); - } - } - } - - return nRead; -} - -uint64_t RleDecoderV2::nextPatched(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - bitSize = decodeBitWidth(fbo); - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - // runs are one off - runLength += 1; - runRead = 0; - - // extract the number of bytes occupied by base - uint64_t thirdByte = readByte(); - byteSize = (thirdByte >> 5) & 0x07; - // base width is one off - byteSize += 1; - - // extract patch width - uint32_t pwo = thirdByte & 0x1f; - patchBitSize = decodeBitWidth(pwo); - - // read fourth byte and extract patch gap width - uint64_t fourthByte = readByte(); - uint32_t pgw = (fourthByte >> 5) & 0x07; - // patch gap width is one off - pgw += 1; - - // extract the length of the patch list - size_t pl = fourthByte & 0x1f; - if (pl == 0) { - throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); - } - - // read the next base width number of bytes to extract base value - base = readLongBE(byteSize); - int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1)); - // if mask of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } - - // TODO: something more efficient than resize - unpacked.resize(runLength); - unpackedIdx = 0; - readLongs(unpacked.data(), 0, runLength, bitSize); - // any remaining bits are thrown out - resetReadLongs(); - - // TODO: something more efficient than resize - unpackedPatch.resize(pl); - patchIdx = 0; - // TODO: Skip corrupt? - // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { - if ((patchBitSize + pgw) > 64) { - throw ParseError("Corrupt PATCHED_BASE encoded data " - "(patchBitSize + pgw > 64)!"); - } - uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); - readLongs(unpackedPatch.data(), 0, pl, cfb); - // any remaining bits are thrown out - resetReadLongs(); - - // apply the patch directly when decoding the packed data - patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1); - - adjustGapAndPatch(); - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - for(uint64_t pos = offset; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - if (static_cast<int64_t>(unpackedIdx) != actualGap) { - // no patching required. add base to unpacked value to get final value - data[pos] = base + unpacked[unpackedIdx]; - } else { - // extract the patch value - int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize); - - // add base to patched value - data[pos] = base + patchedVal; - - // increment the patch to point to next entry in patch list - ++patchIdx; - - if (patchIdx < unpackedPatch.size()) { - adjustGapAndPatch(); - - // next gap is relative to the current gap - actualGap += unpackedIdx; - } - } - - ++runRead; - ++unpackedIdx; - } - - return nRead; -} - -uint64_t RleDecoderV2::nextDelta(int64_t* const data, - uint64_t offset, - uint64_t numValues, - const char* const notNull) { - if (runRead == runLength) { - // extract the number of fixed bits - unsigned char fbo = (firstByte >> 1) & 0x1f; - if (fbo != 0) { - bitSize = decodeBitWidth(fbo); - } else { - bitSize = 0; - } - - // extract the run length - runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; - runLength |= readByte(); - ++runLength; // account for first value - runRead = deltaBase = 0; - - // read the first value stored as vint - if (isSigned) { - firstValue = static_cast<int64_t>(readVslong()); - } else { - firstValue = static_cast<int64_t>(readVulong()); - } - - prevValue = firstValue; - - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - deltaBase = static_cast<int64_t>(readVslong()); - } - - uint64_t nRead = std::min(runLength - runRead, numValues); - - uint64_t pos = offset; - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead == 0 && pos < offset + nRead) { - data[pos++] = firstValue; - ++runRead; - } - - if (bitSize == 0) { - // add fixed deltas to adjacent values - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue + deltaBase; - ++runRead; - } - } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (!notNull || notNull[pos]) break; - } - if (runRead < 2 && pos < offset + nRead) { - // add delta base and first value - prevValue = data[pos++] = firstValue + deltaBase; - ++runRead; - } - - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence - uint64_t remaining = (offset + nRead) - pos; - runRead += readLongs(data, pos, remaining, bitSize, notNull); - - if (deltaBase < 0) { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue - data[pos]; - } - } else { - for ( ; pos < offset + nRead; ++pos) { - // skip null positions - if (notNull && !notNull[pos]) { - continue; - } - prevValue = data[pos] = prevValue + data[pos]; - } - } - } - return nRead; -} - -} // namespace orc +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "RLEv2.hh" +#include "RLEV2Util.hh" + +namespace orc { + +int64_t RleDecoderV2::readLongBE(uint64_t bsz) { + int64_t ret = 0, val; + uint64_t n = bsz; + while (n > 0) { + n--; + val = readByte(); + ret |= (val << (n * 8)); + } + return ret; +} + +inline int64_t RleDecoderV2::readVslong() { + return unZigZag(readVulong()); +} + +uint64_t RleDecoderV2::readVulong() { + uint64_t ret = 0, b; + uint64_t offset = 0; + do { + b = readByte(); + ret |= (0x7f & b) << offset; + offset += 7; + } while (b >= 0x80); + return ret; +} + +RleDecoderV2::RleDecoderV2(std::unique_ptr<SeekableInputStream> input, + bool _isSigned, MemoryPool& pool + ): inputStream(std::move(input)), + isSigned(_isSigned), + firstByte(0), + runLength(0), + runRead(0), + bufferStart(nullptr), + bufferEnd(bufferStart), + deltaBase(0), + byteSize(0), + firstValue(0), + prevValue(0), + bitSize(0), + bitsLeft(0), + curByte(0), + patchBitSize(0), + unpackedIdx(0), + patchIdx(0), + base(0), + curGap(0), + curPatch(0), + patchMask(0), + actualGap(0), + unpacked(pool, 0), + unpackedPatch(pool, 0) { + // PASS +} + +void RleDecoderV2::seek(PositionProvider& location) { + // move the input stream + inputStream->seek(location); + // clear state + bufferEnd = bufferStart = nullptr; + runRead = runLength = 0; + // skip ahead the given number of records + skip(location.next()); +} + +void RleDecoderV2::skip(uint64_t numValues) { + // simple for now, until perf tests indicate something encoding specific is + // needed + const uint64_t N = 64; + int64_t dummy[N]; + + while (numValues) { + uint64_t nRead = std::min(N, numValues); + next(dummy, nRead, nullptr); + numValues -= nRead; + } +} + +void RleDecoderV2::next(int64_t* const data, + const uint64_t numValues, + const char* const notNull) { + uint64_t nRead = 0; + + while (nRead < numValues) { + // Skip any nulls before attempting to read first byte. + while (notNull && !notNull[nRead]) { + if (++nRead == numValues) { + return; // ended with null values + } + } + + if (runRead == runLength) { + resetRun(); + firstByte = readByte(); + } + + uint64_t offset = nRead, length = numValues - nRead; + + EncodingType enc = static_cast<EncodingType> + ((firstByte >> 6) & 0x03); + switch(static_cast<int64_t>(enc)) { + case SHORT_REPEAT: + nRead += nextShortRepeats(data, offset, length, notNull); + break; + case DIRECT: + nRead += nextDirect(data, offset, length, notNull); + break; + case PATCHED_BASE: + nRead += nextPatched(data, offset, length, notNull); + break; + case DELTA: + nRead += nextDelta(data, offset, length, notNull); + break; + default: + throw ParseError("unknown encoding"); + } + } +} + +uint64_t RleDecoderV2::nextShortRepeats(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bytes + byteSize = (firstByte >> 3) & 0x07; + byteSize += 1; + + runLength = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + runLength += MIN_REPEAT; + runRead = 0; + + // read the repeated value which is store using fixed bytes + firstValue = readLongBE(byteSize); + + if (isSigned) { + firstValue = unZigZag(static_cast<uint64_t>(firstValue)); + } + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + if (notNull) { + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = firstValue; + ++runRead; + } + } + } else { + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = firstValue; + ++runRead; + } + } + + return nRead; +} + +uint64_t RleDecoderV2::nextDirect(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + runRead += readLongs(data, offset, nRead, bitSize, notNull); + + if (isSigned) { + if (notNull) { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + if (notNull[pos]) { + data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); + } + } + } else { + for (uint64_t pos = offset; pos < offset + nRead; ++pos) { + data[pos] = unZigZag(static_cast<uint64_t>(data[pos])); + } + } + } + + return nRead; +} + +uint64_t RleDecoderV2::nextPatched(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + bitSize = decodeBitWidth(fbo); + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + // runs are one off + runLength += 1; + runRead = 0; + + // extract the number of bytes occupied by base + uint64_t thirdByte = readByte(); + byteSize = (thirdByte >> 5) & 0x07; + // base width is one off + byteSize += 1; + + // extract patch width + uint32_t pwo = thirdByte & 0x1f; + patchBitSize = decodeBitWidth(pwo); + + // read fourth byte and extract patch gap width + uint64_t fourthByte = readByte(); + uint32_t pgw = (fourthByte >> 5) & 0x07; + // patch gap width is one off + pgw += 1; + + // extract the length of the patch list + size_t pl = fourthByte & 0x1f; + if (pl == 0) { + throw ParseError("Corrupt PATCHED_BASE encoded data (pl==0)!"); + } + + // read the next base width number of bytes to extract base value + base = readLongBE(byteSize); + int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1)); + // if mask of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } + + // TODO: something more efficient than resize + unpacked.resize(runLength); + unpackedIdx = 0; + readLongs(unpacked.data(), 0, runLength, bitSize); + // any remaining bits are thrown out + resetReadLongs(); + + // TODO: something more efficient than resize + unpackedPatch.resize(pl); + patchIdx = 0; + // TODO: Skip corrupt? + // if ((patchBitSize + pgw) > 64 && !skipCorrupt) { + if ((patchBitSize + pgw) > 64) { + throw ParseError("Corrupt PATCHED_BASE encoded data " + "(patchBitSize + pgw > 64)!"); + } + uint32_t cfb = getClosestFixedBits(patchBitSize + pgw); + readLongs(unpackedPatch.data(), 0, pl, cfb); + // any remaining bits are thrown out + resetReadLongs(); + + // apply the patch directly when decoding the packed data + patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1); + + adjustGapAndPatch(); + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + for(uint64_t pos = offset; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + if (static_cast<int64_t>(unpackedIdx) != actualGap) { + // no patching required. add base to unpacked value to get final value + data[pos] = base + unpacked[unpackedIdx]; + } else { + // extract the patch value + int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize); + + // add base to patched value + data[pos] = base + patchedVal; + + // increment the patch to point to next entry in patch list + ++patchIdx; + + if (patchIdx < unpackedPatch.size()) { + adjustGapAndPatch(); + + // next gap is relative to the current gap + actualGap += unpackedIdx; + } + } + + ++runRead; + ++unpackedIdx; + } + + return nRead; +} + +uint64_t RleDecoderV2::nextDelta(int64_t* const data, + uint64_t offset, + uint64_t numValues, + const char* const notNull) { + if (runRead == runLength) { + // extract the number of fixed bits + unsigned char fbo = (firstByte >> 1) & 0x1f; + if (fbo != 0) { + bitSize = decodeBitWidth(fbo); + } else { + bitSize = 0; + } + + // extract the run length + runLength = static_cast<uint64_t>(firstByte & 0x01) << 8; + runLength |= readByte(); + ++runLength; // account for first value + runRead = deltaBase = 0; + + // read the first value stored as vint + if (isSigned) { + firstValue = static_cast<int64_t>(readVslong()); + } else { + firstValue = static_cast<int64_t>(readVulong()); + } + + prevValue = firstValue; + + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + deltaBase = static_cast<int64_t>(readVslong()); + } + + uint64_t nRead = std::min(runLength - runRead, numValues); + + uint64_t pos = offset; + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (!notNull || notNull[pos]) break; + } + if (runRead == 0 && pos < offset + nRead) { + data[pos++] = firstValue; + ++runRead; + } + + if (bitSize == 0) { + // add fixed deltas to adjacent values + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + prevValue = data[pos] = prevValue + deltaBase; + ++runRead; + } + } else { + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (!notNull || notNull[pos]) break; + } + if (runRead < 2 && pos < offset + nRead) { + // add delta base and first value + prevValue = data[pos++] = firstValue + deltaBase; + ++runRead; + } + + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence + uint64_t remaining = (offset + nRead) - pos; + runRead += readLongs(data, pos, remaining, bitSize, notNull); + + if (deltaBase < 0) { + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + prevValue = data[pos] = prevValue - data[pos]; + } + } else { + for ( ; pos < offset + nRead; ++pos) { + // skip null positions + if (notNull && !notNull[pos]) { + continue; + } + prevValue = data[pos] = prevValue + data[pos]; + } + } + } + return nRead; +} + +} // namespace orc diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc index 44e2761b74..f77838a4dd 100644 --- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc +++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc @@ -1,773 +1,773 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with option work for additional information - * regarding copyright ownership. The ASF licenses option file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use option file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "Compression.hh" -#include "RLEv2.hh" -#include "RLEV2Util.hh" - -#define MAX_LITERAL_SIZE 512 -#define MAX_SHORT_REPEAT_LENGTH 10 - -namespace orc { - -/** - * Compute the bits required to represent pth percentile value - * @param data - array - * @param p - percentile value (>=0.0 to <=1.0) - * @return pth percentile bits - */ -uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { - if ((p > 1.0) || (p <= 0.0)) { - throw InvalidArgument("Invalid p value: " + to_string(p)); - } - - if (!reuseHist) { - // histogram that store the encoded bit requirement for each values. - // maximum number of bits that can encoded is 32 (refer FixedBitSizes) - memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); - // compute the histogram - for(size_t i = offset; i < (offset + length); i++) { - uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); - histgram[idx] += 1; - } - } - - int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); - - // return the bits required by pth percentile length - for(int32_t i = HIST_LEN - 1; i >= 0; i--) { - perLen -= histgram[i]; - if (perLen < 0) { - return decodeBitWidth(static_cast<uint32_t>(i)); - } - } - return 0; -} - -RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, - bool hasSigned, bool alignBitPacking) : - RleEncoder(std::move(outStream), hasSigned), - alignedBitPacking(alignBitPacking), - prevDelta(0){ - literals = new int64_t[MAX_LITERAL_SIZE]; - gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; - zigzagLiterals = new int64_t[MAX_LITERAL_SIZE]; - baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; - adjDeltas = new int64_t[MAX_LITERAL_SIZE]; -} - -void RleEncoderV2::write(int64_t val) { - if(numLiterals == 0) { - initializeLiterals(val); - return; - } - - if(numLiterals == 1) { - prevDelta = val - literals[0]; - literals[numLiterals++] = val; - - if(val == literals[0]) { - fixedRunLength = 2; - variableRunLength = 0; - } else { - fixedRunLength = 0; - variableRunLength = 2; - } - return; - } - - int64_t currentDelta = val - literals[numLiterals - 1]; - EncodingOption option = {}; - if (prevDelta == 0 && currentDelta == 0) { - // case 1: fixed delta run - literals[numLiterals++] = val; - - if (variableRunLength > 0) { - // if variable run is non-zero then we are seeing repeating - // values at the end of variable run in which case fixed Run - // length is 2 - fixedRunLength = 2; - } - fixedRunLength++; - - // if fixed run met the minimum condition and if variable - // run is non-zero then flush the variable run and shift the - // tail fixed runs to start of the buffer - if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { - numLiterals -= MIN_REPEAT; - variableRunLength -= (MIN_REPEAT - 1); - - determineEncoding(option); - writeValues(option); - - // shift tail fixed runs to beginning of the buffer - for (size_t i = 0; i < MIN_REPEAT; ++i) { - literals[i] = val; - } - numLiterals = MIN_REPEAT; - } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with option work for additional information + * regarding copyright ownership. The ASF licenses option file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use option file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "Compression.hh" +#include "RLEv2.hh" +#include "RLEV2Util.hh" + +#define MAX_LITERAL_SIZE 512 +#define MAX_SHORT_REPEAT_LENGTH 10 + +namespace orc { + +/** + * Compute the bits required to represent pth percentile value + * @param data - array + * @param p - percentile value (>=0.0 to <=1.0) + * @return pth percentile bits + */ +uint32_t RleEncoderV2::percentileBits(int64_t* data, size_t offset, size_t length, double p, bool reuseHist) { + if ((p > 1.0) || (p <= 0.0)) { + throw InvalidArgument("Invalid p value: " + to_string(p)); + } + + if (!reuseHist) { + // histogram that store the encoded bit requirement for each values. + // maximum number of bits that can encoded is 32 (refer FixedBitSizes) + memset(histgram, 0, FixedBitSizes::SIZE * sizeof(int32_t)); + // compute the histogram + for(size_t i = offset; i < (offset + length); i++) { + uint32_t idx = encodeBitWidth(findClosestNumBits(data[i])); + histgram[idx] += 1; + } + } + + int32_t perLen = static_cast<int32_t>(static_cast<double>(length) * (1.0 - p)); + + // return the bits required by pth percentile length + for(int32_t i = HIST_LEN - 1; i >= 0; i--) { + perLen -= histgram[i]; + if (perLen < 0) { + return decodeBitWidth(static_cast<uint32_t>(i)); + } + } + return 0; +} + +RleEncoderV2::RleEncoderV2(std::unique_ptr<BufferedOutputStream> outStream, + bool hasSigned, bool alignBitPacking) : + RleEncoder(std::move(outStream), hasSigned), + alignedBitPacking(alignBitPacking), + prevDelta(0){ + literals = new int64_t[MAX_LITERAL_SIZE]; + gapVsPatchList = new int64_t[MAX_LITERAL_SIZE]; + zigzagLiterals = new int64_t[MAX_LITERAL_SIZE]; + baseRedLiterals = new int64_t[MAX_LITERAL_SIZE]; + adjDeltas = new int64_t[MAX_LITERAL_SIZE]; +} + +void RleEncoderV2::write(int64_t val) { + if(numLiterals == 0) { + initializeLiterals(val); + return; + } + + if(numLiterals == 1) { + prevDelta = val - literals[0]; + literals[numLiterals++] = val; + + if(val == literals[0]) { + fixedRunLength = 2; + variableRunLength = 0; + } else { + fixedRunLength = 0; + variableRunLength = 2; + } + return; + } + + int64_t currentDelta = val - literals[numLiterals - 1]; + EncodingOption option = {}; + if (prevDelta == 0 && currentDelta == 0) { + // case 1: fixed delta run + literals[numLiterals++] = val; + + if (variableRunLength > 0) { + // if variable run is non-zero then we are seeing repeating + // values at the end of variable run in which case fixed Run + // length is 2 + fixedRunLength = 2; + } + fixedRunLength++; + + // if fixed run met the minimum condition and if variable + // run is non-zero then flush the variable run and shift the + // tail fixed runs to start of the buffer + if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { + numLiterals -= MIN_REPEAT; + variableRunLength -= (MIN_REPEAT - 1); + + determineEncoding(option); + writeValues(option); + + // shift tail fixed runs to beginning of the buffer + for (size_t i = 0; i < MIN_REPEAT; ++i) { + literals[i] = val; + } + numLiterals = MIN_REPEAT; + } + if (fixedRunLength == MAX_LITERAL_SIZE) { - determineEncoding(option); - writeValues(option); - } - return; - } - - // case 2: variable delta run - - // if fixed run length is non-zero and if it satisfies the - // short repeat conditions then write the values as short repeats - // else use delta encoding - if (fixedRunLength >= MIN_REPEAT) { - if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - } - writeValues(option); - } - - // if fixed run length is <MIN_REPEAT and current value is - // different from previous then treat it as variable run - if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - } - - // after writing values re-initialize the variables - if (numLiterals == 0) { - initializeLiterals(val); - } else { - prevDelta = val - literals[numLiterals - 1]; - literals[numLiterals++] = val; - variableRunLength++; - - if (variableRunLength == MAX_LITERAL_SIZE) { - determineEncoding(option); - writeValues(option); - } - } -} - -void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { - int64_t zzEncVal = 0; - for (size_t i = 0; i < numLiterals; i++) { - if (isSigned) { - zzEncVal = zigZag(literals[i]); - } else { - zzEncVal = literals[i]; - } - zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal; - } -} - -void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { - // mask will be max value beyond which patch will be generated - int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; - - // since we are considering only 95 percentile, the size of gap and - // patch array can contain only be 5% values - option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20))); - - // #bit for patch - option.patchWidth = option.brBits100p - option.brBits95p; - option.patchWidth = getClosestFixedBits(option.patchWidth); - - // if patch bit requirement is 64 then it will not possible to pack - // gap and patch together in a long. To make sure gap and patch can be - // packed together adjust the patch width - if (option.patchWidth == 64) { - option.patchWidth = 56; - option.brBits95p = 8; - mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; - } - - uint32_t gapIdx = 0; - uint32_t patchIdx = 0; - size_t prev = 0; - size_t maxGap = 0; - - std::vector<int64_t> gapList; - std::vector<int64_t> patchList; - - for(size_t i = 0; i < numLiterals; i++) { - // if value is above mask then create the patch and record the gap - if (baseRedLiterals[i] > mask) { - size_t gap = i - prev; - if (gap > maxGap) { - maxGap = gap; - } - - // gaps are relative, so store the previous patched value index - prev = i; - gapList.push_back(static_cast<int64_t>(gap)); - gapIdx++; - - // extract the most significant bits that are over mask bits - int64_t patch = baseRedLiterals[i] >> option.brBits95p; - patchList.push_back(patch); - patchIdx++; - - // strip off the MSB to enable safe bit packing - baseRedLiterals[i] &= mask; - } - } - - // adjust the patch length to number of entries in gap list - option.patchLength = gapIdx; - - // if the element to be patched is the first and only element then - // max gap will be 0, but to store the gap as 0 we need atleast 1 bit - if (maxGap == 0 && option.patchLength != 0) { - option.patchGapWidth = 1; - } else { - option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); - } - - // special case: if the patch gap width is greater than 256, then - // we need 9 bits to encode the gap width. But we only have 3 bits in - // header to record the gap width. To deal with this case, we will save - // two entries in patch list in the following way - // 256 gap width => 0 for patch value - // actual gap - 256 => actual patch value - // We will do the same for gap width = 511. If the element to be patched is - // the last element in the scope then gap width will be 511. In this case we - // will have 3 entries in the patch list in the following way - // 255 gap width => 0 for patch value - // 255 gap width => 0 for patch value - // 1 gap width => actual patch value - if (option.patchGapWidth > 8) { - option.patchGapWidth = 8; - // for gap = 511, we need two additional entries in patch list - if (maxGap == 511) { - option.patchLength += 2; - } else { - option.patchLength += 1; - } - } - - // create gap vs patch list - gapIdx = 0; - patchIdx = 0; - for(size_t i = 0; i < option.patchLength; i++) { - int64_t g = gapList[gapIdx++]; - int64_t p = patchList[patchIdx++]; - while (g > 255) { - gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); - i++; - g -= 255; - } - - // store patch value in LSBs and gap in MSBs - gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); - } -} - -void RleEncoderV2::determineEncoding(EncodingOption& option) { - // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings, - // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag - // computation when it's determined to be necessary. - - // not a big win for shorter runs to determine encoding - if (numLiterals <= MIN_REPEAT) { - // we need to compute zigzag values for DIRECT encoding if we decide to - // break early for delta overflows or for shorter runs - computeZigZagLiterals(option); - option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - option.encoding = DIRECT; - return; - } - - // DELTA encoding check - - // for identifying monotonic sequences - bool isIncreasing = true; - bool isDecreasing = true; - option.isFixedDelta = true; - - option.min = literals[0]; - int64_t max = literals[0]; - int64_t initialDelta = literals[1] - literals[0]; - int64_t currDelta = 0; - int64_t deltaMax = 0; - adjDeltas[option.adjDeltasCount++] = initialDelta; - - for (size_t i = 1; i < numLiterals; i++) { - const int64_t l1 = literals[i]; - const int64_t l0 = literals[i - 1]; - currDelta = l1 - l0; - option.min = std::min(option.min, l1); - max = std::max(max, l1); - - isIncreasing &= (l0 <= l1); - isDecreasing &= (l0 >= l1); - - option.isFixedDelta &= (currDelta == initialDelta); - if (i > 1) { - adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); - deltaMax = std::max(deltaMax, adjDeltas[i - 1]); - } - } - - // it's faster to exit under delta overflow condition without checking for - // PATCHED_BASE condition as encoding using DIRECT is faster and has less - // overhead than PATCHED_BASE - if (!isSafeSubtract(max, option.min)) { - computeZigZagLiterals(option); - option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - option.encoding = DIRECT; - return; - } - - // invariant - subtracting any number from any other in the literals after - // option point won't overflow - - // if min is equal to max then the delta is 0, option condition happens for - // fixed values run >10 which cannot be encoded with SHORT_REPEAT - if (option.min == max) { - if (!option.isFixedDelta) { - throw InvalidArgument(to_string(option.min) + "==" + - to_string(max) + ", isFixedDelta cannot be false"); - } - - if(currDelta != 0) { - throw InvalidArgument(to_string(option.min) + "==" + - to_string(max) + ", currDelta should be zero"); - } - option.fixedDelta = 0; - option.encoding = DELTA; - return; - } - - if (option.isFixedDelta) { - if (currDelta != initialDelta) { - throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); - } - - option.encoding = DELTA; - option.fixedDelta = currDelta; - return; - } - - // if initialDelta is 0 then we cannot delta encode as we cannot identify - // the sign of deltas (increasing or decreasing) - if (initialDelta != 0) { - // stores the number of bits required for packing delta blob in - // delta encoding - option.bitsDeltaMax = findClosestNumBits(deltaMax); - - // monotonic condition - if (isIncreasing || isDecreasing) { - option.encoding = DELTA; - return; - } - } - - // PATCHED_BASE encoding check - - // percentile values are computed for the zigzag encoded values. if the - // number of bit requirement between 90th and 100th percentile varies - // beyond a threshold then we need to patch the values. if the variation - // is not significant then we can use direct encoding - - computeZigZagLiterals(option); - option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); - option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true); - uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p; - - // if the difference between 90th percentile and 100th percentile fixed - // bits is > 1 then we need patch the values - if (diffBitsLH > 1) { - - // patching is done only on base reduced values. - // remove base from literals - for (size_t i = 0; i < numLiterals; i++) { - baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); - } - - // 95th percentile width is used to determine max allowed value - // after which patching will be done - option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); - - // 100th percentile is used to compute the max patch width - option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); - - // after base reducing the values, if the difference in bits between - // 95th percentile and 100th percentile value is zero then there - // is no point in patching the values, in which case we will - // fallback to DIRECT encoding. - // The decision to use patched base was based on zigzag values, but the - // actual patching is done on base reduced literals. - if ((option.brBits100p - option.brBits95p) != 0) { - option.encoding = PATCHED_BASE; - preparePatchedBlob(option); - return; - } else { - option.encoding = DIRECT; - return; - } - } else { - // if difference in bits between 95th percentile and 100th percentile is - // 0, then patch length will become 0. Hence we will fallback to direct - option.encoding = DIRECT; - return; - } -} - -uint64_t RleEncoderV2::flush() { - if (numLiterals != 0) { - EncodingOption option = {}; - if (variableRunLength != 0) { - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength != 0) { - if (fixedRunLength < MIN_REPEAT) { - variableRunLength = fixedRunLength; - fixedRunLength = 0; - determineEncoding(option); - writeValues(option); - } else if (fixedRunLength >= MIN_REPEAT - && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { - option.encoding = SHORT_REPEAT; - writeValues(option); - } else { - option.encoding = DELTA; - option.isFixedDelta = true; - writeValues(option); - } - } - } - - outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); - uint64_t dataSize = outputStream->flush(); - bufferLength = bufferPosition = 0; - return dataSize; -} - -void RleEncoderV2::writeValues(EncodingOption& option) { - if (numLiterals != 0) { - switch (option.encoding) { - case SHORT_REPEAT: - writeShortRepeatValues(option); - break; - case DIRECT: - writeDirectValues(option); - break; - case PATCHED_BASE: - writePatchedBasedValues(option); - break; - case DELTA: - writeDeltaValues(option); - break; - default: - throw NotImplementedYet("Not implemented yet"); - } - - numLiterals = 0; - prevDelta = 0; - } -} - -void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { - int64_t repeatVal; - if (isSigned) { - repeatVal = zigZag(literals[0]); - } else { - repeatVal = literals[0]; - } - - const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal); - const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); - - uint32_t header = getOpCode(SHORT_REPEAT); - - fixedRunLength -= MIN_REPEAT; - header |= fixedRunLength; - header |= ((numBytesRepeatVal - 1) << 3); - - writeByte(static_cast<char>(header)); - - for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { - int64_t b = ((repeatVal >> (i * 8)) & 0xff); - writeByte(static_cast<char>(b)); - } - - fixedRunLength = 0; -} - -void RleEncoderV2::writeDirectValues(EncodingOption& option) { - // write the number of fixed bits required in next 5 bits - uint32_t fb = option.zzBits100p; - if (alignedBitPacking) { - fb = getClosestAlignedFixedBits(fb); - } - - const uint32_t efb = encodeBitWidth(fb) << 1; - - // adjust variable run length - variableRunLength -= 1; - - // extract the 9th bit of run length - const uint32_t tailBits = (variableRunLength & 0x100) >> 8; - - // create first byte of the header - const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits); - - // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); - - // write header - writeByte(headerFirstByte); - writeByte(headerSecondByte); - - // bit packing the zigzag encoded literals - writeInts(zigzagLiterals, 0, numLiterals, fb); - - // reset run length - variableRunLength = 0; -} - -void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { - // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding - // because patch is applied to MSB bits. For example: If fixed bit width of - // base value is 7 bits and if patch is 3 bits, the actual value is - // constructed by shifting the patch to left by 7 positions. - // actual_value = patch << 7 | base_value - // So, if we align base_value then actual_value can not be reconstructed. - - // write the number of fixed bits required in next 5 bits - const uint32_t efb = encodeBitWidth(option.brBits95p) << 1; - - // adjust variable run length, they are one off - variableRunLength -= 1; - - // extract the 9th bit of run length - const uint32_t tailBits = (variableRunLength & 0x100) >> 8; - - // create first byte of the header - const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits); - - // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); - - // if the min value is negative toggle the sign - const bool isNegative = (option.min < 0); - if (isNegative) { - option.min = -option.min; - } - - // find the number of bytes required for base and shift it by 5 bits - // to accommodate patch width. The additional bit is used to store the sign - // of the base value. - const uint32_t baseWidth = findClosestNumBits(option.min) + 1; - const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; - const uint32_t bb = (baseBytes - 1) << 5; - - // if the base value is negative then set MSB to 1 - if (isNegative) { - option.min |= (1LL << ((baseBytes * 8) - 1)); - } - - // third byte contains 3 bits for number of bytes occupied by base - // and 5 bits for patchWidth - const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth)); - - // fourth byte contains 3 bits for page gap width and 5 bits for - // patch length - const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); - - // write header - writeByte(headerFirstByte); - writeByte(headerSecondByte); - writeByte(headerThirdByte); - writeByte(headerFourthByte); - - // write the base value using fixed bytes in big endian order - for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { - char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); - writeByte(b); - } - - // base reduced literals are bit packed - uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p); - - writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits); - - // write patch list - closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth); - - writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits); - - // reset run length - variableRunLength = 0; -} - -void RleEncoderV2::writeDeltaValues(EncodingOption& option) { - uint32_t len = 0; - uint32_t fb = option.bitsDeltaMax; - uint32_t efb = 0; - - if (alignedBitPacking) { - fb = getClosestAlignedFixedBits(fb); - } - - if (option.isFixedDelta) { - // if fixed run length is greater than threshold then it will be fixed - // delta sequence with delta value 0 else fixed delta sequence with - // non-zero delta value - if (fixedRunLength > MIN_REPEAT) { - // ex. sequence: 2 2 2 2 2 2 2 2 - len = fixedRunLength - 1; - fixedRunLength = 0; - } else { - // ex. sequence: 4 6 8 10 12 14 16 - len = variableRunLength - 1; - variableRunLength = 0; - } - } else { - // fixed width 0 is used for long repeating values. - // sequences that require only 1 bit to encode will have an additional bit - if (fb == 1) { - fb = 2; - } - efb = encodeBitWidth(fb) << 1; - len = variableRunLength - 1; - variableRunLength = 0; - } - - // extract the 9th bit of run length - const uint32_t tailBits = (len & 0x100) >> 8; - - // create first byte of the header - const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits); - - // second byte of the header stores the remaining 8 bits of runlength - const char headerSecondByte = static_cast<char>(len & 0xff); - - // write header - writeByte(headerFirstByte); - writeByte(headerSecondByte); - - // store the first value from zigzag literal array - if (isSigned) { - writeVslong(literals[0]); - } else { - writeVulong(literals[0]); - } - - if (option.isFixedDelta) { - // if delta is fixed then we don't need to store delta blob - writeVslong(option.fixedDelta); - } else { - // store the first value as delta value using zigzag encoding - writeVslong(adjDeltas[0]); - - // adjacent delta values are bit packed. The length of adjDeltas array is - // always one less than the number of literals (delta difference for n - // elements is n-1). We have already written one element, write the - // remaining numLiterals - 2 elements here - writeInts(adjDeltas, 1, numLiterals - 2, fb); - } -} - -void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { - if(input == nullptr || len < 1 || bitSize < 1) { - return; - } - - if (getClosestAlignedFixedBits(bitSize) == bitSize) { - uint32_t numBytes; - uint32_t endOffSet = static_cast<uint32_t>(offset + len); + determineEncoding(option); + writeValues(option); + } + return; + } + + // case 2: variable delta run + + // if fixed run length is non-zero and if it satisfies the + // short repeat conditions then write the values as short repeats + // else use delta encoding + if (fixedRunLength >= MIN_REPEAT) { + if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + } + writeValues(option); + } + + // if fixed run length is <MIN_REPEAT and current value is + // different from previous then treat it as variable run + if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT && val != literals[numLiterals - 1]) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + } + + // after writing values re-initialize the variables + if (numLiterals == 0) { + initializeLiterals(val); + } else { + prevDelta = val - literals[numLiterals - 1]; + literals[numLiterals++] = val; + variableRunLength++; + + if (variableRunLength == MAX_LITERAL_SIZE) { + determineEncoding(option); + writeValues(option); + } + } +} + +void RleEncoderV2::computeZigZagLiterals(EncodingOption &option) { + int64_t zzEncVal = 0; + for (size_t i = 0; i < numLiterals; i++) { + if (isSigned) { + zzEncVal = zigZag(literals[i]); + } else { + zzEncVal = literals[i]; + } + zigzagLiterals[option.zigzagLiteralsCount++] = zzEncVal; + } +} + +void RleEncoderV2::preparePatchedBlob(EncodingOption& option) { + // mask will be max value beyond which patch will be generated + int64_t mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; + + // since we are considering only 95 percentile, the size of gap and + // patch array can contain only be 5% values + option.patchLength = static_cast<uint32_t>(std::ceil((numLiterals / 20))); + + // #bit for patch + option.patchWidth = option.brBits100p - option.brBits95p; + option.patchWidth = getClosestFixedBits(option.patchWidth); + + // if patch bit requirement is 64 then it will not possible to pack + // gap and patch together in a long. To make sure gap and patch can be + // packed together adjust the patch width + if (option.patchWidth == 64) { + option.patchWidth = 56; + option.brBits95p = 8; + mask = static_cast<int64_t>(static_cast<uint64_t>(1) << option.brBits95p) - 1; + } + + uint32_t gapIdx = 0; + uint32_t patchIdx = 0; + size_t prev = 0; + size_t maxGap = 0; + + std::vector<int64_t> gapList; + std::vector<int64_t> patchList; + + for(size_t i = 0; i < numLiterals; i++) { + // if value is above mask then create the patch and record the gap + if (baseRedLiterals[i] > mask) { + size_t gap = i - prev; + if (gap > maxGap) { + maxGap = gap; + } + + // gaps are relative, so store the previous patched value index + prev = i; + gapList.push_back(static_cast<int64_t>(gap)); + gapIdx++; + + // extract the most significant bits that are over mask bits + int64_t patch = baseRedLiterals[i] >> option.brBits95p; + patchList.push_back(patch); + patchIdx++; + + // strip off the MSB to enable safe bit packing + baseRedLiterals[i] &= mask; + } + } + + // adjust the patch length to number of entries in gap list + option.patchLength = gapIdx; + + // if the element to be patched is the first and only element then + // max gap will be 0, but to store the gap as 0 we need atleast 1 bit + if (maxGap == 0 && option.patchLength != 0) { + option.patchGapWidth = 1; + } else { + option.patchGapWidth = findClosestNumBits(static_cast<int64_t>(maxGap)); + } + + // special case: if the patch gap width is greater than 256, then + // we need 9 bits to encode the gap width. But we only have 3 bits in + // header to record the gap width. To deal with this case, we will save + // two entries in patch list in the following way + // 256 gap width => 0 for patch value + // actual gap - 256 => actual patch value + // We will do the same for gap width = 511. If the element to be patched is + // the last element in the scope then gap width will be 511. In this case we + // will have 3 entries in the patch list in the following way + // 255 gap width => 0 for patch value + // 255 gap width => 0 for patch value + // 1 gap width => actual patch value + if (option.patchGapWidth > 8) { + option.patchGapWidth = 8; + // for gap = 511, we need two additional entries in patch list + if (maxGap == 511) { + option.patchLength += 2; + } else { + option.patchLength += 1; + } + } + + // create gap vs patch list + gapIdx = 0; + patchIdx = 0; + for(size_t i = 0; i < option.patchLength; i++) { + int64_t g = gapList[gapIdx++]; + int64_t p = patchList[patchIdx++]; + while (g > 255) { + gapVsPatchList[option.gapVsPatchListCount++] = (255L << option.patchWidth); + i++; + g -= 255; + } + + // store patch value in LSBs and gap in MSBs + gapVsPatchList[option.gapVsPatchListCount++] = ((g << option.patchWidth) | p); + } +} + +void RleEncoderV2::determineEncoding(EncodingOption& option) { + // We need to compute zigzag values for DIRECT and PATCHED_BASE encodings, + // but not for SHORT_REPEAT or DELTA. So we only perform the zigzag + // computation when it's determined to be necessary. + + // not a big win for shorter runs to determine encoding + if (numLiterals <= MIN_REPEAT) { + // we need to compute zigzag values for DIRECT encoding if we decide to + // break early for delta overflows or for shorter runs + computeZigZagLiterals(option); + option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + option.encoding = DIRECT; + return; + } + + // DELTA encoding check + + // for identifying monotonic sequences + bool isIncreasing = true; + bool isDecreasing = true; + option.isFixedDelta = true; + + option.min = literals[0]; + int64_t max = literals[0]; + int64_t initialDelta = literals[1] - literals[0]; + int64_t currDelta = 0; + int64_t deltaMax = 0; + adjDeltas[option.adjDeltasCount++] = initialDelta; + + for (size_t i = 1; i < numLiterals; i++) { + const int64_t l1 = literals[i]; + const int64_t l0 = literals[i - 1]; + currDelta = l1 - l0; + option.min = std::min(option.min, l1); + max = std::max(max, l1); + + isIncreasing &= (l0 <= l1); + isDecreasing &= (l0 >= l1); + + option.isFixedDelta &= (currDelta == initialDelta); + if (i > 1) { + adjDeltas[option.adjDeltasCount++] = std::abs(currDelta); + deltaMax = std::max(deltaMax, adjDeltas[i - 1]); + } + } + + // it's faster to exit under delta overflow condition without checking for + // PATCHED_BASE condition as encoding using DIRECT is faster and has less + // overhead than PATCHED_BASE + if (!isSafeSubtract(max, option.min)) { + computeZigZagLiterals(option); + option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + option.encoding = DIRECT; + return; + } + + // invariant - subtracting any number from any other in the literals after + // option point won't overflow + + // if min is equal to max then the delta is 0, option condition happens for + // fixed values run >10 which cannot be encoded with SHORT_REPEAT + if (option.min == max) { + if (!option.isFixedDelta) { + throw InvalidArgument(to_string(option.min) + "==" + + to_string(max) + ", isFixedDelta cannot be false"); + } + + if(currDelta != 0) { + throw InvalidArgument(to_string(option.min) + "==" + + to_string(max) + ", currDelta should be zero"); + } + option.fixedDelta = 0; + option.encoding = DELTA; + return; + } + + if (option.isFixedDelta) { + if (currDelta != initialDelta) { + throw InvalidArgument("currDelta should be equal to initialDelta for fixed delta encoding"); + } + + option.encoding = DELTA; + option.fixedDelta = currDelta; + return; + } + + // if initialDelta is 0 then we cannot delta encode as we cannot identify + // the sign of deltas (increasing or decreasing) + if (initialDelta != 0) { + // stores the number of bits required for packing delta blob in + // delta encoding + option.bitsDeltaMax = findClosestNumBits(deltaMax); + + // monotonic condition + if (isIncreasing || isDecreasing) { + option.encoding = DELTA; + return; + } + } + + // PATCHED_BASE encoding check + + // percentile values are computed for the zigzag encoded values. if the + // number of bit requirement between 90th and 100th percentile varies + // beyond a threshold then we need to patch the values. if the variation + // is not significant then we can use direct encoding + + computeZigZagLiterals(option); + option.zzBits100p = percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + option.zzBits90p = percentileBits(zigzagLiterals, 0, numLiterals, 0.9, true); + uint32_t diffBitsLH = option.zzBits100p - option.zzBits90p; + + // if the difference between 90th percentile and 100th percentile fixed + // bits is > 1 then we need patch the values + if (diffBitsLH > 1) { + + // patching is done only on base reduced values. + // remove base from literals + for (size_t i = 0; i < numLiterals; i++) { + baseRedLiterals[option.baseRedLiteralsCount++] = (literals[i] - option.min); + } + + // 95th percentile width is used to determine max allowed value + // after which patching will be done + option.brBits95p = percentileBits(baseRedLiterals, 0, numLiterals, 0.95); + + // 100th percentile is used to compute the max patch width + option.brBits100p = percentileBits(baseRedLiterals, 0, numLiterals, 1.0, true); + + // after base reducing the values, if the difference in bits between + // 95th percentile and 100th percentile value is zero then there + // is no point in patching the values, in which case we will + // fallback to DIRECT encoding. + // The decision to use patched base was based on zigzag values, but the + // actual patching is done on base reduced literals. + if ((option.brBits100p - option.brBits95p) != 0) { + option.encoding = PATCHED_BASE; + preparePatchedBlob(option); + return; + } else { + option.encoding = DIRECT; + return; + } + } else { + // if difference in bits between 95th percentile and 100th percentile is + // 0, then patch length will become 0. Hence we will fallback to direct + option.encoding = DIRECT; + return; + } +} + +uint64_t RleEncoderV2::flush() { + if (numLiterals != 0) { + EncodingOption option = {}; + if (variableRunLength != 0) { + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength != 0) { + if (fixedRunLength < MIN_REPEAT) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + determineEncoding(option); + writeValues(option); + } else if (fixedRunLength >= MIN_REPEAT + && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + option.encoding = SHORT_REPEAT; + writeValues(option); + } else { + option.encoding = DELTA; + option.isFixedDelta = true; + writeValues(option); + } + } + } + + outputStream->BackUp(static_cast<int>(bufferLength - bufferPosition)); + uint64_t dataSize = outputStream->flush(); + bufferLength = bufferPosition = 0; + return dataSize; +} + +void RleEncoderV2::writeValues(EncodingOption& option) { + if (numLiterals != 0) { + switch (option.encoding) { + case SHORT_REPEAT: + writeShortRepeatValues(option); + break; + case DIRECT: + writeDirectValues(option); + break; + case PATCHED_BASE: + writePatchedBasedValues(option); + break; + case DELTA: + writeDeltaValues(option); + break; + default: + throw NotImplementedYet("Not implemented yet"); + } + + numLiterals = 0; + prevDelta = 0; + } +} + +void RleEncoderV2::writeShortRepeatValues(EncodingOption&) { + int64_t repeatVal; + if (isSigned) { + repeatVal = zigZag(literals[0]); + } else { + repeatVal = literals[0]; + } + + const uint32_t numBitsRepeatVal = findClosestNumBits(repeatVal); + const uint32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? (numBitsRepeatVal >> 3) : ((numBitsRepeatVal >> 3) + 1); + + uint32_t header = getOpCode(SHORT_REPEAT); + + fixedRunLength -= MIN_REPEAT; + header |= fixedRunLength; + header |= ((numBytesRepeatVal - 1) << 3); + + writeByte(static_cast<char>(header)); + + for(int32_t i = static_cast<int32_t>(numBytesRepeatVal - 1); i >= 0; i--) { + int64_t b = ((repeatVal >> (i * 8)) & 0xff); + writeByte(static_cast<char>(b)); + } + + fixedRunLength = 0; +} + +void RleEncoderV2::writeDirectValues(EncodingOption& option) { + // write the number of fixed bits required in next 5 bits + uint32_t fb = option.zzBits100p; + if (alignedBitPacking) { + fb = getClosestAlignedFixedBits(fb); + } + + const uint32_t efb = encodeBitWidth(fb) << 1; + + // adjust variable run length + variableRunLength -= 1; + + // extract the 9th bit of run length + const uint32_t tailBits = (variableRunLength & 0x100) >> 8; + + // create first byte of the header + const char headerFirstByte = static_cast<char>(getOpCode(DIRECT) | efb | tailBits); + + // second byte of the header stores the remaining 8 bits of runlength + const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); + + // write header + writeByte(headerFirstByte); + writeByte(headerSecondByte); + + // bit packing the zigzag encoded literals + writeInts(zigzagLiterals, 0, numLiterals, fb); + + // reset run length + variableRunLength = 0; +} + +void RleEncoderV2::writePatchedBasedValues(EncodingOption& option) { + // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding + // because patch is applied to MSB bits. For example: If fixed bit width of + // base value is 7 bits and if patch is 3 bits, the actual value is + // constructed by shifting the patch to left by 7 positions. + // actual_value = patch << 7 | base_value + // So, if we align base_value then actual_value can not be reconstructed. + + // write the number of fixed bits required in next 5 bits + const uint32_t efb = encodeBitWidth(option.brBits95p) << 1; + + // adjust variable run length, they are one off + variableRunLength -= 1; + + // extract the 9th bit of run length + const uint32_t tailBits = (variableRunLength & 0x100) >> 8; + + // create first byte of the header + const char headerFirstByte = static_cast<char>(getOpCode(PATCHED_BASE) | efb | tailBits); + + // second byte of the header stores the remaining 8 bits of runlength + const char headerSecondByte = static_cast<char>(variableRunLength & 0xff); + + // if the min value is negative toggle the sign + const bool isNegative = (option.min < 0); + if (isNegative) { + option.min = -option.min; + } + + // find the number of bytes required for base and shift it by 5 bits + // to accommodate patch width. The additional bit is used to store the sign + // of the base value. + const uint32_t baseWidth = findClosestNumBits(option.min) + 1; + const uint32_t baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; + const uint32_t bb = (baseBytes - 1) << 5; + + // if the base value is negative then set MSB to 1 + if (isNegative) { + option.min |= (1LL << ((baseBytes * 8) - 1)); + } + + // third byte contains 3 bits for number of bytes occupied by base + // and 5 bits for patchWidth + const char headerThirdByte = static_cast<char>(bb | encodeBitWidth(option.patchWidth)); + + // fourth byte contains 3 bits for page gap width and 5 bits for + // patch length + const char headerFourthByte = static_cast<char>((option.patchGapWidth - 1) << 5 | option.patchLength); + + // write header + writeByte(headerFirstByte); + writeByte(headerSecondByte); + writeByte(headerThirdByte); + writeByte(headerFourthByte); + + // write the base value using fixed bytes in big endian order + for(int32_t i = static_cast<int32_t>(baseBytes - 1); i >= 0; i--) { + char b = static_cast<char>(((option.min >> (i * 8)) & 0xff)); + writeByte(b); + } + + // base reduced literals are bit packed + uint32_t closestFixedBits = getClosestFixedBits(option.brBits95p); + + writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits); + + // write patch list + closestFixedBits = getClosestFixedBits(option.patchGapWidth + option.patchWidth); + + writeInts(gapVsPatchList, 0, option.patchLength, closestFixedBits); + + // reset run length + variableRunLength = 0; +} + +void RleEncoderV2::writeDeltaValues(EncodingOption& option) { + uint32_t len = 0; + uint32_t fb = option.bitsDeltaMax; + uint32_t efb = 0; + + if (alignedBitPacking) { + fb = getClosestAlignedFixedBits(fb); + } + + if (option.isFixedDelta) { + // if fixed run length is greater than threshold then it will be fixed + // delta sequence with delta value 0 else fixed delta sequence with + // non-zero delta value + if (fixedRunLength > MIN_REPEAT) { + // ex. sequence: 2 2 2 2 2 2 2 2 + len = fixedRunLength - 1; + fixedRunLength = 0; + } else { + // ex. sequence: 4 6 8 10 12 14 16 + len = variableRunLength - 1; + variableRunLength = 0; + } + } else { + // fixed width 0 is used for long repeating values. + // sequences that require only 1 bit to encode will have an additional bit + if (fb == 1) { + fb = 2; + } + efb = encodeBitWidth(fb) << 1; + len = variableRunLength - 1; + variableRunLength = 0; + } + + // extract the 9th bit of run length + const uint32_t tailBits = (len & 0x100) >> 8; + + // create first byte of the header + const char headerFirstByte = static_cast<char>(getOpCode(DELTA) | efb | tailBits); + + // second byte of the header stores the remaining 8 bits of runlength + const char headerSecondByte = static_cast<char>(len & 0xff); + + // write header + writeByte(headerFirstByte); + writeByte(headerSecondByte); + + // store the first value from zigzag literal array + if (isSigned) { + writeVslong(literals[0]); + } else { + writeVulong(literals[0]); + } + + if (option.isFixedDelta) { + // if delta is fixed then we don't need to store delta blob + writeVslong(option.fixedDelta); + } else { + // store the first value as delta value using zigzag encoding + writeVslong(adjDeltas[0]); + + // adjacent delta values are bit packed. The length of adjDeltas array is + // always one less than the number of literals (delta difference for n + // elements is n-1). We have already written one element, write the + // remaining numLiterals - 2 elements here + writeInts(adjDeltas, 1, numLiterals - 2, fb); + } +} + +void RleEncoderV2::writeInts(int64_t* input, uint32_t offset, size_t len, uint32_t bitSize) { + if(input == nullptr || len < 1 || bitSize < 1) { + return; + } + + if (getClosestAlignedFixedBits(bitSize) == bitSize) { + uint32_t numBytes; + uint32_t endOffSet = static_cast<uint32_t>(offset + len); if (bitSize < 8 ) { - char bitMask = static_cast<char>((1 << bitSize) - 1); - uint32_t numHops = 8 / bitSize; - uint32_t remainder = static_cast<uint32_t>(len % numHops); - uint32_t endUnroll = endOffSet - remainder; - for (uint32_t i = offset; i < endUnroll; i+=numHops) { - char toWrite = 0; - for (uint32_t j = 0; j < numHops; ++j) { - toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize)); - } - writeByte(toWrite); - } - - if (remainder > 0) { - uint32_t startShift = 8 - bitSize; - char toWrite = 0; - for (uint32_t i = endUnroll; i < endOffSet; ++i) { - toWrite |= static_cast<char>((input[i] & bitMask) << startShift); - startShift -= bitSize; - } - writeByte(toWrite); - } - - } else { - numBytes = bitSize / 8; - - for (uint32_t i = offset; i < endOffSet; ++i) { - for (uint32_t j = 0; j < numBytes; ++j) { - char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); - writeByte(toWrite); - } - } - } - - return; - } - - // write for unaligned bit size - uint32_t bitsLeft = 8; - char current = 0; - for(uint32_t i = offset; i < (offset + len); i++) { - int64_t value = input[i]; - uint32_t bitsToWrite = bitSize; - while (bitsToWrite > bitsLeft) { - // add the bits to the bottom of the current word - current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); - // subtract out the bits we just added - bitsToWrite -= bitsLeft; - // zero out the bits above bitsToWrite - value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; - writeByte(current); - current = 0; - bitsLeft = 8; - } - bitsLeft -= bitsToWrite; - current |= static_cast<char>(value << bitsLeft); - if (bitsLeft == 0) { - writeByte(current); - current = 0; - bitsLeft = 8; - } - } - - // flush - if (bitsLeft != 8) { - writeByte(current); - } -} - -void RleEncoderV2::initializeLiterals(int64_t val) { - literals[numLiterals++] = val; - fixedRunLength = 1; - variableRunLength = 1; -} -} + char bitMask = static_cast<char>((1 << bitSize) - 1); + uint32_t numHops = 8 / bitSize; + uint32_t remainder = static_cast<uint32_t>(len % numHops); + uint32_t endUnroll = endOffSet - remainder; + for (uint32_t i = offset; i < endUnroll; i+=numHops) { + char toWrite = 0; + for (uint32_t j = 0; j < numHops; ++j) { + toWrite |= static_cast<char>((input[i+j] & bitMask) << (8 - (j + 1) * bitSize)); + } + writeByte(toWrite); + } + + if (remainder > 0) { + uint32_t startShift = 8 - bitSize; + char toWrite = 0; + for (uint32_t i = endUnroll; i < endOffSet; ++i) { + toWrite |= static_cast<char>((input[i] & bitMask) << startShift); + startShift -= bitSize; + } + writeByte(toWrite); + } + + } else { + numBytes = bitSize / 8; + + for (uint32_t i = offset; i < endOffSet; ++i) { + for (uint32_t j = 0; j < numBytes; ++j) { + char toWrite = static_cast<char>((input[i] >> (8 * (numBytes - j - 1))) & 255); + writeByte(toWrite); + } + } + } + + return; + } + + // write for unaligned bit size + uint32_t bitsLeft = 8; + char current = 0; + for(uint32_t i = offset; i < (offset + len); i++) { + int64_t value = input[i]; + uint32_t bitsToWrite = bitSize; + while (bitsToWrite > bitsLeft) { + // add the bits to the bottom of the current word + current |= static_cast<char>(value >> (bitsToWrite - bitsLeft)); + // subtract out the bits we just added + bitsToWrite -= bitsLeft; + // zero out the bits above bitsToWrite + value &= (static_cast<uint64_t>(1) << bitsToWrite) - 1; + writeByte(current); + current = 0; + bitsLeft = 8; + } + bitsLeft -= bitsToWrite; + current |= static_cast<char>(value << bitsLeft); + if (bitsLeft == 0) { + writeByte(current); + current = 0; + bitsLeft = 8; + } + } + + // flush + if (bitsLeft != 8) { + writeByte(current); + } +} + +void RleEncoderV2::initializeLiterals(int64_t val) { + literals[numLiterals++] = val; + fixedRunLength = 1; + variableRunLength = 1; +} +} diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc index 2401f5e0cb..f13381b5b0 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.cc +++ b/contrib/libs/apache/orc/c++/src/Statistics.cc @@ -1,408 +1,408 @@ - /** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "RLE.hh" -#include "Statistics.hh" - -#include "wrap/coded-stream-wrapper.h" - -namespace orc { - - ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, - const StatContext& statContext) { - if (s.has_intstatistics()) { - return new IntegerColumnStatisticsImpl(s); - } else if (s.has_doublestatistics()) { - return new DoubleColumnStatisticsImpl(s); - } else if (s.has_stringstatistics()) { - return new StringColumnStatisticsImpl(s, statContext); - } else if (s.has_bucketstatistics()) { - return new BooleanColumnStatisticsImpl(s, statContext); - } else if (s.has_decimalstatistics()) { - return new DecimalColumnStatisticsImpl(s, statContext); - } else if (s.has_timestampstatistics()) { - return new TimestampColumnStatisticsImpl(s, statContext); - } else if (s.has_datestatistics()) { - return new DateColumnStatisticsImpl(s, statContext); - } else if (s.has_binarystatistics()) { - return new BinaryColumnStatisticsImpl(s, statContext); - } else { - return new ColumnStatisticsImpl(s); - } - } - - StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, - const StatContext& statContext) { - for(int i = 0; i < stripeStats.colstats_size(); i++) { - colStats.push_back( - convertColumnStatistics(stripeStats.colstats(i), statContext)); - } - } - - StatisticsImpl::StatisticsImpl(const proto::Footer& footer, - const StatContext& statContext) { - for(int i = 0; i < footer.statistics_size(); i++) { - colStats.push_back( - convertColumnStatistics(footer.statistics(i), statContext)); - } - } - - StatisticsImpl::~StatisticsImpl() { - for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); - ptr != colStats.end(); - ++ptr) { - delete *ptr; - } - } - - Statistics::~Statistics() { - // PASS - } - - StripeStatistics::~StripeStatistics() { - // PASS - } - - StripeStatisticsImpl::~StripeStatisticsImpl() { - // PASS - } - - StripeStatisticsImpl::StripeStatisticsImpl( - const proto::StripeStatistics& stripeStats, - std::vector<std::vector<proto::ColumnStatistics> >& indexStats, - const StatContext& statContext) { - columnStats.reset(new StatisticsImpl(stripeStats, statContext)); - rowIndexStats.resize(indexStats.size()); - for(size_t i = 0; i < rowIndexStats.size(); i++) { - for(size_t j = 0; j < indexStats[i].size(); j++) { - rowIndexStats[i].push_back( - std::shared_ptr<const ColumnStatistics>( - convertColumnStatistics(indexStats[i][j], statContext))); - } - } - } - - - ColumnStatistics::~ColumnStatistics() { - // PASS - } - - BinaryColumnStatistics::~BinaryColumnStatistics() { - // PASS - } - - BooleanColumnStatistics::~BooleanColumnStatistics() { - // PASS - } - - DateColumnStatistics::~DateColumnStatistics() { - // PASS - } - - DecimalColumnStatistics::~DecimalColumnStatistics() { - // PASS - } - - DoubleColumnStatistics::~DoubleColumnStatistics() { - // PASS - } - - IntegerColumnStatistics::~IntegerColumnStatistics() { - // PASS - } - - StringColumnStatistics::~StringColumnStatistics() { - // PASS - } - - TimestampColumnStatistics::~TimestampColumnStatistics() { - // PASS - } - - MutableColumnStatistics::~MutableColumnStatistics() { - // PASS - } - - ColumnStatisticsImpl::~ColumnStatisticsImpl() { - // PASS - } - - BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() { - // PASS - } - - BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() { - // PASS - } - - DateColumnStatisticsImpl::~DateColumnStatisticsImpl() { - // PASS - } - - DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() { - // PASS - } - - DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() { - // PASS - } - - IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() { - // PASS - } - - StringColumnStatisticsImpl::~StringColumnStatisticsImpl() { - // PASS - } - - TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() { - // PASS - } - - ColumnStatisticsImpl::ColumnStatisticsImpl - (const proto::ColumnStatistics& pb) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - } - - BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_binarystatistics() && statContext.correctStats) { - _stats.setHasTotalLength(pb.binarystatistics().has_sum()); - _stats.setTotalLength( - static_cast<uint64_t>(pb.binarystatistics().sum())); - } - } - - BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_bucketstatistics() && statContext.correctStats) { - _hasCount = true; - _trueCount = pb.bucketstatistics().count(0); - } else { - _hasCount = false; - _trueCount = 0; - } - } - - DateColumnStatisticsImpl::DateColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_datestatistics() || !statContext.correctStats) { - // hasMinimum_ is false by default; - // hasMaximum_ is false by default; - _stats.setMinimum(0); - _stats.setMaximum(0); - } else { - _stats.setHasMinimum(pb.datestatistics().has_minimum()); - _stats.setHasMaximum(pb.datestatistics().has_maximum()); - _stats.setMinimum(pb.datestatistics().minimum()); - _stats.setMaximum(pb.datestatistics().maximum()); - } - } - - DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (pb.has_decimalstatistics() && statContext.correctStats) { - const proto::DecimalStatistics& stats = pb.decimalstatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); - - _stats.setMinimum(Decimal(stats.minimum())); - _stats.setMaximum(Decimal(stats.maximum())); - _stats.setSum(Decimal(stats.sum())); - } - } - - DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_doublestatistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); - }else{ - const proto::DoubleStatistics& stats = pb.doublestatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); - - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setSum(stats.sum()); - } - } - - IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl - (const proto::ColumnStatistics& pb){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_intstatistics()) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _stats.setSum(0); - }else{ - const proto::IntegerStatistics& stats = pb.intstatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasSum(stats.has_sum()); - - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setSum(stats.sum()); - } - } - - StringColumnStatisticsImpl::StringColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext){ - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_stringstatistics() || !statContext.correctStats) { - _stats.setTotalLength(0); - }else{ - const proto::StringStatistics& stats = pb.stringstatistics(); - _stats.setHasMinimum(stats.has_minimum()); - _stats.setHasMaximum(stats.has_maximum()); - _stats.setHasTotalLength(stats.has_sum()); - - _stats.setMinimum(stats.minimum()); - _stats.setMaximum(stats.maximum()); - _stats.setTotalLength(static_cast<uint64_t>(stats.sum())); - } - } - - TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl - (const proto::ColumnStatistics& pb, const StatContext& statContext) { - _stats.setNumberOfValues(pb.numberofvalues()); - _stats.setHasNull(pb.hasnull()); - if (!pb.has_timestampstatistics() || !statContext.correctStats) { - _stats.setMinimum(0); - _stats.setMaximum(0); - _lowerBound = 0; - _upperBound = 0; - }else{ - const proto::TimestampStatistics& stats = pb.timestampstatistics(); - _stats.setHasMinimum( - stats.has_minimumutc() || - (stats.has_minimum() && (statContext.writerTimezone != nullptr))); - _stats.setHasMaximum( - stats.has_maximumutc() || - (stats.has_maximum() && (statContext.writerTimezone != nullptr))); - _hasLowerBound = stats.has_minimumutc() || stats.has_minimum(); - _hasUpperBound = stats.has_maximumutc() || stats.has_maximum(); - - // Timestamp stats are stored in milliseconds - if (stats.has_minimumutc()) { - int64_t minimum = stats.minimumutc(); - _stats.setMinimum(minimum); - _lowerBound = minimum; - } else if (statContext.writerTimezone) { - int64_t writerTimeSec = stats.minimum() / 1000; - // multiply the offset by 1000 to convert to millisecond - int64_t minimum = - stats.minimum() + - (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) - * 1000; - _stats.setMinimum(minimum); - _lowerBound = minimum; - } else { - _stats.setMinimum(0); - // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown - // TZ and daylight savings - _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); - } - - // Timestamp stats are stored in milliseconds - if (stats.has_maximumutc()) { - int64_t maximum = stats.maximumutc(); - _stats.setMaximum(maximum); - _upperBound = maximum; - } else if (statContext.writerTimezone) { - int64_t writerTimeSec = stats.maximum() / 1000; - // multiply the offset by 1000 to convert to millisecond - int64_t maximum = stats.maximum() + - (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) - * 1000; - _stats.setMaximum(maximum); - _upperBound = maximum; - } else { - _stats.setMaximum(0); - // add 1 day 1 hour (25 hours) in milliseconds to handle unknown - // TZ and daylight savings - _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); - } - // Add 1 millisecond to account for microsecond precision of values - _upperBound += 1; - } - } - - std::unique_ptr<MutableColumnStatistics> createColumnStatistics( - const Type& type) { - switch (static_cast<int64_t>(type.getKind())) { - case BOOLEAN: - return std::unique_ptr<MutableColumnStatistics>( - new BooleanColumnStatisticsImpl()); - case BYTE: - case INT: - case LONG: - case SHORT: - return std::unique_ptr<MutableColumnStatistics>( - new IntegerColumnStatisticsImpl()); - case STRUCT: - case MAP: - case LIST: - case UNION: - return std::unique_ptr<MutableColumnStatistics>( - new ColumnStatisticsImpl()); - case FLOAT: - case DOUBLE: - return std::unique_ptr<MutableColumnStatistics>( - new DoubleColumnStatisticsImpl()); - case BINARY: - return std::unique_ptr<MutableColumnStatistics>( - new BinaryColumnStatisticsImpl()); - case STRING: - case CHAR: - case VARCHAR: - return std::unique_ptr<MutableColumnStatistics>( - new StringColumnStatisticsImpl()); - case DATE: - return std::unique_ptr<MutableColumnStatistics>( - new DateColumnStatisticsImpl()); - case TIMESTAMP: - return std::unique_ptr<MutableColumnStatistics>( - new TimestampColumnStatisticsImpl()); - case DECIMAL: - return std::unique_ptr<MutableColumnStatistics>( - new DecimalColumnStatisticsImpl()); - default: - throw NotImplementedYet("Not supported type: " + type.toString()); - } - } - -}// namespace + /** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "RLE.hh" +#include "Statistics.hh" + +#include "wrap/coded-stream-wrapper.h" + +namespace orc { + + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, + const StatContext& statContext) { + if (s.has_intstatistics()) { + return new IntegerColumnStatisticsImpl(s); + } else if (s.has_doublestatistics()) { + return new DoubleColumnStatisticsImpl(s); + } else if (s.has_stringstatistics()) { + return new StringColumnStatisticsImpl(s, statContext); + } else if (s.has_bucketstatistics()) { + return new BooleanColumnStatisticsImpl(s, statContext); + } else if (s.has_decimalstatistics()) { + return new DecimalColumnStatisticsImpl(s, statContext); + } else if (s.has_timestampstatistics()) { + return new TimestampColumnStatisticsImpl(s, statContext); + } else if (s.has_datestatistics()) { + return new DateColumnStatisticsImpl(s, statContext); + } else if (s.has_binarystatistics()) { + return new BinaryColumnStatisticsImpl(s, statContext); + } else { + return new ColumnStatisticsImpl(s); + } + } + + StatisticsImpl::StatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext) { + for(int i = 0; i < stripeStats.colstats_size(); i++) { + colStats.push_back( + convertColumnStatistics(stripeStats.colstats(i), statContext)); + } + } + + StatisticsImpl::StatisticsImpl(const proto::Footer& footer, + const StatContext& statContext) { + for(int i = 0; i < footer.statistics_size(); i++) { + colStats.push_back( + convertColumnStatistics(footer.statistics(i), statContext)); + } + } + + StatisticsImpl::~StatisticsImpl() { + for(std::vector<ColumnStatistics*>::iterator ptr = colStats.begin(); + ptr != colStats.end(); + ++ptr) { + delete *ptr; + } + } + + Statistics::~Statistics() { + // PASS + } + + StripeStatistics::~StripeStatistics() { + // PASS + } + + StripeStatisticsImpl::~StripeStatisticsImpl() { + // PASS + } + + StripeStatisticsImpl::StripeStatisticsImpl( + const proto::StripeStatistics& stripeStats, + std::vector<std::vector<proto::ColumnStatistics> >& indexStats, + const StatContext& statContext) { + columnStats.reset(new StatisticsImpl(stripeStats, statContext)); + rowIndexStats.resize(indexStats.size()); + for(size_t i = 0; i < rowIndexStats.size(); i++) { + for(size_t j = 0; j < indexStats[i].size(); j++) { + rowIndexStats[i].push_back( + std::shared_ptr<const ColumnStatistics>( + convertColumnStatistics(indexStats[i][j], statContext))); + } + } + } + + + ColumnStatistics::~ColumnStatistics() { + // PASS + } + + BinaryColumnStatistics::~BinaryColumnStatistics() { + // PASS + } + + BooleanColumnStatistics::~BooleanColumnStatistics() { + // PASS + } + + DateColumnStatistics::~DateColumnStatistics() { + // PASS + } + + DecimalColumnStatistics::~DecimalColumnStatistics() { + // PASS + } + + DoubleColumnStatistics::~DoubleColumnStatistics() { + // PASS + } + + IntegerColumnStatistics::~IntegerColumnStatistics() { + // PASS + } + + StringColumnStatistics::~StringColumnStatistics() { + // PASS + } + + TimestampColumnStatistics::~TimestampColumnStatistics() { + // PASS + } + + MutableColumnStatistics::~MutableColumnStatistics() { + // PASS + } + + ColumnStatisticsImpl::~ColumnStatisticsImpl() { + // PASS + } + + BinaryColumnStatisticsImpl::~BinaryColumnStatisticsImpl() { + // PASS + } + + BooleanColumnStatisticsImpl::~BooleanColumnStatisticsImpl() { + // PASS + } + + DateColumnStatisticsImpl::~DateColumnStatisticsImpl() { + // PASS + } + + DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() { + // PASS + } + + DoubleColumnStatisticsImpl::~DoubleColumnStatisticsImpl() { + // PASS + } + + IntegerColumnStatisticsImpl::~IntegerColumnStatisticsImpl() { + // PASS + } + + StringColumnStatisticsImpl::~StringColumnStatisticsImpl() { + // PASS + } + + TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() { + // PASS + } + + ColumnStatisticsImpl::ColumnStatisticsImpl + (const proto::ColumnStatistics& pb) { + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + } + + BinaryColumnStatisticsImpl::BinaryColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (pb.has_binarystatistics() && statContext.correctStats) { + _stats.setHasTotalLength(pb.binarystatistics().has_sum()); + _stats.setTotalLength( + static_cast<uint64_t>(pb.binarystatistics().sum())); + } + } + + BooleanColumnStatisticsImpl::BooleanColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (pb.has_bucketstatistics() && statContext.correctStats) { + _hasCount = true; + _trueCount = pb.bucketstatistics().count(0); + } else { + _hasCount = false; + _trueCount = 0; + } + } + + DateColumnStatisticsImpl::DateColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_datestatistics() || !statContext.correctStats) { + // hasMinimum_ is false by default; + // hasMaximum_ is false by default; + _stats.setMinimum(0); + _stats.setMaximum(0); + } else { + _stats.setHasMinimum(pb.datestatistics().has_minimum()); + _stats.setHasMaximum(pb.datestatistics().has_maximum()); + _stats.setMinimum(pb.datestatistics().minimum()); + _stats.setMaximum(pb.datestatistics().maximum()); + } + } + + DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (pb.has_decimalstatistics() && statContext.correctStats) { + const proto::DecimalStatistics& stats = pb.decimalstatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasSum(stats.has_sum()); + + _stats.setMinimum(Decimal(stats.minimum())); + _stats.setMaximum(Decimal(stats.maximum())); + _stats.setSum(Decimal(stats.sum())); + } + } + + DoubleColumnStatisticsImpl::DoubleColumnStatisticsImpl + (const proto::ColumnStatistics& pb){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_doublestatistics()) { + _stats.setMinimum(0); + _stats.setMaximum(0); + _stats.setSum(0); + }else{ + const proto::DoubleStatistics& stats = pb.doublestatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasSum(stats.has_sum()); + + _stats.setMinimum(stats.minimum()); + _stats.setMaximum(stats.maximum()); + _stats.setSum(stats.sum()); + } + } + + IntegerColumnStatisticsImpl::IntegerColumnStatisticsImpl + (const proto::ColumnStatistics& pb){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_intstatistics()) { + _stats.setMinimum(0); + _stats.setMaximum(0); + _stats.setSum(0); + }else{ + const proto::IntegerStatistics& stats = pb.intstatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasSum(stats.has_sum()); + + _stats.setMinimum(stats.minimum()); + _stats.setMaximum(stats.maximum()); + _stats.setSum(stats.sum()); + } + } + + StringColumnStatisticsImpl::StringColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext){ + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_stringstatistics() || !statContext.correctStats) { + _stats.setTotalLength(0); + }else{ + const proto::StringStatistics& stats = pb.stringstatistics(); + _stats.setHasMinimum(stats.has_minimum()); + _stats.setHasMaximum(stats.has_maximum()); + _stats.setHasTotalLength(stats.has_sum()); + + _stats.setMinimum(stats.minimum()); + _stats.setMaximum(stats.maximum()); + _stats.setTotalLength(static_cast<uint64_t>(stats.sum())); + } + } + + TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl + (const proto::ColumnStatistics& pb, const StatContext& statContext) { + _stats.setNumberOfValues(pb.numberofvalues()); + _stats.setHasNull(pb.hasnull()); + if (!pb.has_timestampstatistics() || !statContext.correctStats) { + _stats.setMinimum(0); + _stats.setMaximum(0); + _lowerBound = 0; + _upperBound = 0; + }else{ + const proto::TimestampStatistics& stats = pb.timestampstatistics(); + _stats.setHasMinimum( + stats.has_minimumutc() || + (stats.has_minimum() && (statContext.writerTimezone != nullptr))); + _stats.setHasMaximum( + stats.has_maximumutc() || + (stats.has_maximum() && (statContext.writerTimezone != nullptr))); + _hasLowerBound = stats.has_minimumutc() || stats.has_minimum(); + _hasUpperBound = stats.has_maximumutc() || stats.has_maximum(); + + // Timestamp stats are stored in milliseconds + if (stats.has_minimumutc()) { + int64_t minimum = stats.minimumutc(); + _stats.setMinimum(minimum); + _lowerBound = minimum; + } else if (statContext.writerTimezone) { + int64_t writerTimeSec = stats.minimum() / 1000; + // multiply the offset by 1000 to convert to millisecond + int64_t minimum = + stats.minimum() + + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) + * 1000; + _stats.setMinimum(minimum); + _lowerBound = minimum; + } else { + _stats.setMinimum(0); + // subtract 1 day 1 hour (25 hours) in milliseconds to handle unknown + // TZ and daylight savings + _lowerBound = stats.minimum() - (25 * SECONDS_PER_HOUR * 1000); + } + + // Timestamp stats are stored in milliseconds + if (stats.has_maximumutc()) { + int64_t maximum = stats.maximumutc(); + _stats.setMaximum(maximum); + _upperBound = maximum; + } else if (statContext.writerTimezone) { + int64_t writerTimeSec = stats.maximum() / 1000; + // multiply the offset by 1000 to convert to millisecond + int64_t maximum = stats.maximum() + + (statContext.writerTimezone->getVariant(writerTimeSec).gmtOffset) + * 1000; + _stats.setMaximum(maximum); + _upperBound = maximum; + } else { + _stats.setMaximum(0); + // add 1 day 1 hour (25 hours) in milliseconds to handle unknown + // TZ and daylight savings + _upperBound = stats.maximum() + (25 * SECONDS_PER_HOUR * 1000); + } + // Add 1 millisecond to account for microsecond precision of values + _upperBound += 1; + } + } + + std::unique_ptr<MutableColumnStatistics> createColumnStatistics( + const Type& type) { + switch (static_cast<int64_t>(type.getKind())) { + case BOOLEAN: + return std::unique_ptr<MutableColumnStatistics>( + new BooleanColumnStatisticsImpl()); + case BYTE: + case INT: + case LONG: + case SHORT: + return std::unique_ptr<MutableColumnStatistics>( + new IntegerColumnStatisticsImpl()); + case STRUCT: + case MAP: + case LIST: + case UNION: + return std::unique_ptr<MutableColumnStatistics>( + new ColumnStatisticsImpl()); + case FLOAT: + case DOUBLE: + return std::unique_ptr<MutableColumnStatistics>( + new DoubleColumnStatisticsImpl()); + case BINARY: + return std::unique_ptr<MutableColumnStatistics>( + new BinaryColumnStatisticsImpl()); + case STRING: + case CHAR: + case VARCHAR: + return std::unique_ptr<MutableColumnStatistics>( + new StringColumnStatisticsImpl()); + case DATE: + return std::unique_ptr<MutableColumnStatistics>( + new DateColumnStatisticsImpl()); + case TIMESTAMP: + return std::unique_ptr<MutableColumnStatistics>( + new TimestampColumnStatisticsImpl()); + case DECIMAL: + return std::unique_ptr<MutableColumnStatistics>( + new DecimalColumnStatisticsImpl()); + default: + throw NotImplementedYet("Not supported type: " + type.toString()); + } + } + +}// namespace diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh index ee9db23f86..849019d8d7 100644 --- a/contrib/libs/apache/orc/c++/src/Statistics.hh +++ b/contrib/libs/apache/orc/c++/src/Statistics.hh @@ -1,971 +1,971 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_STATISTICS_IMPL_HH -#define ORC_STATISTICS_IMPL_HH - -#include "orc/Common.hh" -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include "Timezone.hh" -#include "TypeImpl.hh" - -namespace orc { - -/** - * StatContext contains fields required to compute statistics - */ - - struct StatContext { - const bool correctStats; - const Timezone* const writerTimezone; - StatContext() : correctStats(false), writerTimezone(nullptr) {} - StatContext(bool cStat, const Timezone* const timezone = nullptr) : - correctStats(cStat), writerTimezone(timezone) {} - }; - -/** - * Internal Statistics Implementation - */ - - template <typename T> - class InternalStatisticsImpl { - private: - bool _hasNull; - bool _hasMinimum; - bool _hasMaximum; - bool _hasSum; - bool _hasTotalLength; - uint64_t _totalLength; - uint64_t _valueCount; - T _minimum; - T _maximum; - T _sum; - public: - InternalStatisticsImpl() { - _hasNull = false; - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; - _hasTotalLength = false; - _totalLength = 0; - _valueCount = 0; - } - - ~InternalStatisticsImpl() {} - - // GET / SET _totalLength - bool hasTotalLength() const { return _hasTotalLength; } - - void setHasTotalLength(bool hasTotalLength) { - _hasTotalLength = hasTotalLength; - } - - uint64_t getTotalLength() const { return _totalLength; } - - void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; } - - // GET / SET _sum - bool hasSum() const { return _hasSum; } - - void setHasSum(bool hasSum) { _hasSum = hasSum; } - - T getSum() const { return _sum; } - - void setSum(T sum) { _sum = sum; } - - // GET / SET _maximum - bool hasMaximum() const { return _hasMaximum; } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STATISTICS_IMPL_HH +#define ORC_STATISTICS_IMPL_HH + +#include "orc/Common.hh" +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include "Timezone.hh" +#include "TypeImpl.hh" + +namespace orc { + +/** + * StatContext contains fields required to compute statistics + */ + + struct StatContext { + const bool correctStats; + const Timezone* const writerTimezone; + StatContext() : correctStats(false), writerTimezone(nullptr) {} + StatContext(bool cStat, const Timezone* const timezone = nullptr) : + correctStats(cStat), writerTimezone(timezone) {} + }; + +/** + * Internal Statistics Implementation + */ + + template <typename T> + class InternalStatisticsImpl { + private: + bool _hasNull; + bool _hasMinimum; + bool _hasMaximum; + bool _hasSum; + bool _hasTotalLength; + uint64_t _totalLength; + uint64_t _valueCount; + T _minimum; + T _maximum; + T _sum; + public: + InternalStatisticsImpl() { + _hasNull = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; + _hasTotalLength = false; + _totalLength = 0; + _valueCount = 0; + } + + ~InternalStatisticsImpl() {} + + // GET / SET _totalLength + bool hasTotalLength() const { return _hasTotalLength; } + + void setHasTotalLength(bool hasTotalLength) { + _hasTotalLength = hasTotalLength; + } + + uint64_t getTotalLength() const { return _totalLength; } + + void setTotalLength(uint64_t totalLength) { _totalLength = totalLength; } + + // GET / SET _sum + bool hasSum() const { return _hasSum; } + + void setHasSum(bool hasSum) { _hasSum = hasSum; } + + T getSum() const { return _sum; } + + void setSum(T sum) { _sum = sum; } + + // GET / SET _maximum + bool hasMaximum() const { return _hasMaximum; } + const T & getMaximum() const { return _maximum; } - - void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; } - - void setMaximum(T max) { _maximum = max; } - - // GET / SET _minimum - bool hasMinimum() const { return _hasMinimum; } - - void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; } - + + void setHasMaximum(bool hasMax) { _hasMaximum = hasMax; } + + void setMaximum(T max) { _maximum = max; } + + // GET / SET _minimum + bool hasMinimum() const { return _hasMinimum; } + + void setHasMinimum(bool hasMin) { _hasMinimum = hasMin; } + const T & getMinimum() const { return _minimum; } - - void setMinimum(T min) { _minimum = min; } - - // GET / SET _valueCount - uint64_t getNumberOfValues() const { return _valueCount; } - - void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; } - - // GET / SET _hasNullValue - bool hasNull() const { return _hasNull; } - - void setHasNull(bool hasNull) { _hasNull = hasNull; } - - void reset() { - _hasNull = false; - _hasMinimum = false; - _hasMaximum = false; - _hasSum = false; - _hasTotalLength = false; - _totalLength = 0; - _valueCount = 0; - } - - void updateMinMax(T value) { - if (!_hasMinimum) { - _hasMinimum = _hasMaximum = true; - _minimum = _maximum = value; - } else if (compare(value, _minimum)) { - _minimum = value; - } else if (compare(_maximum, value)) { - _maximum = value; - } - } - - // sum is not merged here as we need to check overflow - void merge(const InternalStatisticsImpl& other) { - _hasNull = _hasNull || other._hasNull; - _valueCount += other._valueCount; - - if (other._hasMinimum) { - if (!_hasMinimum) { - _hasMinimum = _hasMaximum = true; - _minimum = other._minimum; - _maximum = other._maximum; - } else { - // all template types should support operator< - if (compare(_maximum, other._maximum)) { - _maximum = other._maximum; - } - if (compare(other._minimum, _minimum)) { - _minimum = other._minimum; - } - } - } - - _hasTotalLength = _hasTotalLength && other._hasTotalLength; - _totalLength += other._totalLength; - } - }; - - typedef InternalStatisticsImpl<char> InternalCharStatistics; - typedef InternalStatisticsImpl<char> InternalBooleanStatistics; - typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics; - typedef InternalStatisticsImpl<int32_t> InternalDateStatistics; - typedef InternalStatisticsImpl<double> InternalDoubleStatistics; - typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics; - typedef InternalStatisticsImpl<std::string> InternalStringStatistics; - - /** - * Mutable column statistics for use by the writer. - */ - class MutableColumnStatistics { - public: - virtual ~MutableColumnStatistics(); - - virtual void increase(uint64_t count) = 0; - - virtual void setNumberOfValues(uint64_t value) = 0; - - virtual void setHasNull(bool hasNull) = 0; - - virtual void merge(const MutableColumnStatistics& other) = 0; - - virtual void reset() = 0; - - virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0; - }; - -/** - * ColumnStatistics Implementation - */ - - class ColumnStatisticsImpl: public ColumnStatistics, - public MutableColumnStatistics { - private: - InternalCharStatistics _stats; - public: - ColumnStatisticsImpl() { reset(); } - ColumnStatisticsImpl(const proto::ColumnStatistics& stats); - virtual ~ColumnStatisticsImpl() override; - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - void merge(const MutableColumnStatistics& other) override { - _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats); - } - - void reset() override { - _stats.reset(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Column has " << getNumberOfValues() << " values" - << " and has null value: " << (hasNull() ? "yes" : "no") - << std::endl; - return buffer.str(); - } - }; - - class BinaryColumnStatisticsImpl: public BinaryColumnStatistics, - public MutableColumnStatistics { - private: - InternalCharStatistics _stats; - public: - BinaryColumnStatisticsImpl() { reset(); } - BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~BinaryColumnStatisticsImpl() override; - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - bool hasTotalLength() const override { - return _stats.hasTotalLength(); - } - - uint64_t getTotalLength() const override { - if(hasTotalLength()){ - return _stats.getTotalLength(); - }else{ - throw ParseError("Total length is not defined."); - } - } - - void setTotalLength(uint64_t length) { - _stats.setHasTotalLength(true); - _stats.setTotalLength(length); - } - - void update(size_t length) { - _stats.setTotalLength(_stats.getTotalLength() + length); - } - - void merge(const MutableColumnStatistics& other) override { - const BinaryColumnStatisticsImpl& binStats = - dynamic_cast<const BinaryColumnStatisticsImpl&>(other); - _stats.merge(binStats._stats); - } - - void reset() override { - _stats.reset(); - setTotalLength(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); - binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Binary" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasTotalLength()){ - buffer << "Total length: " << getTotalLength() << std::endl; - }else{ - buffer << "Total length: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class BooleanColumnStatisticsImpl: public BooleanColumnStatistics, - public MutableColumnStatistics { - private: - InternalBooleanStatistics _stats; - bool _hasCount; - uint64_t _trueCount; - - public: - BooleanColumnStatisticsImpl() { reset(); } - BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~BooleanColumnStatisticsImpl() override; - - bool hasCount() const override { - return _hasCount; - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - _hasCount = true; - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - uint64_t getFalseCount() const override { - if(hasCount()){ - return getNumberOfValues() - _trueCount; - }else{ - throw ParseError("False count is not defined."); - } - } - - uint64_t getTrueCount() const override { - if(hasCount()){ - return _trueCount; - }else{ - throw ParseError("True count is not defined."); - } - } - - void setTrueCount(uint64_t trueCount) { - _hasCount = true; - _trueCount = trueCount; - } - - void update(bool value, size_t repetitions) { - if (value) { - _trueCount += repetitions; - } - } - - void merge(const MutableColumnStatistics& other) override { - const BooleanColumnStatisticsImpl& boolStats = - dynamic_cast<const BooleanColumnStatisticsImpl&>(other); - _stats.merge(boolStats._stats); - _hasCount = _hasCount && boolStats._hasCount; - _trueCount += boolStats._trueCount; - } - - void reset() override { - _stats.reset(); - setTrueCount(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); - if (_hasCount) { - bucketStats->add_count(_trueCount); - } else { - bucketStats->clear_count(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Boolean" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasCount()){ - buffer << "(true: " << getTrueCount() << "; false: " - << getFalseCount() << ")" << std::endl; - } else { - buffer << "(true: not defined; false: not defined)" << std::endl; - buffer << "True and false counts are not defined" << std::endl; - } - return buffer.str(); - } - }; - - class DateColumnStatisticsImpl: public DateColumnStatistics, - public MutableColumnStatistics{ - private: - InternalDateStatistics _stats; - public: - DateColumnStatisticsImpl() { reset(); } - DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~DateColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - int32_t getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - int32_t getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(int32_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(int32_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - void update(int32_t value) { - _stats.updateMinMax(value); - } - - void merge(const MutableColumnStatistics& other) override { - const DateColumnStatisticsImpl& dateStats = - dynamic_cast<const DateColumnStatisticsImpl&>(other); - _stats.merge(dateStats._stats); - } - - void reset() override { - _stats.reset(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::DateStatistics* dateStatistics = - pbStats.mutable_datestatistics(); - if (_stats.hasMinimum()) { - dateStatistics->set_maximum(_stats.getMaximum()); - dateStatistics->set_minimum(_stats.getMinimum()); - } else { - dateStatistics->clear_minimum(); - dateStatistics->clear_maximum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Date" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class DecimalColumnStatisticsImpl: public DecimalColumnStatistics, - public MutableColumnStatistics { - private: - InternalDecimalStatistics _stats; - - public: - DecimalColumnStatisticsImpl() { reset(); } - DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~DecimalColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasSum() const override { - return _stats.hasSum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - Decimal getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - Decimal getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(Decimal minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(Decimal maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - Decimal getSum() const override { - if(hasSum()){ - return _stats.getSum(); - }else{ - throw ParseError("Sum is not defined."); - } - } - - void setSum(Decimal sum) { - _stats.setHasSum(true); - _stats.setSum(sum); - } - - void update(const Decimal& value) { - _stats.updateMinMax(value); - - if (_stats.hasSum()) { - updateSum(value); - } - } - - void merge(const MutableColumnStatistics& other) override { - const DecimalColumnStatisticsImpl& decStats = - dynamic_cast<const DecimalColumnStatisticsImpl&>(other); - - _stats.merge(decStats._stats); - - _stats.setHasSum(_stats.hasSum() && decStats.hasSum()); - if (_stats.hasSum()) { - updateSum(decStats.getSum()); - } - } - - void reset() override { - _stats.reset(); - setSum(Decimal()); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); - if (_stats.hasMinimum()) { - decStats->set_minimum(TString(_stats.getMinimum().toString())); - decStats->set_maximum(TString(_stats.getMaximum().toString())); - } else { - decStats->clear_minimum(); - decStats->clear_maximum(); - } - if (_stats.hasSum()) { - decStats->set_sum(TString(_stats.getSum().toString())); - } else { - decStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Decimal" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum().toString() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum().toString() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - - if(hasSum()){ - buffer << "Sum: " << getSum().toString() << std::endl; - }else{ - buffer << "Sum: not defined" << std::endl; - } - - return buffer.str(); - } - - private: - void updateSum(Decimal value) { - if (_stats.hasSum()) { - bool overflow = false; - Decimal sum = _stats.getSum(); - if (sum.scale > value.scale) { - value.value = scaleUpInt128ByPowerOfTen(value.value, - sum.scale - value.scale, - overflow); - } else if (sum.scale < value.scale) { - sum.value = scaleUpInt128ByPowerOfTen(sum.value, - value.scale - sum.scale, - overflow); - sum.scale = value.scale; - } - - if (!overflow) { - bool wasPositive = sum.value >= 0; - sum.value += value.value; - if ((value.value >= 0) == wasPositive) { - _stats.setHasSum((sum.value >= 0) == wasPositive); - } - } else { - _stats.setHasSum(false); - } - - if (_stats.hasSum()) { - _stats.setSum(sum); - } - } - } - }; - - class DoubleColumnStatisticsImpl: public DoubleColumnStatistics, - public MutableColumnStatistics { - private: - InternalDoubleStatistics _stats; - public: - DoubleColumnStatisticsImpl() { reset(); } - DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); - virtual ~DoubleColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasSum() const override { - return _stats.hasSum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - double getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - double getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(double minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(double maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - double getSum() const override { - if(hasSum()){ - return _stats.getSum(); - }else{ - throw ParseError("Sum is not defined."); - } - } - - void setSum(double sum) { - _stats.setHasSum(true); - _stats.setSum(sum); - } - - void update(double value) { - _stats.updateMinMax(value); - _stats.setSum(_stats.getSum() + value); - } - - void merge(const MutableColumnStatistics& other) override { - const DoubleColumnStatisticsImpl& doubleStats = - dynamic_cast<const DoubleColumnStatisticsImpl&>(other); - _stats.merge(doubleStats._stats); - - _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); - if (_stats.hasSum()) { - _stats.setSum(_stats.getSum() + doubleStats.getSum()); - } - } - - void reset() override { - _stats.reset(); - setSum(0.0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); - if (_stats.hasMinimum()) { - doubleStats->set_minimum(_stats.getMinimum()); - doubleStats->set_maximum(_stats.getMaximum()); - } else { - doubleStats->clear_minimum(); - doubleStats->clear_maximum(); - } - if (_stats.hasSum()) { - doubleStats->set_sum(_stats.getSum()); - } else { - doubleStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Double" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - - if(hasSum()){ - buffer << "Sum: " << getSum() << std::endl; - }else{ - buffer << "Sum: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class IntegerColumnStatisticsImpl: public IntegerColumnStatistics, - public MutableColumnStatistics { - private: - InternalIntegerStatistics _stats; - public: - IntegerColumnStatisticsImpl() { reset(); } - IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); - virtual ~IntegerColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasSum() const override { - return _stats.hasSum(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - int64_t getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - int64_t getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(int64_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(int64_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - int64_t getSum() const override { - if(hasSum()){ - return _stats.getSum(); - }else{ - throw ParseError("Sum is not defined."); - } - } - - void setSum(int64_t sum) { - _stats.setHasSum(true); - _stats.setSum(sum); - } - + + void setMinimum(T min) { _minimum = min; } + + // GET / SET _valueCount + uint64_t getNumberOfValues() const { return _valueCount; } + + void setNumberOfValues(uint64_t numValues) { _valueCount = numValues; } + + // GET / SET _hasNullValue + bool hasNull() const { return _hasNull; } + + void setHasNull(bool hasNull) { _hasNull = hasNull; } + + void reset() { + _hasNull = false; + _hasMinimum = false; + _hasMaximum = false; + _hasSum = false; + _hasTotalLength = false; + _totalLength = 0; + _valueCount = 0; + } + + void updateMinMax(T value) { + if (!_hasMinimum) { + _hasMinimum = _hasMaximum = true; + _minimum = _maximum = value; + } else if (compare(value, _minimum)) { + _minimum = value; + } else if (compare(_maximum, value)) { + _maximum = value; + } + } + + // sum is not merged here as we need to check overflow + void merge(const InternalStatisticsImpl& other) { + _hasNull = _hasNull || other._hasNull; + _valueCount += other._valueCount; + + if (other._hasMinimum) { + if (!_hasMinimum) { + _hasMinimum = _hasMaximum = true; + _minimum = other._minimum; + _maximum = other._maximum; + } else { + // all template types should support operator< + if (compare(_maximum, other._maximum)) { + _maximum = other._maximum; + } + if (compare(other._minimum, _minimum)) { + _minimum = other._minimum; + } + } + } + + _hasTotalLength = _hasTotalLength && other._hasTotalLength; + _totalLength += other._totalLength; + } + }; + + typedef InternalStatisticsImpl<char> InternalCharStatistics; + typedef InternalStatisticsImpl<char> InternalBooleanStatistics; + typedef InternalStatisticsImpl<int64_t> InternalIntegerStatistics; + typedef InternalStatisticsImpl<int32_t> InternalDateStatistics; + typedef InternalStatisticsImpl<double> InternalDoubleStatistics; + typedef InternalStatisticsImpl<Decimal> InternalDecimalStatistics; + typedef InternalStatisticsImpl<std::string> InternalStringStatistics; + + /** + * Mutable column statistics for use by the writer. + */ + class MutableColumnStatistics { + public: + virtual ~MutableColumnStatistics(); + + virtual void increase(uint64_t count) = 0; + + virtual void setNumberOfValues(uint64_t value) = 0; + + virtual void setHasNull(bool hasNull) = 0; + + virtual void merge(const MutableColumnStatistics& other) = 0; + + virtual void reset() = 0; + + virtual void toProtoBuf(proto::ColumnStatistics& pbStats) const = 0; + }; + +/** + * ColumnStatistics Implementation + */ + + class ColumnStatisticsImpl: public ColumnStatistics, + public MutableColumnStatistics { + private: + InternalCharStatistics _stats; + public: + ColumnStatisticsImpl() { reset(); } + ColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~ColumnStatisticsImpl() override; + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + void merge(const MutableColumnStatistics& other) override { + _stats.merge(dynamic_cast<const ColumnStatisticsImpl&>(other)._stats); + } + + void reset() override { + _stats.reset(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Column has " << getNumberOfValues() << " values" + << " and has null value: " << (hasNull() ? "yes" : "no") + << std::endl; + return buffer.str(); + } + }; + + class BinaryColumnStatisticsImpl: public BinaryColumnStatistics, + public MutableColumnStatistics { + private: + InternalCharStatistics _stats; + public: + BinaryColumnStatisticsImpl() { reset(); } + BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~BinaryColumnStatisticsImpl() override; + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + bool hasTotalLength() const override { + return _stats.hasTotalLength(); + } + + uint64_t getTotalLength() const override { + if(hasTotalLength()){ + return _stats.getTotalLength(); + }else{ + throw ParseError("Total length is not defined."); + } + } + + void setTotalLength(uint64_t length) { + _stats.setHasTotalLength(true); + _stats.setTotalLength(length); + } + + void update(size_t length) { + _stats.setTotalLength(_stats.getTotalLength() + length); + } + + void merge(const MutableColumnStatistics& other) override { + const BinaryColumnStatisticsImpl& binStats = + dynamic_cast<const BinaryColumnStatisticsImpl&>(other); + _stats.merge(binStats._stats); + } + + void reset() override { + _stats.reset(); + setTotalLength(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::BinaryStatistics* binStats = pbStats.mutable_binarystatistics(); + binStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Binary" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasTotalLength()){ + buffer << "Total length: " << getTotalLength() << std::endl; + }else{ + buffer << "Total length: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class BooleanColumnStatisticsImpl: public BooleanColumnStatistics, + public MutableColumnStatistics { + private: + InternalBooleanStatistics _stats; + bool _hasCount; + uint64_t _trueCount; + + public: + BooleanColumnStatisticsImpl() { reset(); } + BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~BooleanColumnStatisticsImpl() override; + + bool hasCount() const override { + return _hasCount; + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + _hasCount = true; + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + uint64_t getFalseCount() const override { + if(hasCount()){ + return getNumberOfValues() - _trueCount; + }else{ + throw ParseError("False count is not defined."); + } + } + + uint64_t getTrueCount() const override { + if(hasCount()){ + return _trueCount; + }else{ + throw ParseError("True count is not defined."); + } + } + + void setTrueCount(uint64_t trueCount) { + _hasCount = true; + _trueCount = trueCount; + } + + void update(bool value, size_t repetitions) { + if (value) { + _trueCount += repetitions; + } + } + + void merge(const MutableColumnStatistics& other) override { + const BooleanColumnStatisticsImpl& boolStats = + dynamic_cast<const BooleanColumnStatisticsImpl&>(other); + _stats.merge(boolStats._stats); + _hasCount = _hasCount && boolStats._hasCount; + _trueCount += boolStats._trueCount; + } + + void reset() override { + _stats.reset(); + setTrueCount(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::BucketStatistics* bucketStats = pbStats.mutable_bucketstatistics(); + if (_hasCount) { + bucketStats->add_count(_trueCount); + } else { + bucketStats->clear_count(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Boolean" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasCount()){ + buffer << "(true: " << getTrueCount() << "; false: " + << getFalseCount() << ")" << std::endl; + } else { + buffer << "(true: not defined; false: not defined)" << std::endl; + buffer << "True and false counts are not defined" << std::endl; + } + return buffer.str(); + } + }; + + class DateColumnStatisticsImpl: public DateColumnStatistics, + public MutableColumnStatistics{ + private: + InternalDateStatistics _stats; + public: + DateColumnStatisticsImpl() { reset(); } + DateColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~DateColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + int32_t getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + int32_t getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(int32_t minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(int32_t maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + void update(int32_t value) { + _stats.updateMinMax(value); + } + + void merge(const MutableColumnStatistics& other) override { + const DateColumnStatisticsImpl& dateStats = + dynamic_cast<const DateColumnStatisticsImpl&>(other); + _stats.merge(dateStats._stats); + } + + void reset() override { + _stats.reset(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::DateStatistics* dateStatistics = + pbStats.mutable_datestatistics(); + if (_stats.hasMinimum()) { + dateStatistics->set_maximum(_stats.getMaximum()); + dateStatistics->set_minimum(_stats.getMinimum()); + } else { + dateStatistics->clear_minimum(); + dateStatistics->clear_maximum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Date" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class DecimalColumnStatisticsImpl: public DecimalColumnStatistics, + public MutableColumnStatistics { + private: + InternalDecimalStatistics _stats; + + public: + DecimalColumnStatisticsImpl() { reset(); } + DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~DecimalColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasSum() const override { + return _stats.hasSum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + Decimal getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + Decimal getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(Decimal minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(Decimal maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + Decimal getSum() const override { + if(hasSum()){ + return _stats.getSum(); + }else{ + throw ParseError("Sum is not defined."); + } + } + + void setSum(Decimal sum) { + _stats.setHasSum(true); + _stats.setSum(sum); + } + + void update(const Decimal& value) { + _stats.updateMinMax(value); + + if (_stats.hasSum()) { + updateSum(value); + } + } + + void merge(const MutableColumnStatistics& other) override { + const DecimalColumnStatisticsImpl& decStats = + dynamic_cast<const DecimalColumnStatisticsImpl&>(other); + + _stats.merge(decStats._stats); + + _stats.setHasSum(_stats.hasSum() && decStats.hasSum()); + if (_stats.hasSum()) { + updateSum(decStats.getSum()); + } + } + + void reset() override { + _stats.reset(); + setSum(Decimal()); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::DecimalStatistics* decStats = pbStats.mutable_decimalstatistics(); + if (_stats.hasMinimum()) { + decStats->set_minimum(TString(_stats.getMinimum().toString())); + decStats->set_maximum(TString(_stats.getMaximum().toString())); + } else { + decStats->clear_minimum(); + decStats->clear_maximum(); + } + if (_stats.hasSum()) { + decStats->set_sum(TString(_stats.getSum().toString())); + } else { + decStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Decimal" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum().toString() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum().toString() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + + if(hasSum()){ + buffer << "Sum: " << getSum().toString() << std::endl; + }else{ + buffer << "Sum: not defined" << std::endl; + } + + return buffer.str(); + } + + private: + void updateSum(Decimal value) { + if (_stats.hasSum()) { + bool overflow = false; + Decimal sum = _stats.getSum(); + if (sum.scale > value.scale) { + value.value = scaleUpInt128ByPowerOfTen(value.value, + sum.scale - value.scale, + overflow); + } else if (sum.scale < value.scale) { + sum.value = scaleUpInt128ByPowerOfTen(sum.value, + value.scale - sum.scale, + overflow); + sum.scale = value.scale; + } + + if (!overflow) { + bool wasPositive = sum.value >= 0; + sum.value += value.value; + if ((value.value >= 0) == wasPositive) { + _stats.setHasSum((sum.value >= 0) == wasPositive); + } + } else { + _stats.setHasSum(false); + } + + if (_stats.hasSum()) { + _stats.setSum(sum); + } + } + } + }; + + class DoubleColumnStatisticsImpl: public DoubleColumnStatistics, + public MutableColumnStatistics { + private: + InternalDoubleStatistics _stats; + public: + DoubleColumnStatisticsImpl() { reset(); } + DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~DoubleColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasSum() const override { + return _stats.hasSum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + double getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + double getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(double minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(double maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + double getSum() const override { + if(hasSum()){ + return _stats.getSum(); + }else{ + throw ParseError("Sum is not defined."); + } + } + + void setSum(double sum) { + _stats.setHasSum(true); + _stats.setSum(sum); + } + + void update(double value) { + _stats.updateMinMax(value); + _stats.setSum(_stats.getSum() + value); + } + + void merge(const MutableColumnStatistics& other) override { + const DoubleColumnStatisticsImpl& doubleStats = + dynamic_cast<const DoubleColumnStatisticsImpl&>(other); + _stats.merge(doubleStats._stats); + + _stats.setHasSum(_stats.hasSum() && doubleStats.hasSum()); + if (_stats.hasSum()) { + _stats.setSum(_stats.getSum() + doubleStats.getSum()); + } + } + + void reset() override { + _stats.reset(); + setSum(0.0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::DoubleStatistics* doubleStats = pbStats.mutable_doublestatistics(); + if (_stats.hasMinimum()) { + doubleStats->set_minimum(_stats.getMinimum()); + doubleStats->set_maximum(_stats.getMaximum()); + } else { + doubleStats->clear_minimum(); + doubleStats->clear_maximum(); + } + if (_stats.hasSum()) { + doubleStats->set_sum(_stats.getSum()); + } else { + doubleStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Double" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + + if(hasSum()){ + buffer << "Sum: " << getSum() << std::endl; + }else{ + buffer << "Sum: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class IntegerColumnStatisticsImpl: public IntegerColumnStatistics, + public MutableColumnStatistics { + private: + InternalIntegerStatistics _stats; + public: + IntegerColumnStatisticsImpl() { reset(); } + IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats); + virtual ~IntegerColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasSum() const override { + return _stats.hasSum(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + int64_t getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + int64_t getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(int64_t minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(int64_t maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + int64_t getSum() const override { + if(hasSum()){ + return _stats.getSum(); + }else{ + throw ParseError("Sum is not defined."); + } + } + + void setSum(int64_t sum) { + _stats.setHasSum(true); + _stats.setSum(sum); + } + void update(int64_t value, int repetitions) { _stats.updateMinMax(value); - + if (_stats.hasSum()) { if (repetitions > 1) { _stats.setHasSum(multiplyExact(value, repetitions, &value)); @@ -981,498 +981,498 @@ namespace orc { } } - void merge(const MutableColumnStatistics& other) override { - const IntegerColumnStatisticsImpl& intStats = - dynamic_cast<const IntegerColumnStatisticsImpl&>(other); - - _stats.merge(intStats._stats); - - // update sum and check overflow - _stats.setHasSum(_stats.hasSum() && intStats.hasSum()); - if (_stats.hasSum()) { + void merge(const MutableColumnStatistics& other) override { + const IntegerColumnStatisticsImpl& intStats = + dynamic_cast<const IntegerColumnStatisticsImpl&>(other); + + _stats.merge(intStats._stats); + + // update sum and check overflow + _stats.setHasSum(_stats.hasSum() && intStats.hasSum()); + if (_stats.hasSum()) { int64_t value; _stats.setHasSum(addExact(_stats.getSum(), intStats.getSum(), &value)); if (_stats.hasSum()) { _stats.setSum(value); - } - } - } - - void reset() override { - _stats.reset(); - setSum(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); - if (_stats.hasMinimum()) { - intStats->set_minimum(_stats.getMinimum()); - intStats->set_maximum(_stats.getMaximum()); - } else { - intStats->clear_minimum(); - intStats->clear_maximum(); - } - if (_stats.hasSum()) { - intStats->set_sum(_stats.getSum()); - } else { - intStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: Integer" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum: not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum: not defined" << std::endl; - } - - if(hasSum()){ - buffer << "Sum: " << getSum() << std::endl; - }else{ - buffer << "Sum: not defined" << std::endl; - } - return buffer.str(); - } - }; - - class StringColumnStatisticsImpl: public StringColumnStatistics, - public MutableColumnStatistics{ - private: - InternalStringStatistics _stats; - - public: - StringColumnStatisticsImpl() { - reset(); - } - StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~StringColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - bool hasTotalLength() const override { - return _stats.hasTotalLength(); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - + } + } + } + + void reset() override { + _stats.reset(); + setSum(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::IntegerStatistics* intStats = pbStats.mutable_intstatistics(); + if (_stats.hasMinimum()) { + intStats->set_minimum(_stats.getMinimum()); + intStats->set_maximum(_stats.getMaximum()); + } else { + intStats->clear_minimum(); + intStats->clear_maximum(); + } + if (_stats.hasSum()) { + intStats->set_sum(_stats.getSum()); + } else { + intStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: Integer" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum: not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum: not defined" << std::endl; + } + + if(hasSum()){ + buffer << "Sum: " << getSum() << std::endl; + }else{ + buffer << "Sum: not defined" << std::endl; + } + return buffer.str(); + } + }; + + class StringColumnStatisticsImpl: public StringColumnStatistics, + public MutableColumnStatistics{ + private: + InternalStringStatistics _stats; + + public: + StringColumnStatisticsImpl() { + reset(); + } + StringColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~StringColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + bool hasTotalLength() const override { + return _stats.hasTotalLength(); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + const std::string & getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + const std::string & getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(std::string minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(std::string maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - uint64_t getTotalLength() const override { - if(hasTotalLength()){ - return _stats.getTotalLength(); - }else{ - throw ParseError("Total length is not defined."); - } - } - - void setTotalLength(uint64_t length) { - _stats.setHasTotalLength(true); - _stats.setTotalLength(length); - } - - void update(const char* value, size_t length) { - if (value != nullptr) { - if (!_stats.hasMinimum()) { - std::string tempStr(value, value + length); - setMinimum(tempStr); - setMaximum(tempStr); - } else { - // update min - int minCmp = strncmp(_stats.getMinimum().c_str(), - value, - std::min(_stats.getMinimum().length(), length)); - if (minCmp > 0 || - (minCmp == 0 && length < _stats.getMinimum().length())) { - setMinimum(std::string(value, value + length)); - } - - // update max - int maxCmp = strncmp(_stats.getMaximum().c_str(), - value, - std::min(_stats.getMaximum().length(), length)); - if (maxCmp < 0 || - (maxCmp == 0 && length > _stats.getMaximum().length())) { - setMaximum(std::string(value, value + length)); - } - } - } - - _stats.setTotalLength(_stats.getTotalLength() + length); - } - - void update(std::string value) { - update(value.c_str(), value.length()); - } - - void merge(const MutableColumnStatistics& other) override { - const StringColumnStatisticsImpl& strStats = - dynamic_cast<const StringColumnStatisticsImpl&>(other); - _stats.merge(strStats._stats); - } - - void reset() override { - _stats.reset(); - setTotalLength(0); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); - if (_stats.hasMinimum()) { - strStats->set_minimum(TString(_stats.getMinimum())); - strStats->set_maximum(TString(_stats.getMaximum())); - } else { - strStats->clear_minimum(); - strStats->clear_maximum(); - } - if (_stats.hasTotalLength()) { - strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); - } else { - strStats->clear_sum(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - buffer << "Data type: String" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - buffer << "Minimum: " << getMinimum() << std::endl; - }else{ - buffer << "Minimum is not defined" << std::endl; - } - - if(hasMaximum()){ - buffer << "Maximum: " << getMaximum() << std::endl; - }else{ - buffer << "Maximum is not defined" << std::endl; - } - - if(hasTotalLength()){ - buffer << "Total length: " << getTotalLength() << std::endl; - }else{ - buffer << "Total length is not defined" << std::endl; - } - return buffer.str(); - } - }; - - class TimestampColumnStatisticsImpl: public TimestampColumnStatistics, - public MutableColumnStatistics { - private: - InternalIntegerStatistics _stats; - bool _hasLowerBound; - bool _hasUpperBound; - int64_t _lowerBound; - int64_t _upperBound; - - public: - TimestampColumnStatisticsImpl() { reset(); } - TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, - const StatContext& statContext); - virtual ~TimestampColumnStatisticsImpl() override; - - bool hasMinimum() const override { - return _stats.hasMinimum(); - } - - bool hasMaximum() const override { - return _stats.hasMaximum(); - } - - uint64_t getNumberOfValues() const override { - return _stats.getNumberOfValues(); - } - - void setNumberOfValues(uint64_t value) override { - _stats.setNumberOfValues(value); - } - - void increase(uint64_t count) override { - _stats.setNumberOfValues(_stats.getNumberOfValues() + count); - } - - bool hasNull() const override { - return _stats.hasNull(); - } - - void setHasNull(bool hasNull) override { - _stats.setHasNull(hasNull); - } - - int64_t getMinimum() const override { - if(hasMinimum()){ - return _stats.getMinimum(); - }else{ - throw ParseError("Minimum is not defined."); - } - } - - int64_t getMaximum() const override { - if(hasMaximum()){ - return _stats.getMaximum(); - }else{ - throw ParseError("Maximum is not defined."); - } - } - - void setMinimum(int64_t minimum) { - _stats.setHasMinimum(true); - _stats.setMinimum(minimum); - } - - void setMaximum(int64_t maximum) { - _stats.setHasMaximum(true); - _stats.setMaximum(maximum); - } - - void update(int64_t value) { - _stats.updateMinMax(value); - } - - void merge(const MutableColumnStatistics& other) override { - const TimestampColumnStatisticsImpl& tsStats = - dynamic_cast<const TimestampColumnStatisticsImpl&>(other); - _stats.merge(tsStats._stats); - } - - void reset() override { - _stats.reset(); - } - - void toProtoBuf(proto::ColumnStatistics& pbStats) const override { - pbStats.set_hasnull(_stats.hasNull()); - pbStats.set_numberofvalues(_stats.getNumberOfValues()); - - proto::TimestampStatistics* tsStats = - pbStats.mutable_timestampstatistics(); - if (_stats.hasMinimum()) { - tsStats->set_minimumutc(_stats.getMinimum()); - tsStats->set_maximumutc(_stats.getMaximum()); - } else { - tsStats->clear_minimumutc(); - tsStats->clear_maximumutc(); - } - } - - std::string toString() const override { - std::ostringstream buffer; - struct tm tmValue; - char timeBuffer[20]; - time_t secs = 0; - - buffer << "Data type: Timestamp" << std::endl - << "Values: " << getNumberOfValues() << std::endl - << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; - if(hasMinimum()){ - secs = static_cast<time_t>(getMinimum() / 1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "Minimum: " << timeBuffer << "." - << (getMinimum() % 1000) << std::endl; - }else{ - buffer << "Minimum is not defined" << std::endl; - } - - if(hasLowerBound()){ - secs = static_cast<time_t>(getLowerBound() / 1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "LowerBound: " << timeBuffer << "." - << (getLowerBound() % 1000) << std::endl; - }else{ - buffer << "LowerBound is not defined" << std::endl; - } - - if(hasMaximum()){ - secs = static_cast<time_t>(getMaximum()/1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "Maximum: " << timeBuffer << "." - << (getMaximum() % 1000) << std::endl; - }else{ - buffer << "Maximum is not defined" << std::endl; - } - - if(hasUpperBound()){ - secs = static_cast<time_t>(getUpperBound() / 1000); - gmtime_r(&secs, &tmValue); - strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); - buffer << "UpperBound: " << timeBuffer << "." - << (getUpperBound() % 1000) << std::endl; - }else{ - buffer << "UpperBound is not defined" << std::endl; - } - - return buffer.str(); - } - - bool hasLowerBound() const override { - return _hasLowerBound; - } - - bool hasUpperBound() const override { - return _hasUpperBound; - } - - int64_t getLowerBound() const override { - if(hasLowerBound()){ - return _lowerBound; - }else{ - throw ParseError("LowerBound is not defined."); - } - } - - int64_t getUpperBound() const override { - if(hasUpperBound()){ - return _upperBound; - }else{ - throw ParseError("UpperBound is not defined."); - } - } - }; - - ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, - const StatContext& statContext); - - class StatisticsImpl: public Statistics { - private: - std::vector<ColumnStatistics*> colStats; - - // DELIBERATELY NOT IMPLEMENTED - StatisticsImpl(const StatisticsImpl&); - StatisticsImpl& operator=(const StatisticsImpl&); - - public: - StatisticsImpl(const proto::StripeStatistics& stripeStats, - const StatContext& statContext); - - StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); - - virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId - ) const override { - return colStats[columnId]; - } - - virtual ~StatisticsImpl() override; - - uint32_t getNumberOfColumns() const override { - return static_cast<uint32_t>(colStats.size()); - } - }; - - class StripeStatisticsImpl: public StripeStatistics { - private: - std::unique_ptr<StatisticsImpl> columnStats; - std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > - rowIndexStats; - - // DELIBERATELY NOT IMPLEMENTED - StripeStatisticsImpl(const StripeStatisticsImpl&); - StripeStatisticsImpl& operator=(const StripeStatisticsImpl&); - - public: - StripeStatisticsImpl( - const proto::StripeStatistics& stripeStats, - std::vector<std::vector<proto::ColumnStatistics> >& indexStats, - const StatContext& statContext); - - virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId - ) const override { - return columnStats->getColumnStatistics(columnId); - } - - uint32_t getNumberOfColumns() const override { - return columnStats->getNumberOfColumns(); - } - - virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, - uint32_t rowIndex - ) const override { - // check id indices are valid - return rowIndexStats[columnId][rowIndex].get(); - } - - virtual ~StripeStatisticsImpl() override; - - uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { - return static_cast<uint32_t>(rowIndexStats[columnId].size()); - } - }; - - /** - * Create ColumnStatistics for writers - * @param type of column - * @return MutableColumnStatistics instances - */ - std::unique_ptr<MutableColumnStatistics> createColumnStatistics( - const Type& type); - -}// namespace - -#endif + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(std::string minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(std::string maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + uint64_t getTotalLength() const override { + if(hasTotalLength()){ + return _stats.getTotalLength(); + }else{ + throw ParseError("Total length is not defined."); + } + } + + void setTotalLength(uint64_t length) { + _stats.setHasTotalLength(true); + _stats.setTotalLength(length); + } + + void update(const char* value, size_t length) { + if (value != nullptr) { + if (!_stats.hasMinimum()) { + std::string tempStr(value, value + length); + setMinimum(tempStr); + setMaximum(tempStr); + } else { + // update min + int minCmp = strncmp(_stats.getMinimum().c_str(), + value, + std::min(_stats.getMinimum().length(), length)); + if (minCmp > 0 || + (minCmp == 0 && length < _stats.getMinimum().length())) { + setMinimum(std::string(value, value + length)); + } + + // update max + int maxCmp = strncmp(_stats.getMaximum().c_str(), + value, + std::min(_stats.getMaximum().length(), length)); + if (maxCmp < 0 || + (maxCmp == 0 && length > _stats.getMaximum().length())) { + setMaximum(std::string(value, value + length)); + } + } + } + + _stats.setTotalLength(_stats.getTotalLength() + length); + } + + void update(std::string value) { + update(value.c_str(), value.length()); + } + + void merge(const MutableColumnStatistics& other) override { + const StringColumnStatisticsImpl& strStats = + dynamic_cast<const StringColumnStatisticsImpl&>(other); + _stats.merge(strStats._stats); + } + + void reset() override { + _stats.reset(); + setTotalLength(0); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::StringStatistics* strStats = pbStats.mutable_stringstatistics(); + if (_stats.hasMinimum()) { + strStats->set_minimum(TString(_stats.getMinimum())); + strStats->set_maximum(TString(_stats.getMaximum())); + } else { + strStats->clear_minimum(); + strStats->clear_maximum(); + } + if (_stats.hasTotalLength()) { + strStats->set_sum(static_cast<int64_t>(_stats.getTotalLength())); + } else { + strStats->clear_sum(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + buffer << "Data type: String" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + buffer << "Minimum: " << getMinimum() << std::endl; + }else{ + buffer << "Minimum is not defined" << std::endl; + } + + if(hasMaximum()){ + buffer << "Maximum: " << getMaximum() << std::endl; + }else{ + buffer << "Maximum is not defined" << std::endl; + } + + if(hasTotalLength()){ + buffer << "Total length: " << getTotalLength() << std::endl; + }else{ + buffer << "Total length is not defined" << std::endl; + } + return buffer.str(); + } + }; + + class TimestampColumnStatisticsImpl: public TimestampColumnStatistics, + public MutableColumnStatistics { + private: + InternalIntegerStatistics _stats; + bool _hasLowerBound; + bool _hasUpperBound; + int64_t _lowerBound; + int64_t _upperBound; + + public: + TimestampColumnStatisticsImpl() { reset(); } + TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats, + const StatContext& statContext); + virtual ~TimestampColumnStatisticsImpl() override; + + bool hasMinimum() const override { + return _stats.hasMinimum(); + } + + bool hasMaximum() const override { + return _stats.hasMaximum(); + } + + uint64_t getNumberOfValues() const override { + return _stats.getNumberOfValues(); + } + + void setNumberOfValues(uint64_t value) override { + _stats.setNumberOfValues(value); + } + + void increase(uint64_t count) override { + _stats.setNumberOfValues(_stats.getNumberOfValues() + count); + } + + bool hasNull() const override { + return _stats.hasNull(); + } + + void setHasNull(bool hasNull) override { + _stats.setHasNull(hasNull); + } + + int64_t getMinimum() const override { + if(hasMinimum()){ + return _stats.getMinimum(); + }else{ + throw ParseError("Minimum is not defined."); + } + } + + int64_t getMaximum() const override { + if(hasMaximum()){ + return _stats.getMaximum(); + }else{ + throw ParseError("Maximum is not defined."); + } + } + + void setMinimum(int64_t minimum) { + _stats.setHasMinimum(true); + _stats.setMinimum(minimum); + } + + void setMaximum(int64_t maximum) { + _stats.setHasMaximum(true); + _stats.setMaximum(maximum); + } + + void update(int64_t value) { + _stats.updateMinMax(value); + } + + void merge(const MutableColumnStatistics& other) override { + const TimestampColumnStatisticsImpl& tsStats = + dynamic_cast<const TimestampColumnStatisticsImpl&>(other); + _stats.merge(tsStats._stats); + } + + void reset() override { + _stats.reset(); + } + + void toProtoBuf(proto::ColumnStatistics& pbStats) const override { + pbStats.set_hasnull(_stats.hasNull()); + pbStats.set_numberofvalues(_stats.getNumberOfValues()); + + proto::TimestampStatistics* tsStats = + pbStats.mutable_timestampstatistics(); + if (_stats.hasMinimum()) { + tsStats->set_minimumutc(_stats.getMinimum()); + tsStats->set_maximumutc(_stats.getMaximum()); + } else { + tsStats->clear_minimumutc(); + tsStats->clear_maximumutc(); + } + } + + std::string toString() const override { + std::ostringstream buffer; + struct tm tmValue; + char timeBuffer[20]; + time_t secs = 0; + + buffer << "Data type: Timestamp" << std::endl + << "Values: " << getNumberOfValues() << std::endl + << "Has null: " << (hasNull() ? "yes" : "no") << std::endl; + if(hasMinimum()){ + secs = static_cast<time_t>(getMinimum() / 1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "Minimum: " << timeBuffer << "." + << (getMinimum() % 1000) << std::endl; + }else{ + buffer << "Minimum is not defined" << std::endl; + } + + if(hasLowerBound()){ + secs = static_cast<time_t>(getLowerBound() / 1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "LowerBound: " << timeBuffer << "." + << (getLowerBound() % 1000) << std::endl; + }else{ + buffer << "LowerBound is not defined" << std::endl; + } + + if(hasMaximum()){ + secs = static_cast<time_t>(getMaximum()/1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "Maximum: " << timeBuffer << "." + << (getMaximum() % 1000) << std::endl; + }else{ + buffer << "Maximum is not defined" << std::endl; + } + + if(hasUpperBound()){ + secs = static_cast<time_t>(getUpperBound() / 1000); + gmtime_r(&secs, &tmValue); + strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue); + buffer << "UpperBound: " << timeBuffer << "." + << (getUpperBound() % 1000) << std::endl; + }else{ + buffer << "UpperBound is not defined" << std::endl; + } + + return buffer.str(); + } + + bool hasLowerBound() const override { + return _hasLowerBound; + } + + bool hasUpperBound() const override { + return _hasUpperBound; + } + + int64_t getLowerBound() const override { + if(hasLowerBound()){ + return _lowerBound; + }else{ + throw ParseError("LowerBound is not defined."); + } + } + + int64_t getUpperBound() const override { + if(hasUpperBound()){ + return _upperBound; + }else{ + throw ParseError("UpperBound is not defined."); + } + } + }; + + ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s, + const StatContext& statContext); + + class StatisticsImpl: public Statistics { + private: + std::vector<ColumnStatistics*> colStats; + + // DELIBERATELY NOT IMPLEMENTED + StatisticsImpl(const StatisticsImpl&); + StatisticsImpl& operator=(const StatisticsImpl&); + + public: + StatisticsImpl(const proto::StripeStatistics& stripeStats, + const StatContext& statContext); + + StatisticsImpl(const proto::Footer& footer, const StatContext& statContext); + + virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId + ) const override { + return colStats[columnId]; + } + + virtual ~StatisticsImpl() override; + + uint32_t getNumberOfColumns() const override { + return static_cast<uint32_t>(colStats.size()); + } + }; + + class StripeStatisticsImpl: public StripeStatistics { + private: + std::unique_ptr<StatisticsImpl> columnStats; + std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > + rowIndexStats; + + // DELIBERATELY NOT IMPLEMENTED + StripeStatisticsImpl(const StripeStatisticsImpl&); + StripeStatisticsImpl& operator=(const StripeStatisticsImpl&); + + public: + StripeStatisticsImpl( + const proto::StripeStatistics& stripeStats, + std::vector<std::vector<proto::ColumnStatistics> >& indexStats, + const StatContext& statContext); + + virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId + ) const override { + return columnStats->getColumnStatistics(columnId); + } + + uint32_t getNumberOfColumns() const override { + return columnStats->getNumberOfColumns(); + } + + virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId, + uint32_t rowIndex + ) const override { + // check id indices are valid + return rowIndexStats[columnId][rowIndex].get(); + } + + virtual ~StripeStatisticsImpl() override; + + uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override { + return static_cast<uint32_t>(rowIndexStats[columnId].size()); + } + }; + + /** + * Create ColumnStatistics for writers + * @param type of column + * @return MutableColumnStatistics instances + */ + std::unique_ptr<MutableColumnStatistics> createColumnStatistics( + const Type& type); + +}// namespace + +#endif diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.cc b/contrib/libs/apache/orc/c++/src/StripeStream.cc index b63f19d28e..f9d82f30e0 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.cc +++ b/contrib/libs/apache/orc/c++/src/StripeStream.cc @@ -1,161 +1,161 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "RLE.hh" -#include "Reader.hh" -#include "StripeStream.hh" - -#include "wrap/coded-stream-wrapper.h" - -namespace orc { - - StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, - const proto::StripeInformation& _stripeInfo, - const proto::StripeFooter& _footer, - uint64_t _stripeStart, - InputStream& _input, - const Timezone& _writerTimezone - ): reader(_reader), - stripeInfo(_stripeInfo), - footer(_footer), - stripeIndex(_index), - stripeStart(_stripeStart), - input(_input), - writerTimezone(_writerTimezone) { - // PASS - } - - StripeStreamsImpl::~StripeStreamsImpl() { - // PASS - } - - StreamInformation::~StreamInformation() { - // PASS - } - - StripeInformation::~StripeInformation() { - // PASS - } - - - StreamInformationImpl::~StreamInformationImpl() { - // PASS - } - - const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const { - return reader.getSelectedColumns(); - } - - proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId - ) const { - return footer.columns(static_cast<int>(columnId)); - } - - const Timezone& StripeStreamsImpl::getWriterTimezone() const { - return writerTimezone; - } - - std::ostream* StripeStreamsImpl::getErrorStream() const { - return reader.getFileContents().errorStream; - } - - std::unique_ptr<SeekableInputStream> - StripeStreamsImpl::getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const { - uint64_t offset = stripeStart; - uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength(); - MemoryPool *pool = reader.getFileContents().pool; - for(int i = 0; i < footer.streams_size(); ++i) { - const proto::Stream& stream = footer.streams(i); - if (stream.has_kind() && - stream.kind() == kind && - stream.column() == static_cast<uint64_t>(columnId)) { - uint64_t streamLength = stream.length(); - uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; - if (offset + streamLength > dataEnd) { - std::stringstream msg; - msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex - << ": streamOffset=" << offset << ", streamLength=" << streamLength - << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" - << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength(); - throw ParseError(msg.str()); - } - return createDecompressor(reader.getCompression(), - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream - (&input, - offset, - stream.length(), - *pool, - myBlock)), - reader.getCompressionSize(), - *pool); - } - offset += stream.length(); - } - return std::unique_ptr<SeekableInputStream>(); - } - - MemoryPool& StripeStreamsImpl::getMemoryPool() const { - return *reader.getFileContents().pool; - } - - bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const { - return reader.getThrowOnHive11DecimalOverflow(); - } - - int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const { - return reader.getForcedScaleOnHive11Decimal(); - } - - void StripeInformationImpl::ensureStripeFooterLoaded() const { - if (stripeFooter.get() == nullptr) { - std::unique_ptr<SeekableInputStream> pbStream = - createDecompressor(compression, - std::unique_ptr<SeekableInputStream> - (new SeekableFileInputStream(stream, - offset + - indexLength + - dataLength, - footerLength, - memory)), - blockSize, - memory); - stripeFooter.reset(new proto::StripeFooter()); - if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { - throw ParseError("Failed to parse the stripe footer"); - } - } - } - - std::unique_ptr<StreamInformation> - StripeInformationImpl::getStreamInformation(uint64_t streamId) const { - ensureStripeFooterLoaded(); - uint64_t streamOffset = offset; - for(uint64_t s=0; s < streamId; ++s) { - streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); - } - return ORC_UNIQUE_PTR<StreamInformation> - (new StreamInformationImpl(streamOffset, - stripeFooter-> - streams(static_cast<int>(streamId)))); - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "RLE.hh" +#include "Reader.hh" +#include "StripeStream.hh" + +#include "wrap/coded-stream-wrapper.h" + +namespace orc { + + StripeStreamsImpl::StripeStreamsImpl(const RowReaderImpl& _reader, uint64_t _index, + const proto::StripeInformation& _stripeInfo, + const proto::StripeFooter& _footer, + uint64_t _stripeStart, + InputStream& _input, + const Timezone& _writerTimezone + ): reader(_reader), + stripeInfo(_stripeInfo), + footer(_footer), + stripeIndex(_index), + stripeStart(_stripeStart), + input(_input), + writerTimezone(_writerTimezone) { + // PASS + } + + StripeStreamsImpl::~StripeStreamsImpl() { + // PASS + } + + StreamInformation::~StreamInformation() { + // PASS + } + + StripeInformation::~StripeInformation() { + // PASS + } + + + StreamInformationImpl::~StreamInformationImpl() { + // PASS + } + + const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const { + return reader.getSelectedColumns(); + } + + proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId + ) const { + return footer.columns(static_cast<int>(columnId)); + } + + const Timezone& StripeStreamsImpl::getWriterTimezone() const { + return writerTimezone; + } + + std::ostream* StripeStreamsImpl::getErrorStream() const { + return reader.getFileContents().errorStream; + } + + std::unique_ptr<SeekableInputStream> + StripeStreamsImpl::getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const { + uint64_t offset = stripeStart; + uint64_t dataEnd = stripeInfo.offset() + stripeInfo.indexlength() + stripeInfo.datalength(); + MemoryPool *pool = reader.getFileContents().pool; + for(int i = 0; i < footer.streams_size(); ++i) { + const proto::Stream& stream = footer.streams(i); + if (stream.has_kind() && + stream.kind() == kind && + stream.column() == static_cast<uint64_t>(columnId)) { + uint64_t streamLength = stream.length(); + uint64_t myBlock = shouldStream ? input.getNaturalReadSize(): streamLength; + if (offset + streamLength > dataEnd) { + std::stringstream msg; + msg << "Malformed stream meta at stream index " << i << " in stripe " << stripeIndex + << ": streamOffset=" << offset << ", streamLength=" << streamLength + << ", stripeOffset=" << stripeInfo.offset() << ", stripeIndexLength=" + << stripeInfo.indexlength() << ", stripeDataLength=" << stripeInfo.datalength(); + throw ParseError(msg.str()); + } + return createDecompressor(reader.getCompression(), + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream + (&input, + offset, + stream.length(), + *pool, + myBlock)), + reader.getCompressionSize(), + *pool); + } + offset += stream.length(); + } + return std::unique_ptr<SeekableInputStream>(); + } + + MemoryPool& StripeStreamsImpl::getMemoryPool() const { + return *reader.getFileContents().pool; + } + + bool StripeStreamsImpl::getThrowOnHive11DecimalOverflow() const { + return reader.getThrowOnHive11DecimalOverflow(); + } + + int32_t StripeStreamsImpl::getForcedScaleOnHive11Decimal() const { + return reader.getForcedScaleOnHive11Decimal(); + } + + void StripeInformationImpl::ensureStripeFooterLoaded() const { + if (stripeFooter.get() == nullptr) { + std::unique_ptr<SeekableInputStream> pbStream = + createDecompressor(compression, + std::unique_ptr<SeekableInputStream> + (new SeekableFileInputStream(stream, + offset + + indexLength + + dataLength, + footerLength, + memory)), + blockSize, + memory); + stripeFooter.reset(new proto::StripeFooter()); + if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) { + throw ParseError("Failed to parse the stripe footer"); + } + } + } + + std::unique_ptr<StreamInformation> + StripeInformationImpl::getStreamInformation(uint64_t streamId) const { + ensureStripeFooterLoaded(); + uint64_t streamOffset = offset; + for(uint64_t s=0; s < streamId; ++s) { + streamOffset += stripeFooter->streams(static_cast<int>(s)).length(); + } + return ORC_UNIQUE_PTR<StreamInformation> + (new StreamInformationImpl(streamOffset, + stripeFooter-> + streams(static_cast<int>(streamId)))); + } + +} diff --git a/contrib/libs/apache/orc/c++/src/StripeStream.hh b/contrib/libs/apache/orc/c++/src/StripeStream.hh index 5cbaf60a69..da5cb16f37 100644 --- a/contrib/libs/apache/orc/c++/src/StripeStream.hh +++ b/contrib/libs/apache/orc/c++/src/StripeStream.hh @@ -1,213 +1,213 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_STRIPE_STREAM_HH -#define ORC_STRIPE_STREAM_HH - -#include "orc/Int128.hh" -#include "orc/OrcFile.hh" -#include "orc/Reader.hh" - -#include "Timezone.hh" -#include "TypeImpl.hh" - -namespace orc { - - class RowReaderImpl; - - /** - * StripeStream Implementation - */ - - class StripeStreamsImpl: public StripeStreams { - private: - const RowReaderImpl& reader; - const proto::StripeInformation& stripeInfo; - const proto::StripeFooter& footer; - const uint64_t stripeIndex; - const uint64_t stripeStart; - InputStream& input; - const Timezone& writerTimezone; - - public: - StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, - const proto::StripeInformation& stripeInfo, - const proto::StripeFooter& footer, - uint64_t stripeStart, - InputStream& input, - const Timezone& writerTimezone); - - virtual ~StripeStreamsImpl() override; - - virtual const std::vector<bool> getSelectedColumns() const override; - - virtual proto::ColumnEncoding getEncoding(uint64_t columnId - ) const override; - - virtual std::unique_ptr<SeekableInputStream> - getStream(uint64_t columnId, - proto::Stream_Kind kind, - bool shouldStream) const override; - - MemoryPool& getMemoryPool() const override; - - const Timezone& getWriterTimezone() const override; - - std::ostream* getErrorStream() const override; - - bool getThrowOnHive11DecimalOverflow() const override; - - int32_t getForcedScaleOnHive11Decimal() const override; - }; - - /** - * StreamInformation Implementation - */ - - class StreamInformationImpl: public StreamInformation { - private: - StreamKind kind; - uint64_t column; - uint64_t offset; - uint64_t length; - public: - StreamInformationImpl(uint64_t _offset, - const proto::Stream& stream - ): kind(static_cast<StreamKind>(stream.kind())), - column(stream.column()), - offset(_offset), - length(stream.length()) { - // PASS - } - - ~StreamInformationImpl() override; - - StreamKind getKind() const override { - return kind; - } - - uint64_t getColumnId() const override { - return column; - } - - uint64_t getOffset() const override { - return offset; - } - - uint64_t getLength() const override { - return length; - } - }; - - /** - * StripeInformation Implementation - */ - - class StripeInformationImpl : public StripeInformation { - uint64_t offset; - uint64_t indexLength; - uint64_t dataLength; - uint64_t footerLength; - uint64_t numRows; - InputStream* stream; - MemoryPool& memory; - CompressionKind compression; - uint64_t blockSize; - mutable std::unique_ptr<proto::StripeFooter> stripeFooter; - void ensureStripeFooterLoaded() const; - public: - - StripeInformationImpl(uint64_t _offset, - uint64_t _indexLength, - uint64_t _dataLength, - uint64_t _footerLength, - uint64_t _numRows, - InputStream* _stream, - MemoryPool& _memory, - CompressionKind _compression, - uint64_t _blockSize - ) : offset(_offset), - indexLength(_indexLength), - dataLength(_dataLength), - footerLength(_footerLength), - numRows(_numRows), - stream(_stream), - memory(_memory), - compression(_compression), - blockSize(_blockSize) { - // PASS - } - - virtual ~StripeInformationImpl() override { - // PASS - } - - uint64_t getOffset() const override { - return offset; - } - - uint64_t getLength() const override { - return indexLength + dataLength + footerLength; - } - uint64_t getIndexLength() const override { - return indexLength; - } - - uint64_t getDataLength()const override { - return dataLength; - } - - uint64_t getFooterLength() const override { - return footerLength; - } - - uint64_t getNumberOfRows() const override { - return numRows; - } - - uint64_t getNumberOfStreams() const override { - ensureStripeFooterLoaded(); - return static_cast<uint64_t>(stripeFooter->streams_size()); - } - - std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId - ) const override; - - ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { - ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter-> - columns(static_cast<int>(colId)) - .kind()); - } - - uint64_t getDictionarySize(uint64_t colId) const override { - ensureStripeFooterLoaded(); - return static_cast<ColumnEncodingKind>(stripeFooter-> - columns(static_cast<int>(colId)) - .dictionarysize()); - } - - const std::string& getWriterTimezone() const override { - ensureStripeFooterLoaded(); - return stripeFooter->writertimezone(); - } - }; - -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_STRIPE_STREAM_HH +#define ORC_STRIPE_STREAM_HH + +#include "orc/Int128.hh" +#include "orc/OrcFile.hh" +#include "orc/Reader.hh" + +#include "Timezone.hh" +#include "TypeImpl.hh" + +namespace orc { + + class RowReaderImpl; + + /** + * StripeStream Implementation + */ + + class StripeStreamsImpl: public StripeStreams { + private: + const RowReaderImpl& reader; + const proto::StripeInformation& stripeInfo; + const proto::StripeFooter& footer; + const uint64_t stripeIndex; + const uint64_t stripeStart; + InputStream& input; + const Timezone& writerTimezone; + + public: + StripeStreamsImpl(const RowReaderImpl& reader, uint64_t index, + const proto::StripeInformation& stripeInfo, + const proto::StripeFooter& footer, + uint64_t stripeStart, + InputStream& input, + const Timezone& writerTimezone); + + virtual ~StripeStreamsImpl() override; + + virtual const std::vector<bool> getSelectedColumns() const override; + + virtual proto::ColumnEncoding getEncoding(uint64_t columnId + ) const override; + + virtual std::unique_ptr<SeekableInputStream> + getStream(uint64_t columnId, + proto::Stream_Kind kind, + bool shouldStream) const override; + + MemoryPool& getMemoryPool() const override; + + const Timezone& getWriterTimezone() const override; + + std::ostream* getErrorStream() const override; + + bool getThrowOnHive11DecimalOverflow() const override; + + int32_t getForcedScaleOnHive11Decimal() const override; + }; + + /** + * StreamInformation Implementation + */ + + class StreamInformationImpl: public StreamInformation { + private: + StreamKind kind; + uint64_t column; + uint64_t offset; + uint64_t length; + public: + StreamInformationImpl(uint64_t _offset, + const proto::Stream& stream + ): kind(static_cast<StreamKind>(stream.kind())), + column(stream.column()), + offset(_offset), + length(stream.length()) { + // PASS + } + + ~StreamInformationImpl() override; + + StreamKind getKind() const override { + return kind; + } + + uint64_t getColumnId() const override { + return column; + } + + uint64_t getOffset() const override { + return offset; + } + + uint64_t getLength() const override { + return length; + } + }; + + /** + * StripeInformation Implementation + */ + + class StripeInformationImpl : public StripeInformation { + uint64_t offset; + uint64_t indexLength; + uint64_t dataLength; + uint64_t footerLength; + uint64_t numRows; + InputStream* stream; + MemoryPool& memory; + CompressionKind compression; + uint64_t blockSize; + mutable std::unique_ptr<proto::StripeFooter> stripeFooter; + void ensureStripeFooterLoaded() const; + public: + + StripeInformationImpl(uint64_t _offset, + uint64_t _indexLength, + uint64_t _dataLength, + uint64_t _footerLength, + uint64_t _numRows, + InputStream* _stream, + MemoryPool& _memory, + CompressionKind _compression, + uint64_t _blockSize + ) : offset(_offset), + indexLength(_indexLength), + dataLength(_dataLength), + footerLength(_footerLength), + numRows(_numRows), + stream(_stream), + memory(_memory), + compression(_compression), + blockSize(_blockSize) { + // PASS + } + + virtual ~StripeInformationImpl() override { + // PASS + } + + uint64_t getOffset() const override { + return offset; + } + + uint64_t getLength() const override { + return indexLength + dataLength + footerLength; + } + uint64_t getIndexLength() const override { + return indexLength; + } + + uint64_t getDataLength()const override { + return dataLength; + } + + uint64_t getFooterLength() const override { + return footerLength; + } + + uint64_t getNumberOfRows() const override { + return numRows; + } + + uint64_t getNumberOfStreams() const override { + ensureStripeFooterLoaded(); + return static_cast<uint64_t>(stripeFooter->streams_size()); + } + + std::unique_ptr<StreamInformation> getStreamInformation(uint64_t streamId + ) const override; + + ColumnEncodingKind getColumnEncoding(uint64_t colId) const override { + ensureStripeFooterLoaded(); + return static_cast<ColumnEncodingKind>(stripeFooter-> + columns(static_cast<int>(colId)) + .kind()); + } + + uint64_t getDictionarySize(uint64_t colId) const override { + ensureStripeFooterLoaded(); + return static_cast<ColumnEncodingKind>(stripeFooter-> + columns(static_cast<int>(colId)) + .dictionarysize()); + } + + const std::string& getWriterTimezone() const override { + ensureStripeFooterLoaded(); + return stripeFooter->writertimezone(); + } + }; + +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Timezone.cc b/contrib/libs/apache/orc/c++/src/Timezone.cc index 318e5bcc12..0aa66ef71c 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.cc +++ b/contrib/libs/apache/orc/c++/src/Timezone.cc @@ -1,936 +1,936 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/OrcFile.hh" -#include "Timezone.hh" - -#include <errno.h> -#include <map> -#include <sstream> -#include <stdint.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> - -namespace orc { - - // default location of the timezone files - static const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo"; - - // location of a symlink to the local timezone - static const char LOCAL_TIMEZONE[] = "/etc/localtime"; - - enum TransitionKind { - TRANSITION_JULIAN, - TRANSITION_DAY, - TRANSITION_MONTH - }; - - static const int64_t MONTHS_PER_YEAR = 12; - /** - * The number of days in each month in non-leap and leap years. - */ - static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = - {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, - {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; - static const int64_t DAYS_PER_WEEK = 7; - - // Leap years and day of the week repeat every 400 years, which makes it - // a good cycle length. - static const int64_t SECONDS_PER_400_YEARS = - SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3)); - - /** - * Is the given year a leap year? - */ - bool isLeap(int64_t year) { - return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); - } - - /** - * Find the position that is the closest and less than or equal to the - * target. - * @return -1 if the target < array[0] or array is empty or - * i if array[i] <= target and (i == n or array[i] < array[i+1]) - */ - int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) { - uint64_t size = array.size(); - if (size == 0) { - return -1; - } - uint64_t min = 0; - uint64_t max = size - 1; - uint64_t mid = (min + max) / 2; - while ((array[mid] != target) && (min < max)) { - if (array[mid] < target) { - min = mid + 1; - } else if (mid == 0) { - max = 0; - } else { - max = mid - 1; - } - mid = (min + max) / 2; - } - if (target < array[mid]) { - return static_cast<int64_t>(mid) - 1; - } else { - return static_cast<int64_t>(mid); - } - } - - struct Transition { - TransitionKind kind; - int64_t day; - int64_t week; - int64_t month; - int64_t time; - - std::string toString() const { - std::stringstream buffer; - switch (kind) { - case TRANSITION_JULIAN: - buffer << "julian " << day; - break; - case TRANSITION_DAY: - buffer << "day " << day; - break; - case TRANSITION_MONTH: - buffer << "month " << month << " week " << week << " day " << day; - break; - } - buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) - << ":" << (time % 60); - return buffer.str(); - } - - /** - * Get the transition time for the given year. - * @param year the year - * @return the number of seconds past local Jan 1 00:00:00 that the - * transition happens. - */ - int64_t getTime(int64_t year) const { - int64_t result = time; - switch (kind) { - case TRANSITION_JULIAN: - result += SECONDS_PER_DAY * day; - if (day > 60 && isLeap(year)) { - result += SECONDS_PER_DAY; - } - break; - case TRANSITION_DAY: - result += SECONDS_PER_DAY * day; - break; - case TRANSITION_MONTH: { - bool inLeap = isLeap(year); - int64_t adjustedMonth = (month + 9) % 12 + 1; - int64_t adjustedYear = (month <= 2) ? (year - 1) : year; - int64_t adjustedCentury = adjustedYear / 100; - int64_t adjustedRemainder = adjustedYear % 100; - - // day of the week of the first day of month - int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + - 1 + adjustedRemainder + adjustedRemainder / 4 + - adjustedCentury / 4 - 2 * adjustedCentury) % 7; - if (dayOfWeek < 0) { - dayOfWeek += DAYS_PER_WEEK; - } - - int64_t d = day - dayOfWeek; - if (d < 0) { - d += DAYS_PER_WEEK; - } - for (int w = 1; w < week; ++w) { - if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) { - break; - } - d += DAYS_PER_WEEK; - } - result += d * SECONDS_PER_DAY; - - // Add in the time for the month - for(int m=0; m < month - 1; ++m) { - result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY; - } - break; - } - } - return result; - } - }; - - /** - * The current rule for finding timezone variants arbitrarily far in - * the future. They are based on a string representation that - * specifies the standard name and offset. For timezones with - * daylight savings, the string specifies the daylight variant name - * and offset and the rules for switching between them. - * - * rule = <standard name><standard offset><daylight>? - * name = string with no numbers or '+', '-', or ',' - * offset = [-+]?hh(:mm(:ss)?)? - * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)? - * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week> - */ - class FutureRuleImpl: public FutureRule { - std::string ruleString; - TimezoneVariant standard; - bool hasDst; - TimezoneVariant dst; - Transition start; - Transition end; - - // expanded time_t offsets of transitions - std::vector<int64_t> offsets; - - // Is the epoch (1 Jan 1970 00:00) in standard time? - // This code assumes that the transition dates fall in the same order - // each year. Hopefully no timezone regions decide to move across the - // equator, which is about what it would take. - bool startInStd; - - void computeOffsets() { - if (!hasDst) { - startInStd = true; - offsets.resize(1); - } else { - // Insert a transition for the epoch and two per a year for the next - // 400 years. We assume that the all even positions are in standard - // time if and only if startInStd and the odd ones are the reverse. - offsets.resize(400 * 2 + 1); - startInStd = start.getTime(1970) < end.getTime(1970); - int64_t base = 0; - for(int64_t year = 1970; year < 1970 + 400; ++year) { - if (startInStd) { - offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + start.getTime(year) - standard.gmtOffset; - offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + end.getTime(year) - dst.gmtOffset; - } else { - offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = - base + end.getTime(year) - dst.gmtOffset; - offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = - base + start.getTime(year) - standard.gmtOffset; - } - base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY; - } - } - offsets[0] = 0; - } - - public: - virtual ~FutureRuleImpl() override; - bool isDefined() const override; - const TimezoneVariant& getVariant(int64_t clk) const override; - void print(std::ostream& out) const override; - - friend class FutureRuleParser; - }; - - FutureRule::~FutureRule() { - // PASS - } - - FutureRuleImpl::~FutureRuleImpl() { - // PASS - } - - bool FutureRuleImpl::isDefined() const { - return ruleString.size() > 0; - } - - const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const { - if (!hasDst) { - return standard; - } else { - int64_t adjusted = clk % SECONDS_PER_400_YEARS; - if (adjusted < 0) { - adjusted += SECONDS_PER_400_YEARS; - } - int64_t idx = binarySearch(offsets, adjusted); - if (startInStd == (idx % 2 == 0)) { - return standard; - } else { - return dst; - } - } - } - - void FutureRuleImpl::print(std::ostream& out) const { - if (isDefined()) { - out << " Future rule: " << ruleString << "\n"; - out << " standard " << standard.toString() << "\n"; - if (hasDst) { - out << " dst " << dst.toString() << "\n"; - out << " start " << start.toString() << "\n"; - out << " end " << end.toString() << "\n"; - } - } - } - - /** - * A parser for the future rule strings. - */ - class FutureRuleParser { - public: - FutureRuleParser(const std::string& str, - FutureRuleImpl* rule - ): ruleString(str), - length(str.size()), - position(0), - output(*rule) { - output.ruleString = str; - if (position != length) { - parseName(output.standard.name); - output.standard.gmtOffset = -parseOffset(); - output.standard.isDst = false; - output.hasDst = position < length; - if (output.hasDst) { - parseName(output.dst.name); - output.dst.isDst = true; - if (ruleString[position] != ',') { - output.dst.gmtOffset = -parseOffset(); - } else { - output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60; - } - parseTransition(output.start); - parseTransition(output.end); - } - if (position != length) { - throwError("Extra text"); - } - output.computeOffsets(); - } - } - - private: - - const std::string& ruleString; - size_t length; - size_t position; - FutureRuleImpl &output; - - void throwError(const char *msg) { - std::stringstream buffer; - buffer << msg << " at " << position << " in '" << ruleString << "'"; - throw TimezoneError(buffer.str()); - } - - /** - * Parse the names of the form: - * ([^-+0-9,]+|<[^>]+>) - * and set the output string. - */ - void parseName(std::string& result) { - if (position == length) { - throwError("name required"); - } - size_t start = position; - if (ruleString[position] == '<') { - while (position < length && ruleString[position] != '>') { - position += 1; - } - if (position == length) { - throwError("missing close '>'"); - } - position +=1; - } else { - while (position < length) { - char ch = ruleString[position]; - if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') { - break; - } - position += 1; - } - } - if (position == start) { - throwError("empty string not allowed"); - } - result = ruleString.substr(start, position - start); - } - - /** - * Parse an integer of the form [0-9]+ and return it. - */ - int64_t parseNumber() { - if (position >= length) { - throwError("missing number"); - } - int64_t result = 0; - while (position < length) { - char ch = ruleString[position]; - if (isdigit(ch)) { - result = result * 10 + (ch - '0'); - position += 1; - } else { - break; - } - } - return result; - } - - /** - * Parse the offsets of the form: - * [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)? - * and convert it into a number of seconds. - */ - int64_t parseOffset() { - int64_t scale = 3600; - bool isNegative = false; - if (position < length) { - char ch = ruleString[position]; - isNegative = ch == '-'; - if (ch == '-' || ch == '+') { - position += 1; - } - } - int64_t result = parseNumber() * scale; - while (position < length && scale > 1 && ruleString[position] == ':') { - scale /= 60; - position += 1; - result += parseNumber() * scale; - } - if (isNegative) { - result = -result; - } - return result; - } - - /** - * Parse a transition of the following form: - * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)? - */ - void parseTransition(Transition& transition) { - if (length - position < 2 || ruleString[position] != ',') { - throwError("missing transition"); - } - position += 1; - char ch = ruleString[position]; - if (ch == 'J') { - transition.kind = TRANSITION_JULIAN; - position += 1; - transition.day = parseNumber(); - } else if (ch == 'M') { - transition.kind = TRANSITION_MONTH; - position += 1; - transition.month = parseNumber(); - if (position == length || ruleString[position] != '.') { - throwError("missing first ."); - } - position += 1; - transition.week = parseNumber(); - if (position == length || ruleString[position] != '.') { - throwError("missing second ."); - } - position += 1; - transition.day = parseNumber(); - } else { - transition.kind = TRANSITION_DAY; - transition.day = parseNumber(); - } - if (position < length && ruleString[position] == '/') { - position += 1; - transition.time = parseOffset(); - } else { - transition.time = 2 * 60 * 60; - } - } - }; - - /** - * Parse the POSIX TZ string. - */ - std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) { - std::shared_ptr<FutureRule> result(new FutureRuleImpl()); - FutureRuleParser parser(ruleString, - dynamic_cast<FutureRuleImpl*>(result.get())); - return result; - } - - std::string TimezoneVariant::toString() const { - std::stringstream buffer; - buffer << name << " " << gmtOffset; - if (isDst) { - buffer << " (dst)"; - } - return buffer.str(); - } - - /** - * An abstraction of the differences between versions. - */ - class VersionParser { - public: - virtual ~VersionParser(); - - /** - * Get the version number. - */ - virtual uint64_t getVersion() const = 0; - - /** - * Get the number of bytes - */ - virtual uint64_t getTimeSize() const = 0; - - /** - * Parse the time at the given location. - */ - virtual int64_t parseTime(const unsigned char* ptr) const = 0; - - /** - * Parse the future string - */ - virtual std::string parseFutureString(const unsigned char *ptr, - uint64_t offset, - uint64_t length) const = 0; - }; - - VersionParser::~VersionParser() { - // PASS - } - - static uint32_t decode32(const unsigned char* ptr) { - return static_cast<uint32_t>(ptr[0] << 24) | - static_cast<uint32_t>(ptr[1] << 16) | - static_cast<uint32_t>(ptr[2] << 8) | - static_cast<uint32_t>(ptr[3]); - } - - class Version1Parser: public VersionParser { - public: - virtual ~Version1Parser() override; - - virtual uint64_t getVersion() const override { - return 1; - } - - /** - * Get the number of bytes - */ - virtual uint64_t getTimeSize() const override { - return 4; - } - - /** - * Parse the time at the given location. - */ - virtual int64_t parseTime(const unsigned char* ptr) const override { - // sign extend from 32 bits - return static_cast<int32_t>(decode32(ptr)); - } - - virtual std::string parseFutureString(const unsigned char *, - uint64_t, - uint64_t) const override { - return ""; - } - }; - - Version1Parser::~Version1Parser() { - // PASS - } - - class Version2Parser: public VersionParser { - public: - virtual ~Version2Parser() override; - - virtual uint64_t getVersion() const override { - return 2; - } - - /** - * Get the number of bytes - */ - virtual uint64_t getTimeSize() const override { - return 8; - } - - /** - * Parse the time at the given location. - */ - virtual int64_t parseTime(const unsigned char* ptr) const override { - return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4); - } - - virtual std::string parseFutureString(const unsigned char *ptr, - uint64_t offset, - uint64_t length) const override { - return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, - length - 2); - } - }; - - Version2Parser::~Version2Parser() { - // PASS - } - - class TimezoneImpl: public Timezone { - public: - TimezoneImpl(const std::string& name, - const std::vector<unsigned char> bytes); - virtual ~TimezoneImpl() override; - - /** - * Get the variant for the given time (time_t). - */ - const TimezoneVariant& getVariant(int64_t clk) const override; - - void print(std::ostream&) const override; - - uint64_t getVersion() const override { - return version; - } - - int64_t getEpoch() const override { - return epoch; - } - - int64_t convertToUTC(int64_t clk) const override { - return clk + getVariant(clk).gmtOffset; - } - - private: - void parseTimeVariants(const unsigned char* ptr, - uint64_t variantOffset, - uint64_t variantCount, - uint64_t nameOffset, - uint64_t nameCount); - void parseZoneFile(const unsigned char* ptr, - uint64_t sectionOffset, - uint64_t fileLength, - const VersionParser& version); - // filename - std::string filename; - - // the version of the file - uint64_t version; - - // the list of variants for this timezone - std::vector<TimezoneVariant> variants; - - // the list of the times where the local rules change - std::vector<int64_t> transitions; - - // the variant that starts at this transition. - std::vector<uint64_t> currentVariant; - - // the variant before the first transition - uint64_t ancientVariant; - - // the rule for future times - std::shared_ptr<FutureRule> futureRule; - - // the last explicit transition after which we use the future rule - int64_t lastTransition; - - // The ORC epoch time in this timezone. - int64_t epoch; - }; - - DIAGNOSTIC_PUSH - #ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wglobal-constructors") - DIAGNOSTIC_IGNORE("-Wexit-time-destructors") - #endif - static std::mutex timezone_mutex; - static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache; - DIAGNOSTIC_POP - - Timezone::~Timezone() { - // PASS - } - - TimezoneImpl::TimezoneImpl(const std::string& _filename, - const std::vector<unsigned char> buffer - ): filename(_filename) { - parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser()); - // Build the literal for the ORC epoch - // 2015 Jan 1 00:00:00 - tm epochStruct; - epochStruct.tm_sec = 0; - epochStruct.tm_min = 0; - epochStruct.tm_hour = 0; - epochStruct.tm_mday = 1; - epochStruct.tm_mon = 0; - epochStruct.tm_year = 2015 - 1900; - epochStruct.tm_isdst = 0; - time_t utcEpoch = timegm(&epochStruct); - epoch = utcEpoch - getVariant(utcEpoch).gmtOffset; - } - - const char* getTimezoneDirectory() { - const char *dir = getenv("TZDIR"); - if (!dir) { - dir = DEFAULT_TZDIR; - } - return dir; - } - - /** - * Get a timezone by absolute filename. - * Results are cached. - */ - const Timezone& getTimezoneByFilename(const std::string& filename) { - // ORC-110 - std::lock_guard<std::mutex> timezone_lock(timezone_mutex); - std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = - timezoneCache.find(filename); - if (itr != timezoneCache.end()) { - return *(itr->second).get(); - } - try { - ORC_UNIQUE_PTR<InputStream> file = readFile(filename); - size_t size = static_cast<size_t>(file->getLength()); - std::vector<unsigned char> buffer(size); - file->read(&buffer[0], size, 0); - timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer)); - } catch(ParseError& err) { - throw TimezoneError(err.what()); - } - return *timezoneCache[filename].get(); - } - - /** - * Get the local timezone. - */ - const Timezone& getLocalTimezone() { -#ifdef _MSC_VER - return getTimezoneByName("UTC"); -#else - return getTimezoneByFilename(LOCAL_TIMEZONE); -#endif - } - - /** - * Get a timezone by name (eg. America/Los_Angeles). - * Results are cached. - */ - const Timezone& getTimezoneByName(const std::string& zone) { - std::string filename(getTimezoneDirectory()); - filename += "/"; - filename += zone; - return getTimezoneByFilename(filename); - } - - /** - * Parse a set of bytes as a timezone file as if they came from filename. - */ - std::unique_ptr<Timezone> getTimezone(const std::string& filename, - const std::vector<unsigned char>& b){ - return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b)); - } - - TimezoneImpl::~TimezoneImpl() { - // PASS - } - - void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, - uint64_t variantOffset, - uint64_t variantCount, - uint64_t nameOffset, - uint64_t nameCount) { - for(uint64_t variant=0; variant < variantCount; ++variant) { - variants[variant].gmtOffset = - static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); - variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; - uint64_t nameStart = ptr[variantOffset + 6 * variant + 5]; - if (nameStart >= nameCount) { - std::stringstream buffer; - buffer << "name out of range in variant " << variant - << " - " << nameStart << " >= " << nameCount; - throw TimezoneError(buffer.str()); - } - variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) - + nameOffset + nameStart); - } - } - - /** - * Parse the zone file to get the bits we need. - * There are two versions of the timezone file: - * - * Version 1(version = 0x00): - * Magic(version) - * Header - * TransitionTimes(4 byte) - * TransitionRules - * Rules - * LeapSeconds(4 byte) - * IsStd - * IsGmt - * - * Version2: - * Version1(0x32) = a version 1 copy of the data for old clients - * Magic(0x32) - * Header - * TransitionTimes(8 byte) - * TransitionRules - * Rules - * LeapSeconds(8 byte) - * IsStd - * IsGmt - * FutureString - */ - void TimezoneImpl::parseZoneFile(const unsigned char *ptr, - uint64_t sectionOffset, - uint64_t fileLength, - const VersionParser& versionParser) { - const uint64_t magicOffset = sectionOffset + 0; - const uint64_t headerOffset = magicOffset + 20; - - // check for validity before we start parsing - if (fileLength < headerOffset + 6 * 4 || - strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) - != 0) { - std::stringstream buffer; - buffer << "non-tzfile " << filename; - throw TimezoneError(buffer.str()); - } - - const uint64_t isGmtCount = decode32(ptr + headerOffset + 0); - const uint64_t isStdCount = decode32(ptr + headerOffset + 4); - const uint64_t leapCount = decode32(ptr + headerOffset + 8); - const uint64_t timeCount = decode32(ptr + headerOffset + 12); - const uint64_t variantCount = decode32(ptr + headerOffset + 16); - const uint64_t nameCount = decode32(ptr + headerOffset + 20); - - const uint64_t timeOffset = headerOffset + 24; - const uint64_t timeVariantOffset = - timeOffset + versionParser.getTimeSize() * timeCount; - const uint64_t variantOffset = timeVariantOffset + timeCount; - const uint64_t nameOffset = variantOffset + variantCount * 6; - const uint64_t sectionLength = nameOffset + nameCount - + (versionParser.getTimeSize() + 4) * leapCount - + isGmtCount + isStdCount; - - if (sectionLength > fileLength) { - std::stringstream buffer; - buffer << "tzfile too short " << filename - << " needs " << sectionLength << " and has " << fileLength; - throw TimezoneError(buffer.str()); - } - - // if it is version 2, skip over the old layout and read the new one. - if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) { - parseZoneFile(ptr, sectionLength, fileLength, Version2Parser()); - return; - } - version = versionParser.getVersion(); - variants.resize(variantCount); - transitions.resize(timeCount); - currentVariant.resize(timeCount); - parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, - nameCount); - bool foundAncient = false; - for(uint64_t t=0; t < timeCount; ++t) { - transitions[t] = - versionParser.parseTime(ptr + timeOffset + - t * versionParser.getTimeSize()); - currentVariant[t] = ptr[timeVariantOffset + t]; - if (currentVariant[t] >= variantCount) { - std::stringstream buffer; - buffer << "tzfile rule out of range " << filename - << " references rule " << currentVariant[t] - << " of " << variantCount; - throw TimezoneError(buffer.str()); - } - // find the oldest standard time and use that as the ancient value - if (!foundAncient && - !variants[currentVariant[t]].isDst) { - foundAncient = true; - ancientVariant = currentVariant[t]; - } - } - if (!foundAncient) { - ancientVariant = 0; - } - futureRule = parseFutureRule(versionParser.parseFutureString - (ptr, sectionLength, - fileLength - sectionLength)); - - // find the lower bound for applying the future rule - if (futureRule->isDefined()) { - if (timeCount > 0) { - lastTransition = transitions[timeCount - 1]; - } else { - lastTransition = INT64_MIN; - } - } else { - lastTransition = INT64_MAX; - } - } - - const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const { - // if it is after the last explicit entry in the table, - // use the future rule to get an answer - if (clk > lastTransition) { - return futureRule->getVariant(clk); - } else { - int64_t transition = binarySearch(transitions, clk); - uint64_t idx; - if (transition < 0) { - idx = ancientVariant; - } else { - idx = currentVariant[static_cast<size_t>(transition)]; - } - return variants[idx]; - } - } - - void TimezoneImpl::print(std::ostream& out) const { - out << "Timezone file: " << filename << "\n"; - out << " Version: " << version << "\n"; - futureRule->print(out); - for(uint64_t r=0; r < variants.size(); ++r) { - out << " Variant " << r << ": " - << variants[r].toString() << "\n"; - } - for(uint64_t t=0; t < transitions.size(); ++t) { - tm timeStruct; - tm* result = nullptr; - char buffer[25]; - if (sizeof(time_t) >= 8) { - time_t val = transitions[t]; - result = gmtime_r(&val, &timeStruct); - if (result) { - strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct); - } - } - std::cout << " Transition: " << (result == nullptr ? "null" : buffer) - << " (" << transitions[t] << ") -> " - << variants[currentVariant[t]].name - << "\n"; - } - } - - TimezoneError::TimezoneError(const std::string& what - ): std::runtime_error(what) { - // PASS - } - - TimezoneError::TimezoneError(const TimezoneError& other - ): std::runtime_error(other) { - // PASS - } - - TimezoneError::~TimezoneError() ORC_NOEXCEPT { - // PASS - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/OrcFile.hh" +#include "Timezone.hh" + +#include <errno.h> +#include <map> +#include <sstream> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +namespace orc { + + // default location of the timezone files + static const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo"; + + // location of a symlink to the local timezone + static const char LOCAL_TIMEZONE[] = "/etc/localtime"; + + enum TransitionKind { + TRANSITION_JULIAN, + TRANSITION_DAY, + TRANSITION_MONTH + }; + + static const int64_t MONTHS_PER_YEAR = 12; + /** + * The number of days in each month in non-leap and leap years. + */ + static const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = + {{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; + static const int64_t DAYS_PER_WEEK = 7; + + // Leap years and day of the week repeat every 400 years, which makes it + // a good cycle length. + static const int64_t SECONDS_PER_400_YEARS = + SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3)); + + /** + * Is the given year a leap year? + */ + bool isLeap(int64_t year) { + return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); + } + + /** + * Find the position that is the closest and less than or equal to the + * target. + * @return -1 if the target < array[0] or array is empty or + * i if array[i] <= target and (i == n or array[i] < array[i+1]) + */ + int64_t binarySearch(const std::vector<int64_t> &array, int64_t target) { + uint64_t size = array.size(); + if (size == 0) { + return -1; + } + uint64_t min = 0; + uint64_t max = size - 1; + uint64_t mid = (min + max) / 2; + while ((array[mid] != target) && (min < max)) { + if (array[mid] < target) { + min = mid + 1; + } else if (mid == 0) { + max = 0; + } else { + max = mid - 1; + } + mid = (min + max) / 2; + } + if (target < array[mid]) { + return static_cast<int64_t>(mid) - 1; + } else { + return static_cast<int64_t>(mid); + } + } + + struct Transition { + TransitionKind kind; + int64_t day; + int64_t week; + int64_t month; + int64_t time; + + std::string toString() const { + std::stringstream buffer; + switch (kind) { + case TRANSITION_JULIAN: + buffer << "julian " << day; + break; + case TRANSITION_DAY: + buffer << "day " << day; + break; + case TRANSITION_MONTH: + buffer << "month " << month << " week " << week << " day " << day; + break; + } + buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) + << ":" << (time % 60); + return buffer.str(); + } + + /** + * Get the transition time for the given year. + * @param year the year + * @return the number of seconds past local Jan 1 00:00:00 that the + * transition happens. + */ + int64_t getTime(int64_t year) const { + int64_t result = time; + switch (kind) { + case TRANSITION_JULIAN: + result += SECONDS_PER_DAY * day; + if (day > 60 && isLeap(year)) { + result += SECONDS_PER_DAY; + } + break; + case TRANSITION_DAY: + result += SECONDS_PER_DAY * day; + break; + case TRANSITION_MONTH: { + bool inLeap = isLeap(year); + int64_t adjustedMonth = (month + 9) % 12 + 1; + int64_t adjustedYear = (month <= 2) ? (year - 1) : year; + int64_t adjustedCentury = adjustedYear / 100; + int64_t adjustedRemainder = adjustedYear % 100; + + // day of the week of the first day of month + int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + + 1 + adjustedRemainder + adjustedRemainder / 4 + + adjustedCentury / 4 - 2 * adjustedCentury) % 7; + if (dayOfWeek < 0) { + dayOfWeek += DAYS_PER_WEEK; + } + + int64_t d = day - dayOfWeek; + if (d < 0) { + d += DAYS_PER_WEEK; + } + for (int w = 1; w < week; ++w) { + if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) { + break; + } + d += DAYS_PER_WEEK; + } + result += d * SECONDS_PER_DAY; + + // Add in the time for the month + for(int m=0; m < month - 1; ++m) { + result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY; + } + break; + } + } + return result; + } + }; + + /** + * The current rule for finding timezone variants arbitrarily far in + * the future. They are based on a string representation that + * specifies the standard name and offset. For timezones with + * daylight savings, the string specifies the daylight variant name + * and offset and the rules for switching between them. + * + * rule = <standard name><standard offset><daylight>? + * name = string with no numbers or '+', '-', or ',' + * offset = [-+]?hh(:mm(:ss)?)? + * daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)? + * day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week> + */ + class FutureRuleImpl: public FutureRule { + std::string ruleString; + TimezoneVariant standard; + bool hasDst; + TimezoneVariant dst; + Transition start; + Transition end; + + // expanded time_t offsets of transitions + std::vector<int64_t> offsets; + + // Is the epoch (1 Jan 1970 00:00) in standard time? + // This code assumes that the transition dates fall in the same order + // each year. Hopefully no timezone regions decide to move across the + // equator, which is about what it would take. + bool startInStd; + + void computeOffsets() { + if (!hasDst) { + startInStd = true; + offsets.resize(1); + } else { + // Insert a transition for the epoch and two per a year for the next + // 400 years. We assume that the all even positions are in standard + // time if and only if startInStd and the odd ones are the reverse. + offsets.resize(400 * 2 + 1); + startInStd = start.getTime(1970) < end.getTime(1970); + int64_t base = 0; + for(int64_t year = 1970; year < 1970 + 400; ++year) { + if (startInStd) { + offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = + base + start.getTime(year) - standard.gmtOffset; + offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = + base + end.getTime(year) - dst.gmtOffset; + } else { + offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] = + base + end.getTime(year) - dst.gmtOffset; + offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] = + base + start.getTime(year) - standard.gmtOffset; + } + base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY; + } + } + offsets[0] = 0; + } + + public: + virtual ~FutureRuleImpl() override; + bool isDefined() const override; + const TimezoneVariant& getVariant(int64_t clk) const override; + void print(std::ostream& out) const override; + + friend class FutureRuleParser; + }; + + FutureRule::~FutureRule() { + // PASS + } + + FutureRuleImpl::~FutureRuleImpl() { + // PASS + } + + bool FutureRuleImpl::isDefined() const { + return ruleString.size() > 0; + } + + const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const { + if (!hasDst) { + return standard; + } else { + int64_t adjusted = clk % SECONDS_PER_400_YEARS; + if (adjusted < 0) { + adjusted += SECONDS_PER_400_YEARS; + } + int64_t idx = binarySearch(offsets, adjusted); + if (startInStd == (idx % 2 == 0)) { + return standard; + } else { + return dst; + } + } + } + + void FutureRuleImpl::print(std::ostream& out) const { + if (isDefined()) { + out << " Future rule: " << ruleString << "\n"; + out << " standard " << standard.toString() << "\n"; + if (hasDst) { + out << " dst " << dst.toString() << "\n"; + out << " start " << start.toString() << "\n"; + out << " end " << end.toString() << "\n"; + } + } + } + + /** + * A parser for the future rule strings. + */ + class FutureRuleParser { + public: + FutureRuleParser(const std::string& str, + FutureRuleImpl* rule + ): ruleString(str), + length(str.size()), + position(0), + output(*rule) { + output.ruleString = str; + if (position != length) { + parseName(output.standard.name); + output.standard.gmtOffset = -parseOffset(); + output.standard.isDst = false; + output.hasDst = position < length; + if (output.hasDst) { + parseName(output.dst.name); + output.dst.isDst = true; + if (ruleString[position] != ',') { + output.dst.gmtOffset = -parseOffset(); + } else { + output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60; + } + parseTransition(output.start); + parseTransition(output.end); + } + if (position != length) { + throwError("Extra text"); + } + output.computeOffsets(); + } + } + + private: + + const std::string& ruleString; + size_t length; + size_t position; + FutureRuleImpl &output; + + void throwError(const char *msg) { + std::stringstream buffer; + buffer << msg << " at " << position << " in '" << ruleString << "'"; + throw TimezoneError(buffer.str()); + } + + /** + * Parse the names of the form: + * ([^-+0-9,]+|<[^>]+>) + * and set the output string. + */ + void parseName(std::string& result) { + if (position == length) { + throwError("name required"); + } + size_t start = position; + if (ruleString[position] == '<') { + while (position < length && ruleString[position] != '>') { + position += 1; + } + if (position == length) { + throwError("missing close '>'"); + } + position +=1; + } else { + while (position < length) { + char ch = ruleString[position]; + if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') { + break; + } + position += 1; + } + } + if (position == start) { + throwError("empty string not allowed"); + } + result = ruleString.substr(start, position - start); + } + + /** + * Parse an integer of the form [0-9]+ and return it. + */ + int64_t parseNumber() { + if (position >= length) { + throwError("missing number"); + } + int64_t result = 0; + while (position < length) { + char ch = ruleString[position]; + if (isdigit(ch)) { + result = result * 10 + (ch - '0'); + position += 1; + } else { + break; + } + } + return result; + } + + /** + * Parse the offsets of the form: + * [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)? + * and convert it into a number of seconds. + */ + int64_t parseOffset() { + int64_t scale = 3600; + bool isNegative = false; + if (position < length) { + char ch = ruleString[position]; + isNegative = ch == '-'; + if (ch == '-' || ch == '+') { + position += 1; + } + } + int64_t result = parseNumber() * scale; + while (position < length && scale > 1 && ruleString[position] == ':') { + scale /= 60; + position += 1; + result += parseNumber() * scale; + } + if (isNegative) { + result = -result; + } + return result; + } + + /** + * Parse a transition of the following form: + * ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)? + */ + void parseTransition(Transition& transition) { + if (length - position < 2 || ruleString[position] != ',') { + throwError("missing transition"); + } + position += 1; + char ch = ruleString[position]; + if (ch == 'J') { + transition.kind = TRANSITION_JULIAN; + position += 1; + transition.day = parseNumber(); + } else if (ch == 'M') { + transition.kind = TRANSITION_MONTH; + position += 1; + transition.month = parseNumber(); + if (position == length || ruleString[position] != '.') { + throwError("missing first ."); + } + position += 1; + transition.week = parseNumber(); + if (position == length || ruleString[position] != '.') { + throwError("missing second ."); + } + position += 1; + transition.day = parseNumber(); + } else { + transition.kind = TRANSITION_DAY; + transition.day = parseNumber(); + } + if (position < length && ruleString[position] == '/') { + position += 1; + transition.time = parseOffset(); + } else { + transition.time = 2 * 60 * 60; + } + } + }; + + /** + * Parse the POSIX TZ string. + */ + std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString) { + std::shared_ptr<FutureRule> result(new FutureRuleImpl()); + FutureRuleParser parser(ruleString, + dynamic_cast<FutureRuleImpl*>(result.get())); + return result; + } + + std::string TimezoneVariant::toString() const { + std::stringstream buffer; + buffer << name << " " << gmtOffset; + if (isDst) { + buffer << " (dst)"; + } + return buffer.str(); + } + + /** + * An abstraction of the differences between versions. + */ + class VersionParser { + public: + virtual ~VersionParser(); + + /** + * Get the version number. + */ + virtual uint64_t getVersion() const = 0; + + /** + * Get the number of bytes + */ + virtual uint64_t getTimeSize() const = 0; + + /** + * Parse the time at the given location. + */ + virtual int64_t parseTime(const unsigned char* ptr) const = 0; + + /** + * Parse the future string + */ + virtual std::string parseFutureString(const unsigned char *ptr, + uint64_t offset, + uint64_t length) const = 0; + }; + + VersionParser::~VersionParser() { + // PASS + } + + static uint32_t decode32(const unsigned char* ptr) { + return static_cast<uint32_t>(ptr[0] << 24) | + static_cast<uint32_t>(ptr[1] << 16) | + static_cast<uint32_t>(ptr[2] << 8) | + static_cast<uint32_t>(ptr[3]); + } + + class Version1Parser: public VersionParser { + public: + virtual ~Version1Parser() override; + + virtual uint64_t getVersion() const override { + return 1; + } + + /** + * Get the number of bytes + */ + virtual uint64_t getTimeSize() const override { + return 4; + } + + /** + * Parse the time at the given location. + */ + virtual int64_t parseTime(const unsigned char* ptr) const override { + // sign extend from 32 bits + return static_cast<int32_t>(decode32(ptr)); + } + + virtual std::string parseFutureString(const unsigned char *, + uint64_t, + uint64_t) const override { + return ""; + } + }; + + Version1Parser::~Version1Parser() { + // PASS + } + + class Version2Parser: public VersionParser { + public: + virtual ~Version2Parser() override; + + virtual uint64_t getVersion() const override { + return 2; + } + + /** + * Get the number of bytes + */ + virtual uint64_t getTimeSize() const override { + return 8; + } + + /** + * Parse the time at the given location. + */ + virtual int64_t parseTime(const unsigned char* ptr) const override { + return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4); + } + + virtual std::string parseFutureString(const unsigned char *ptr, + uint64_t offset, + uint64_t length) const override { + return std::string(reinterpret_cast<const char*>(ptr) + offset + 1, + length - 2); + } + }; + + Version2Parser::~Version2Parser() { + // PASS + } + + class TimezoneImpl: public Timezone { + public: + TimezoneImpl(const std::string& name, + const std::vector<unsigned char> bytes); + virtual ~TimezoneImpl() override; + + /** + * Get the variant for the given time (time_t). + */ + const TimezoneVariant& getVariant(int64_t clk) const override; + + void print(std::ostream&) const override; + + uint64_t getVersion() const override { + return version; + } + + int64_t getEpoch() const override { + return epoch; + } + + int64_t convertToUTC(int64_t clk) const override { + return clk + getVariant(clk).gmtOffset; + } + + private: + void parseTimeVariants(const unsigned char* ptr, + uint64_t variantOffset, + uint64_t variantCount, + uint64_t nameOffset, + uint64_t nameCount); + void parseZoneFile(const unsigned char* ptr, + uint64_t sectionOffset, + uint64_t fileLength, + const VersionParser& version); + // filename + std::string filename; + + // the version of the file + uint64_t version; + + // the list of variants for this timezone + std::vector<TimezoneVariant> variants; + + // the list of the times where the local rules change + std::vector<int64_t> transitions; + + // the variant that starts at this transition. + std::vector<uint64_t> currentVariant; + + // the variant before the first transition + uint64_t ancientVariant; + + // the rule for future times + std::shared_ptr<FutureRule> futureRule; + + // the last explicit transition after which we use the future rule + int64_t lastTransition; + + // The ORC epoch time in this timezone. + int64_t epoch; + }; + + DIAGNOSTIC_PUSH + #ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wglobal-constructors") + DIAGNOSTIC_IGNORE("-Wexit-time-destructors") + #endif + static std::mutex timezone_mutex; + static std::map<std::string, std::shared_ptr<Timezone> > timezoneCache; + DIAGNOSTIC_POP + + Timezone::~Timezone() { + // PASS + } + + TimezoneImpl::TimezoneImpl(const std::string& _filename, + const std::vector<unsigned char> buffer + ): filename(_filename) { + parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser()); + // Build the literal for the ORC epoch + // 2015 Jan 1 00:00:00 + tm epochStruct; + epochStruct.tm_sec = 0; + epochStruct.tm_min = 0; + epochStruct.tm_hour = 0; + epochStruct.tm_mday = 1; + epochStruct.tm_mon = 0; + epochStruct.tm_year = 2015 - 1900; + epochStruct.tm_isdst = 0; + time_t utcEpoch = timegm(&epochStruct); + epoch = utcEpoch - getVariant(utcEpoch).gmtOffset; + } + + const char* getTimezoneDirectory() { + const char *dir = getenv("TZDIR"); + if (!dir) { + dir = DEFAULT_TZDIR; + } + return dir; + } + + /** + * Get a timezone by absolute filename. + * Results are cached. + */ + const Timezone& getTimezoneByFilename(const std::string& filename) { + // ORC-110 + std::lock_guard<std::mutex> timezone_lock(timezone_mutex); + std::map<std::string, std::shared_ptr<Timezone> >::iterator itr = + timezoneCache.find(filename); + if (itr != timezoneCache.end()) { + return *(itr->second).get(); + } + try { + ORC_UNIQUE_PTR<InputStream> file = readFile(filename); + size_t size = static_cast<size_t>(file->getLength()); + std::vector<unsigned char> buffer(size); + file->read(&buffer[0], size, 0); + timezoneCache[filename] = std::shared_ptr<Timezone>(new TimezoneImpl(filename, buffer)); + } catch(ParseError& err) { + throw TimezoneError(err.what()); + } + return *timezoneCache[filename].get(); + } + + /** + * Get the local timezone. + */ + const Timezone& getLocalTimezone() { +#ifdef _MSC_VER + return getTimezoneByName("UTC"); +#else + return getTimezoneByFilename(LOCAL_TIMEZONE); +#endif + } + + /** + * Get a timezone by name (eg. America/Los_Angeles). + * Results are cached. + */ + const Timezone& getTimezoneByName(const std::string& zone) { + std::string filename(getTimezoneDirectory()); + filename += "/"; + filename += zone; + return getTimezoneByFilename(filename); + } + + /** + * Parse a set of bytes as a timezone file as if they came from filename. + */ + std::unique_ptr<Timezone> getTimezone(const std::string& filename, + const std::vector<unsigned char>& b){ + return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b)); + } + + TimezoneImpl::~TimezoneImpl() { + // PASS + } + + void TimezoneImpl::parseTimeVariants(const unsigned char* ptr, + uint64_t variantOffset, + uint64_t variantCount, + uint64_t nameOffset, + uint64_t nameCount) { + for(uint64_t variant=0; variant < variantCount; ++variant) { + variants[variant].gmtOffset = + static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant)); + variants[variant].isDst = ptr[variantOffset + 6 * variant + 4] != 0; + uint64_t nameStart = ptr[variantOffset + 6 * variant + 5]; + if (nameStart >= nameCount) { + std::stringstream buffer; + buffer << "name out of range in variant " << variant + << " - " << nameStart << " >= " << nameCount; + throw TimezoneError(buffer.str()); + } + variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) + + nameOffset + nameStart); + } + } + + /** + * Parse the zone file to get the bits we need. + * There are two versions of the timezone file: + * + * Version 1(version = 0x00): + * Magic(version) + * Header + * TransitionTimes(4 byte) + * TransitionRules + * Rules + * LeapSeconds(4 byte) + * IsStd + * IsGmt + * + * Version2: + * Version1(0x32) = a version 1 copy of the data for old clients + * Magic(0x32) + * Header + * TransitionTimes(8 byte) + * TransitionRules + * Rules + * LeapSeconds(8 byte) + * IsStd + * IsGmt + * FutureString + */ + void TimezoneImpl::parseZoneFile(const unsigned char *ptr, + uint64_t sectionOffset, + uint64_t fileLength, + const VersionParser& versionParser) { + const uint64_t magicOffset = sectionOffset + 0; + const uint64_t headerOffset = magicOffset + 20; + + // check for validity before we start parsing + if (fileLength < headerOffset + 6 * 4 || + strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) + != 0) { + std::stringstream buffer; + buffer << "non-tzfile " << filename; + throw TimezoneError(buffer.str()); + } + + const uint64_t isGmtCount = decode32(ptr + headerOffset + 0); + const uint64_t isStdCount = decode32(ptr + headerOffset + 4); + const uint64_t leapCount = decode32(ptr + headerOffset + 8); + const uint64_t timeCount = decode32(ptr + headerOffset + 12); + const uint64_t variantCount = decode32(ptr + headerOffset + 16); + const uint64_t nameCount = decode32(ptr + headerOffset + 20); + + const uint64_t timeOffset = headerOffset + 24; + const uint64_t timeVariantOffset = + timeOffset + versionParser.getTimeSize() * timeCount; + const uint64_t variantOffset = timeVariantOffset + timeCount; + const uint64_t nameOffset = variantOffset + variantCount * 6; + const uint64_t sectionLength = nameOffset + nameCount + + (versionParser.getTimeSize() + 4) * leapCount + + isGmtCount + isStdCount; + + if (sectionLength > fileLength) { + std::stringstream buffer; + buffer << "tzfile too short " << filename + << " needs " << sectionLength << " and has " << fileLength; + throw TimezoneError(buffer.str()); + } + + // if it is version 2, skip over the old layout and read the new one. + if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) { + parseZoneFile(ptr, sectionLength, fileLength, Version2Parser()); + return; + } + version = versionParser.getVersion(); + variants.resize(variantCount); + transitions.resize(timeCount); + currentVariant.resize(timeCount); + parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, + nameCount); + bool foundAncient = false; + for(uint64_t t=0; t < timeCount; ++t) { + transitions[t] = + versionParser.parseTime(ptr + timeOffset + + t * versionParser.getTimeSize()); + currentVariant[t] = ptr[timeVariantOffset + t]; + if (currentVariant[t] >= variantCount) { + std::stringstream buffer; + buffer << "tzfile rule out of range " << filename + << " references rule " << currentVariant[t] + << " of " << variantCount; + throw TimezoneError(buffer.str()); + } + // find the oldest standard time and use that as the ancient value + if (!foundAncient && + !variants[currentVariant[t]].isDst) { + foundAncient = true; + ancientVariant = currentVariant[t]; + } + } + if (!foundAncient) { + ancientVariant = 0; + } + futureRule = parseFutureRule(versionParser.parseFutureString + (ptr, sectionLength, + fileLength - sectionLength)); + + // find the lower bound for applying the future rule + if (futureRule->isDefined()) { + if (timeCount > 0) { + lastTransition = transitions[timeCount - 1]; + } else { + lastTransition = INT64_MIN; + } + } else { + lastTransition = INT64_MAX; + } + } + + const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const { + // if it is after the last explicit entry in the table, + // use the future rule to get an answer + if (clk > lastTransition) { + return futureRule->getVariant(clk); + } else { + int64_t transition = binarySearch(transitions, clk); + uint64_t idx; + if (transition < 0) { + idx = ancientVariant; + } else { + idx = currentVariant[static_cast<size_t>(transition)]; + } + return variants[idx]; + } + } + + void TimezoneImpl::print(std::ostream& out) const { + out << "Timezone file: " << filename << "\n"; + out << " Version: " << version << "\n"; + futureRule->print(out); + for(uint64_t r=0; r < variants.size(); ++r) { + out << " Variant " << r << ": " + << variants[r].toString() << "\n"; + } + for(uint64_t t=0; t < transitions.size(); ++t) { + tm timeStruct; + tm* result = nullptr; + char buffer[25]; + if (sizeof(time_t) >= 8) { + time_t val = transitions[t]; + result = gmtime_r(&val, &timeStruct); + if (result) { + strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct); + } + } + std::cout << " Transition: " << (result == nullptr ? "null" : buffer) + << " (" << transitions[t] << ") -> " + << variants[currentVariant[t]].name + << "\n"; + } + } + + TimezoneError::TimezoneError(const std::string& what + ): std::runtime_error(what) { + // PASS + } + + TimezoneError::TimezoneError(const TimezoneError& other + ): std::runtime_error(other) { + // PASS + } + + TimezoneError::~TimezoneError() ORC_NOEXCEPT { + // PASS + } + +} diff --git a/contrib/libs/apache/orc/c++/src/Timezone.hh b/contrib/libs/apache/orc/c++/src/Timezone.hh index 136b7a18b7..6bcb6586d0 100644 --- a/contrib/libs/apache/orc/c++/src/Timezone.hh +++ b/contrib/libs/apache/orc/c++/src/Timezone.hh @@ -1,130 +1,130 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TIMEZONE_HH -#define TIMEZONE_HH - -// This file is for timezone routines. - -#include "Adaptor.hh" - -#include <memory> -#include <stdexcept> -#include <stdint.h> -#include <string> -#include <vector> - -namespace orc { - - static const int64_t SECONDS_PER_HOUR = 60 * 60; - static const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24; - - /** - * A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles). - */ - struct TimezoneVariant { - int64_t gmtOffset; - bool isDst; - std::string name; - - std::string toString() const; - }; - - /** - * A region that shares the same legal rules for wall clock time and - * day light savings transitions. They are typically named for the largest - * city in the region (eg. America/Los_Angeles or America/Mexico_City). - */ - class Timezone { - public: - virtual ~Timezone(); - - /** - * Get the variant for the given time (time_t). - */ - virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; - - /** - * Get the number of seconds between the ORC epoch in this timezone - * and Unix epoch. - * ORC epoch is 1 Jan 2015 00:00:00 local. - * Unix epoch is 1 Jan 1970 00:00:00 UTC. - */ - virtual int64_t getEpoch() const = 0; - - /** - * Print the timezone to the stream. - */ - virtual void print(std::ostream&) const = 0; - - /** - * Get the version of the zone file. - */ - virtual uint64_t getVersion() const =0; - - /** - * Convert wall clock time of current timezone to UTC timezone - */ - virtual int64_t convertToUTC(int64_t clk) const = 0; - }; - - /** - * Get the local timezone. - * Results are cached. - */ - const Timezone& getLocalTimezone(); - - /** - * Get a timezone by name (eg. America/Los_Angeles). - * Results are cached. - */ - const Timezone& getTimezoneByName(const std::string& zone); - - /** - * Parse a set of bytes as a timezone file as if they came from filename. - */ - std::unique_ptr<Timezone> getTimezone(const std::string& filename, - const std::vector<unsigned char>& b); - - class TimezoneError: public std::runtime_error { - public: - TimezoneError(const std::string& what); - TimezoneError(const TimezoneError&); - virtual ~TimezoneError() ORC_NOEXCEPT; - }; - - /** - * Represents the parsed POSIX timezone rule strings that are used to - * describe the future transitions, because they can go arbitrarily far into - * the future. - */ - class FutureRule { - public: - virtual ~FutureRule(); - virtual bool isDefined() const = 0; - virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; - virtual void print(std::ostream& out) const = 0; - }; - - /** - * Parse the POSIX TZ string. - */ - std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TIMEZONE_HH +#define TIMEZONE_HH + +// This file is for timezone routines. + +#include "Adaptor.hh" + +#include <memory> +#include <stdexcept> +#include <stdint.h> +#include <string> +#include <vector> + +namespace orc { + + static const int64_t SECONDS_PER_HOUR = 60 * 60; + static const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24; + + /** + * A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles). + */ + struct TimezoneVariant { + int64_t gmtOffset; + bool isDst; + std::string name; + + std::string toString() const; + }; + + /** + * A region that shares the same legal rules for wall clock time and + * day light savings transitions. They are typically named for the largest + * city in the region (eg. America/Los_Angeles or America/Mexico_City). + */ + class Timezone { + public: + virtual ~Timezone(); + + /** + * Get the variant for the given time (time_t). + */ + virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; + + /** + * Get the number of seconds between the ORC epoch in this timezone + * and Unix epoch. + * ORC epoch is 1 Jan 2015 00:00:00 local. + * Unix epoch is 1 Jan 1970 00:00:00 UTC. + */ + virtual int64_t getEpoch() const = 0; + + /** + * Print the timezone to the stream. + */ + virtual void print(std::ostream&) const = 0; + + /** + * Get the version of the zone file. + */ + virtual uint64_t getVersion() const =0; + + /** + * Convert wall clock time of current timezone to UTC timezone + */ + virtual int64_t convertToUTC(int64_t clk) const = 0; + }; + + /** + * Get the local timezone. + * Results are cached. + */ + const Timezone& getLocalTimezone(); + + /** + * Get a timezone by name (eg. America/Los_Angeles). + * Results are cached. + */ + const Timezone& getTimezoneByName(const std::string& zone); + + /** + * Parse a set of bytes as a timezone file as if they came from filename. + */ + std::unique_ptr<Timezone> getTimezone(const std::string& filename, + const std::vector<unsigned char>& b); + + class TimezoneError: public std::runtime_error { + public: + TimezoneError(const std::string& what); + TimezoneError(const TimezoneError&); + virtual ~TimezoneError() ORC_NOEXCEPT; + }; + + /** + * Represents the parsed POSIX timezone rule strings that are used to + * describe the future transitions, because they can go arbitrarily far into + * the future. + */ + class FutureRule { + public: + virtual ~FutureRule(); + virtual bool isDefined() const = 0; + virtual const TimezoneVariant& getVariant(int64_t clk) const = 0; + virtual void print(std::ostream& out) const = 0; + }; + + /** + * Parse the POSIX TZ string. + */ + std::shared_ptr<FutureRule> parseFutureRule(const std::string& ruleString); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc index c154f2af04..78a0e00686 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc @@ -1,707 +1,707 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Adaptor.hh" -#include "orc/Exceptions.hh" -#include "TypeImpl.hh" - -#include <iostream> -#include <sstream> - -namespace orc { - - Type::~Type() { - // PASS - } - - TypeImpl::TypeImpl(TypeKind _kind) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = 0; - precision = 0; - scale = 0; - subtypeCount = 0; - } - - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = _maxLength; - precision = 0; - scale = 0; - subtypeCount = 0; - } - - TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, - uint64_t _scale) { - parent = nullptr; - columnId = -1; - maximumColumnId = -1; - kind = _kind; - maxLength = 0; - precision = _precision; - scale = _scale; - subtypeCount = 0; - } - - uint64_t TypeImpl::assignIds(uint64_t root) const { - columnId = static_cast<int64_t>(root); - uint64_t current = root + 1; - for(uint64_t i=0; i < subtypeCount; ++i) { - current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current); - } - maximumColumnId = static_cast<int64_t>(current) - 1; - return current; - } - - TypeImpl::~TypeImpl() { - for (std::vector<Type*>::iterator it = subTypes.begin(); - it != subTypes.end(); it++) { - delete (*it) ; - } - } - - void TypeImpl::ensureIdAssigned() const { - if (columnId == -1) { - const TypeImpl* root = this; - while (root->parent != nullptr) { - root = root->parent; - } - root->assignIds(0); - } - } - - uint64_t TypeImpl::getColumnId() const { - ensureIdAssigned(); - return static_cast<uint64_t>(columnId); - } - - uint64_t TypeImpl::getMaximumColumnId() const { - ensureIdAssigned(); - return static_cast<uint64_t>(maximumColumnId); - } - - TypeKind TypeImpl::getKind() const { - return kind; - } - - uint64_t TypeImpl::getSubtypeCount() const { - return subtypeCount; - } - - const Type* TypeImpl::getSubtype(uint64_t i) const { - return subTypes[i]; - } - - const std::string& TypeImpl::getFieldName(uint64_t i) const { - return fieldNames[i]; - } - - uint64_t TypeImpl::getMaximumLength() const { - return maxLength; - } - - uint64_t TypeImpl::getPrecision() const { - return precision; - } - - uint64_t TypeImpl::getScale() const { - return scale; - } - - void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) { - columnId = static_cast<int64_t>(_columnId); - maximumColumnId = static_cast<int64_t>(_maxColumnId); - } - - void TypeImpl::addChildType(std::unique_ptr<Type> childType) { - TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release()); - subTypes.push_back(child); - if (child != nullptr) { - child->parent = this; - } - subtypeCount += 1; - } - - Type* TypeImpl::addStructField(const std::string& fieldName, - std::unique_ptr<Type> fieldType) { - addChildType(std::move(fieldType)); - fieldNames.push_back(fieldName); - return this; - } - - Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) { - addChildType(std::move(fieldType)); - return this; - } - - std::string TypeImpl::toString() const { - switch (static_cast<int64_t>(kind)) { - case BOOLEAN: - return "boolean"; - case BYTE: - return "tinyint"; - case SHORT: - return "smallint"; - case INT: - return "int"; - case LONG: - return "bigint"; - case FLOAT: - return "float"; - case DOUBLE: - return "double"; - case STRING: - return "string"; - case BINARY: - return "binary"; - case TIMESTAMP: - return "timestamp"; - case LIST: - return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; - case MAP: - return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + - (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; - case STRUCT: { - std::string result = "struct<"; - for(size_t i=0; i < subTypes.size(); ++i) { - if (i != 0) { - result += ","; - } - result += fieldNames[i]; - result += ":"; - result += subTypes[i]->toString(); - } - result += ">"; - return result; - } - case UNION: { - std::string result = "uniontype<"; - for(size_t i=0; i < subTypes.size(); ++i) { - if (i != 0) { - result += ","; - } - result += subTypes[i]->toString(); - } - result += ">"; - return result; - } - case DECIMAL: { - std::stringstream result; - result << "decimal(" << precision << "," << scale << ")"; - return result.str(); - } - case DATE: - return "date"; - case VARCHAR: { - std::stringstream result; - result << "varchar(" << maxLength << ")"; - return result.str(); - } - case CHAR: { - std::stringstream result; - result << "char(" << maxLength << ")"; - return result.str(); - } - default: - throw NotImplementedYet("Unknown type"); - } - } - - std::unique_ptr<ColumnVectorBatch> - TypeImpl::createRowBatch(uint64_t capacity, - MemoryPool& memoryPool, - bool encoded) const { - switch (static_cast<int64_t>(kind)) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case DATE: - return std::unique_ptr<ColumnVectorBatch> - (new LongVectorBatch(capacity, memoryPool)); - - case FLOAT: - case DOUBLE: - return std::unique_ptr<ColumnVectorBatch> - (new DoubleVectorBatch(capacity, memoryPool)); - - case STRING: - case BINARY: - case CHAR: - case VARCHAR: - return encoded ? - std::unique_ptr<ColumnVectorBatch> - (new EncodedStringVectorBatch(capacity, memoryPool)) - : std::unique_ptr<ColumnVectorBatch> - (new StringVectorBatch(capacity, memoryPool)); - - case TIMESTAMP: - return std::unique_ptr<ColumnVectorBatch> - (new TimestampVectorBatch(capacity, memoryPool)); - - case STRUCT: { - StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - for(uint64_t i=0; i < getSubtypeCount(); ++i) { - result->fields.push_back(getSubtype(i)-> - createRowBatch(capacity, - memoryPool, encoded).release()); - } - return return_value; - } - - case LIST: { - ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - if (getSubtype(0) != nullptr) { - result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); - } - return return_value; - } - - case MAP: { - MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - if (getSubtype(0) != nullptr) { - result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); - } - if (getSubtype(1) != nullptr) { - result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded); - } - return return_value; - } - - case DECIMAL: { - if (getPrecision() == 0 || getPrecision() > 18) { - return std::unique_ptr<ColumnVectorBatch> - (new Decimal128VectorBatch(capacity, memoryPool)); - } else { - return std::unique_ptr<ColumnVectorBatch> - (new Decimal64VectorBatch(capacity, memoryPool)); - } - } - - case UNION: { - UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool); - std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); - for(uint64_t i=0; i < getSubtypeCount(); ++i) { - result->children.push_back(getSubtype(i)->createRowBatch(capacity, - memoryPool, encoded) - .release()); - } - return return_value; - } - - default: - throw NotImplementedYet("not supported yet"); - } - } - - std::unique_ptr<Type> createPrimitiveType(TypeKind kind) { - return std::unique_ptr<Type>(new TypeImpl(kind)); - } - - std::unique_ptr<Type> createCharType(TypeKind kind, - uint64_t maxLength) { - return std::unique_ptr<Type>(new TypeImpl(kind, maxLength)); - } - - std::unique_ptr<Type> createDecimalType(uint64_t precision, - uint64_t scale) { - return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); - } - - std::unique_ptr<Type> createStructType() { - return std::unique_ptr<Type>(new TypeImpl(STRUCT)); - } - - std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) { - TypeImpl* result = new TypeImpl(LIST); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - result->addChildType(std::move(elements)); - return return_value; - } - - std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, - std::unique_ptr<Type> value) { - TypeImpl* result = new TypeImpl(MAP); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - result->addChildType(std::move(key)); - result->addChildType(std::move(value)); - return return_value; - } - - std::unique_ptr<Type> createUnionType() { - return std::unique_ptr<Type>(new TypeImpl(UNION)); - } - - std::string printProtobufMessage(const google::protobuf::Message& message); - std::unique_ptr<Type> convertType(const proto::Type& type, - const proto::Footer& footer) { - switch (static_cast<int64_t>(type.kind())) { - - case proto::Type_Kind_BOOLEAN: - case proto::Type_Kind_BYTE: - case proto::Type_Kind_SHORT: - case proto::Type_Kind_INT: - case proto::Type_Kind_LONG: - case proto::Type_Kind_FLOAT: - case proto::Type_Kind_DOUBLE: - case proto::Type_Kind_STRING: - case proto::Type_Kind_BINARY: - case proto::Type_Kind_TIMESTAMP: - case proto::Type_Kind_DATE: - return std::unique_ptr<Type> - (new TypeImpl(static_cast<TypeKind>(type.kind()))); - - case proto::Type_Kind_CHAR: - case proto::Type_Kind_VARCHAR: - return std::unique_ptr<Type> - (new TypeImpl(static_cast<TypeKind>(type.kind()), - type.maximumlength())); - - case proto::Type_Kind_DECIMAL: - return std::unique_ptr<Type> - (new TypeImpl(DECIMAL, type.precision(), type.scale())); - - case proto::Type_Kind_LIST: - case proto::Type_Kind_MAP: - case proto::Type_Kind_UNION: { - TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind())); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1) - throw ParseError("Illegal LIST type that doesn't contain one subtype"); - if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2) - throw ParseError("Illegal MAP type that doesn't contain two subtypes"); - if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0) - throw ParseError("Illegal UNION type that doesn't contain any subtypes"); - for(int i=0; i < type.subtypes_size(); ++i) { - result->addUnionChild(convertType(footer.types(static_cast<int> - (type.subtypes(i))), - footer)); - } - return return_value; - } - - case proto::Type_Kind_STRUCT: { - TypeImpl* result = new TypeImpl(STRUCT); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); - for(int i=0; i < type.subtypes_size(); ++i) { - result->addStructField(type.fieldnames(i), - convertType(footer.types(static_cast<int> - (type.subtypes(i))), - footer)); - } - return return_value; - } - default: - throw NotImplementedYet("Unknown type kind"); - } - } - - /** - * Build a clone of the file type, projecting columns from the selected - * vector. This routine assumes that the parent of any selected column - * is also selected. The column ids are copied from the fileType. - * @param fileType the type in the file - * @param selected is each column by id selected - * @return a clone of the fileType filtered by the selection array - */ - std::unique_ptr<Type> buildSelectedType(const Type *fileType, - const std::vector<bool>& selected) { - if (fileType == nullptr || !selected[fileType->getColumnId()]) { - return std::unique_ptr<Type>(); - } - - TypeImpl* result; - switch (static_cast<int>(fileType->getKind())) { - case BOOLEAN: - case BYTE: - case SHORT: - case INT: - case LONG: - case FLOAT: - case DOUBLE: - case STRING: - case BINARY: - case TIMESTAMP: - case DATE: - result = new TypeImpl(fileType->getKind()); - break; - - case DECIMAL: - result= new TypeImpl(fileType->getKind(), - fileType->getPrecision(), fileType->getScale()); - break; - - case VARCHAR: - case CHAR: - result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength()); - break; - - case LIST: - result = new TypeImpl(fileType->getKind()); - result->addChildType(buildSelectedType(fileType->getSubtype(0), - selected)); - break; - - case MAP: - result = new TypeImpl(fileType->getKind()); - result->addChildType(buildSelectedType(fileType->getSubtype(0), - selected)); - result->addChildType(buildSelectedType(fileType->getSubtype(1), - selected)); - break; - - case STRUCT: { - result = new TypeImpl(fileType->getKind()); - for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { - std::unique_ptr<Type> childType = - buildSelectedType(fileType->getSubtype(child), selected); - if (childType.get() != nullptr) { - result->addStructField(fileType->getFieldName(child), - std::move(childType)); - } - } - break; - } - - case UNION: { - result = new TypeImpl(fileType->getKind()); - for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { - std::unique_ptr<Type> childType = - buildSelectedType(fileType->getSubtype(child), selected); - if (childType.get() != nullptr) { - result->addUnionChild(std::move(childType)); - } - } - break; - } - - default: - throw NotImplementedYet("Unknown type kind"); - } - result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); - return std::unique_ptr<Type>(result); - } - - ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) { - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res = - TypeImpl::parseType(input, 0, input.size()); - if (res.size() != 1) { - throw std::logic_error("Invalid type string."); - } - return std::move(res[0].second); - } - - std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* arrayType = new TypeImpl(LIST); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = - TypeImpl::parseType(input, start, end); - if (v.size() != 1) { - throw std::logic_error("Array type must contain exactly one sub type."); - } - arrayType->addChildType(std::move(v[0].second)); - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input, - size_t start, - size_t end) { - TypeImpl * mapType = new TypeImpl(MAP); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = - TypeImpl::parseType(input, start, end); - if (v.size() != 2) { - throw std::logic_error( - "Map type must contain exactly two sub types."); - } - mapType->addChildType(std::move(v[0].second)); - mapType->addChildType(std::move(v[1].second)); - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* structType = new TypeImpl(STRUCT); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type>> > v = - TypeImpl::parseType(input, start, end); - if (v.size() == 0) { - throw std::logic_error( - "Struct type must contain at least one sub type."); - } - for (size_t i = 0; i < v.size(); ++i) { - structType->addStructField(v[i].first, std::move(v[i].second)); - } - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input, - size_t start, - size_t end) { - TypeImpl* unionType = new TypeImpl(UNION); - std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType); - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = - TypeImpl::parseType(input, start, end); - if (v.size() == 0) { - throw std::logic_error("Union type must contain at least one sub type."); - } - for (size_t i = 0; i < v.size(); ++i) { - unionType->addChildType(std::move(v[i].second)); - } - return return_value; - } - - std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input, - size_t start, - size_t end) { - size_t sep = input.find(',', start); - if (sep + 1 >= end || sep == std::string::npos) { - throw std::logic_error("Decimal type must specify precision and scale."); - } - uint64_t precision = - static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str())); - uint64_t scale = - static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); - return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); - } - - std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, - const std::string &input, - size_t start, - size_t end) { - if (category == "boolean") { - return std::unique_ptr<Type>(new TypeImpl(BOOLEAN)); - } else if (category == "tinyint") { - return std::unique_ptr<Type>(new TypeImpl(BYTE)); - } else if (category == "smallint") { - return std::unique_ptr<Type>(new TypeImpl(SHORT)); - } else if (category == "int") { - return std::unique_ptr<Type>(new TypeImpl(INT)); - } else if (category == "bigint") { - return std::unique_ptr<Type>(new TypeImpl(LONG)); - } else if (category == "float") { - return std::unique_ptr<Type>(new TypeImpl(FLOAT)); - } else if (category == "double") { - return std::unique_ptr<Type>(new TypeImpl(DOUBLE)); - } else if (category == "string") { - return std::unique_ptr<Type>(new TypeImpl(STRING)); - } else if (category == "binary") { - return std::unique_ptr<Type>(new TypeImpl(BINARY)); - } else if (category == "timestamp") { - return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP)); - } else if (category == "array") { - return parseArrayType(input, start, end); - } else if (category == "map") { - return parseMapType(input, start, end); - } else if (category == "struct") { - return parseStructType(input, start, end); - } else if (category == "uniontype") { - return parseUnionType(input, start, end); - } else if (category == "decimal") { - return parseDecimalType(input, start, end); - } else if (category == "date") { - return std::unique_ptr<Type>(new TypeImpl(DATE)); - } else if (category == "varchar") { - uint64_t maxLength = static_cast<uint64_t>( - atoi(input.substr(start, end - start).c_str())); - return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength)); - } else if (category == "char") { - uint64_t maxLength = static_cast<uint64_t>( - atoi(input.substr(start, end - start).c_str())); - return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength)); - } else { - throw std::logic_error("Unknown type " + category); - } - } - - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > TypeImpl::parseType( - const std::string &input, - size_t start, - size_t end) { - std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res; - size_t pos = start; - - while (pos < end) { - size_t endPos = pos; - while (endPos < end && (isalnum(input[endPos]) || input[endPos] == '_')) { - ++endPos; - } - - std::string fieldName; - if (input[endPos] == ':') { - fieldName = input.substr(pos, endPos - pos); - pos = ++endPos; - while (endPos < end && isalpha(input[endPos])) { - ++endPos; - } - } - - size_t nextPos = endPos + 1; - if (input[endPos] == '<') { - int count = 1; - while (nextPos < end) { - if (input[nextPos] == '<') { - ++count; - } else if (input[nextPos] == '>') { - --count; - } - if (count == 0) { - break; - } - ++nextPos; - } - if (nextPos == end) { - throw std::logic_error("Invalid type string. Cannot find closing >"); - } - } else if (input[endPos] == '(') { - while (nextPos < end && input[nextPos] != ')') { - ++nextPos; - } - if (nextPos == end) { - throw std::logic_error("Invalid type string. Cannot find closing )"); - } - } else if (input[endPos] != ',' && endPos != end) { - throw std::logic_error("Unrecognized character."); - } - - std::string category = input.substr(pos, endPos - pos); - res.push_back(std::make_pair(fieldName, parseCategory(category, input, endPos + 1, nextPos))); - - if (nextPos < end && (input[nextPos] == ')' || input[nextPos] == '>')) { - pos = nextPos + 2; - } else { - pos = nextPos; - } - } - - return res; - } - -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Adaptor.hh" +#include "orc/Exceptions.hh" +#include "TypeImpl.hh" + +#include <iostream> +#include <sstream> + +namespace orc { + + Type::~Type() { + // PASS + } + + TypeImpl::TypeImpl(TypeKind _kind) { + parent = nullptr; + columnId = -1; + maximumColumnId = -1; + kind = _kind; + maxLength = 0; + precision = 0; + scale = 0; + subtypeCount = 0; + } + + TypeImpl::TypeImpl(TypeKind _kind, uint64_t _maxLength) { + parent = nullptr; + columnId = -1; + maximumColumnId = -1; + kind = _kind; + maxLength = _maxLength; + precision = 0; + scale = 0; + subtypeCount = 0; + } + + TypeImpl::TypeImpl(TypeKind _kind, uint64_t _precision, + uint64_t _scale) { + parent = nullptr; + columnId = -1; + maximumColumnId = -1; + kind = _kind; + maxLength = 0; + precision = _precision; + scale = _scale; + subtypeCount = 0; + } + + uint64_t TypeImpl::assignIds(uint64_t root) const { + columnId = static_cast<int64_t>(root); + uint64_t current = root + 1; + for(uint64_t i=0; i < subtypeCount; ++i) { + current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current); + } + maximumColumnId = static_cast<int64_t>(current) - 1; + return current; + } + + TypeImpl::~TypeImpl() { + for (std::vector<Type*>::iterator it = subTypes.begin(); + it != subTypes.end(); it++) { + delete (*it) ; + } + } + + void TypeImpl::ensureIdAssigned() const { + if (columnId == -1) { + const TypeImpl* root = this; + while (root->parent != nullptr) { + root = root->parent; + } + root->assignIds(0); + } + } + + uint64_t TypeImpl::getColumnId() const { + ensureIdAssigned(); + return static_cast<uint64_t>(columnId); + } + + uint64_t TypeImpl::getMaximumColumnId() const { + ensureIdAssigned(); + return static_cast<uint64_t>(maximumColumnId); + } + + TypeKind TypeImpl::getKind() const { + return kind; + } + + uint64_t TypeImpl::getSubtypeCount() const { + return subtypeCount; + } + + const Type* TypeImpl::getSubtype(uint64_t i) const { + return subTypes[i]; + } + + const std::string& TypeImpl::getFieldName(uint64_t i) const { + return fieldNames[i]; + } + + uint64_t TypeImpl::getMaximumLength() const { + return maxLength; + } + + uint64_t TypeImpl::getPrecision() const { + return precision; + } + + uint64_t TypeImpl::getScale() const { + return scale; + } + + void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) { + columnId = static_cast<int64_t>(_columnId); + maximumColumnId = static_cast<int64_t>(_maxColumnId); + } + + void TypeImpl::addChildType(std::unique_ptr<Type> childType) { + TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release()); + subTypes.push_back(child); + if (child != nullptr) { + child->parent = this; + } + subtypeCount += 1; + } + + Type* TypeImpl::addStructField(const std::string& fieldName, + std::unique_ptr<Type> fieldType) { + addChildType(std::move(fieldType)); + fieldNames.push_back(fieldName); + return this; + } + + Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) { + addChildType(std::move(fieldType)); + return this; + } + + std::string TypeImpl::toString() const { + switch (static_cast<int64_t>(kind)) { + case BOOLEAN: + return "boolean"; + case BYTE: + return "tinyint"; + case SHORT: + return "smallint"; + case INT: + return "int"; + case LONG: + return "bigint"; + case FLOAT: + return "float"; + case DOUBLE: + return "double"; + case STRING: + return "string"; + case BINARY: + return "binary"; + case TIMESTAMP: + return "timestamp"; + case LIST: + return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">"; + case MAP: + return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," + + (subTypes[1] ? subTypes[1]->toString() : "void") + ">"; + case STRUCT: { + std::string result = "struct<"; + for(size_t i=0; i < subTypes.size(); ++i) { + if (i != 0) { + result += ","; + } + result += fieldNames[i]; + result += ":"; + result += subTypes[i]->toString(); + } + result += ">"; + return result; + } + case UNION: { + std::string result = "uniontype<"; + for(size_t i=0; i < subTypes.size(); ++i) { + if (i != 0) { + result += ","; + } + result += subTypes[i]->toString(); + } + result += ">"; + return result; + } + case DECIMAL: { + std::stringstream result; + result << "decimal(" << precision << "," << scale << ")"; + return result.str(); + } + case DATE: + return "date"; + case VARCHAR: { + std::stringstream result; + result << "varchar(" << maxLength << ")"; + return result.str(); + } + case CHAR: { + std::stringstream result; + result << "char(" << maxLength << ")"; + return result.str(); + } + default: + throw NotImplementedYet("Unknown type"); + } + } + + std::unique_ptr<ColumnVectorBatch> + TypeImpl::createRowBatch(uint64_t capacity, + MemoryPool& memoryPool, + bool encoded) const { + switch (static_cast<int64_t>(kind)) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case DATE: + return std::unique_ptr<ColumnVectorBatch> + (new LongVectorBatch(capacity, memoryPool)); + + case FLOAT: + case DOUBLE: + return std::unique_ptr<ColumnVectorBatch> + (new DoubleVectorBatch(capacity, memoryPool)); + + case STRING: + case BINARY: + case CHAR: + case VARCHAR: + return encoded ? + std::unique_ptr<ColumnVectorBatch> + (new EncodedStringVectorBatch(capacity, memoryPool)) + : std::unique_ptr<ColumnVectorBatch> + (new StringVectorBatch(capacity, memoryPool)); + + case TIMESTAMP: + return std::unique_ptr<ColumnVectorBatch> + (new TimestampVectorBatch(capacity, memoryPool)); + + case STRUCT: { + StructVectorBatch *result = new StructVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + for(uint64_t i=0; i < getSubtypeCount(); ++i) { + result->fields.push_back(getSubtype(i)-> + createRowBatch(capacity, + memoryPool, encoded).release()); + } + return return_value; + } + + case LIST: { + ListVectorBatch* result = new ListVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + if (getSubtype(0) != nullptr) { + result->elements = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); + } + return return_value; + } + + case MAP: { + MapVectorBatch* result = new MapVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + if (getSubtype(0) != nullptr) { + result->keys = getSubtype(0)->createRowBatch(capacity, memoryPool, encoded); + } + if (getSubtype(1) != nullptr) { + result->elements = getSubtype(1)->createRowBatch(capacity, memoryPool, encoded); + } + return return_value; + } + + case DECIMAL: { + if (getPrecision() == 0 || getPrecision() > 18) { + return std::unique_ptr<ColumnVectorBatch> + (new Decimal128VectorBatch(capacity, memoryPool)); + } else { + return std::unique_ptr<ColumnVectorBatch> + (new Decimal64VectorBatch(capacity, memoryPool)); + } + } + + case UNION: { + UnionVectorBatch *result = new UnionVectorBatch(capacity, memoryPool); + std::unique_ptr<ColumnVectorBatch> return_value = std::unique_ptr<ColumnVectorBatch>(result); + for(uint64_t i=0; i < getSubtypeCount(); ++i) { + result->children.push_back(getSubtype(i)->createRowBatch(capacity, + memoryPool, encoded) + .release()); + } + return return_value; + } + + default: + throw NotImplementedYet("not supported yet"); + } + } + + std::unique_ptr<Type> createPrimitiveType(TypeKind kind) { + return std::unique_ptr<Type>(new TypeImpl(kind)); + } + + std::unique_ptr<Type> createCharType(TypeKind kind, + uint64_t maxLength) { + return std::unique_ptr<Type>(new TypeImpl(kind, maxLength)); + } + + std::unique_ptr<Type> createDecimalType(uint64_t precision, + uint64_t scale) { + return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); + } + + std::unique_ptr<Type> createStructType() { + return std::unique_ptr<Type>(new TypeImpl(STRUCT)); + } + + std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) { + TypeImpl* result = new TypeImpl(LIST); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + result->addChildType(std::move(elements)); + return return_value; + } + + std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, + std::unique_ptr<Type> value) { + TypeImpl* result = new TypeImpl(MAP); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + result->addChildType(std::move(key)); + result->addChildType(std::move(value)); + return return_value; + } + + std::unique_ptr<Type> createUnionType() { + return std::unique_ptr<Type>(new TypeImpl(UNION)); + } + + std::string printProtobufMessage(const google::protobuf::Message& message); + std::unique_ptr<Type> convertType(const proto::Type& type, + const proto::Footer& footer) { + switch (static_cast<int64_t>(type.kind())) { + + case proto::Type_Kind_BOOLEAN: + case proto::Type_Kind_BYTE: + case proto::Type_Kind_SHORT: + case proto::Type_Kind_INT: + case proto::Type_Kind_LONG: + case proto::Type_Kind_FLOAT: + case proto::Type_Kind_DOUBLE: + case proto::Type_Kind_STRING: + case proto::Type_Kind_BINARY: + case proto::Type_Kind_TIMESTAMP: + case proto::Type_Kind_DATE: + return std::unique_ptr<Type> + (new TypeImpl(static_cast<TypeKind>(type.kind()))); + + case proto::Type_Kind_CHAR: + case proto::Type_Kind_VARCHAR: + return std::unique_ptr<Type> + (new TypeImpl(static_cast<TypeKind>(type.kind()), + type.maximumlength())); + + case proto::Type_Kind_DECIMAL: + return std::unique_ptr<Type> + (new TypeImpl(DECIMAL, type.precision(), type.scale())); + + case proto::Type_Kind_LIST: + case proto::Type_Kind_MAP: + case proto::Type_Kind_UNION: { + TypeImpl* result = new TypeImpl(static_cast<TypeKind>(type.kind())); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + if (type.kind() == proto::Type_Kind_LIST && type.subtypes_size() != 1) + throw ParseError("Illegal LIST type that doesn't contain one subtype"); + if (type.kind() == proto::Type_Kind_MAP && type.subtypes_size() != 2) + throw ParseError("Illegal MAP type that doesn't contain two subtypes"); + if (type.kind() == proto::Type_Kind_UNION && type.subtypes_size() == 0) + throw ParseError("Illegal UNION type that doesn't contain any subtypes"); + for(int i=0; i < type.subtypes_size(); ++i) { + result->addUnionChild(convertType(footer.types(static_cast<int> + (type.subtypes(i))), + footer)); + } + return return_value; + } + + case proto::Type_Kind_STRUCT: { + TypeImpl* result = new TypeImpl(STRUCT); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(result); + for(int i=0; i < type.subtypes_size(); ++i) { + result->addStructField(type.fieldnames(i), + convertType(footer.types(static_cast<int> + (type.subtypes(i))), + footer)); + } + return return_value; + } + default: + throw NotImplementedYet("Unknown type kind"); + } + } + + /** + * Build a clone of the file type, projecting columns from the selected + * vector. This routine assumes that the parent of any selected column + * is also selected. The column ids are copied from the fileType. + * @param fileType the type in the file + * @param selected is each column by id selected + * @return a clone of the fileType filtered by the selection array + */ + std::unique_ptr<Type> buildSelectedType(const Type *fileType, + const std::vector<bool>& selected) { + if (fileType == nullptr || !selected[fileType->getColumnId()]) { + return std::unique_ptr<Type>(); + } + + TypeImpl* result; + switch (static_cast<int>(fileType->getKind())) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case FLOAT: + case DOUBLE: + case STRING: + case BINARY: + case TIMESTAMP: + case DATE: + result = new TypeImpl(fileType->getKind()); + break; + + case DECIMAL: + result= new TypeImpl(fileType->getKind(), + fileType->getPrecision(), fileType->getScale()); + break; + + case VARCHAR: + case CHAR: + result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength()); + break; + + case LIST: + result = new TypeImpl(fileType->getKind()); + result->addChildType(buildSelectedType(fileType->getSubtype(0), + selected)); + break; + + case MAP: + result = new TypeImpl(fileType->getKind()); + result->addChildType(buildSelectedType(fileType->getSubtype(0), + selected)); + result->addChildType(buildSelectedType(fileType->getSubtype(1), + selected)); + break; + + case STRUCT: { + result = new TypeImpl(fileType->getKind()); + for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { + std::unique_ptr<Type> childType = + buildSelectedType(fileType->getSubtype(child), selected); + if (childType.get() != nullptr) { + result->addStructField(fileType->getFieldName(child), + std::move(childType)); + } + } + break; + } + + case UNION: { + result = new TypeImpl(fileType->getKind()); + for(uint64_t child=0; child < fileType->getSubtypeCount(); ++child) { + std::unique_ptr<Type> childType = + buildSelectedType(fileType->getSubtype(child), selected); + if (childType.get() != nullptr) { + result->addUnionChild(std::move(childType)); + } + } + break; + } + + default: + throw NotImplementedYet("Unknown type kind"); + } + result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId()); + return std::unique_ptr<Type>(result); + } + + ORC_UNIQUE_PTR<Type> Type::buildTypeFromString(const std::string& input) { + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res = + TypeImpl::parseType(input, 0, input.size()); + if (res.size() != 1) { + throw std::logic_error("Invalid type string."); + } + return std::move(res[0].second); + } + + std::unique_ptr<Type> TypeImpl::parseArrayType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* arrayType = new TypeImpl(LIST); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(arrayType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = + TypeImpl::parseType(input, start, end); + if (v.size() != 1) { + throw std::logic_error("Array type must contain exactly one sub type."); + } + arrayType->addChildType(std::move(v[0].second)); + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseMapType(const std::string &input, + size_t start, + size_t end) { + TypeImpl * mapType = new TypeImpl(MAP); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(mapType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = + TypeImpl::parseType(input, start, end); + if (v.size() != 2) { + throw std::logic_error( + "Map type must contain exactly two sub types."); + } + mapType->addChildType(std::move(v[0].second)); + mapType->addChildType(std::move(v[1].second)); + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseStructType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* structType = new TypeImpl(STRUCT); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(structType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type>> > v = + TypeImpl::parseType(input, start, end); + if (v.size() == 0) { + throw std::logic_error( + "Struct type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + structType->addStructField(v[i].first, std::move(v[i].second)); + } + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseUnionType(const std::string &input, + size_t start, + size_t end) { + TypeImpl* unionType = new TypeImpl(UNION); + std::unique_ptr<Type> return_value = std::unique_ptr<Type>(unionType); + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > v = + TypeImpl::parseType(input, start, end); + if (v.size() == 0) { + throw std::logic_error("Union type must contain at least one sub type."); + } + for (size_t i = 0; i < v.size(); ++i) { + unionType->addChildType(std::move(v[i].second)); + } + return return_value; + } + + std::unique_ptr<Type> TypeImpl::parseDecimalType(const std::string &input, + size_t start, + size_t end) { + size_t sep = input.find(',', start); + if (sep + 1 >= end || sep == std::string::npos) { + throw std::logic_error("Decimal type must specify precision and scale."); + } + uint64_t precision = + static_cast<uint64_t>(atoi(input.substr(start, sep - start).c_str())); + uint64_t scale = + static_cast<uint64_t>(atoi(input.substr(sep + 1, end - sep - 1).c_str())); + return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale)); + } + + std::unique_ptr<Type> TypeImpl::parseCategory(std::string category, + const std::string &input, + size_t start, + size_t end) { + if (category == "boolean") { + return std::unique_ptr<Type>(new TypeImpl(BOOLEAN)); + } else if (category == "tinyint") { + return std::unique_ptr<Type>(new TypeImpl(BYTE)); + } else if (category == "smallint") { + return std::unique_ptr<Type>(new TypeImpl(SHORT)); + } else if (category == "int") { + return std::unique_ptr<Type>(new TypeImpl(INT)); + } else if (category == "bigint") { + return std::unique_ptr<Type>(new TypeImpl(LONG)); + } else if (category == "float") { + return std::unique_ptr<Type>(new TypeImpl(FLOAT)); + } else if (category == "double") { + return std::unique_ptr<Type>(new TypeImpl(DOUBLE)); + } else if (category == "string") { + return std::unique_ptr<Type>(new TypeImpl(STRING)); + } else if (category == "binary") { + return std::unique_ptr<Type>(new TypeImpl(BINARY)); + } else if (category == "timestamp") { + return std::unique_ptr<Type>(new TypeImpl(TIMESTAMP)); + } else if (category == "array") { + return parseArrayType(input, start, end); + } else if (category == "map") { + return parseMapType(input, start, end); + } else if (category == "struct") { + return parseStructType(input, start, end); + } else if (category == "uniontype") { + return parseUnionType(input, start, end); + } else if (category == "decimal") { + return parseDecimalType(input, start, end); + } else if (category == "date") { + return std::unique_ptr<Type>(new TypeImpl(DATE)); + } else if (category == "varchar") { + uint64_t maxLength = static_cast<uint64_t>( + atoi(input.substr(start, end - start).c_str())); + return std::unique_ptr<Type>(new TypeImpl(VARCHAR, maxLength)); + } else if (category == "char") { + uint64_t maxLength = static_cast<uint64_t>( + atoi(input.substr(start, end - start).c_str())); + return std::unique_ptr<Type>(new TypeImpl(CHAR, maxLength)); + } else { + throw std::logic_error("Unknown type " + category); + } + } + + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > TypeImpl::parseType( + const std::string &input, + size_t start, + size_t end) { + std::vector<std::pair<std::string, ORC_UNIQUE_PTR<Type> > > res; + size_t pos = start; + + while (pos < end) { + size_t endPos = pos; + while (endPos < end && (isalnum(input[endPos]) || input[endPos] == '_')) { + ++endPos; + } + + std::string fieldName; + if (input[endPos] == ':') { + fieldName = input.substr(pos, endPos - pos); + pos = ++endPos; + while (endPos < end && isalpha(input[endPos])) { + ++endPos; + } + } + + size_t nextPos = endPos + 1; + if (input[endPos] == '<') { + int count = 1; + while (nextPos < end) { + if (input[nextPos] == '<') { + ++count; + } else if (input[nextPos] == '>') { + --count; + } + if (count == 0) { + break; + } + ++nextPos; + } + if (nextPos == end) { + throw std::logic_error("Invalid type string. Cannot find closing >"); + } + } else if (input[endPos] == '(') { + while (nextPos < end && input[nextPos] != ')') { + ++nextPos; + } + if (nextPos == end) { + throw std::logic_error("Invalid type string. Cannot find closing )"); + } + } else if (input[endPos] != ',' && endPos != end) { + throw std::logic_error("Unrecognized character."); + } + + std::string category = input.substr(pos, endPos - pos); + res.push_back(std::make_pair(fieldName, parseCategory(category, input, endPos + 1, nextPos))); + + if (nextPos < end && (input[nextPos] == ')' || input[nextPos] == '>')) { + pos = nextPos + 2; + } else { + pos = nextPos; + } + } + + return res; + } + +} diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh index 054ceab5dc..cee52006b7 100644 --- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh +++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh @@ -1,198 +1,198 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef TYPE_IMPL_HH -#define TYPE_IMPL_HH - -#include "orc/Type.hh" - -#include "Adaptor.hh" -#include "wrap/orc-proto-wrapper.hh" - -#include <vector> - -namespace orc { - - class TypeImpl: public Type { - private: - TypeImpl* parent; - mutable int64_t columnId; - mutable int64_t maximumColumnId; - TypeKind kind; - std::vector<Type*> subTypes; - std::vector<std::string> fieldNames; - uint64_t subtypeCount; - uint64_t maxLength; - uint64_t precision; - uint64_t scale; - - public: - /** - * Create most of the primitive types. - */ - TypeImpl(TypeKind kind); - - /** - * Create char and varchar type. - */ - TypeImpl(TypeKind kind, uint64_t maxLength); - - /** - * Create decimal type. - */ - TypeImpl(TypeKind kind, uint64_t precision, - uint64_t scale); - - virtual ~TypeImpl() override; - - uint64_t getColumnId() const override; - - uint64_t getMaximumColumnId() const override; - - TypeKind getKind() const override; - - uint64_t getSubtypeCount() const override; - - const Type* getSubtype(uint64_t i) const override; - - const std::string& getFieldName(uint64_t i) const override; - - uint64_t getMaximumLength() const override; - - uint64_t getPrecision() const override; - - uint64_t getScale() const override; - - std::string toString() const override; - - Type* addStructField(const std::string& fieldName, - std::unique_ptr<Type> fieldType) override; - Type* addUnionChild(std::unique_ptr<Type> fieldType) override; - - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, - MemoryPool& memoryPool, - bool encoded = false - ) const override; - - /** - * Explicitly set the column ids. Only for internal usage. - */ - void setIds(uint64_t columnId, uint64_t maxColumnId); - - /** - * Add a child type. - */ - void addChildType(std::unique_ptr<Type> childType); - - static std::vector<std::pair<std::string, std::unique_ptr<Type> > > parseType( - const std::string &input, - size_t start, - size_t end); - - private: - /** - * Assign ids to this node and its children giving this - * node rootId. - * @param rootId the column id that should be assigned to this node. - */ - uint64_t assignIds(uint64_t rootId) const; - - /** - * Ensure that ids are assigned to all of the nodes. - */ - void ensureIdAssigned() const; - - /** - * Parse array type from string - * @param input the input string of an array type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseArrayType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse map type from string - * @param input the input string of a map type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseMapType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse struct type from string - * @param input the input string of a struct type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseStructType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse union type from string - * @param input the input string of an union type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseUnionType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse decimal type from string - * @param input the input string of a decimal type - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseDecimalType(const std::string &input, - size_t start, - size_t end); - - /** - * Parse type for a category - * @param category type name - * @param input the input string of the category - * @param start start position of the input string - * @param end end position of the input string - */ - static std::unique_ptr<Type> parseCategory(std::string category, - const std::string &input, - size_t start, - size_t end); - }; - - std::unique_ptr<Type> convertType(const proto::Type& type, - const proto::Footer& footer); - - /** - * Build a clone of the file type, projecting columns from the selected - * vector. This routine assumes that the parent of any selected column - * is also selected. - * @param fileType the type in the file - * @param selected is each column by id selected - * @return a clone of the fileType filtered by the selection array - */ - std::unique_ptr<Type> buildSelectedType(const Type *fileType, - const std::vector<bool>& selected); -} - -#endif +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TYPE_IMPL_HH +#define TYPE_IMPL_HH + +#include "orc/Type.hh" + +#include "Adaptor.hh" +#include "wrap/orc-proto-wrapper.hh" + +#include <vector> + +namespace orc { + + class TypeImpl: public Type { + private: + TypeImpl* parent; + mutable int64_t columnId; + mutable int64_t maximumColumnId; + TypeKind kind; + std::vector<Type*> subTypes; + std::vector<std::string> fieldNames; + uint64_t subtypeCount; + uint64_t maxLength; + uint64_t precision; + uint64_t scale; + + public: + /** + * Create most of the primitive types. + */ + TypeImpl(TypeKind kind); + + /** + * Create char and varchar type. + */ + TypeImpl(TypeKind kind, uint64_t maxLength); + + /** + * Create decimal type. + */ + TypeImpl(TypeKind kind, uint64_t precision, + uint64_t scale); + + virtual ~TypeImpl() override; + + uint64_t getColumnId() const override; + + uint64_t getMaximumColumnId() const override; + + TypeKind getKind() const override; + + uint64_t getSubtypeCount() const override; + + const Type* getSubtype(uint64_t i) const override; + + const std::string& getFieldName(uint64_t i) const override; + + uint64_t getMaximumLength() const override; + + uint64_t getPrecision() const override; + + uint64_t getScale() const override; + + std::string toString() const override; + + Type* addStructField(const std::string& fieldName, + std::unique_ptr<Type> fieldType) override; + Type* addUnionChild(std::unique_ptr<Type> fieldType) override; + + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size, + MemoryPool& memoryPool, + bool encoded = false + ) const override; + + /** + * Explicitly set the column ids. Only for internal usage. + */ + void setIds(uint64_t columnId, uint64_t maxColumnId); + + /** + * Add a child type. + */ + void addChildType(std::unique_ptr<Type> childType); + + static std::vector<std::pair<std::string, std::unique_ptr<Type> > > parseType( + const std::string &input, + size_t start, + size_t end); + + private: + /** + * Assign ids to this node and its children giving this + * node rootId. + * @param rootId the column id that should be assigned to this node. + */ + uint64_t assignIds(uint64_t rootId) const; + + /** + * Ensure that ids are assigned to all of the nodes. + */ + void ensureIdAssigned() const; + + /** + * Parse array type from string + * @param input the input string of an array type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseArrayType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse map type from string + * @param input the input string of a map type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseMapType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse struct type from string + * @param input the input string of a struct type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseStructType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse union type from string + * @param input the input string of an union type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseUnionType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse decimal type from string + * @param input the input string of a decimal type + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseDecimalType(const std::string &input, + size_t start, + size_t end); + + /** + * Parse type for a category + * @param category type name + * @param input the input string of the category + * @param start start position of the input string + * @param end end position of the input string + */ + static std::unique_ptr<Type> parseCategory(std::string category, + const std::string &input, + size_t start, + size_t end); + }; + + std::unique_ptr<Type> convertType(const proto::Type& type, + const proto::Footer& footer); + + /** + * Build a clone of the file type, projecting columns from the selected + * vector. This routine assumes that the parent of any selected column + * is also selected. + * @param fileType the type in the file + * @param selected is each column by id selected + * @return a clone of the fileType filtered by the selection array + */ + std::unique_ptr<Type> buildSelectedType(const Type *fileType, + const std::vector<bool>& selected); +} + +#endif diff --git a/contrib/libs/apache/orc/c++/src/Vector.cc b/contrib/libs/apache/orc/c++/src/Vector.cc index 14c0ded030..6ba2f8ae7d 100644 --- a/contrib/libs/apache/orc/c++/src/Vector.cc +++ b/contrib/libs/apache/orc/c++/src/Vector.cc @@ -1,518 +1,518 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Vector.hh" - -#include "Adaptor.hh" -#include "orc/Exceptions.hh" - -#include <iostream> -#include <sstream> -#include <cstdlib> - -namespace orc { - - ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, - MemoryPool& pool - ): capacity(cap), - numElements(0), - notNull(pool, cap), - hasNulls(false), - isEncoded(false), - memoryPool(pool) { - std::memset(notNull.data(), 1, capacity); - } - - ColumnVectorBatch::~ColumnVectorBatch() { - // PASS - } - - void ColumnVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - capacity = cap; - notNull.resize(cap); - } - } - - void ColumnVectorBatch::clear() { - numElements = 0; - } - - uint64_t ColumnVectorBatch::getMemoryUsage() { - return static_cast<uint64_t>(notNull.capacity() * sizeof(char)); - } - - bool ColumnVectorBatch::hasVariableLength() { - return false; - } - - LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity) { - // PASS - } - - LongVectorBatch::~LongVectorBatch() { - // PASS - } - - std::string LongVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Long vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void LongVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - } - } - - void LongVectorBatch::clear() { - numElements = 0; - } - - uint64_t LongVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() + - static_cast<uint64_t>(data.capacity() * sizeof(int64_t)); - } - - DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity) { - // PASS - } - - DoubleVectorBatch::~DoubleVectorBatch() { - // PASS - } - - std::string DoubleVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Double vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void DoubleVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - } - } - - void DoubleVectorBatch::clear() { - numElements = 0; - } - - uint64_t DoubleVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(data.capacity() * sizeof(double)); - } - - StringDictionary::StringDictionary(MemoryPool& pool) - : dictionaryBlob(pool), - dictionaryOffset(pool) { - // PASS - } - - EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, - MemoryPool& pool) - : StringVectorBatch(_capacity, pool), - dictionary(), - index(pool, _capacity) { - // PASS - } - - EncodedStringVectorBatch::~EncodedStringVectorBatch() { - // PASS - } - - std::string EncodedStringVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Encoded string vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool - ): ColumnVectorBatch(_capacity, pool), - data(pool, _capacity), - length(pool, _capacity), - blob(pool) { - // PASS - } - - StringVectorBatch::~StringVectorBatch() { - // PASS - } - - std::string StringVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Byte vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void StringVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - length.resize(cap); - } - } - - void StringVectorBatch::clear() { - numElements = 0; - } - - uint64_t StringVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(data.capacity() * sizeof(char*) - + length.capacity() * sizeof(int64_t)); - } - - StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool) { - // PASS - } - - StructVectorBatch::~StructVectorBatch() { - for (uint64_t i=0; i<this->fields.size(); i++) { - delete this->fields[i]; - } - } - - std::string StructVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Struct vector <" << numElements << " of " << capacity - << "; "; - for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin(); - ptr != fields.end(); ++ptr) { - buffer << (*ptr)->toString() << "; "; - } - buffer << ">"; - return buffer.str(); - } - - void StructVectorBatch::resize(uint64_t cap) { - ColumnVectorBatch::resize(cap); - } - - void StructVectorBatch::clear() { - for(size_t i=0; i < fields.size(); i++) { - fields[i]->clear(); - } - numElements = 0; - } - - uint64_t StructVectorBatch::getMemoryUsage() { - uint64_t memory = ColumnVectorBatch::getMemoryUsage(); - for (unsigned int i=0; i < fields.size(); i++) { - memory += fields[i]->getMemoryUsage(); - } - return memory; - } - - bool StructVectorBatch::hasVariableLength() { - for (unsigned int i=0; i < fields.size(); i++) { - if (fields[i]->hasVariableLength()) { - return true; - } - } - return false; - } - - ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - offsets(pool, cap+1) { - // PASS - } - - ListVectorBatch::~ListVectorBatch() { - // PASS - } - - std::string ListVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "List vector <" << elements->toString() << " with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void ListVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - offsets.resize(cap + 1); - } - } - - void ListVectorBatch::clear() { - numElements = 0; - elements->clear(); - } - - uint64_t ListVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) - + elements->getMemoryUsage(); - } - - bool ListVectorBatch::hasVariableLength() { - return true; - } - - MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - offsets(pool, cap+1) { - // PASS - } - - MapVectorBatch::~MapVectorBatch() { - // PASS - } - - std::string MapVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Map vector <" << keys->toString() << ", " - << elements->toString() << " with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void MapVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - offsets.resize(cap + 1); - } - } - - void MapVectorBatch::clear() { - keys->clear(); - elements->clear(); - numElements = 0; - } - - uint64_t MapVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) - + keys->getMemoryUsage() - + elements->getMemoryUsage(); - } - - bool MapVectorBatch::hasVariableLength() { - return true; - } - - UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - tags(pool, cap), - offsets(pool, cap) { - // PASS - } - - UnionVectorBatch::~UnionVectorBatch() { - for (uint64_t i=0; i < children.size(); i++) { - delete children[i]; - } - } - - std::string UnionVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Union vector <"; - for(size_t i=0; i < children.size(); ++i) { - if (i != 0) { - buffer << ", "; - } - buffer << children[i]->toString(); - } - buffer << "; with " << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void UnionVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - tags.resize(cap); - offsets.resize(cap); - } - } - - void UnionVectorBatch::clear() { - for(size_t i=0; i < children.size(); i++) { - children[i]->clear(); - } - numElements = 0; - } - - uint64_t UnionVectorBatch::getMemoryUsage() { - uint64_t memory = ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) - + offsets.capacity() * sizeof(uint64_t)); - for(size_t i=0; i < children.size(); ++i) { - memory += children[i]->getMemoryUsage(); - } - return memory; - } - - bool UnionVectorBatch::hasVariableLength() { - for(size_t i=0; i < children.size(); ++i) { - if (children[i]->hasVariableLength()) { - return true; - } - } - return false; - } - - Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - precision(0), - scale(0), - values(pool, cap), - readScales(pool, cap) { - // PASS - } - - Decimal64VectorBatch::~Decimal64VectorBatch() { - // PASS - } - - std::string Decimal64VectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Decimal64 vector with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void Decimal64VectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - values.resize(cap); - readScales.resize(cap); - } - } - - void Decimal64VectorBatch::clear() { - numElements = 0; - } - - uint64_t Decimal64VectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>( - (values.capacity() + readScales.capacity()) * sizeof(int64_t)); - } - - Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool - ): ColumnVectorBatch(cap, pool), - precision(0), - scale(0), - values(pool, cap), - readScales(pool, cap) { - // PASS - } - - Decimal128VectorBatch::~Decimal128VectorBatch() { - // PASS - } - - std::string Decimal128VectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Decimal128 vector with " - << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void Decimal128VectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - values.resize(cap); - readScales.resize(cap); - } - } - - void Decimal128VectorBatch::clear() { - numElements = 0; - } - - uint64_t Decimal128VectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>(values.capacity() * sizeof(Int128) - + readScales.capacity() * sizeof(int64_t)); - } - - Decimal::Decimal(const Int128& _value, - int32_t _scale): value(_value), scale(_scale) { - // PASS - } - - Decimal::Decimal(const std::string& str) { - std::size_t foundPoint = str.find("."); - // no decimal point, it is int - if(foundPoint == std::string::npos){ - value = Int128(str); - scale = 0; - }else{ - std::string copy(str); - scale = static_cast<int32_t>(str.length() - foundPoint - 1); - value = Int128(copy.replace(foundPoint, 1, "")); - } - } - - Decimal::Decimal() : value(0), scale(0) { - // PASS - } - - std::string Decimal::toString() const { - return value.toDecimalString(scale); - } - - TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, - MemoryPool& pool - ): ColumnVectorBatch(_capacity, - pool), - data(pool, _capacity), - nanoseconds(pool, _capacity) { - // PASS - } - - TimestampVectorBatch::~TimestampVectorBatch() { - // PASS - } - - std::string TimestampVectorBatch::toString() const { - std::ostringstream buffer; - buffer << "Timestamp vector <" << numElements << " of " << capacity << ">"; - return buffer.str(); - } - - void TimestampVectorBatch::resize(uint64_t cap) { - if (capacity < cap) { - ColumnVectorBatch::resize(cap); - data.resize(cap); - nanoseconds.resize(cap); - } - } - - void TimestampVectorBatch::clear() { - numElements = 0; - } - - uint64_t TimestampVectorBatch::getMemoryUsage() { - return ColumnVectorBatch::getMemoryUsage() - + static_cast<uint64_t>( - (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t)); - } -} +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Vector.hh" + +#include "Adaptor.hh" +#include "orc/Exceptions.hh" + +#include <iostream> +#include <sstream> +#include <cstdlib> + +namespace orc { + + ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, + MemoryPool& pool + ): capacity(cap), + numElements(0), + notNull(pool, cap), + hasNulls(false), + isEncoded(false), + memoryPool(pool) { + std::memset(notNull.data(), 1, capacity); + } + + ColumnVectorBatch::~ColumnVectorBatch() { + // PASS + } + + void ColumnVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + capacity = cap; + notNull.resize(cap); + } + } + + void ColumnVectorBatch::clear() { + numElements = 0; + } + + uint64_t ColumnVectorBatch::getMemoryUsage() { + return static_cast<uint64_t>(notNull.capacity() * sizeof(char)); + } + + bool ColumnVectorBatch::hasVariableLength() { + return false; + } + + LongVectorBatch::LongVectorBatch(uint64_t _capacity, MemoryPool& pool + ): ColumnVectorBatch(_capacity, pool), + data(pool, _capacity) { + // PASS + } + + LongVectorBatch::~LongVectorBatch() { + // PASS + } + + std::string LongVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Long vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void LongVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + } + } + + void LongVectorBatch::clear() { + numElements = 0; + } + + uint64_t LongVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(int64_t)); + } + + DoubleVectorBatch::DoubleVectorBatch(uint64_t _capacity, MemoryPool& pool + ): ColumnVectorBatch(_capacity, pool), + data(pool, _capacity) { + // PASS + } + + DoubleVectorBatch::~DoubleVectorBatch() { + // PASS + } + + std::string DoubleVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Double vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void DoubleVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + } + } + + void DoubleVectorBatch::clear() { + numElements = 0; + } + + uint64_t DoubleVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(double)); + } + + StringDictionary::StringDictionary(MemoryPool& pool) + : dictionaryBlob(pool), + dictionaryOffset(pool) { + // PASS + } + + EncodedStringVectorBatch::EncodedStringVectorBatch(uint64_t _capacity, + MemoryPool& pool) + : StringVectorBatch(_capacity, pool), + dictionary(), + index(pool, _capacity) { + // PASS + } + + EncodedStringVectorBatch::~EncodedStringVectorBatch() { + // PASS + } + + std::string EncodedStringVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Encoded string vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + StringVectorBatch::StringVectorBatch(uint64_t _capacity, MemoryPool& pool + ): ColumnVectorBatch(_capacity, pool), + data(pool, _capacity), + length(pool, _capacity), + blob(pool) { + // PASS + } + + StringVectorBatch::~StringVectorBatch() { + // PASS + } + + std::string StringVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Byte vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void StringVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + length.resize(cap); + } + } + + void StringVectorBatch::clear() { + numElements = 0; + } + + uint64_t StringVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(data.capacity() * sizeof(char*) + + length.capacity() * sizeof(int64_t)); + } + + StructVectorBatch::StructVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool) { + // PASS + } + + StructVectorBatch::~StructVectorBatch() { + for (uint64_t i=0; i<this->fields.size(); i++) { + delete this->fields[i]; + } + } + + std::string StructVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Struct vector <" << numElements << " of " << capacity + << "; "; + for(std::vector<ColumnVectorBatch*>::const_iterator ptr=fields.begin(); + ptr != fields.end(); ++ptr) { + buffer << (*ptr)->toString() << "; "; + } + buffer << ">"; + return buffer.str(); + } + + void StructVectorBatch::resize(uint64_t cap) { + ColumnVectorBatch::resize(cap); + } + + void StructVectorBatch::clear() { + for(size_t i=0; i < fields.size(); i++) { + fields[i]->clear(); + } + numElements = 0; + } + + uint64_t StructVectorBatch::getMemoryUsage() { + uint64_t memory = ColumnVectorBatch::getMemoryUsage(); + for (unsigned int i=0; i < fields.size(); i++) { + memory += fields[i]->getMemoryUsage(); + } + return memory; + } + + bool StructVectorBatch::hasVariableLength() { + for (unsigned int i=0; i < fields.size(); i++) { + if (fields[i]->hasVariableLength()) { + return true; + } + } + return false; + } + + ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + offsets(pool, cap+1) { + // PASS + } + + ListVectorBatch::~ListVectorBatch() { + // PASS + } + + std::string ListVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "List vector <" << elements->toString() << " with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void ListVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + offsets.resize(cap + 1); + } + } + + void ListVectorBatch::clear() { + numElements = 0; + elements->clear(); + } + + uint64_t ListVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + + elements->getMemoryUsage(); + } + + bool ListVectorBatch::hasVariableLength() { + return true; + } + + MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + offsets(pool, cap+1) { + // PASS + } + + MapVectorBatch::~MapVectorBatch() { + // PASS + } + + std::string MapVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Map vector <" << keys->toString() << ", " + << elements->toString() << " with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void MapVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + offsets.resize(cap + 1); + } + } + + void MapVectorBatch::clear() { + keys->clear(); + elements->clear(); + numElements = 0; + } + + uint64_t MapVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) + + keys->getMemoryUsage() + + elements->getMemoryUsage(); + } + + bool MapVectorBatch::hasVariableLength() { + return true; + } + + UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + tags(pool, cap), + offsets(pool, cap) { + // PASS + } + + UnionVectorBatch::~UnionVectorBatch() { + for (uint64_t i=0; i < children.size(); i++) { + delete children[i]; + } + } + + std::string UnionVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Union vector <"; + for(size_t i=0; i < children.size(); ++i) { + if (i != 0) { + buffer << ", "; + } + buffer << children[i]->toString(); + } + buffer << "; with " << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void UnionVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + tags.resize(cap); + offsets.resize(cap); + } + } + + void UnionVectorBatch::clear() { + for(size_t i=0; i < children.size(); i++) { + children[i]->clear(); + } + numElements = 0; + } + + uint64_t UnionVectorBatch::getMemoryUsage() { + uint64_t memory = ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) + + offsets.capacity() * sizeof(uint64_t)); + for(size_t i=0; i < children.size(); ++i) { + memory += children[i]->getMemoryUsage(); + } + return memory; + } + + bool UnionVectorBatch::hasVariableLength() { + for(size_t i=0; i < children.size(); ++i) { + if (children[i]->hasVariableLength()) { + return true; + } + } + return false; + } + + Decimal64VectorBatch::Decimal64VectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + precision(0), + scale(0), + values(pool, cap), + readScales(pool, cap) { + // PASS + } + + Decimal64VectorBatch::~Decimal64VectorBatch() { + // PASS + } + + std::string Decimal64VectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Decimal64 vector with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void Decimal64VectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + values.resize(cap); + readScales.resize(cap); + } + } + + void Decimal64VectorBatch::clear() { + numElements = 0; + } + + uint64_t Decimal64VectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>( + (values.capacity() + readScales.capacity()) * sizeof(int64_t)); + } + + Decimal128VectorBatch::Decimal128VectorBatch(uint64_t cap, MemoryPool& pool + ): ColumnVectorBatch(cap, pool), + precision(0), + scale(0), + values(pool, cap), + readScales(pool, cap) { + // PASS + } + + Decimal128VectorBatch::~Decimal128VectorBatch() { + // PASS + } + + std::string Decimal128VectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Decimal128 vector with " + << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void Decimal128VectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + values.resize(cap); + readScales.resize(cap); + } + } + + void Decimal128VectorBatch::clear() { + numElements = 0; + } + + uint64_t Decimal128VectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>(values.capacity() * sizeof(Int128) + + readScales.capacity() * sizeof(int64_t)); + } + + Decimal::Decimal(const Int128& _value, + int32_t _scale): value(_value), scale(_scale) { + // PASS + } + + Decimal::Decimal(const std::string& str) { + std::size_t foundPoint = str.find("."); + // no decimal point, it is int + if(foundPoint == std::string::npos){ + value = Int128(str); + scale = 0; + }else{ + std::string copy(str); + scale = static_cast<int32_t>(str.length() - foundPoint - 1); + value = Int128(copy.replace(foundPoint, 1, "")); + } + } + + Decimal::Decimal() : value(0), scale(0) { + // PASS + } + + std::string Decimal::toString() const { + return value.toDecimalString(scale); + } + + TimestampVectorBatch::TimestampVectorBatch(uint64_t _capacity, + MemoryPool& pool + ): ColumnVectorBatch(_capacity, + pool), + data(pool, _capacity), + nanoseconds(pool, _capacity) { + // PASS + } + + TimestampVectorBatch::~TimestampVectorBatch() { + // PASS + } + + std::string TimestampVectorBatch::toString() const { + std::ostringstream buffer; + buffer << "Timestamp vector <" << numElements << " of " << capacity << ">"; + return buffer.str(); + } + + void TimestampVectorBatch::resize(uint64_t cap) { + if (capacity < cap) { + ColumnVectorBatch::resize(cap); + data.resize(cap); + nanoseconds.resize(cap); + } + } + + void TimestampVectorBatch::clear() { + numElements = 0; + } + + uint64_t TimestampVectorBatch::getMemoryUsage() { + return ColumnVectorBatch::getMemoryUsage() + + static_cast<uint64_t>( + (data.capacity() + nanoseconds.capacity()) * sizeof(int64_t)); + } +} diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc index b5bd19b304..8b13750865 100644 --- a/contrib/libs/apache/orc/c++/src/Writer.cc +++ b/contrib/libs/apache/orc/c++/src/Writer.cc @@ -1,641 +1,641 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Common.hh" -#include "orc/OrcFile.hh" - -#include "ColumnWriter.hh" -#include "Timezone.hh" - -#include <memory> - -namespace orc { - - struct WriterOptionsPrivate { - uint64_t stripeSize; - uint64_t compressionBlockSize; - uint64_t rowIndexStride; - CompressionKind compression; - CompressionStrategy compressionStrategy; - MemoryPool* memoryPool; - double paddingTolerance; - std::ostream* errorStream; - FileVersion fileVersion; - double dictionaryKeySizeThreshold; - bool enableIndex; - std::set<uint64_t> columnsUseBloomFilter; - double bloomFilterFalsePositiveProb; - BloomFilterVersion bloomFilterVersion; - - WriterOptionsPrivate() : - fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 - stripeSize = 64 * 1024 * 1024; // 64M - compressionBlockSize = 64 * 1024; // 64K - rowIndexStride = 10000; - compression = CompressionKind_ZLIB; - compressionStrategy = CompressionStrategy_SPEED; - memoryPool = getDefaultPool(); - paddingTolerance = 0.0; - errorStream = &std::cerr; - dictionaryKeySizeThreshold = 0.0; - enableIndex = true; - bloomFilterFalsePositiveProb = 0.05; - bloomFilterVersion = UTF8; - } - }; - - WriterOptions::WriterOptions(): - privateBits(std::unique_ptr<WriterOptionsPrivate> - (new WriterOptionsPrivate())) { - // PASS - } - - WriterOptions::WriterOptions(const WriterOptions& rhs): - privateBits(std::unique_ptr<WriterOptionsPrivate> - (new WriterOptionsPrivate(*(rhs.privateBits.get())))) { - // PASS - } - - WriterOptions::WriterOptions(WriterOptions& rhs) { - // swap privateBits with rhs - WriterOptionsPrivate* l = privateBits.release(); - privateBits.reset(rhs.privateBits.release()); - rhs.privateBits.reset(l); - } - - WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) { - if (this != &rhs) { - privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get()))); - } - return *this; - } - - WriterOptions::~WriterOptions() { - // PASS - } - RleVersion WriterOptions::getRleVersion() const { - if(privateBits->fileVersion == FileVersion::v_0_11()) - { - return RleVersion_1; - } - - return RleVersion_2; - } - - WriterOptions& WriterOptions::setStripeSize(uint64_t size) { - privateBits->stripeSize = size; - return *this; - } - - uint64_t WriterOptions::getStripeSize() const { - return privateBits->stripeSize; - } - - WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) { - privateBits->compressionBlockSize = size; - return *this; - } - - uint64_t WriterOptions::getCompressionBlockSize() const { - return privateBits->compressionBlockSize; - } - - WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) { - privateBits->rowIndexStride = stride; - privateBits->enableIndex = (stride != 0); - return *this; - } - - uint64_t WriterOptions::getRowIndexStride() const { - return privateBits->rowIndexStride; - } - - WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) { - privateBits->dictionaryKeySizeThreshold = val; - return *this; - } - - double WriterOptions::getDictionaryKeySizeThreshold() const { - return privateBits->dictionaryKeySizeThreshold; - } - - WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) { - // Only Hive_0_11 and Hive_0_12 version are supported currently - if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) { - privateBits->fileVersion = version; - return *this; - } - throw std::logic_error("Unsupported file version specified."); - } - - FileVersion WriterOptions::getFileVersion() const { - return privateBits->fileVersion; - } - - WriterOptions& WriterOptions::setCompression(CompressionKind comp) { - privateBits->compression = comp; - return *this; - } - - CompressionKind WriterOptions::getCompression() const { - return privateBits->compression; - } - - WriterOptions& WriterOptions::setCompressionStrategy( - CompressionStrategy strategy) { - privateBits->compressionStrategy = strategy; - return *this; - } - - CompressionStrategy WriterOptions::getCompressionStrategy() const { - return privateBits->compressionStrategy; - } - - bool WriterOptions::getAlignedBitpacking() const { - return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED; - } - - WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) { - privateBits->paddingTolerance = tolerance; - return *this; - } - - double WriterOptions::getPaddingTolerance() const { - return privateBits->paddingTolerance; - } - - WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) { - privateBits->memoryPool = memoryPool; - return *this; - } - - MemoryPool* WriterOptions::getMemoryPool() const { - return privateBits->memoryPool; - } - - WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) { - privateBits->errorStream = &errStream; - return *this; - } - - std::ostream* WriterOptions::getErrorStream() const { - return privateBits->errorStream; - } - - bool WriterOptions::getEnableIndex() const { - return privateBits->enableIndex; - } - - bool WriterOptions::getEnableDictionary() const { - return privateBits->dictionaryKeySizeThreshold > 0.0; - } - - WriterOptions& WriterOptions::setColumnsUseBloomFilter( - const std::set<uint64_t>& columns) { - privateBits->columnsUseBloomFilter = columns; - return *this; - } - - bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const { - return privateBits->columnsUseBloomFilter.find(column) != - privateBits->columnsUseBloomFilter.end(); - } - - WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) { - privateBits->bloomFilterFalsePositiveProb = fpp; - return *this; - } - - double WriterOptions::getBloomFilterFPP() const { - return privateBits->bloomFilterFalsePositiveProb; - } - - // delibrately not provide setter to write bloom filter version because - // we only support UTF8 for now. - BloomFilterVersion WriterOptions::getBloomFilterVersion() const { - return privateBits->bloomFilterVersion; - } - - Writer::~Writer() { - // PASS - } - - class WriterImpl : public Writer { - private: - std::unique_ptr<ColumnWriter> columnWriter; - std::unique_ptr<BufferedOutputStream> compressionStream; - std::unique_ptr<BufferedOutputStream> bufferedStream; - std::unique_ptr<StreamsFactory> streamsFactory; - OutputStream* outStream; - WriterOptions options; - const Type& type; - uint64_t stripeRows, totalRows, indexRows; - uint64_t currentOffset; - proto::Footer fileFooter; - proto::PostScript postScript; - proto::StripeInformation stripeInfo; - proto::Metadata metadata; - - static const char* magicId; - static const WriterId writerId; - - public: - WriterImpl( - const Type& type, - OutputStream* stream, - const WriterOptions& options); - - std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) - const override; - - void add(ColumnVectorBatch& rowsToAdd) override; - - void close() override; - - void addUserMetadata(const std::string name, const std::string value) override; - - private: - void init(); - void initStripe(); - void writeStripe(); - void writeMetadata(); - void writeFileFooter(); - void writePostscript(); - void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index); - static proto::CompressionKind convertCompressionKind( - const CompressionKind& kind); - }; - - const char * WriterImpl::magicId = "ORC"; - - const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER; - - WriterImpl::WriterImpl( - const Type& t, - OutputStream* stream, - const WriterOptions& opts) : - outStream(stream), - options(opts), - type(t) { - streamsFactory = createStreamsFactory(options, outStream); - columnWriter = buildWriter(type, *streamsFactory, options); - stripeRows = totalRows = indexRows = 0; - currentOffset = 0; - - // compression stream for stripe footer, file footer and metadata - compressionStream = createCompressor( - options.getCompression(), - outStream, - options.getCompressionStrategy(), - 1 * 1024 * 1024, // buffer capacity: 1M - options.getCompressionBlockSize(), - *options.getMemoryPool()); - - // uncompressed stream for post script - bufferedStream.reset(new BufferedOutputStream( - *options.getMemoryPool(), - outStream, - 1024, // buffer capacity: 1024 bytes - options.getCompressionBlockSize())); - - init(); - } - - std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) - const { - return type.createRowBatch(size, *options.getMemoryPool()); - } - - void WriterImpl::add(ColumnVectorBatch& rowsToAdd) { - if (options.getEnableIndex()) { - uint64_t pos = 0; - uint64_t chunkSize = 0; - uint64_t rowIndexStride = options.getRowIndexStride(); - while (pos < rowsToAdd.numElements) { - chunkSize = std::min(rowsToAdd.numElements - pos, - rowIndexStride - indexRows); - columnWriter->add(rowsToAdd, pos, chunkSize, nullptr); - - pos += chunkSize; - indexRows += chunkSize; - stripeRows += chunkSize; - - if (indexRows >= rowIndexStride) { - columnWriter->createRowIndexEntry(); - indexRows = 0; - } - } - } else { - stripeRows += rowsToAdd.numElements; - columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr); - } - - if (columnWriter->getEstimatedSize() >= options.getStripeSize()) { - writeStripe(); - } - } - - void WriterImpl::close() { - if (stripeRows > 0) { - writeStripe(); - } - writeMetadata(); - writeFileFooter(); - writePostscript(); - outStream->close(); - } - - void WriterImpl::addUserMetadata(const std::string name, const std::string value){ - proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata(); - userMetadataItem->set_name(TString(name)); - userMetadataItem->set_value(TString(value)); - } - - void WriterImpl::init() { - // Write file header - const static size_t magicIdLength = strlen(WriterImpl::magicId); - outStream->write(WriterImpl::magicId, magicIdLength); - currentOffset += magicIdLength; - - // Initialize file footer - fileFooter.set_headerlength(currentOffset); - fileFooter.set_contentlength(0); - fileFooter.set_numberofrows(0); - fileFooter.set_rowindexstride( - static_cast<uint32_t>(options.getRowIndexStride())); - fileFooter.set_writer(writerId); +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Common.hh" +#include "orc/OrcFile.hh" + +#include "ColumnWriter.hh" +#include "Timezone.hh" + +#include <memory> + +namespace orc { + + struct WriterOptionsPrivate { + uint64_t stripeSize; + uint64_t compressionBlockSize; + uint64_t rowIndexStride; + CompressionKind compression; + CompressionStrategy compressionStrategy; + MemoryPool* memoryPool; + double paddingTolerance; + std::ostream* errorStream; + FileVersion fileVersion; + double dictionaryKeySizeThreshold; + bool enableIndex; + std::set<uint64_t> columnsUseBloomFilter; + double bloomFilterFalsePositiveProb; + BloomFilterVersion bloomFilterVersion; + + WriterOptionsPrivate() : + fileVersion(FileVersion::v_0_12()) { // default to Hive_0_12 + stripeSize = 64 * 1024 * 1024; // 64M + compressionBlockSize = 64 * 1024; // 64K + rowIndexStride = 10000; + compression = CompressionKind_ZLIB; + compressionStrategy = CompressionStrategy_SPEED; + memoryPool = getDefaultPool(); + paddingTolerance = 0.0; + errorStream = &std::cerr; + dictionaryKeySizeThreshold = 0.0; + enableIndex = true; + bloomFilterFalsePositiveProb = 0.05; + bloomFilterVersion = UTF8; + } + }; + + WriterOptions::WriterOptions(): + privateBits(std::unique_ptr<WriterOptionsPrivate> + (new WriterOptionsPrivate())) { + // PASS + } + + WriterOptions::WriterOptions(const WriterOptions& rhs): + privateBits(std::unique_ptr<WriterOptionsPrivate> + (new WriterOptionsPrivate(*(rhs.privateBits.get())))) { + // PASS + } + + WriterOptions::WriterOptions(WriterOptions& rhs) { + // swap privateBits with rhs + WriterOptionsPrivate* l = privateBits.release(); + privateBits.reset(rhs.privateBits.release()); + rhs.privateBits.reset(l); + } + + WriterOptions& WriterOptions::operator=(const WriterOptions& rhs) { + if (this != &rhs) { + privateBits.reset(new WriterOptionsPrivate(*(rhs.privateBits.get()))); + } + return *this; + } + + WriterOptions::~WriterOptions() { + // PASS + } + RleVersion WriterOptions::getRleVersion() const { + if(privateBits->fileVersion == FileVersion::v_0_11()) + { + return RleVersion_1; + } + + return RleVersion_2; + } + + WriterOptions& WriterOptions::setStripeSize(uint64_t size) { + privateBits->stripeSize = size; + return *this; + } + + uint64_t WriterOptions::getStripeSize() const { + return privateBits->stripeSize; + } + + WriterOptions& WriterOptions::setCompressionBlockSize(uint64_t size) { + privateBits->compressionBlockSize = size; + return *this; + } + + uint64_t WriterOptions::getCompressionBlockSize() const { + return privateBits->compressionBlockSize; + } + + WriterOptions& WriterOptions::setRowIndexStride(uint64_t stride) { + privateBits->rowIndexStride = stride; + privateBits->enableIndex = (stride != 0); + return *this; + } + + uint64_t WriterOptions::getRowIndexStride() const { + return privateBits->rowIndexStride; + } + + WriterOptions& WriterOptions::setDictionaryKeySizeThreshold(double val) { + privateBits->dictionaryKeySizeThreshold = val; + return *this; + } + + double WriterOptions::getDictionaryKeySizeThreshold() const { + return privateBits->dictionaryKeySizeThreshold; + } + + WriterOptions& WriterOptions::setFileVersion(const FileVersion& version) { + // Only Hive_0_11 and Hive_0_12 version are supported currently + if (version.getMajor() == 0 && (version.getMinor() == 11 || version.getMinor() == 12)) { + privateBits->fileVersion = version; + return *this; + } + throw std::logic_error("Unsupported file version specified."); + } + + FileVersion WriterOptions::getFileVersion() const { + return privateBits->fileVersion; + } + + WriterOptions& WriterOptions::setCompression(CompressionKind comp) { + privateBits->compression = comp; + return *this; + } + + CompressionKind WriterOptions::getCompression() const { + return privateBits->compression; + } + + WriterOptions& WriterOptions::setCompressionStrategy( + CompressionStrategy strategy) { + privateBits->compressionStrategy = strategy; + return *this; + } + + CompressionStrategy WriterOptions::getCompressionStrategy() const { + return privateBits->compressionStrategy; + } + + bool WriterOptions::getAlignedBitpacking() const { + return privateBits->compressionStrategy == CompressionStrategy ::CompressionStrategy_SPEED; + } + + WriterOptions& WriterOptions::setPaddingTolerance(double tolerance) { + privateBits->paddingTolerance = tolerance; + return *this; + } + + double WriterOptions::getPaddingTolerance() const { + return privateBits->paddingTolerance; + } + + WriterOptions& WriterOptions::setMemoryPool(MemoryPool* memoryPool) { + privateBits->memoryPool = memoryPool; + return *this; + } + + MemoryPool* WriterOptions::getMemoryPool() const { + return privateBits->memoryPool; + } + + WriterOptions& WriterOptions::setErrorStream(std::ostream& errStream) { + privateBits->errorStream = &errStream; + return *this; + } + + std::ostream* WriterOptions::getErrorStream() const { + return privateBits->errorStream; + } + + bool WriterOptions::getEnableIndex() const { + return privateBits->enableIndex; + } + + bool WriterOptions::getEnableDictionary() const { + return privateBits->dictionaryKeySizeThreshold > 0.0; + } + + WriterOptions& WriterOptions::setColumnsUseBloomFilter( + const std::set<uint64_t>& columns) { + privateBits->columnsUseBloomFilter = columns; + return *this; + } + + bool WriterOptions::isColumnUseBloomFilter(uint64_t column) const { + return privateBits->columnsUseBloomFilter.find(column) != + privateBits->columnsUseBloomFilter.end(); + } + + WriterOptions& WriterOptions::setBloomFilterFPP(double fpp) { + privateBits->bloomFilterFalsePositiveProb = fpp; + return *this; + } + + double WriterOptions::getBloomFilterFPP() const { + return privateBits->bloomFilterFalsePositiveProb; + } + + // delibrately not provide setter to write bloom filter version because + // we only support UTF8 for now. + BloomFilterVersion WriterOptions::getBloomFilterVersion() const { + return privateBits->bloomFilterVersion; + } + + Writer::~Writer() { + // PASS + } + + class WriterImpl : public Writer { + private: + std::unique_ptr<ColumnWriter> columnWriter; + std::unique_ptr<BufferedOutputStream> compressionStream; + std::unique_ptr<BufferedOutputStream> bufferedStream; + std::unique_ptr<StreamsFactory> streamsFactory; + OutputStream* outStream; + WriterOptions options; + const Type& type; + uint64_t stripeRows, totalRows, indexRows; + uint64_t currentOffset; + proto::Footer fileFooter; + proto::PostScript postScript; + proto::StripeInformation stripeInfo; + proto::Metadata metadata; + + static const char* magicId; + static const WriterId writerId; + + public: + WriterImpl( + const Type& type, + OutputStream* stream, + const WriterOptions& options); + + std::unique_ptr<ColumnVectorBatch> createRowBatch(uint64_t size) + const override; + + void add(ColumnVectorBatch& rowsToAdd) override; + + void close() override; + + void addUserMetadata(const std::string name, const std::string value) override; + + private: + void init(); + void initStripe(); + void writeStripe(); + void writeMetadata(); + void writeFileFooter(); + void writePostscript(); + void buildFooterType(const Type& t, proto::Footer& footer, uint32_t& index); + static proto::CompressionKind convertCompressionKind( + const CompressionKind& kind); + }; + + const char * WriterImpl::magicId = "ORC"; + + const WriterId WriterImpl::writerId = WriterId::ORC_CPP_WRITER; + + WriterImpl::WriterImpl( + const Type& t, + OutputStream* stream, + const WriterOptions& opts) : + outStream(stream), + options(opts), + type(t) { + streamsFactory = createStreamsFactory(options, outStream); + columnWriter = buildWriter(type, *streamsFactory, options); + stripeRows = totalRows = indexRows = 0; + currentOffset = 0; + + // compression stream for stripe footer, file footer and metadata + compressionStream = createCompressor( + options.getCompression(), + outStream, + options.getCompressionStrategy(), + 1 * 1024 * 1024, // buffer capacity: 1M + options.getCompressionBlockSize(), + *options.getMemoryPool()); + + // uncompressed stream for post script + bufferedStream.reset(new BufferedOutputStream( + *options.getMemoryPool(), + outStream, + 1024, // buffer capacity: 1024 bytes + options.getCompressionBlockSize())); + + init(); + } + + std::unique_ptr<ColumnVectorBatch> WriterImpl::createRowBatch(uint64_t size) + const { + return type.createRowBatch(size, *options.getMemoryPool()); + } + + void WriterImpl::add(ColumnVectorBatch& rowsToAdd) { + if (options.getEnableIndex()) { + uint64_t pos = 0; + uint64_t chunkSize = 0; + uint64_t rowIndexStride = options.getRowIndexStride(); + while (pos < rowsToAdd.numElements) { + chunkSize = std::min(rowsToAdd.numElements - pos, + rowIndexStride - indexRows); + columnWriter->add(rowsToAdd, pos, chunkSize, nullptr); + + pos += chunkSize; + indexRows += chunkSize; + stripeRows += chunkSize; + + if (indexRows >= rowIndexStride) { + columnWriter->createRowIndexEntry(); + indexRows = 0; + } + } + } else { + stripeRows += rowsToAdd.numElements; + columnWriter->add(rowsToAdd, 0, rowsToAdd.numElements, nullptr); + } + + if (columnWriter->getEstimatedSize() >= options.getStripeSize()) { + writeStripe(); + } + } + + void WriterImpl::close() { + if (stripeRows > 0) { + writeStripe(); + } + writeMetadata(); + writeFileFooter(); + writePostscript(); + outStream->close(); + } + + void WriterImpl::addUserMetadata(const std::string name, const std::string value){ + proto::UserMetadataItem* userMetadataItem = fileFooter.add_metadata(); + userMetadataItem->set_name(TString(name)); + userMetadataItem->set_value(TString(value)); + } + + void WriterImpl::init() { + // Write file header + const static size_t magicIdLength = strlen(WriterImpl::magicId); + outStream->write(WriterImpl::magicId, magicIdLength); + currentOffset += magicIdLength; + + // Initialize file footer + fileFooter.set_headerlength(currentOffset); + fileFooter.set_contentlength(0); + fileFooter.set_numberofrows(0); + fileFooter.set_rowindexstride( + static_cast<uint32_t>(options.getRowIndexStride())); + fileFooter.set_writer(writerId); fileFooter.set_softwareversion(ORC_VERSION); - - uint32_t index = 0; - buildFooterType(type, fileFooter, index); - - // Initialize post script - postScript.set_footerlength(0); - postScript.set_compression( - WriterImpl::convertCompressionKind(options.getCompression())); - postScript.set_compressionblocksize(options.getCompressionBlockSize()); - - postScript.add_version(options.getFileVersion().getMajor()); - postScript.add_version(options.getFileVersion().getMinor()); - - postScript.set_writerversion(WriterVersion_ORC_135); - postScript.set_magic("ORC"); - - // Initialize first stripe - initStripe(); - } - - void WriterImpl::initStripe() { - stripeInfo.set_offset(currentOffset); - stripeInfo.set_indexlength(0); - stripeInfo.set_datalength(0); - stripeInfo.set_footerlength(0); - stripeInfo.set_numberofrows(0); - - stripeRows = indexRows = 0; - } - - void WriterImpl::writeStripe() { - if (options.getEnableIndex() && indexRows != 0) { - columnWriter->createRowIndexEntry(); - indexRows = 0; - } else { - columnWriter->mergeRowGroupStatsIntoStripeStats(); - } - - // dictionary should be written before any stream is flushed - columnWriter->writeDictionary(); - - std::vector<proto::Stream> streams; - // write ROW_INDEX streams - if (options.getEnableIndex()) { - columnWriter->writeIndex(streams); - } - // write streams like PRESENT, DATA, etc. - columnWriter->flush(streams); - - // generate and write stripe footer - proto::StripeFooter stripeFooter; - for (uint32_t i = 0; i < streams.size(); ++i) { - *stripeFooter.add_streams() = streams[i]; - } - - std::vector<proto::ColumnEncoding> encodings; - columnWriter->getColumnEncoding(encodings); - - for (uint32_t i = 0; i < encodings.size(); ++i) { - *stripeFooter.add_columns() = encodings[i]; - } - - // use GMT to guarantee TimestampVectorBatch from reader can write - // same wall clock time - stripeFooter.set_writertimezone("GMT"); - - // add stripe statistics to metadata - proto::StripeStatistics* stripeStats = metadata.add_stripestats(); - std::vector<proto::ColumnStatistics> colStats; - columnWriter->getStripeStatistics(colStats); - for (uint32_t i = 0; i != colStats.size(); ++i) { - *stripeStats->add_colstats() = colStats[i]; - } - // merge stripe stats into file stats and clear stripe stats - columnWriter->mergeStripeStatsIntoFileStats(); - - if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) { - throw std::logic_error("Failed to write stripe footer."); - } - uint64_t footerLength = compressionStream->flush(); - - // calculate data length and index length - uint64_t dataLength = 0; - uint64_t indexLength = 0; - for (uint32_t i = 0; i < streams.size(); ++i) { - if (streams[i].kind() == proto::Stream_Kind_ROW_INDEX || - streams[i].kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8) { - indexLength += streams[i].length(); - } else { - dataLength += streams[i].length(); - } - } - - // update stripe info - stripeInfo.set_indexlength(indexLength); - stripeInfo.set_datalength(dataLength); - stripeInfo.set_footerlength(footerLength); - stripeInfo.set_numberofrows(stripeRows); - - *fileFooter.add_stripes() = stripeInfo; - - currentOffset = currentOffset + indexLength + dataLength + footerLength; - totalRows += stripeRows; - - columnWriter->reset(); - - initStripe(); - } - - void WriterImpl::writeMetadata() { - if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) { - throw std::logic_error("Failed to write metadata."); - } - postScript.set_metadatalength(compressionStream.get()->flush()); - } - - void WriterImpl::writeFileFooter() { - fileFooter.set_contentlength(currentOffset - fileFooter.headerlength()); - fileFooter.set_numberofrows(totalRows); - - // update file statistics - std::vector<proto::ColumnStatistics> colStats; - columnWriter->getFileStatistics(colStats); - for (uint32_t i = 0; i != colStats.size(); ++i) { - *fileFooter.add_statistics() = colStats[i]; - } - - if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) { - throw std::logic_error("Failed to write file footer."); - } - postScript.set_footerlength(compressionStream->flush()); - } - - void WriterImpl::writePostscript() { - if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) { - throw std::logic_error("Failed to write post script."); - } - unsigned char psLength = - static_cast<unsigned char>(bufferedStream->flush()); - outStream->write(&psLength, sizeof(unsigned char)); - } - - void WriterImpl::buildFooterType( - const Type& t, - proto::Footer& footer, - uint32_t & index) { - proto::Type protoType; - protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength())); - protoType.set_precision(static_cast<uint32_t>(t.getPrecision())); - protoType.set_scale(static_cast<uint32_t>(t.getScale())); - - switch (t.getKind()) { - case BOOLEAN: { - protoType.set_kind(proto::Type_Kind_BOOLEAN); - break; - } - case BYTE: { - protoType.set_kind(proto::Type_Kind_BYTE); - break; - } - case SHORT: { - protoType.set_kind(proto::Type_Kind_SHORT); - break; - } - case INT: { - protoType.set_kind(proto::Type_Kind_INT); - break; - } - case LONG: { - protoType.set_kind(proto::Type_Kind_LONG); - break; - } - case FLOAT: { - protoType.set_kind(proto::Type_Kind_FLOAT); - break; - } - case DOUBLE: { - protoType.set_kind(proto::Type_Kind_DOUBLE); - break; - } - case STRING: { - protoType.set_kind(proto::Type_Kind_STRING); - break; - } - case BINARY: { - protoType.set_kind(proto::Type_Kind_BINARY); - break; - } - case TIMESTAMP: { - protoType.set_kind(proto::Type_Kind_TIMESTAMP); - break; - } - case LIST: { - protoType.set_kind(proto::Type_Kind_LIST); - break; - } - case MAP: { - protoType.set_kind(proto::Type_Kind_MAP); - break; - } - case STRUCT: { - protoType.set_kind(proto::Type_Kind_STRUCT); - break; - } - case UNION: { - protoType.set_kind(proto::Type_Kind_UNION); - break; - } - case DECIMAL: { - protoType.set_kind(proto::Type_Kind_DECIMAL); - break; - } - case DATE: { - protoType.set_kind(proto::Type_Kind_DATE); - break; - } - case VARCHAR: { - protoType.set_kind(proto::Type_Kind_VARCHAR); - break; - } - case CHAR: { - protoType.set_kind(proto::Type_Kind_CHAR); - break; - } - default: - throw std::logic_error("Unknown type."); - } - - int pos = static_cast<int>(index); - *footer.add_types() = protoType; - - for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) { - // only add subtypes' field names if this type is STRUCT - if (t.getKind() == STRUCT) { - footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i))); - } - footer.mutable_types(pos)->add_subtypes(++index); - buildFooterType(*t.getSubtype(i), footer, index); - } - } - - proto::CompressionKind WriterImpl::convertCompressionKind( - const CompressionKind& kind) { - return static_cast<proto::CompressionKind>(kind); - } - - std::unique_ptr<Writer> createWriter( - const Type& type, - OutputStream* stream, - const WriterOptions& options) { - return std::unique_ptr<Writer>( - new WriterImpl( - type, - stream, - options)); - } - -} - + + uint32_t index = 0; + buildFooterType(type, fileFooter, index); + + // Initialize post script + postScript.set_footerlength(0); + postScript.set_compression( + WriterImpl::convertCompressionKind(options.getCompression())); + postScript.set_compressionblocksize(options.getCompressionBlockSize()); + + postScript.add_version(options.getFileVersion().getMajor()); + postScript.add_version(options.getFileVersion().getMinor()); + + postScript.set_writerversion(WriterVersion_ORC_135); + postScript.set_magic("ORC"); + + // Initialize first stripe + initStripe(); + } + + void WriterImpl::initStripe() { + stripeInfo.set_offset(currentOffset); + stripeInfo.set_indexlength(0); + stripeInfo.set_datalength(0); + stripeInfo.set_footerlength(0); + stripeInfo.set_numberofrows(0); + + stripeRows = indexRows = 0; + } + + void WriterImpl::writeStripe() { + if (options.getEnableIndex() && indexRows != 0) { + columnWriter->createRowIndexEntry(); + indexRows = 0; + } else { + columnWriter->mergeRowGroupStatsIntoStripeStats(); + } + + // dictionary should be written before any stream is flushed + columnWriter->writeDictionary(); + + std::vector<proto::Stream> streams; + // write ROW_INDEX streams + if (options.getEnableIndex()) { + columnWriter->writeIndex(streams); + } + // write streams like PRESENT, DATA, etc. + columnWriter->flush(streams); + + // generate and write stripe footer + proto::StripeFooter stripeFooter; + for (uint32_t i = 0; i < streams.size(); ++i) { + *stripeFooter.add_streams() = streams[i]; + } + + std::vector<proto::ColumnEncoding> encodings; + columnWriter->getColumnEncoding(encodings); + + for (uint32_t i = 0; i < encodings.size(); ++i) { + *stripeFooter.add_columns() = encodings[i]; + } + + // use GMT to guarantee TimestampVectorBatch from reader can write + // same wall clock time + stripeFooter.set_writertimezone("GMT"); + + // add stripe statistics to metadata + proto::StripeStatistics* stripeStats = metadata.add_stripestats(); + std::vector<proto::ColumnStatistics> colStats; + columnWriter->getStripeStatistics(colStats); + for (uint32_t i = 0; i != colStats.size(); ++i) { + *stripeStats->add_colstats() = colStats[i]; + } + // merge stripe stats into file stats and clear stripe stats + columnWriter->mergeStripeStatsIntoFileStats(); + + if (!stripeFooter.SerializeToZeroCopyStream(compressionStream.get())) { + throw std::logic_error("Failed to write stripe footer."); + } + uint64_t footerLength = compressionStream->flush(); + + // calculate data length and index length + uint64_t dataLength = 0; + uint64_t indexLength = 0; + for (uint32_t i = 0; i < streams.size(); ++i) { + if (streams[i].kind() == proto::Stream_Kind_ROW_INDEX || + streams[i].kind() == proto::Stream_Kind_BLOOM_FILTER_UTF8) { + indexLength += streams[i].length(); + } else { + dataLength += streams[i].length(); + } + } + + // update stripe info + stripeInfo.set_indexlength(indexLength); + stripeInfo.set_datalength(dataLength); + stripeInfo.set_footerlength(footerLength); + stripeInfo.set_numberofrows(stripeRows); + + *fileFooter.add_stripes() = stripeInfo; + + currentOffset = currentOffset + indexLength + dataLength + footerLength; + totalRows += stripeRows; + + columnWriter->reset(); + + initStripe(); + } + + void WriterImpl::writeMetadata() { + if (!metadata.SerializeToZeroCopyStream(compressionStream.get())) { + throw std::logic_error("Failed to write metadata."); + } + postScript.set_metadatalength(compressionStream.get()->flush()); + } + + void WriterImpl::writeFileFooter() { + fileFooter.set_contentlength(currentOffset - fileFooter.headerlength()); + fileFooter.set_numberofrows(totalRows); + + // update file statistics + std::vector<proto::ColumnStatistics> colStats; + columnWriter->getFileStatistics(colStats); + for (uint32_t i = 0; i != colStats.size(); ++i) { + *fileFooter.add_statistics() = colStats[i]; + } + + if (!fileFooter.SerializeToZeroCopyStream(compressionStream.get())) { + throw std::logic_error("Failed to write file footer."); + } + postScript.set_footerlength(compressionStream->flush()); + } + + void WriterImpl::writePostscript() { + if (!postScript.SerializeToZeroCopyStream(bufferedStream.get())) { + throw std::logic_error("Failed to write post script."); + } + unsigned char psLength = + static_cast<unsigned char>(bufferedStream->flush()); + outStream->write(&psLength, sizeof(unsigned char)); + } + + void WriterImpl::buildFooterType( + const Type& t, + proto::Footer& footer, + uint32_t & index) { + proto::Type protoType; + protoType.set_maximumlength(static_cast<uint32_t>(t.getMaximumLength())); + protoType.set_precision(static_cast<uint32_t>(t.getPrecision())); + protoType.set_scale(static_cast<uint32_t>(t.getScale())); + + switch (t.getKind()) { + case BOOLEAN: { + protoType.set_kind(proto::Type_Kind_BOOLEAN); + break; + } + case BYTE: { + protoType.set_kind(proto::Type_Kind_BYTE); + break; + } + case SHORT: { + protoType.set_kind(proto::Type_Kind_SHORT); + break; + } + case INT: { + protoType.set_kind(proto::Type_Kind_INT); + break; + } + case LONG: { + protoType.set_kind(proto::Type_Kind_LONG); + break; + } + case FLOAT: { + protoType.set_kind(proto::Type_Kind_FLOAT); + break; + } + case DOUBLE: { + protoType.set_kind(proto::Type_Kind_DOUBLE); + break; + } + case STRING: { + protoType.set_kind(proto::Type_Kind_STRING); + break; + } + case BINARY: { + protoType.set_kind(proto::Type_Kind_BINARY); + break; + } + case TIMESTAMP: { + protoType.set_kind(proto::Type_Kind_TIMESTAMP); + break; + } + case LIST: { + protoType.set_kind(proto::Type_Kind_LIST); + break; + } + case MAP: { + protoType.set_kind(proto::Type_Kind_MAP); + break; + } + case STRUCT: { + protoType.set_kind(proto::Type_Kind_STRUCT); + break; + } + case UNION: { + protoType.set_kind(proto::Type_Kind_UNION); + break; + } + case DECIMAL: { + protoType.set_kind(proto::Type_Kind_DECIMAL); + break; + } + case DATE: { + protoType.set_kind(proto::Type_Kind_DATE); + break; + } + case VARCHAR: { + protoType.set_kind(proto::Type_Kind_VARCHAR); + break; + } + case CHAR: { + protoType.set_kind(proto::Type_Kind_CHAR); + break; + } + default: + throw std::logic_error("Unknown type."); + } + + int pos = static_cast<int>(index); + *footer.add_types() = protoType; + + for (uint64_t i = 0; i < t.getSubtypeCount(); ++i) { + // only add subtypes' field names if this type is STRUCT + if (t.getKind() == STRUCT) { + footer.mutable_types(pos)->add_fieldnames(TString(t.getFieldName(i))); + } + footer.mutable_types(pos)->add_subtypes(++index); + buildFooterType(*t.getSubtype(i), footer, index); + } + } + + proto::CompressionKind WriterImpl::convertCompressionKind( + const CompressionKind& kind) { + return static_cast<proto::CompressionKind>(kind); + } + + std::unique_ptr<Writer> createWriter( + const Type& type, + OutputStream* stream, + const WriterOptions& options) { + return std::unique_ptr<Writer>( + new WriterImpl( + type, + stream, + options)); + } + +} + diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.cc b/contrib/libs/apache/orc/c++/src/io/InputStream.cc index 6e54b1412f..201f6f9c1d 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.cc @@ -1,222 +1,222 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "InputStream.hh" - -#include <algorithm> -#include <iomanip> - -namespace orc { - - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length) { - const uint64_t width = 24; - out << std::hex; - for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { - out << std::setfill('0') << std::setw(7) << (line * width); - for(uint64_t byte = 0; - byte < width && line * width + byte < length; ++byte) { - out << " " << std::setfill('0') << std::setw(2) - << static_cast<uint64_t>(0xff & buffer[line * width + - byte]); - } - out << "\n"; - } - out << std::dec; - } - - PositionProvider::PositionProvider(const std::list<uint64_t>& posns) { - position = posns.begin(); - } - - uint64_t PositionProvider::next() { - uint64_t result = *position; - ++position; - return result; - } - - SeekableInputStream::~SeekableInputStream() { - // PASS - } - - SeekableArrayInputStream::~SeekableArrayInputStream() { - // PASS - } - - SeekableArrayInputStream::SeekableArrayInputStream - (const unsigned char* values, - uint64_t size, - uint64_t blkSize - ): data(reinterpret_cast<const char*>(values)) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); - } - - SeekableArrayInputStream::SeekableArrayInputStream(const char* values, - uint64_t size, - uint64_t blkSize - ): data(values) { - length = size; - position = 0; - blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); - } - - bool SeekableArrayInputStream::Next(const void** buffer, int*size) { - uint64_t currentSize = std::min(length - position, blockSize); - if (currentSize > 0) { - *buffer = data + position; - *size = static_cast<int>(currentSize); - position += currentSize; - return true; - } - *size = 0; - return false; - } - - void SeekableArrayInputStream::BackUp(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount <= blockSize && unsignedCount <= position) { - position -= unsignedCount; - } else { - throw std::logic_error("Can't backup that much!"); - } - } - } - - bool SeekableArrayInputStream::Skip(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount + position <= length) { - position += unsignedCount; - return true; - } else { - position = length; - } - } - return false; - } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "InputStream.hh" + +#include <algorithm> +#include <iomanip> + +namespace orc { + + void printBuffer(std::ostream& out, + const char *buffer, + uint64_t length) { + const uint64_t width = 24; + out << std::hex; + for(uint64_t line = 0; line < (length + width - 1) / width; ++line) { + out << std::setfill('0') << std::setw(7) << (line * width); + for(uint64_t byte = 0; + byte < width && line * width + byte < length; ++byte) { + out << " " << std::setfill('0') << std::setw(2) + << static_cast<uint64_t>(0xff & buffer[line * width + + byte]); + } + out << "\n"; + } + out << std::dec; + } + + PositionProvider::PositionProvider(const std::list<uint64_t>& posns) { + position = posns.begin(); + } + + uint64_t PositionProvider::next() { + uint64_t result = *position; + ++position; + return result; + } + + SeekableInputStream::~SeekableInputStream() { + // PASS + } + + SeekableArrayInputStream::~SeekableArrayInputStream() { + // PASS + } + + SeekableArrayInputStream::SeekableArrayInputStream + (const unsigned char* values, + uint64_t size, + uint64_t blkSize + ): data(reinterpret_cast<const char*>(values)) { + length = size; + position = 0; + blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); + } + + SeekableArrayInputStream::SeekableArrayInputStream(const char* values, + uint64_t size, + uint64_t blkSize + ): data(values) { + length = size; + position = 0; + blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize); + } + + bool SeekableArrayInputStream::Next(const void** buffer, int*size) { + uint64_t currentSize = std::min(length - position, blockSize); + if (currentSize > 0) { + *buffer = data + position; + *size = static_cast<int>(currentSize); + position += currentSize; + return true; + } + *size = 0; + return false; + } + + void SeekableArrayInputStream::BackUp(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast<uint64_t>(count); + if (unsignedCount <= blockSize && unsignedCount <= position) { + position -= unsignedCount; + } else { + throw std::logic_error("Can't backup that much!"); + } + } + } + + bool SeekableArrayInputStream::Skip(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast<uint64_t>(count); + if (unsignedCount + position <= length) { + position += unsignedCount; + return true; + } else { + position = length; + } + } + return false; + } + int64_t SeekableArrayInputStream::ByteCount() const { - return static_cast<google::protobuf::int64>(position); - } - - void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { - position = seekPosition.next(); - } - - std::string SeekableArrayInputStream::getName() const { - std::ostringstream result; - result << "SeekableArrayInputStream " << position << " of " << length; - return result.str(); - } - - static uint64_t computeBlock(uint64_t request, uint64_t length) { - return std::min(length, request == 0 ? 256 * 1024 : request); - } - - SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, - uint64_t offset, - uint64_t byteCount, - MemoryPool& _pool, - uint64_t _blockSize - ):pool(_pool), - input(stream), - start(offset), - length(byteCount), - blockSize(computeBlock - (_blockSize, - length)) { - - position = 0; - buffer.reset(new DataBuffer<char>(pool)); - pushBack = 0; - } - - SeekableFileInputStream::~SeekableFileInputStream() { - // PASS - } - - bool SeekableFileInputStream::Next(const void** data, int*size) { - uint64_t bytesRead; - if (pushBack != 0) { - *data = buffer->data() + (buffer->size() - pushBack); - bytesRead = pushBack; - } else { - bytesRead = std::min(length - position, blockSize); - buffer->resize(bytesRead); - if (bytesRead > 0) { - input->read(buffer->data(), bytesRead, start+position); - *data = static_cast<void*>(buffer->data()); - } - } - position += bytesRead; - pushBack = 0; - *size = static_cast<int>(bytesRead); - return bytesRead != 0; - } - - void SeekableFileInputStream::BackUp(int signedCount) { - if (signedCount < 0) { - throw std::logic_error("can't backup negative distances"); - } - uint64_t count = static_cast<uint64_t>(signedCount); - if (pushBack > 0) { - throw std::logic_error("can't backup unless we just called Next"); - } - if (count > blockSize || count > position) { - throw std::logic_error("can't backup that far"); - } - pushBack = static_cast<uint64_t>(count); - position -= pushBack; - } - - bool SeekableFileInputStream::Skip(int signedCount) { - if (signedCount < 0) { - return false; - } - uint64_t count = static_cast<uint64_t>(signedCount); - position = std::min(position + count, length); - pushBack = 0; - return position < length; - } - - int64_t SeekableFileInputStream::ByteCount() const { - return static_cast<int64_t>(position); - } - - void SeekableFileInputStream::seek(PositionProvider& location) { - position = location.next(); - if (position > length) { - position = length; - throw std::logic_error("seek too far"); - } - pushBack = 0; - } - - std::string SeekableFileInputStream::getName() const { - std::ostringstream result; - result << input->getName() << " from " << start << " for " - << length; - return result.str(); - } - -} + return static_cast<google::protobuf::int64>(position); + } + + void SeekableArrayInputStream::seek(PositionProvider& seekPosition) { + position = seekPosition.next(); + } + + std::string SeekableArrayInputStream::getName() const { + std::ostringstream result; + result << "SeekableArrayInputStream " << position << " of " << length; + return result.str(); + } + + static uint64_t computeBlock(uint64_t request, uint64_t length) { + return std::min(length, request == 0 ? 256 * 1024 : request); + } + + SeekableFileInputStream::SeekableFileInputStream(InputStream* stream, + uint64_t offset, + uint64_t byteCount, + MemoryPool& _pool, + uint64_t _blockSize + ):pool(_pool), + input(stream), + start(offset), + length(byteCount), + blockSize(computeBlock + (_blockSize, + length)) { + + position = 0; + buffer.reset(new DataBuffer<char>(pool)); + pushBack = 0; + } + + SeekableFileInputStream::~SeekableFileInputStream() { + // PASS + } + + bool SeekableFileInputStream::Next(const void** data, int*size) { + uint64_t bytesRead; + if (pushBack != 0) { + *data = buffer->data() + (buffer->size() - pushBack); + bytesRead = pushBack; + } else { + bytesRead = std::min(length - position, blockSize); + buffer->resize(bytesRead); + if (bytesRead > 0) { + input->read(buffer->data(), bytesRead, start+position); + *data = static_cast<void*>(buffer->data()); + } + } + position += bytesRead; + pushBack = 0; + *size = static_cast<int>(bytesRead); + return bytesRead != 0; + } + + void SeekableFileInputStream::BackUp(int signedCount) { + if (signedCount < 0) { + throw std::logic_error("can't backup negative distances"); + } + uint64_t count = static_cast<uint64_t>(signedCount); + if (pushBack > 0) { + throw std::logic_error("can't backup unless we just called Next"); + } + if (count > blockSize || count > position) { + throw std::logic_error("can't backup that far"); + } + pushBack = static_cast<uint64_t>(count); + position -= pushBack; + } + + bool SeekableFileInputStream::Skip(int signedCount) { + if (signedCount < 0) { + return false; + } + uint64_t count = static_cast<uint64_t>(signedCount); + position = std::min(position + count, length); + pushBack = 0; + return position < length; + } + + int64_t SeekableFileInputStream::ByteCount() const { + return static_cast<int64_t>(position); + } + + void SeekableFileInputStream::seek(PositionProvider& location) { + position = location.next(); + if (position > length) { + position = length; + throw std::logic_error("seek too far"); + } + pushBack = 0; + } + + std::string SeekableFileInputStream::getName() const { + std::ostringstream result; + result << input->getName() << " from " << start << " for " + << length; + return result.str(); + } + +} diff --git a/contrib/libs/apache/orc/c++/src/io/InputStream.hh b/contrib/libs/apache/orc/c++/src/io/InputStream.hh index d8bd3d4d8c..797049a300 100644 --- a/contrib/libs/apache/orc/c++/src/io/InputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/InputStream.hh @@ -1,116 +1,116 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_INPUTSTREAM_HH -#define ORC_INPUTSTREAM_HH - -#include "Adaptor.hh" -#include "orc/OrcFile.hh" -#include "wrap/zero-copy-stream-wrapper.h" - -#include <list> -#include <fstream> -#include <iostream> -#include <sstream> -#include <vector> - -namespace orc { - - void printBuffer(std::ostream& out, - const char *buffer, - uint64_t length); - - class PositionProvider { - private: - std::list<uint64_t>::const_iterator position; - public: - PositionProvider(const std::list<uint64_t>& positions); - uint64_t next(); - }; - - /** - * A subclass of Google's ZeroCopyInputStream that supports seek. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf readers. - */ - class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { - public: - virtual ~SeekableInputStream(); - virtual void seek(PositionProvider& position) = 0; - virtual std::string getName() const = 0; - }; - - /** - * Create a seekable input stream based on a memory range. - */ - class SeekableArrayInputStream: public SeekableInputStream { - private: - const char* data; - uint64_t length; - uint64_t position; - uint64_t blockSize; - - public: - SeekableArrayInputStream(const unsigned char* list, - uint64_t length, - uint64_t block_size = 0); - SeekableArrayInputStream(const char* list, - uint64_t length, - uint64_t block_size = 0); - virtual ~SeekableArrayInputStream() override; - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_INPUTSTREAM_HH +#define ORC_INPUTSTREAM_HH + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "wrap/zero-copy-stream-wrapper.h" + +#include <list> +#include <fstream> +#include <iostream> +#include <sstream> +#include <vector> + +namespace orc { + + void printBuffer(std::ostream& out, + const char *buffer, + uint64_t length); + + class PositionProvider { + private: + std::list<uint64_t>::const_iterator position; + public: + PositionProvider(const std::list<uint64_t>& positions); + uint64_t next(); + }; + + /** + * A subclass of Google's ZeroCopyInputStream that supports seek. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf readers. + */ + class SeekableInputStream: public google::protobuf::io::ZeroCopyInputStream { + public: + virtual ~SeekableInputStream(); + virtual void seek(PositionProvider& position) = 0; + virtual std::string getName() const = 0; + }; + + /** + * Create a seekable input stream based on a memory range. + */ + class SeekableArrayInputStream: public SeekableInputStream { + private: + const char* data; + uint64_t length; + uint64_t position; + uint64_t blockSize; + + public: + SeekableArrayInputStream(const unsigned char* list, + uint64_t length, + uint64_t block_size = 0); + SeekableArrayInputStream(const char* list, + uint64_t length, + uint64_t block_size = 0); + virtual ~SeekableArrayInputStream() override; + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - }; - - /** - * Create a seekable input stream based on an input stream. - */ - class SeekableFileInputStream: public SeekableInputStream { - private: - MemoryPool& pool; - InputStream* const input; - const uint64_t start; - const uint64_t length; - const uint64_t blockSize; - std::unique_ptr<DataBuffer<char> > buffer; - uint64_t position; - uint64_t pushBack; - - public: - SeekableFileInputStream(InputStream* input, - uint64_t offset, - uint64_t byteCount, - MemoryPool& pool, - uint64_t blockSize = 0); - virtual ~SeekableFileInputStream() override; - - virtual bool Next(const void** data, int*size) override; - virtual void BackUp(int count) override; - virtual bool Skip(int count) override; - virtual int64_t ByteCount() const override; - virtual void seek(PositionProvider& position) override; - virtual std::string getName() const override; - }; - -} - -#endif //ORC_INPUTSTREAM_HH + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; + + /** + * Create a seekable input stream based on an input stream. + */ + class SeekableFileInputStream: public SeekableInputStream { + private: + MemoryPool& pool; + InputStream* const input; + const uint64_t start; + const uint64_t length; + const uint64_t blockSize; + std::unique_ptr<DataBuffer<char> > buffer; + uint64_t position; + uint64_t pushBack; + + public: + SeekableFileInputStream(InputStream* input, + uint64_t offset, + uint64_t byteCount, + MemoryPool& pool, + uint64_t blockSize = 0); + virtual ~SeekableFileInputStream() override; + + virtual bool Next(const void** data, int*size) override; + virtual void BackUp(int count) override; + virtual bool Skip(int count) override; + virtual int64_t ByteCount() const override; + virtual void seek(PositionProvider& position) override; + virtual std::string getName() const override; + }; + +} + +#endif //ORC_INPUTSTREAM_HH diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc index 11a21c0bd3..dd9327adf9 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.cc +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.cc @@ -1,147 +1,147 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "orc/Exceptions.hh" -#include "OutputStream.hh" - -#include <sstream> - -namespace orc { - - PositionRecorder::~PositionRecorder() { - // PASS - } - - BufferedOutputStream::BufferedOutputStream( - MemoryPool& pool, - OutputStream * outStream, - uint64_t capacity_, - uint64_t blockSize_) - : outputStream(outStream), - blockSize(blockSize_) { - dataBuffer.reset(new DataBuffer<char>(pool)); - dataBuffer->reserve(capacity_); - } - - BufferedOutputStream::~BufferedOutputStream() { - // PASS - } - - bool BufferedOutputStream::Next(void** buffer, int* size) { - *size = static_cast<int>(blockSize); - uint64_t oldSize = dataBuffer->size(); - uint64_t newSize = oldSize + blockSize; - uint64_t newCapacity = dataBuffer->capacity(); - while (newCapacity < newSize) { - newCapacity += dataBuffer->capacity(); - } - dataBuffer->reserve(newCapacity); - dataBuffer->resize(newSize); - *buffer = dataBuffer->data() + oldSize; - return true; - } - - void BufferedOutputStream::BackUp(int count) { - if (count >= 0) { - uint64_t unsignedCount = static_cast<uint64_t>(count); - if (unsignedCount <= dataBuffer->size()) { - dataBuffer->resize(dataBuffer->size() - unsignedCount); - } else { - throw std::logic_error("Can't backup that much!"); - } - } - } - +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "orc/Exceptions.hh" +#include "OutputStream.hh" + +#include <sstream> + +namespace orc { + + PositionRecorder::~PositionRecorder() { + // PASS + } + + BufferedOutputStream::BufferedOutputStream( + MemoryPool& pool, + OutputStream * outStream, + uint64_t capacity_, + uint64_t blockSize_) + : outputStream(outStream), + blockSize(blockSize_) { + dataBuffer.reset(new DataBuffer<char>(pool)); + dataBuffer->reserve(capacity_); + } + + BufferedOutputStream::~BufferedOutputStream() { + // PASS + } + + bool BufferedOutputStream::Next(void** buffer, int* size) { + *size = static_cast<int>(blockSize); + uint64_t oldSize = dataBuffer->size(); + uint64_t newSize = oldSize + blockSize; + uint64_t newCapacity = dataBuffer->capacity(); + while (newCapacity < newSize) { + newCapacity += dataBuffer->capacity(); + } + dataBuffer->reserve(newCapacity); + dataBuffer->resize(newSize); + *buffer = dataBuffer->data() + oldSize; + return true; + } + + void BufferedOutputStream::BackUp(int count) { + if (count >= 0) { + uint64_t unsignedCount = static_cast<uint64_t>(count); + if (unsignedCount <= dataBuffer->size()) { + dataBuffer->resize(dataBuffer->size() - unsignedCount); + } else { + throw std::logic_error("Can't backup that much!"); + } + } + } + int64_t BufferedOutputStream::ByteCount() const { - return static_cast<google::protobuf::int64>(dataBuffer->size()); - } - - bool BufferedOutputStream::WriteAliasedRaw(const void *, int) { - throw NotImplementedYet("WriteAliasedRaw is not supported."); - } - - bool BufferedOutputStream::AllowsAliasing() const { - return false; - } - - std::string BufferedOutputStream::getName() const { - std::ostringstream result; - result << "BufferedOutputStream " << dataBuffer->size() << " of " - << dataBuffer->capacity(); - return result.str(); - } - - uint64_t BufferedOutputStream::getSize() const { - return dataBuffer->size(); - } - - uint64_t BufferedOutputStream::flush() { - uint64_t dataSize = dataBuffer->size(); - outputStream->write(dataBuffer->data(), dataSize); - dataBuffer->resize(0); - return dataSize; - } - - void AppendOnlyBufferedStream::write(const char * data, size_t size) { - size_t dataOffset = 0; - while (size > 0) { - if (bufferOffset == bufferLength) { - if (!outStream->Next( - reinterpret_cast<void **>(&buffer), - &bufferLength)) { - throw std::logic_error("Failed to allocate buffer."); - } - bufferOffset = 0; - } - size_t len = std::min( - static_cast<size_t>(bufferLength - bufferOffset), - size); - memcpy(buffer + bufferOffset, data + dataOffset, len); - bufferOffset += static_cast<int>(len); - dataOffset += len; - size -= len; - } - } - - uint64_t AppendOnlyBufferedStream::getSize() const { - return outStream->getSize(); - } - - uint64_t AppendOnlyBufferedStream::flush() { - outStream->BackUp(bufferLength - bufferOffset); - bufferOffset = bufferLength = 0; - buffer = nullptr; - return outStream->flush(); - } - - void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { - uint64_t flushedSize = outStream->getSize(); - uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset); - if (outStream->isCompressed()) { - // start of the compression chunk in the stream - recorder->add(flushedSize); - // number of decompressed bytes that need to be consumed - recorder->add(unflushedSize); - } else { - flushedSize -= static_cast<uint64_t>(bufferLength); - // byte offset of the start location - recorder->add(flushedSize + unflushedSize); - } - } - -} + return static_cast<google::protobuf::int64>(dataBuffer->size()); + } + + bool BufferedOutputStream::WriteAliasedRaw(const void *, int) { + throw NotImplementedYet("WriteAliasedRaw is not supported."); + } + + bool BufferedOutputStream::AllowsAliasing() const { + return false; + } + + std::string BufferedOutputStream::getName() const { + std::ostringstream result; + result << "BufferedOutputStream " << dataBuffer->size() << " of " + << dataBuffer->capacity(); + return result.str(); + } + + uint64_t BufferedOutputStream::getSize() const { + return dataBuffer->size(); + } + + uint64_t BufferedOutputStream::flush() { + uint64_t dataSize = dataBuffer->size(); + outputStream->write(dataBuffer->data(), dataSize); + dataBuffer->resize(0); + return dataSize; + } + + void AppendOnlyBufferedStream::write(const char * data, size_t size) { + size_t dataOffset = 0; + while (size > 0) { + if (bufferOffset == bufferLength) { + if (!outStream->Next( + reinterpret_cast<void **>(&buffer), + &bufferLength)) { + throw std::logic_error("Failed to allocate buffer."); + } + bufferOffset = 0; + } + size_t len = std::min( + static_cast<size_t>(bufferLength - bufferOffset), + size); + memcpy(buffer + bufferOffset, data + dataOffset, len); + bufferOffset += static_cast<int>(len); + dataOffset += len; + size -= len; + } + } + + uint64_t AppendOnlyBufferedStream::getSize() const { + return outStream->getSize(); + } + + uint64_t AppendOnlyBufferedStream::flush() { + outStream->BackUp(bufferLength - bufferOffset); + bufferOffset = bufferLength = 0; + buffer = nullptr; + return outStream->flush(); + } + + void AppendOnlyBufferedStream::recordPosition(PositionRecorder* recorder) const { + uint64_t flushedSize = outStream->getSize(); + uint64_t unflushedSize = static_cast<uint64_t>(bufferOffset); + if (outStream->isCompressed()) { + // start of the compression chunk in the stream + recorder->add(flushedSize); + // number of decompressed bytes that need to be consumed + recorder->add(unflushedSize); + } else { + flushedSize -= static_cast<uint64_t>(bufferLength); + // byte offset of the start location + recorder->add(flushedSize + unflushedSize); + } + } + +} diff --git a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh index 7ce9fafa24..e40263fdfb 100644 --- a/contrib/libs/apache/orc/c++/src/io/OutputStream.hh +++ b/contrib/libs/apache/orc/c++/src/io/OutputStream.hh @@ -1,96 +1,96 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_OUTPUTSTREAM_HH -#define ORC_OUTPUTSTREAM_HH - -#include "Adaptor.hh" -#include "orc/OrcFile.hh" -#include "wrap/zero-copy-stream-wrapper.h" - -namespace orc { - - /** - * Record write position for creating index stream - */ - class PositionRecorder { - public: - virtual ~PositionRecorder(); - virtual void add(uint64_t pos) = 0; - }; - - /** - * A subclass of Google's ZeroCopyOutputStream that supports output to memory - * buffer, and flushing to OutputStream. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf writers. - */ - class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream { - private: - OutputStream * outputStream; - std::unique_ptr<DataBuffer<char> > dataBuffer; - uint64_t blockSize; - - public: - BufferedOutputStream(MemoryPool& pool, - OutputStream * outStream, - uint64_t capacity, - uint64_t block_size); - virtual ~BufferedOutputStream() override; - - virtual bool Next(void** data, int*size) override; - virtual void BackUp(int count) override; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_OUTPUTSTREAM_HH +#define ORC_OUTPUTSTREAM_HH + +#include "Adaptor.hh" +#include "orc/OrcFile.hh" +#include "wrap/zero-copy-stream-wrapper.h" + +namespace orc { + + /** + * Record write position for creating index stream + */ + class PositionRecorder { + public: + virtual ~PositionRecorder(); + virtual void add(uint64_t pos) = 0; + }; + + /** + * A subclass of Google's ZeroCopyOutputStream that supports output to memory + * buffer, and flushing to OutputStream. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf writers. + */ + class BufferedOutputStream: public google::protobuf::io::ZeroCopyOutputStream { + private: + OutputStream * outputStream; + std::unique_ptr<DataBuffer<char> > dataBuffer; + uint64_t blockSize; + + public: + BufferedOutputStream(MemoryPool& pool, + OutputStream * outStream, + uint64_t capacity, + uint64_t block_size); + virtual ~BufferedOutputStream() override; + + virtual bool Next(void** data, int*size) override; + virtual void BackUp(int count) override; virtual int64_t ByteCount() const override; - virtual bool WriteAliasedRaw(const void * data, int size) override; - virtual bool AllowsAliasing() const override; - - virtual std::string getName() const; - virtual uint64_t getSize() const; - virtual uint64_t flush(); - - virtual bool isCompressed() const { return false; } - }; - - /** - * An append only buffered stream that allows - * buffer, and flushing to OutputStream. - * By extending Google's class, we get the ability to pass it directly - * to the protobuf writers. - */ - class AppendOnlyBufferedStream { - private: - std::unique_ptr<BufferedOutputStream> outStream; - char * buffer; - int bufferOffset, bufferLength; - - public: - AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) : - outStream(std::move(_outStream)) { - buffer = nullptr; - bufferOffset = bufferLength = 0; - } - - void write(const char * data, size_t size); - uint64_t getSize() const; - uint64_t flush(); - - void recordPosition(PositionRecorder* recorder) const; - }; -} - -#endif // ORC_OUTPUTSTREAM_HH + virtual bool WriteAliasedRaw(const void * data, int size) override; + virtual bool AllowsAliasing() const override; + + virtual std::string getName() const; + virtual uint64_t getSize() const; + virtual uint64_t flush(); + + virtual bool isCompressed() const { return false; } + }; + + /** + * An append only buffered stream that allows + * buffer, and flushing to OutputStream. + * By extending Google's class, we get the ability to pass it directly + * to the protobuf writers. + */ + class AppendOnlyBufferedStream { + private: + std::unique_ptr<BufferedOutputStream> outStream; + char * buffer; + int bufferOffset, bufferLength; + + public: + AppendOnlyBufferedStream(std::unique_ptr<BufferedOutputStream> _outStream) : + outStream(std::move(_outStream)) { + buffer = nullptr; + bufferOffset = bufferLength = 0; + } + + void write(const char * data, size_t size); + uint64_t getSize() const; + uint64_t flush(); + + void recordPosition(PositionRecorder* recorder) const; + }; +} + +#endif // ORC_OUTPUTSTREAM_HH diff --git a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h index 605fbf826c..8d1eab50b4 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/coded-stream-wrapper.h @@ -1,35 +1,35 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef CODED_STREAM_WRAPPER_HH -#define CODED_STREAM_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") -#endif - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wconversion") -#endif - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CODED_STREAM_WRAPPER_HH +#define CODED_STREAM_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +#endif + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wconversion") +#endif + #include <google/protobuf/io/coded_stream.h> - -DIAGNOSTIC_POP - -#endif + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh index 5c161660cc..dc8e9de7f6 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh +++ b/contrib/libs/apache/orc/c++/src/wrap/orc-proto-wrapper.hh @@ -1,47 +1,47 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ORC_PROTO_WRAPPER_HH -#define ORC_PROTO_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wconversion") - DIAGNOSTIC_IGNORE("-Wdeprecated") - DIAGNOSTIC_IGNORE("-Wsign-conversion") - DIAGNOSTIC_IGNORE("-Wunused-parameter") -#endif - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wnested-anon-types") - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") - DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") - DIAGNOSTIC_IGNORE("-Wunknown-warning-option") - DIAGNOSTIC_IGNORE("-Wweak-vtables") - DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant") -#endif - -#if defined(_MSC_VER) - DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned - DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false' -#endif - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ORC_PROTO_WRAPPER_HH +#define ORC_PROTO_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wconversion") + DIAGNOSTIC_IGNORE("-Wdeprecated") + DIAGNOSTIC_IGNORE("-Wsign-conversion") + DIAGNOSTIC_IGNORE("-Wunused-parameter") +#endif + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wnested-anon-types") + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") + DIAGNOSTIC_IGNORE("-Wshorten-64-to-32") + DIAGNOSTIC_IGNORE("-Wunknown-warning-option") + DIAGNOSTIC_IGNORE("-Wweak-vtables") + DIAGNOSTIC_IGNORE("-Wzero-as-null-pointer-constant") +#endif + +#if defined(_MSC_VER) + DIAGNOSTIC_IGNORE(4146) // unary minus operator applied to unsigned type, result still unsigned + DIAGNOSTIC_IGNORE(4800) // forcing value to bool 'true' or 'false' +#endif + #include "contrib/libs/apache/orc/proto/orc_proto.pb.h" - -DIAGNOSTIC_POP - -#endif + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h index aeab0f0033..497ae6f508 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/snappy-wrapper.h @@ -1,30 +1,30 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SNAPPY_WRAPPER_HH -#define SNAPPY_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") -#endif - -#include <snappy.h> - -DIAGNOSTIC_POP - -#endif +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SNAPPY_WRAPPER_HH +#define SNAPPY_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +#endif + +#include <snappy.h> + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h index 1af0bd002d..7cf1491d3d 100644 --- a/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h +++ b/contrib/libs/apache/orc/c++/src/wrap/zero-copy-stream-wrapper.h @@ -1,36 +1,36 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ZERO_COPY_STREAM_WRAPPER_HH -#define ZERO_COPY_STREAM_WRAPPER_HH - -#include "Adaptor.hh" - -DIAGNOSTIC_PUSH - -#if defined(__GNUC__) || defined(__clang__) - DIAGNOSTIC_IGNORE("-Wdeprecated") - DIAGNOSTIC_IGNORE("-Wpadded") - DIAGNOSTIC_IGNORE("-Wunused-parameter") -#endif - -#ifdef __clang__ - DIAGNOSTIC_IGNORE("-Wreserved-id-macro") -#endif - +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef ZERO_COPY_STREAM_WRAPPER_HH +#define ZERO_COPY_STREAM_WRAPPER_HH + +#include "Adaptor.hh" + +DIAGNOSTIC_PUSH + +#if defined(__GNUC__) || defined(__clang__) + DIAGNOSTIC_IGNORE("-Wdeprecated") + DIAGNOSTIC_IGNORE("-Wpadded") + DIAGNOSTIC_IGNORE("-Wunused-parameter") +#endif + +#ifdef __clang__ + DIAGNOSTIC_IGNORE("-Wreserved-id-macro") +#endif + #include <google/protobuf/io/zero_copy_stream.h> - -DIAGNOSTIC_POP - -#endif + +DIAGNOSTIC_POP + +#endif diff --git a/contrib/libs/apache/orc/proto/orc_proto.proto b/contrib/libs/apache/orc/proto/orc_proto.proto index e8b84dbecd..44c780db50 100644 --- a/contrib/libs/apache/orc/proto/orc_proto.proto +++ b/contrib/libs/apache/orc/proto/orc_proto.proto @@ -1,348 +1,348 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -syntax = "proto2"; - -package orc.proto; - -option java_package = "org.apache.orc"; - -message IntegerStatistics { - optional sint64 minimum = 1; - optional sint64 maximum = 2; - optional sint64 sum = 3; -} - -message DoubleStatistics { - optional double minimum = 1; - optional double maximum = 2; - optional double sum = 3; -} - -message StringStatistics { - optional string minimum = 1; - optional string maximum = 2; - // sum will store the total length of all strings in a stripe - optional sint64 sum = 3; - // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper - // bound instead of the minimum or maximum values above. - optional string lowerBound = 4; - optional string upperBound = 5; -} - -message BucketStatistics { - repeated uint64 count = 1 [packed=true]; -} - -message DecimalStatistics { - optional string minimum = 1; - optional string maximum = 2; - optional string sum = 3; -} - -message DateStatistics { - // min,max values saved as days since epoch - optional sint32 minimum = 1; - optional sint32 maximum = 2; -} - -message TimestampStatistics { - // min,max values saved as milliseconds since epoch - optional sint64 minimum = 1; - optional sint64 maximum = 2; - optional sint64 minimumUtc = 3; - optional sint64 maximumUtc = 4; +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto2"; + +package orc.proto; + +option java_package = "org.apache.orc"; + +message IntegerStatistics { + optional sint64 minimum = 1; + optional sint64 maximum = 2; + optional sint64 sum = 3; +} + +message DoubleStatistics { + optional double minimum = 1; + optional double maximum = 2; + optional double sum = 3; +} + +message StringStatistics { + optional string minimum = 1; + optional string maximum = 2; + // sum will store the total length of all strings in a stripe + optional sint64 sum = 3; + // If the minimum or maximum value was longer than 1024 bytes, store a lower or upper + // bound instead of the minimum or maximum values above. + optional string lowerBound = 4; + optional string upperBound = 5; +} + +message BucketStatistics { + repeated uint64 count = 1 [packed=true]; +} + +message DecimalStatistics { + optional string minimum = 1; + optional string maximum = 2; + optional string sum = 3; +} + +message DateStatistics { + // min,max values saved as days since epoch + optional sint32 minimum = 1; + optional sint32 maximum = 2; +} + +message TimestampStatistics { + // min,max values saved as milliseconds since epoch + optional sint64 minimum = 1; + optional sint64 maximum = 2; + optional sint64 minimumUtc = 3; + optional sint64 maximumUtc = 4; // store the lower 6 TS digits for min/max to achieve nanosecond precision optional int32 minimumNanos = 5; optional int32 maximumNanos = 6; -} - -message BinaryStatistics { - // sum will store the total binary blob length in a stripe - optional sint64 sum = 1; -} - -// Statistics for list and map -message CollectionStatistics { - optional uint64 minChildren = 1; - optional uint64 maxChildren = 2; - optional uint64 totalChildren = 3; -} - -message ColumnStatistics { - optional uint64 numberOfValues = 1; - optional IntegerStatistics intStatistics = 2; - optional DoubleStatistics doubleStatistics = 3; - optional StringStatistics stringStatistics = 4; - optional BucketStatistics bucketStatistics = 5; - optional DecimalStatistics decimalStatistics = 6; - optional DateStatistics dateStatistics = 7; - optional BinaryStatistics binaryStatistics = 8; - optional TimestampStatistics timestampStatistics = 9; - optional bool hasNull = 10; - optional uint64 bytesOnDisk = 11; - optional CollectionStatistics collectionStatistics = 12; -} - -message RowIndexEntry { - repeated uint64 positions = 1 [packed=true]; - optional ColumnStatistics statistics = 2; -} - -message RowIndex { - repeated RowIndexEntry entry = 1; -} - -message BloomFilter { - optional uint32 numHashFunctions = 1; - repeated fixed64 bitset = 2; - optional bytes utf8bitset = 3; -} - -message BloomFilterIndex { - repeated BloomFilter bloomFilter = 1; -} - -message Stream { - // if you add new index stream kinds, you need to make sure to update - // StreamName to ensure it is added to the stripe in the right area - enum Kind { - PRESENT = 0; - DATA = 1; - LENGTH = 2; - DICTIONARY_DATA = 3; - DICTIONARY_COUNT = 4; - SECONDARY = 5; - ROW_INDEX = 6; - BLOOM_FILTER = 7; - BLOOM_FILTER_UTF8 = 8; - // Virtual stream kinds to allocate space for encrypted index and data. - ENCRYPTED_INDEX = 9; - ENCRYPTED_DATA = 10; - - // stripe statistics streams - STRIPE_STATISTICS = 100; - // A virtual stream kind that is used for setting the encryption IV. - FILE_STATISTICS = 101; - } - optional Kind kind = 1; - optional uint32 column = 2; - optional uint64 length = 3; -} - -message ColumnEncoding { - enum Kind { - DIRECT = 0; - DICTIONARY = 1; - DIRECT_V2 = 2; - DICTIONARY_V2 = 3; - } - optional Kind kind = 1; - optional uint32 dictionarySize = 2; - - // The encoding of the bloom filters for this column: - // 0 or missing = none or original - // 1 = ORC-135 (utc for timestamps) - optional uint32 bloomEncoding = 3; -} - -message StripeEncryptionVariant { - repeated Stream streams = 1; - repeated ColumnEncoding encoding = 2; -} - -// each stripe looks like: -// index streams -// unencrypted -// variant 1..N -// data streams -// unencrypted -// variant 1..N -// footer - -message StripeFooter { - repeated Stream streams = 1; - repeated ColumnEncoding columns = 2; - optional string writerTimezone = 3; - // one for each column encryption variant - repeated StripeEncryptionVariant encryption = 4; -} - -// the file tail looks like: -// encrypted stripe statistics: ColumnarStripeStatistics (order by variant) -// stripe statistics: Metadata -// footer: Footer -// postscript: PostScript -// psLen: byte - -message StringPair { - optional string key = 1; - optional string value = 2; -} - -message Type { - enum Kind { - BOOLEAN = 0; - BYTE = 1; - SHORT = 2; - INT = 3; - LONG = 4; - FLOAT = 5; - DOUBLE = 6; - STRING = 7; - BINARY = 8; - TIMESTAMP = 9; - LIST = 10; - MAP = 11; - STRUCT = 12; - UNION = 13; - DECIMAL = 14; - DATE = 15; - VARCHAR = 16; - CHAR = 17; - TIMESTAMP_INSTANT = 18; - } - optional Kind kind = 1; - repeated uint32 subtypes = 2 [packed=true]; - repeated string fieldNames = 3; - optional uint32 maximumLength = 4; - optional uint32 precision = 5; - optional uint32 scale = 6; - repeated StringPair attributes = 7; -} - -message StripeInformation { - // the global file offset of the start of the stripe - optional uint64 offset = 1; - // the number of bytes of index - optional uint64 indexLength = 2; - // the number of bytes of data - optional uint64 dataLength = 3; - // the number of bytes in the stripe footer - optional uint64 footerLength = 4; - // the number of rows in this stripe - optional uint64 numberOfRows = 5; - // If this is present, the reader should use this value for the encryption - // stripe id for setting the encryption IV. Otherwise, the reader should - // use one larger than the previous stripe's encryptStripeId. - // For unmerged ORC files, the first stripe will use 1 and the rest of the - // stripes won't have it set. For merged files, the stripe information - // will be copied from their original files and thus the first stripe of - // each of the input files will reset it to 1. - // Note that 1 was choosen, because protobuf v3 doesn't serialize - // primitive types that are the default (eg. 0). - optional uint64 encryptStripeId = 6; - // For each encryption variant, the new encrypted local key to use - // until we find a replacement. - repeated bytes encryptedLocalKeys = 7; -} - -message UserMetadataItem { - optional string name = 1; - optional bytes value = 2; -} - -// StripeStatistics (1 per a stripe), which each contain the -// ColumnStatistics for each column. -// This message type is only used in ORC v0 and v1. -message StripeStatistics { - repeated ColumnStatistics colStats = 1; -} - -// This message type is only used in ORC v0 and v1. -message Metadata { - repeated StripeStatistics stripeStats = 1; -} - -// In ORC v2 (and for encrypted columns in v1), each column has -// their column statistics written separately. -message ColumnarStripeStatistics { - // one value for each stripe in the file - repeated ColumnStatistics colStats = 1; -} - -enum EncryptionAlgorithm { - UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms - AES_CTR_128 = 1; - AES_CTR_256 = 2; -} - -message FileStatistics { - repeated ColumnStatistics column = 1; -} - -// How was the data masked? This isn't necessary for reading the file, but -// is documentation about how the file was written. -message DataMask { - // the kind of masking, which may include third party masks - optional string name = 1; - // parameters for the mask - repeated string maskParameters = 2; - // the unencrypted column roots this mask was applied to - repeated uint32 columns = 3 [packed = true]; -} - -// Information about the encryption keys. -message EncryptionKey { - optional string keyName = 1; - optional uint32 keyVersion = 2; - optional EncryptionAlgorithm algorithm = 3; -} - -// The description of an encryption variant. -// Each variant is a single subtype that is encrypted with a single key. -message EncryptionVariant { - // the column id of the root - optional uint32 root = 1; - // The master key that was used to encrypt the local key, referenced as - // an index into the Encryption.key list. - optional uint32 key = 2; - // the encrypted key for the file footer - optional bytes encryptedKey = 3; - // the stripe statistics for this variant - repeated Stream stripeStatistics = 4; - // encrypted file statistics as a FileStatistics - optional bytes fileStatistics = 5; -} - -// Which KeyProvider encrypted the local keys. -enum KeyProviderKind { - UNKNOWN = 0; - HADOOP = 1; - AWS = 2; - GCP = 3; - AZURE = 4; -} - -message Encryption { - // all of the masks used in this file - repeated DataMask mask = 1; - // all of the keys used in this file - repeated EncryptionKey key = 2; - // The encrypted variants. - // Readers should prefer the first variant that the user has access to - // the corresponding key. If they don't have access to any of the keys, - // they should get the unencrypted masked data. - repeated EncryptionVariant variants = 3; - // How are the local keys encrypted? - optional KeyProviderKind keyProvider = 4; -} - +} + +message BinaryStatistics { + // sum will store the total binary blob length in a stripe + optional sint64 sum = 1; +} + +// Statistics for list and map +message CollectionStatistics { + optional uint64 minChildren = 1; + optional uint64 maxChildren = 2; + optional uint64 totalChildren = 3; +} + +message ColumnStatistics { + optional uint64 numberOfValues = 1; + optional IntegerStatistics intStatistics = 2; + optional DoubleStatistics doubleStatistics = 3; + optional StringStatistics stringStatistics = 4; + optional BucketStatistics bucketStatistics = 5; + optional DecimalStatistics decimalStatistics = 6; + optional DateStatistics dateStatistics = 7; + optional BinaryStatistics binaryStatistics = 8; + optional TimestampStatistics timestampStatistics = 9; + optional bool hasNull = 10; + optional uint64 bytesOnDisk = 11; + optional CollectionStatistics collectionStatistics = 12; +} + +message RowIndexEntry { + repeated uint64 positions = 1 [packed=true]; + optional ColumnStatistics statistics = 2; +} + +message RowIndex { + repeated RowIndexEntry entry = 1; +} + +message BloomFilter { + optional uint32 numHashFunctions = 1; + repeated fixed64 bitset = 2; + optional bytes utf8bitset = 3; +} + +message BloomFilterIndex { + repeated BloomFilter bloomFilter = 1; +} + +message Stream { + // if you add new index stream kinds, you need to make sure to update + // StreamName to ensure it is added to the stripe in the right area + enum Kind { + PRESENT = 0; + DATA = 1; + LENGTH = 2; + DICTIONARY_DATA = 3; + DICTIONARY_COUNT = 4; + SECONDARY = 5; + ROW_INDEX = 6; + BLOOM_FILTER = 7; + BLOOM_FILTER_UTF8 = 8; + // Virtual stream kinds to allocate space for encrypted index and data. + ENCRYPTED_INDEX = 9; + ENCRYPTED_DATA = 10; + + // stripe statistics streams + STRIPE_STATISTICS = 100; + // A virtual stream kind that is used for setting the encryption IV. + FILE_STATISTICS = 101; + } + optional Kind kind = 1; + optional uint32 column = 2; + optional uint64 length = 3; +} + +message ColumnEncoding { + enum Kind { + DIRECT = 0; + DICTIONARY = 1; + DIRECT_V2 = 2; + DICTIONARY_V2 = 3; + } + optional Kind kind = 1; + optional uint32 dictionarySize = 2; + + // The encoding of the bloom filters for this column: + // 0 or missing = none or original + // 1 = ORC-135 (utc for timestamps) + optional uint32 bloomEncoding = 3; +} + +message StripeEncryptionVariant { + repeated Stream streams = 1; + repeated ColumnEncoding encoding = 2; +} + +// each stripe looks like: +// index streams +// unencrypted +// variant 1..N +// data streams +// unencrypted +// variant 1..N +// footer + +message StripeFooter { + repeated Stream streams = 1; + repeated ColumnEncoding columns = 2; + optional string writerTimezone = 3; + // one for each column encryption variant + repeated StripeEncryptionVariant encryption = 4; +} + +// the file tail looks like: +// encrypted stripe statistics: ColumnarStripeStatistics (order by variant) +// stripe statistics: Metadata +// footer: Footer +// postscript: PostScript +// psLen: byte + +message StringPair { + optional string key = 1; + optional string value = 2; +} + +message Type { + enum Kind { + BOOLEAN = 0; + BYTE = 1; + SHORT = 2; + INT = 3; + LONG = 4; + FLOAT = 5; + DOUBLE = 6; + STRING = 7; + BINARY = 8; + TIMESTAMP = 9; + LIST = 10; + MAP = 11; + STRUCT = 12; + UNION = 13; + DECIMAL = 14; + DATE = 15; + VARCHAR = 16; + CHAR = 17; + TIMESTAMP_INSTANT = 18; + } + optional Kind kind = 1; + repeated uint32 subtypes = 2 [packed=true]; + repeated string fieldNames = 3; + optional uint32 maximumLength = 4; + optional uint32 precision = 5; + optional uint32 scale = 6; + repeated StringPair attributes = 7; +} + +message StripeInformation { + // the global file offset of the start of the stripe + optional uint64 offset = 1; + // the number of bytes of index + optional uint64 indexLength = 2; + // the number of bytes of data + optional uint64 dataLength = 3; + // the number of bytes in the stripe footer + optional uint64 footerLength = 4; + // the number of rows in this stripe + optional uint64 numberOfRows = 5; + // If this is present, the reader should use this value for the encryption + // stripe id for setting the encryption IV. Otherwise, the reader should + // use one larger than the previous stripe's encryptStripeId. + // For unmerged ORC files, the first stripe will use 1 and the rest of the + // stripes won't have it set. For merged files, the stripe information + // will be copied from their original files and thus the first stripe of + // each of the input files will reset it to 1. + // Note that 1 was choosen, because protobuf v3 doesn't serialize + // primitive types that are the default (eg. 0). + optional uint64 encryptStripeId = 6; + // For each encryption variant, the new encrypted local key to use + // until we find a replacement. + repeated bytes encryptedLocalKeys = 7; +} + +message UserMetadataItem { + optional string name = 1; + optional bytes value = 2; +} + +// StripeStatistics (1 per a stripe), which each contain the +// ColumnStatistics for each column. +// This message type is only used in ORC v0 and v1. +message StripeStatistics { + repeated ColumnStatistics colStats = 1; +} + +// This message type is only used in ORC v0 and v1. +message Metadata { + repeated StripeStatistics stripeStats = 1; +} + +// In ORC v2 (and for encrypted columns in v1), each column has +// their column statistics written separately. +message ColumnarStripeStatistics { + // one value for each stripe in the file + repeated ColumnStatistics colStats = 1; +} + +enum EncryptionAlgorithm { + UNKNOWN_ENCRYPTION = 0; // used for detecting future algorithms + AES_CTR_128 = 1; + AES_CTR_256 = 2; +} + +message FileStatistics { + repeated ColumnStatistics column = 1; +} + +// How was the data masked? This isn't necessary for reading the file, but +// is documentation about how the file was written. +message DataMask { + // the kind of masking, which may include third party masks + optional string name = 1; + // parameters for the mask + repeated string maskParameters = 2; + // the unencrypted column roots this mask was applied to + repeated uint32 columns = 3 [packed = true]; +} + +// Information about the encryption keys. +message EncryptionKey { + optional string keyName = 1; + optional uint32 keyVersion = 2; + optional EncryptionAlgorithm algorithm = 3; +} + +// The description of an encryption variant. +// Each variant is a single subtype that is encrypted with a single key. +message EncryptionVariant { + // the column id of the root + optional uint32 root = 1; + // The master key that was used to encrypt the local key, referenced as + // an index into the Encryption.key list. + optional uint32 key = 2; + // the encrypted key for the file footer + optional bytes encryptedKey = 3; + // the stripe statistics for this variant + repeated Stream stripeStatistics = 4; + // encrypted file statistics as a FileStatistics + optional bytes fileStatistics = 5; +} + +// Which KeyProvider encrypted the local keys. +enum KeyProviderKind { + UNKNOWN = 0; + HADOOP = 1; + AWS = 2; + GCP = 3; + AZURE = 4; +} + +message Encryption { + // all of the masks used in this file + repeated DataMask mask = 1; + // all of the keys used in this file + repeated EncryptionKey key = 2; + // The encrypted variants. + // Readers should prefer the first variant that the user has access to + // the corresponding key. If they don't have access to any of the keys, + // they should get the unencrypted masked data. + repeated EncryptionVariant variants = 3; + // How are the local keys encrypted? + optional KeyProviderKind keyProvider = 4; +} + enum CalendarKind { UNKNOWN_CALENDAR = 0; // A hybrid Julian/Gregorian calendar with a cutover point in October 1582. @@ -351,100 +351,100 @@ enum CalendarKind { PROLEPTIC_GREGORIAN = 2; } -message Footer { - optional uint64 headerLength = 1; - optional uint64 contentLength = 2; - repeated StripeInformation stripes = 3; - repeated Type types = 4; - repeated UserMetadataItem metadata = 5; - optional uint64 numberOfRows = 6; - repeated ColumnStatistics statistics = 7; - optional uint32 rowIndexStride = 8; - - // Each implementation that writes ORC files should register for a code - // 0 = ORC Java - // 1 = ORC C++ - // 2 = Presto - // 3 = Scritchley Go from https://github.com/scritchley/orc - optional uint32 writer = 9; - - // information about the encryption in this file - optional Encryption encryption = 10; +message Footer { + optional uint64 headerLength = 1; + optional uint64 contentLength = 2; + repeated StripeInformation stripes = 3; + repeated Type types = 4; + repeated UserMetadataItem metadata = 5; + optional uint64 numberOfRows = 6; + repeated ColumnStatistics statistics = 7; + optional uint32 rowIndexStride = 8; + + // Each implementation that writes ORC files should register for a code + // 0 = ORC Java + // 1 = ORC C++ + // 2 = Presto + // 3 = Scritchley Go from https://github.com/scritchley/orc + optional uint32 writer = 9; + + // information about the encryption in this file + optional Encryption encryption = 10; optional CalendarKind calendar = 11; // informative description about the version of the software that wrote // the file. It is assumed to be within a given writer, so for example // ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT". optional string softwareVersion = 12; -} - -enum CompressionKind { - NONE = 0; - ZLIB = 1; - SNAPPY = 2; - LZO = 3; - LZ4 = 4; - ZSTD = 5; -} - -// Serialized length must be less that 255 bytes -message PostScript { - optional uint64 footerLength = 1; - optional CompressionKind compression = 2; - optional uint64 compressionBlockSize = 3; - // the version of the file format - // [0, 11] = Hive 0.11 - // [0, 12] = Hive 0.12 - repeated uint32 version = 4 [packed = true]; - optional uint64 metadataLength = 5; - - // The version of the writer that wrote the file. This number is - // updated when we make fixes or large changes to the writer so that - // readers can detect whether a given bug is present in the data. - // - // Only the Java ORC writer may use values under 6 (or missing) so that - // readers that predate ORC-202 treat the new writers correctly. Each - // writer should assign their own sequence of versions starting from 6. - // - // Version of the ORC Java writer: - // 0 = original - // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & - // string statistics use utf8 for min/max) - // 2 = HIVE-4243 fixed (use real column names from Hive tables) - // 3 = HIVE-12055 added (vectorized writer implementation) - // 4 = HIVE-13083 fixed (decimals write present stream correctly) - // 5 = ORC-101 fixed (bloom filters use utf8 consistently) - // 6 = ORC-135 fixed (timestamp statistics use utc) - // 7 = ORC-517 fixed (decimal64 min/max incorrect) - // 8 = ORC-203 added (trim very long string statistics) - // 9 = ORC-14 added (column encryption) - // - // Version of the ORC C++ writer: - // 6 = original - // - // Version of the Presto writer: - // 6 = original - // - // Version of the Scritchley Go writer: - // 6 = original - // +} + +enum CompressionKind { + NONE = 0; + ZLIB = 1; + SNAPPY = 2; + LZO = 3; + LZ4 = 4; + ZSTD = 5; +} + +// Serialized length must be less that 255 bytes +message PostScript { + optional uint64 footerLength = 1; + optional CompressionKind compression = 2; + optional uint64 compressionBlockSize = 3; + // the version of the file format + // [0, 11] = Hive 0.11 + // [0, 12] = Hive 0.12 + repeated uint32 version = 4 [packed = true]; + optional uint64 metadataLength = 5; + + // The version of the writer that wrote the file. This number is + // updated when we make fixes or large changes to the writer so that + // readers can detect whether a given bug is present in the data. + // + // Only the Java ORC writer may use values under 6 (or missing) so that + // readers that predate ORC-202 treat the new writers correctly. Each + // writer should assign their own sequence of versions starting from 6. + // + // Version of the ORC Java writer: + // 0 = original + // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics & + // string statistics use utf8 for min/max) + // 2 = HIVE-4243 fixed (use real column names from Hive tables) + // 3 = HIVE-12055 added (vectorized writer implementation) + // 4 = HIVE-13083 fixed (decimals write present stream correctly) + // 5 = ORC-101 fixed (bloom filters use utf8 consistently) + // 6 = ORC-135 fixed (timestamp statistics use utc) + // 7 = ORC-517 fixed (decimal64 min/max incorrect) + // 8 = ORC-203 added (trim very long string statistics) + // 9 = ORC-14 added (column encryption) + // + // Version of the ORC C++ writer: + // 6 = original + // + // Version of the Presto writer: + // 6 = original + // + // Version of the Scritchley Go writer: + // 6 = original + // // Version of the Trino writer: // 6 = original // - optional uint32 writerVersion = 6; - - // the number of bytes in the encrypted stripe statistics - optional uint64 stripeStatisticsLength = 7; - - // Leave this last in the record - optional string magic = 8000; -} - -// The contents of the file tail that must be serialized. -// This gets serialized as part of OrcSplit, also used by footer cache. -message FileTail { - optional PostScript postscript = 1; - optional Footer footer = 2; - optional uint64 fileLength = 3; - optional uint64 postscriptLength = 4; -} + optional uint32 writerVersion = 6; + + // the number of bytes in the encrypted stripe statistics + optional uint64 stripeStatisticsLength = 7; + + // Leave this last in the record + optional string magic = 8000; +} + +// The contents of the file tail that must be serialized. +// This gets serialized as part of OrcSplit, also used by footer cache. +message FileTail { + optional PostScript postscript = 1; + optional Footer footer = 2; + optional uint64 fileLength = 3; + optional uint64 postscriptLength = 4; +} diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make index 5672ba95db..cfe8d93b37 100644 --- a/contrib/libs/apache/orc/ya.make +++ b/contrib/libs/apache/orc/ya.make @@ -1,69 +1,69 @@ # Generated by devtools/yamaker from nixpkgs e392df43c9f302d4a0892caaadcad3cd693edf9e. - -LIBRARY() - + +LIBRARY() + OWNER( iaz1607 g:cpp-contrib ) - + VERSION(1.6.12) - + ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-1.6.12.tar.gz) LICENSE(Apache-2.0) LICENSE_TEXTS(.yandex_meta/licenses.list.txt) -PEERDIR( - contrib/libs/lz4 - contrib/libs/snappy - contrib/libs/zlib - contrib/libs/zstd -) - -ADDINCL( +PEERDIR( + contrib/libs/lz4 + contrib/libs/snappy + contrib/libs/zlib + contrib/libs/zstd +) + +ADDINCL( contrib/libs/apache/orc/c++/include contrib/libs/apache/orc/c++/src contrib/libs/apache/orc/proto - contrib/libs/lz4 + contrib/libs/lz4 contrib/libs/zstd/include -) - -NO_COMPILER_WARNINGS() - -NO_UTIL() - -SRCS( - c++/src/Adaptor.cc - c++/src/BloomFilter.cc - c++/src/ByteRLE.cc - c++/src/ColumnPrinter.cc - c++/src/ColumnReader.cc - c++/src/ColumnWriter.cc - c++/src/Common.cc - c++/src/Compression.cc - c++/src/Exceptions.cc - c++/src/Int128.cc - c++/src/LzoDecompressor.cc - c++/src/MemoryPool.cc - c++/src/Murmur3.cc - c++/src/OrcFile.cc - c++/src/RLE.cc - c++/src/RLEV2Util.cc - c++/src/RLEv1.cc - c++/src/Reader.cc - c++/src/RleDecoderV2.cc - c++/src/RleEncoderV2.cc - c++/src/Statistics.cc - c++/src/StripeStream.cc - c++/src/Timezone.cc - c++/src/TypeImpl.cc - c++/src/Vector.cc - c++/src/Writer.cc - c++/src/io/InputStream.cc - c++/src/io/OutputStream.cc - proto/orc_proto.proto -) - -END() +) + +NO_COMPILER_WARNINGS() + +NO_UTIL() + +SRCS( + c++/src/Adaptor.cc + c++/src/BloomFilter.cc + c++/src/ByteRLE.cc + c++/src/ColumnPrinter.cc + c++/src/ColumnReader.cc + c++/src/ColumnWriter.cc + c++/src/Common.cc + c++/src/Compression.cc + c++/src/Exceptions.cc + c++/src/Int128.cc + c++/src/LzoDecompressor.cc + c++/src/MemoryPool.cc + c++/src/Murmur3.cc + c++/src/OrcFile.cc + c++/src/RLE.cc + c++/src/RLEV2Util.cc + c++/src/RLEv1.cc + c++/src/Reader.cc + c++/src/RleDecoderV2.cc + c++/src/RleEncoderV2.cc + c++/src/Statistics.cc + c++/src/StripeStream.cc + c++/src/Timezone.cc + c++/src/TypeImpl.cc + c++/src/Vector.cc + c++/src/Writer.cc + c++/src/io/InputStream.cc + c++/src/io/OutputStream.cc + proto/orc_proto.proto +) + +END() |