summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrobot-contrib <[email protected]>2025-07-31 18:00:57 +0300
committerrobot-contrib <[email protected]>2025-07-31 18:24:59 +0300
commitf621c90694c03bdc1992bd3cb726ff315a9627d5 (patch)
treeaabd88e8bca000ac17fe21727b68d360189c6442
parent190037200407b9e5215a7af324af558d778588b5 (diff)
Update contrib/libs/apache/orc to 2.2.0
commit_hash:e0bdaabcbb3fe0971f373bd76ccaccb105aff7d3
-rw-r--r--contrib/libs/apache/orc/.yandex_meta/__init__.py35
-rw-r--r--contrib/libs/apache/orc/.yandex_meta/default.nix38
-rw-r--r--contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report20
-rw-r--r--contrib/libs/apache/orc/.yandex_meta/licenses.list.txt4
-rw-r--r--contrib/libs/apache/orc/README.md64
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Geospatial.hh196
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Int128.hh34
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Reader.hh6
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Statistics.hh30
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/Type.hh24
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/orc-config.hh2
-rw-r--r--contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh6
-rw-r--r--contrib/libs/apache/orc/c++/src/Adaptor-linux.hh6
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.cc3
-rw-r--r--contrib/libs/apache/orc/c++/src/BloomFilter.hh1
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnPrinter.cc2
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnReader.cc9
-rw-r--r--contrib/libs/apache/orc/c++/src/ColumnWriter.cc215
-rw-r--r--contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc4
-rw-r--r--contrib/libs/apache/orc/c++/src/Dictionary.cc99
-rw-r--r--contrib/libs/apache/orc/c++/src/Dictionary.hh104
-rw-r--r--contrib/libs/apache/orc/c++/src/Geospatial.cc307
-rw-r--r--contrib/libs/apache/orc/c++/src/Geospatial.hh86
-rw-r--r--contrib/libs/apache/orc/c++/src/Int128.cc35
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.cc2
-rw-r--r--contrib/libs/apache/orc/c++/src/RLE.hh3
-rw-r--r--contrib/libs/apache/orc/c++/src/RLEv2.hh1
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.cc24
-rw-r--r--contrib/libs/apache/orc/c++/src/Reader.hh3
-rw-r--r--contrib/libs/apache/orc/c++/src/RleEncoderV2.cc2
-rw-r--r--contrib/libs/apache/orc/c++/src/SchemaEvolution.cc7
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.cc62
-rw-r--r--contrib/libs/apache/orc/c++/src/Statistics.hh151
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.cc139
-rw-r--r--contrib/libs/apache/orc/c++/src/TypeImpl.hh26
-rw-r--r--contrib/libs/apache/orc/c++/src/Writer.cc35
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc6
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc6
-rw-r--r--contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh6
-rw-r--r--contrib/libs/apache/orc/patches/darwin-byte-count.patch20
-rw-r--r--contrib/libs/apache/orc/patches/darwin-proto-int64.patch10
-rw-r--r--contrib/libs/apache/orc/patches/fix_strings.patch36
-rw-r--r--contrib/libs/apache/orc/patches/pr2345-fix-windows-build.patch26
-rw-r--r--contrib/libs/apache/orc/ya.make6
44 files changed, 1683 insertions, 218 deletions
diff --git a/contrib/libs/apache/orc/.yandex_meta/__init__.py b/contrib/libs/apache/orc/.yandex_meta/__init__.py
new file mode 100644
index 00000000000..fef25291ecd
--- /dev/null
+++ b/contrib/libs/apache/orc/.yandex_meta/__init__.py
@@ -0,0 +1,35 @@
+import os
+
+from devtools.yamaker.project import CMakeNinjaNixProject
+
+
+def apache_orc_post_install(self):
+ with self.yamakes["."] as orc:
+ proto_wrapper_source = "c++/src/wrap/orc-proto-wrapper.cc"
+ os.remove(f"{self.dstdir}/{proto_wrapper_source}")
+ orc.SRCS.remove(proto_wrapper_source)
+
+ orc.PEERDIR.add("contrib/libs/apache/orc-format")
+ orc.SRCS.remove("orc-format_ep-prefix/src/orc-format_ep/src/main/proto/orc/proto/orc_proto.proto")
+ orc.CFLAGS.remove("-DPROTOBUF_USE_DLLS")
+
+
+apache_orc = CMakeNinjaNixProject(
+ owners=["iaz1607", "g:cpp-contrib"],
+ arcdir="contrib/libs/apache/orc",
+ nixattr="apache-orc",
+ ignore_commands=["cmake"],
+ install_targets=["orc"],
+ disable_includes=[
+ "BpackingAvx512.hh",
+ "sparsehash/dense_hash_map",
+ ],
+ addincl_global={".": {"./c++/include"}},
+ platform_dispatchers=[
+ "c++/src/Adaptor.hh",
+ ],
+ unbundle_from={
+ "orc_format": "orc-format_ep-prefix",
+ },
+ post_install=apache_orc_post_install,
+)
diff --git a/contrib/libs/apache/orc/.yandex_meta/default.nix b/contrib/libs/apache/orc/.yandex_meta/default.nix
new file mode 100644
index 00000000000..f464679c435
--- /dev/null
+++ b/contrib/libs/apache/orc/.yandex_meta/default.nix
@@ -0,0 +1,38 @@
+self: super: with self; {
+ apache-orc = stdenv.mkDerivation rec {
+ name = "orc";
+ version = "2.2.0";
+
+ src = fetchFromGitHub {
+ owner = "apache";
+ repo = "orc";
+ rev = "rel/release-${version}";
+ sha256 = "sha256-tRUCO7SPlHuGffXJx/rQXvQdEfgCCQAtUZP7OQcjqpU=";
+ };
+
+ patches = [];
+
+ nativeBuildInputs = [
+ cmake
+ ninja
+ ];
+
+ buildInputs = [
+ lz4
+ protobuf
+ snappy
+ zlib
+ zstd
+ ];
+
+ cmakeFlags = [
+ "-DLZ4_HOME=${lz4.dev}"
+ "-DPROTOBUF_HOME=${protobuf}"
+ "-DSNAPPY_HOME=${snappy.dev}"
+ "-DZLIB_HOME=${zlib.dev}"
+ "-DZSTD_HOME=${zstd.dev}"
+ "-DBUILD_JAVA=OFF"
+ "-DBUILD_CPP_TESTS=OFF"
+ ];
+ };
+}
diff --git a/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report b/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report
index a2e9c7ccd97..31414d3f709 100644
--- a/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report
+++ b/contrib/libs/apache/orc/.yandex_meta/devtools.licenses.report
@@ -31,7 +31,7 @@
KEEP Apache-2.0 44dc743c95835a9e71d7b3cca63dcc7c
BELONGS ya.make
-FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c++/include/orc/ColumnPrinter.hh at line 3, c++/include/orc/Common.hh at line 3, c++/include/orc/Exceptions.hh at line 3, c++/include/orc/Int128.hh at line 3, c++/include/orc/MemoryPool.hh at line 3, c++/include/orc/OrcFile.hh at line 3, c++/include/orc/Reader.hh at line 3, c++/include/orc/Statistics.hh at line 3, c++/include/orc/Type.hh at line 3, c++/include/orc/Vector.hh at line 3, c++/include/orc/Writer.hh at line 3, c++/include/orc/orc-config.hh at line 3, c++/include/orc/sargs/Literal.hh at line 3, c++/include/orc/sargs/SearchArgument.hh at line 3, c++/include/orc/sargs/TruthValue.hh at line 3, c++/src/Adaptor-linux.hh at line 3, c++/src/Adaptor.cc at line 3, c++/src/BlockBuffer.cc at line 3, c++/src/BlockBuffer.hh at line 3, c++/src/BloomFilter.cc at line 3, c++/src/BloomFilter.hh at line 3, c++/src/Bpacking.hh at line 3, c++/src/BpackingDefault.cc at line 3, c++/src/BpackingDefault.hh at line 3, c++/src/ByteRLE.cc at line 3, c++/src/ByteRLE.hh at line 3, c++/src/ColumnPrinter.cc at line 3, c++/src/ColumnReader.cc at line 3, c++/src/ColumnReader.hh at line 3, c++/src/ColumnWriter.cc at line 3, c++/src/ColumnWriter.hh at line 3, c++/src/Common.cc at line 3, c++/src/Compression.cc at line 3, c++/src/Compression.hh at line 3, c++/src/ConvertColumnReader.cc at line 3, c++/src/ConvertColumnReader.hh at line 3, c++/src/CpuInfoUtil.cc at line 3, c++/src/CpuInfoUtil.hh at line 3, c++/src/Dispatch.hh at line 3, c++/src/Exceptions.cc at line 3, c++/src/Int128.cc at line 3, c++/src/LzoDecompressor.hh at line 3, c++/src/MemoryPool.cc at line 3, c++/src/Murmur3.cc at line 3, c++/src/Murmur3.hh at line 3, c++/src/Options.hh at line 3, c++/src/OrcFile.cc at line 3, c++/src/RLE.cc at line 3, c++/src/RLE.hh at line 3, c++/src/RLEV2Util.hh at line 3, c++/src/RLEv1.cc at line 3, c++/src/RLEv1.hh at line 3, c++/src/RLEv2.hh at line 3, c++/src/Reader.cc at line 3, c++/src/Reader.hh at line 3, c++/src/RleDecoderV2.cc at line 3, c++/src/SchemaEvolution.cc at line 3, c++/src/SchemaEvolution.hh at line 3, c++/src/Statistics.cc at line 3, c++/src/Statistics.hh at line 3, c++/src/StripeStream.cc at line 3, c++/src/StripeStream.hh at line 3, c++/src/Timezone.cc at line 3, c++/src/Timezone.hh at line 3, c++/src/TypeImpl.cc at line 3, c++/src/TypeImpl.hh at line 3, c++/src/Utils.hh at line 3, c++/src/Vector.cc at line 3, c++/src/Writer.cc at line 3, c++/src/io/Cache.cc at line 3, c++/src/io/Cache.hh at line 3, c++/src/io/InputStream.cc at line 3, c++/src/io/InputStream.hh at line 3, c++/src/io/OutputStream.cc at line 3, c++/src/io/OutputStream.hh at line 3, c++/src/sargs/ExpressionTree.cc at line 3, c++/src/sargs/ExpressionTree.hh at line 3, c++/src/sargs/Literal.cc at line 3, c++/src/sargs/PredicateLeaf.cc at line 3, c++/src/sargs/PredicateLeaf.hh at line 3, c++/src/sargs/SargsApplier.cc at line 3, c++/src/sargs/SargsApplier.hh at line 3, c++/src/sargs/SearchArgument.cc at line 3, c++/src/sargs/SearchArgument.hh at line 3, c++/src/sargs/TruthValue.cc at line 3
+FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c++/include/orc/ColumnPrinter.hh at line 3, c++/include/orc/Common.hh at line 3, c++/include/orc/Exceptions.hh at line 3, c++/include/orc/Geospatial.hh at line 3, c++/include/orc/Int128.hh at line 3, c++/include/orc/MemoryPool.hh at line 3, c++/include/orc/OrcFile.hh at line 3, c++/include/orc/Reader.hh at line 3, c++/include/orc/Statistics.hh at line 3, c++/include/orc/Type.hh at line 3, c++/include/orc/Vector.hh at line 3, c++/include/orc/Writer.hh at line 3, c++/include/orc/orc-config.hh at line 3, c++/include/orc/sargs/Literal.hh at line 3, c++/include/orc/sargs/SearchArgument.hh at line 3, c++/include/orc/sargs/TruthValue.hh at line 3, c++/src/Adaptor-linux.hh at line 3, c++/src/Adaptor.cc at line 3, c++/src/BlockBuffer.cc at line 3, c++/src/BlockBuffer.hh at line 3, c++/src/BloomFilter.cc at line 3, c++/src/BloomFilter.hh at line 3, c++/src/Bpacking.hh at line 3, c++/src/BpackingDefault.cc at line 3, c++/src/BpackingDefault.hh at line 3, c++/src/ByteRLE.cc at line 3, c++/src/ByteRLE.hh at line 3, c++/src/ColumnPrinter.cc at line 3, c++/src/ColumnReader.cc at line 3, c++/src/ColumnReader.hh at line 3, c++/src/ColumnWriter.cc at line 3, c++/src/ColumnWriter.hh at line 3, c++/src/Common.cc at line 3, c++/src/Compression.cc at line 3, c++/src/Compression.hh at line 3, c++/src/ConvertColumnReader.cc at line 3, c++/src/ConvertColumnReader.hh at line 3, c++/src/CpuInfoUtil.cc at line 3, c++/src/CpuInfoUtil.hh at line 3, c++/src/Dictionary.cc at line 3, c++/src/Dictionary.hh at line 3, c++/src/Dispatch.hh at line 3, c++/src/Exceptions.cc at line 3, c++/src/Geospatial.cc at line 3, c++/src/Geospatial.hh at line 4, c++/src/Int128.cc at line 3, c++/src/LzoDecompressor.hh at line 3, c++/src/MemoryPool.cc at line 3, c++/src/Murmur3.cc at line 3, c++/src/Murmur3.hh at line 3, c++/src/Options.hh at line 3, c++/src/OrcFile.cc at line 3, c++/src/RLE.cc at line 3, c++/src/RLE.hh at line 3, c++/src/RLEV2Util.hh at line 3, c++/src/RLEv1.cc at line 3, c++/src/RLEv1.hh at line 3, c++/src/RLEv2.hh at line 3, c++/src/Reader.cc at line 3, c++/src/Reader.hh at line 3, c++/src/RleDecoderV2.cc at line 3, c++/src/SchemaEvolution.cc at line 3, c++/src/SchemaEvolution.hh at line 3, c++/src/Statistics.cc at line 3, c++/src/Statistics.hh at line 3, c++/src/StripeStream.cc at line 3, c++/src/StripeStream.hh at line 3, c++/src/Timezone.cc at line 3, c++/src/Timezone.hh at line 3, c++/src/TypeImpl.cc at line 3, c++/src/TypeImpl.hh at line 3, c++/src/Utils.hh at line 3, c++/src/Vector.cc at line 3, c++/src/Writer.cc at line 3, c++/src/io/Cache.cc at line 3, c++/src/io/Cache.hh at line 3, c++/src/io/InputStream.cc at line 3, c++/src/io/InputStream.hh at line 3, c++/src/io/OutputStream.cc at line 3, c++/src/io/OutputStream.hh at line 3, c++/src/sargs/ExpressionTree.cc at line 3, c++/src/sargs/ExpressionTree.hh at line 3, c++/src/sargs/Literal.cc at line 3, c++/src/sargs/PredicateLeaf.cc at line 3, c++/src/sargs/PredicateLeaf.hh at line 3, c++/src/sargs/SargsApplier.cc at line 3, c++/src/sargs/SargsApplier.hh at line 3, c++/src/sargs/SearchArgument.cc at line 3, c++/src/sargs/SearchArgument.hh at line 3, c++/src/sargs/TruthValue.cc at line 3
Note: matched license text is too long. Read it in the source files.
Scancode info:
Original SPDX id: Apache-2.0
@@ -43,6 +43,7 @@ FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c+
c++/include/orc/ColumnPrinter.hh [2:16]
c++/include/orc/Common.hh [2:16]
c++/include/orc/Exceptions.hh [2:16]
+ c++/include/orc/Geospatial.hh [2:16]
c++/include/orc/Int128.hh [2:16]
c++/include/orc/MemoryPool.hh [2:16]
c++/include/orc/OrcFile.hh [2:16]
@@ -78,8 +79,12 @@ FILE_INCLUDE NOTICE found in files: c++/include/orc/BloomFilter.hh at line 3, c+
c++/src/ConvertColumnReader.hh [2:16]
c++/src/CpuInfoUtil.cc [2:16]
c++/src/CpuInfoUtil.hh [2:16]
+ c++/src/Dictionary.cc [2:16]
+ c++/src/Dictionary.hh [2:16]
c++/src/Dispatch.hh [2:16]
c++/src/Exceptions.cc [2:16]
+ c++/src/Geospatial.cc [2:16]
+ c++/src/Geospatial.hh [3:17]
c++/src/Int128.cc [2:16]
c++/src/LzoDecompressor.hh [2:16]
c++/src/MemoryPool.cc [2:16]
@@ -169,6 +174,19 @@ BELONGS ya.make
Files with this license:
LICENSE [277:301]
+KEEP Apache-2.0 79a77c558a7457bc9605aaf522710501
+BELONGS ya.make
+ License text:
+ * The original code is licensed under the Apache License, Version 2.0.
+ Scancode info:
+ Original SPDX id: Apache-2.0
+ Score : 100.00
+ Match type : NOTICE
+ Links : http://www.apache.org/licenses/, http://www.apache.org/licenses/LICENSE-2.0, https://spdx.org/licenses/Apache-2.0
+ Files with this license:
+ c++/include/orc/Geospatial.hh [25:25]
+ c++/src/Geospatial.cc [25:25]
+
SKIP LicenseRef-scancode-unknown-license-reference 82b9c91a8f717463c6871d82aa2b1ff8
BELONGS ya.make
# notice about sub-components that have other licenses
diff --git a/contrib/libs/apache/orc/.yandex_meta/licenses.list.txt b/contrib/libs/apache/orc/.yandex_meta/licenses.list.txt
index 97b8f7dba6c..57bb43fcb21 100644
--- a/contrib/libs/apache/orc/.yandex_meta/licenses.list.txt
+++ b/contrib/libs/apache/orc/.yandex_meta/licenses.list.txt
@@ -240,6 +240,10 @@
====================Apache-2.0====================
+ * The original code is licensed under the Apache License, Version 2.0.
+
+
+====================Apache-2.0====================
This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).
diff --git a/contrib/libs/apache/orc/README.md b/contrib/libs/apache/orc/README.md
index cf5c5d07938..2ddf0849b94 100644
--- a/contrib/libs/apache/orc/README.md
+++ b/contrib/libs/apache/orc/README.md
@@ -18,20 +18,21 @@ lists, maps, and unions.
This project includes both a Java library and a C++ library for reading and writing the _Optimized Row Columnar_ (ORC) file format. The C++ and Java libraries are completely independent of each other and will each read all versions of ORC files.
Releases:
-* Latest: <a href="https://orc.apache.org/releases">Apache ORC releases</a>
-* Maven Central: <a href="https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22">![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)</a>
-* Downloads: <a href="https://orc.apache.org/downloads">Apache ORC downloads</a>
-* Release tags: <a href="https://github.com/apache/orc/releases">Apache ORC release tags</a>
-* Plan: <a href="https://github.com/apache/orc/milestones">Apache ORC future release plan</a>
+
+* Latest: [Apache ORC releases](https://orc.apache.org/releases)
+* Maven Central: [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.orc/orc/badge.svg)](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.orc%22)
+* Downloads: [Apache ORC downloads](https://orc.apache.org/downloads)
+* Release tags: [Apache ORC release tags](https://github.com/apache/orc/releases)
+* Plan: [Apache ORC future release plan](https://github.com/apache/orc/milestones)
The current build status:
-* Main branch <a href="https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain">
-![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main)</a>
-Bug tracking: <a href="https://orc.apache.org/bugs">Apache Jira</a>
+* Main branch [![main build status](https://github.com/apache/orc/actions/workflows/build_and_test.yml/badge.svg?branch=main)](https://github.com/apache/orc/actions/workflows/build_and_test.yml?query=branch%3Amain)
+Bug tracking: [Apache Jira](https://orc.apache.org/bugs)
The subdirectories are:
+
* c++ - the c++ reader and writer
* cmake_modules - the cmake modules
* docker - docker scripts to build and test on various linuxes
@@ -47,6 +48,7 @@ The subdirectories are:
* Install cmake 3.12 or higher
To build a release version with debug information:
+
```shell
% mkdir build
% cd build
@@ -57,6 +59,7 @@ To build a release version with debug information:
```
To build a debug version:
+
```shell
% mkdir build
% cd build
@@ -67,6 +70,7 @@ To build a debug version:
```
To build a release version without debug information:
+
```shell
% mkdir build
% cd build
@@ -77,6 +81,7 @@ To build a release version without debug information:
```
To build only the Java library:
+
```shell
% cd java
% ./mvnw package
@@ -84,6 +89,7 @@ To build only the Java library:
```
To build only the C++ library:
+
```shell
% mkdir build
% cd build
@@ -94,6 +100,7 @@ To build only the C++ library:
```
To build the C++ library with AVX512 enabled:
+
```shell
export ORC_USER_SIMD_LEVEL=AVX512
% mkdir build
@@ -102,8 +109,49 @@ export ORC_USER_SIMD_LEVEL=AVX512
% make package
% make test-out
```
+
Cmake option BUILD_ENABLE_AVX512 can be set to "ON" or (default value)"OFF" at the compile time. At compile time, it defines the SIMD level(AVX512) to be compiled into the binaries.
Environment variable ORC_USER_SIMD_LEVEL can be set to "AVX512" or (default value)"NONE" at the run time. At run time, it defines the SIMD level to dispatch the code which can apply SIMD optimization.
Note that if ORC_USER_SIMD_LEVEL is set to "NONE" at run time, AVX512 will not take effect at run time even if BUILD_ENABLE_AVX512 is set to "ON" at compile time.
+
+### Building with Meson
+
+While CMake is the official build system for orc, there is unofficial support for using Meson to build select parts of the project. To build a debug version of the library and test it using Meson, from the project root you can run:
+
+```shell
+meson setup build
+meson compile -C build
+meson test -C build
+```
+
+By default, Meson will build unoptimized libraries with debug symbols. By contrast, the CMake build system generates release libraries by default. If you would like to create release libraries ala CMake, you should set the buildtype option. You must either remove the existing build directory before changing that setting, or alternatively pass the ``--reconfigure`` flag:
+
+```shell
+meson setup build -Dbuildtype=release --reconfigure
+meson compile -C build
+meson test -C build
+```
+
+Meson supports running your test suite through valgrind out of the box:
+
+```shell
+meson test -C build --wrap=valgrind
+```
+
+If you'd like to enable sanitizers, you can leverage the ``-Db_sanitize=`` option. For example, to enable both ASAN and UBSAN, you can run:
+
+```shell
+meson setup build -Dbuildtype=debug -Db_sanitize=address,undefined --reconfigure
+meson compile -C build
+meson test
+```
+
+Meson takes care of detecting all dependencies on your system, and downloading missing ones as required through its [Wrap system](https://mesonbuild.com/Wrap-dependency-system-manual.html). The dependencies for the project are all stored in the ``subprojects`` directory in individual wrap files. The majority of these are system generated files created by running:
+
+```shell
+meson wrap install <depencency_name>
+```
+
+From the project root. If you are developing orc and need to add a new dependency in the future, be sure to check Meson's [WrapDB](https://mesonbuild.com/Wrapdb-projects.html) to check if a pre-configured wrap entry exists. If not, you may still manually configure the dependency as outlined in the aforementioned Wrap system documentation.
diff --git a/contrib/libs/apache/orc/c++/include/orc/Geospatial.hh b/contrib/libs/apache/orc/c++/include/orc/Geospatial.hh
new file mode 100644
index 00000000000..d3b9e282858
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/include/orc/Geospatial.hh
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains code adapted from the Apache Arrow project.
+ *
+ * Original source:
+ * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.h
+ *
+ * The original code is licensed under the Apache License, Version 2.0.
+ *
+ * Modifications may have been made from the original source.
+ */
+
+#ifndef ORC_GEOSPATIAL_HH
+#define ORC_GEOSPATIAL_HH
+
+#include <array>
+#include <cmath>
+#include <ostream>
+#include <string>
+
+namespace orc::geospatial {
+
+ constexpr double INF = std::numeric_limits<double>::infinity();
+ // The maximum number of dimensions supported (X, Y, Z, M)
+ inline constexpr int MAX_DIMENSIONS = 4;
+
+ // Supported combinations of geometry dimensions
+ enum class Dimensions {
+ XY = 0, // X and Y only
+ XYZ = 1, // X, Y, and Z
+ XYM = 2, // X, Y, and M
+ XYZM = 3, // X, Y, Z, and M
+ VALUE_MIN = 0,
+ VALUE_MAX = 3
+ };
+
+ // Supported geometry types according to ISO WKB
+ enum class GeometryType {
+ POINT = 1,
+ LINESTRING = 2,
+ POLYGON = 3,
+ MULTIPOINT = 4,
+ MULTILINESTRING = 5,
+ MULTIPOLYGON = 6,
+ GEOMETRYCOLLECTION = 7,
+ VALUE_MIN = 1,
+ VALUE_MAX = 7
+ };
+
+ // BoundingBox represents the minimum bounding rectangle (or box) for a geometry.
+ // It supports up to 4 dimensions (X, Y, Z, M).
+ struct BoundingBox {
+ using XY = std::array<double, 2>;
+ using XYZ = std::array<double, 3>;
+ using XYM = std::array<double, 3>;
+ using XYZM = std::array<double, 4>;
+
+ // Default constructor: initializes to an empty bounding box.
+ BoundingBox() : min{INF, INF, INF, INF}, max{-INF, -INF, -INF, -INF} {}
+ // Constructor with explicit min/max values.
+ BoundingBox(const XYZM& mins, const XYZM& maxes) : min(mins), max(maxes) {}
+ BoundingBox(const BoundingBox& other) = default;
+ BoundingBox& operator=(const BoundingBox&) = default;
+
+ // Update the bounding box to include a 2D coordinate.
+ void updateXY(const XY& coord) {
+ updateInternal(coord);
+ }
+ // Update the bounding box to include a 3D coordinate (XYZ).
+ void updateXYZ(const XYZ& coord) {
+ updateInternal(coord);
+ }
+ // Update the bounding box to include a 3D coordinate (XYM).
+ void updateXYM(const XYM& coord) {
+ std::array<int, 3> dims = {0, 1, 3};
+ for (int i = 0; i < 3; ++i) {
+ auto dim = dims[i];
+ if (!std::isnan(min[dim]) && !std::isnan(max[dim])) {
+ min[dim] = std::min(min[dim], coord[i]);
+ max[dim] = std::max(max[dim], coord[i]);
+ }
+ }
+ }
+ // Update the bounding box to include a 4D coordinate (XYZM).
+ void updateXYZM(const XYZM& coord) {
+ updateInternal(coord);
+ }
+
+ // Reset the bounding box to its initial empty state.
+ void reset() {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ min[i] = INF;
+ max[i] = -INF;
+ }
+ }
+
+ // Invalidate the bounding box (set all values to NaN).
+ void invalidate() {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ min[i] = std::numeric_limits<double>::quiet_NaN();
+ max[i] = std::numeric_limits<double>::quiet_NaN();
+ }
+ }
+
+ // Check if the bound for a given dimension is empty.
+ bool boundEmpty(int dim) const {
+ return std::isinf(min[dim] - max[dim]);
+ }
+
+ // Check if the bound for a given dimension is valid (not NaN).
+ bool boundValid(int dim) const {
+ return !std::isnan(min[dim]) && !std::isnan(max[dim]);
+ }
+
+ // Get the lower bound (min values).
+ const XYZM& lowerBound() const {
+ return min;
+ }
+ // Get the upper bound (max values).
+ const XYZM& upperBound() const {
+ return max;
+ }
+
+ // Get validity for each dimension.
+ std::array<bool, MAX_DIMENSIONS> dimensionValid() const {
+ return {boundValid(0), boundValid(1), boundValid(2), boundValid(3)};
+ }
+ // Get emptiness for each dimension.
+ std::array<bool, MAX_DIMENSIONS> dimensionEmpty() const {
+ return {boundEmpty(0), boundEmpty(1), boundEmpty(2), boundEmpty(3)};
+ }
+
+ // Merge another bounding box into this one.
+ void merge(const BoundingBox& other) {
+ for (int i = 0; i < MAX_DIMENSIONS; ++i) {
+ if (std::isnan(min[i]) || std::isnan(max[i]) || std::isnan(other.min[i]) ||
+ std::isnan(other.max[i])) {
+ min[i] = std::numeric_limits<double>::quiet_NaN();
+ max[i] = std::numeric_limits<double>::quiet_NaN();
+ } else {
+ min[i] = std::min(min[i], other.min[i]);
+ max[i] = std::max(max[i], other.max[i]);
+ }
+ }
+ }
+
+ // Convert the bounding box to a string representation.
+ std::string toString() const;
+
+ XYZM min; // Minimum values for each dimension
+ XYZM max; // Maximum values for each dimension
+
+ private:
+ // Internal update function for XY, XYZ, or XYZM coordinates.
+ template <typename Coord>
+ void updateInternal(const Coord& coord) {
+ for (size_t i = 0; i < coord.size(); ++i) {
+ if (!std::isnan(min[i]) && !std::isnan(max[i])) {
+ min[i] = std::min(min[i], coord[i]);
+ max[i] = std::max(max[i], coord[i]);
+ }
+ }
+ }
+ };
+
+ inline bool operator==(const BoundingBox& lhs, const BoundingBox& rhs) {
+ return lhs.min == rhs.min && lhs.max == rhs.max;
+ }
+ inline bool operator!=(const BoundingBox& lhs, const BoundingBox& rhs) {
+ return !(lhs == rhs);
+ }
+ inline std::ostream& operator<<(std::ostream& os, const BoundingBox& obj) {
+ os << obj.toString();
+ return os;
+ }
+
+} // namespace orc::geospatial
+
+#endif // ORC_GEOSPATIAL_HH
diff --git a/contrib/libs/apache/orc/c++/include/orc/Int128.hh b/contrib/libs/apache/orc/c++/include/orc/Int128.hh
index 6954c771cf1..e728e70e7b4 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Int128.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Int128.hh
@@ -193,43 +193,13 @@ namespace orc {
* Shift left by the given number of bits.
* Values larger than 2**127 will shift into the sign bit.
*/
- Int128& operator<<=(uint32_t bits) {
- if (bits != 0) {
- if (bits < 64) {
- highbits_ <<= bits;
- highbits_ |= (lowbits_ >> (64 - bits));
- lowbits_ <<= bits;
- } else if (bits < 128) {
- highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
- lowbits_ = 0;
- } else {
- highbits_ = 0;
- lowbits_ = 0;
- }
- }
- return *this;
- }
+ Int128& operator<<=(uint32_t bits);
/**
* Shift right by the given number of bits. Negative values will
* sign extend and fill with one bits.
*/
- Int128& operator>>=(uint32_t bits) {
- if (bits != 0) {
- if (bits < 64) {
- lowbits_ >>= bits;
- lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
- highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >> bits);
- } else if (bits < 128) {
- lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
- highbits_ = highbits_ >= 0 ? 0 : -1l;
- } else {
- highbits_ = highbits_ >= 0 ? 0 : -1l;
- lowbits_ = static_cast<uint64_t>(highbits_);
- }
- }
- return *this;
- }
+ Int128& operator>>=(uint32_t bits);
bool operator==(const Int128& right) const {
return highbits_ == right.highbits_ && lowbits_ == right.lowbits_;
diff --git a/contrib/libs/apache/orc/c++/include/orc/Reader.hh b/contrib/libs/apache/orc/c++/include/orc/Reader.hh
index b015b649104..e9f420f1139 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Reader.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Reader.hh
@@ -498,9 +498,11 @@ namespace orc {
/**
* Get the statistics about a stripe.
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
- * @return the statistics about that stripe
+ * @param includeRowIndex whether the row index of the stripe is included
+ * @return the statistics about that stripe and row group index statistics
*/
- virtual std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const = 0;
+ virtual std::unique_ptr<StripeStatistics> getStripeStatistics(
+ uint64_t stripeIndex, bool includeRowIndex = true) const = 0;
/**
* Get the length of the data stripes in the file.
diff --git a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh
index 4ba8c35f7d1..58169abe590 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Statistics.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Statistics.hh
@@ -19,12 +19,11 @@
#ifndef ORC_STATISTICS_HH
#define ORC_STATISTICS_HH
+#include "orc/Geospatial.hh"
#include "orc/Type.hh"
#include "orc/Vector.hh"
#include "orc/orc-config.hh"
-#include <sstream>
-
namespace orc {
/**
@@ -367,6 +366,33 @@ namespace orc {
virtual int32_t getMaximumNanos() const = 0;
};
+ /**
+ * Statistics for Geometry and Geography
+ */
+ class GeospatialColumnStatistics : public ColumnStatistics {
+ public:
+ virtual ~GeospatialColumnStatistics();
+
+ /**
+ * Get bounding box
+ * @return bounding box
+ */
+ virtual const geospatial::BoundingBox& getBoundingBox() const = 0;
+
+ /**
+ * Get geospatial types
+ * @return a sorted vector of geometry type IDs that elements is unique
+ */
+ virtual std::vector<int32_t> getGeospatialTypes() const = 0;
+
+ /**
+ * Update stats by a new value
+ * @param value new value to update
+ * @param length length of the value
+ */
+ virtual void update(const char* value, size_t length) = 0;
+ };
+
class Statistics {
public:
virtual ~Statistics();
diff --git a/contrib/libs/apache/orc/c++/include/orc/Type.hh b/contrib/libs/apache/orc/c++/include/orc/Type.hh
index 82e0e3cc86f..4bb794ff343 100644
--- a/contrib/libs/apache/orc/c++/include/orc/Type.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/Type.hh
@@ -25,6 +25,18 @@
namespace orc {
+ namespace geospatial {
+ enum EdgeInterpolationAlgorithm {
+ SPHERICAL = 0,
+ VINCENTY = 1,
+ THOMAS = 2,
+ ANDOYER = 3,
+ KARNEY = 4
+ };
+ std::string AlgoToString(EdgeInterpolationAlgorithm algo);
+ EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo);
+ } // namespace geospatial
+
enum TypeKind {
BOOLEAN = 0,
BYTE = 1,
@@ -44,7 +56,9 @@ namespace orc {
DATE = 15,
VARCHAR = 16,
CHAR = 17,
- TIMESTAMP_INSTANT = 18
+ TIMESTAMP_INSTANT = 18,
+ GEOMETRY = 19,
+ GEOGRAPHY = 20
};
class Type {
@@ -59,6 +73,10 @@ namespace orc {
virtual uint64_t getMaximumLength() const = 0;
virtual uint64_t getPrecision() const = 0;
virtual uint64_t getScale() const = 0;
+ // for geospatial types only
+ virtual const std::string& getCrs() const = 0;
+ // for geography type only
+ virtual geospatial::EdgeInterpolationAlgorithm getAlgorithm() const = 0;
virtual Type& setAttribute(const std::string& key, const std::string& value) = 0;
virtual bool hasAttributeKey(const std::string& key) const = 0;
virtual Type& removeAttribute(const std::string& key) = 0;
@@ -115,6 +133,10 @@ namespace orc {
std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements);
std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key, std::unique_ptr<Type> value);
std::unique_ptr<Type> createUnionType();
+ std::unique_ptr<Type> createGeometryType(const std::string& crs = "OGC:CRS84");
+ std::unique_ptr<Type> createGeographyType(
+ const std::string& crs = "OGC:CRS84",
+ geospatial::EdgeInterpolationAlgorithm algo = geospatial::SPHERICAL);
} // namespace orc
#endif
diff --git a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
index 00fd5ce66dd..2e062454d92 100644
--- a/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/orc-config.hh
@@ -19,7 +19,7 @@
#ifndef ORC_CONFIG_HH
#define ORC_CONFIG_HH
-#define ORC_VERSION "2.1.3"
+#define ORC_VERSION "2.2.0"
#define ORC_CXX_HAS_CSTDINT
diff --git a/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh
index 6493840a92c..2fa3ea04cb6 100644
--- a/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh
+++ b/contrib/libs/apache/orc/c++/include/orc/sargs/SearchArgument.hh
@@ -251,6 +251,12 @@ namespace orc {
* @return the new SearchArgument
*/
virtual std::unique_ptr<SearchArgument> build() = 0;
+
+ /**
+ * Add a maybe leaf to the current item on the stack.
+ * @return this
+ */
+ virtual SearchArgumentBuilder& maybe() = 0;
};
/**
diff --git a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
index 00934ec2e42..b6a602493cf 100644
--- a/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
+++ b/contrib/libs/apache/orc/c++/src/Adaptor-linux.hh
@@ -49,6 +49,12 @@ typedef SSIZE_T ssize_t;
ssize_t pread(int fd, void* buf, size_t count, off_t offset);
#endif
+#if defined(__GNUC__) || defined(__clang__)
+ #define NO_SANITIZE_ATTR __attribute__((no_sanitize("signed-integer-overflow", "shift")))
+#else
+ #define NO_SANITIZE_ATTR
+#endif
+
#ifdef HAS_DIAGNOSTIC_PUSH
#ifdef __clang__
#define DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.cc b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
index 887637223ac..025bdd8a033 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.cc
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.cc
@@ -208,7 +208,7 @@ namespace orc {
}
DIAGNOSTIC_POP
-
+ NO_SANITIZE_ATTR
void BloomFilterImpl::addHash(int64_t hash64) {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
@@ -226,6 +226,7 @@ namespace orc {
}
}
+ NO_SANITIZE_ATTR
bool BloomFilterImpl::testHash(int64_t hash64) const {
int32_t hash1 = static_cast<int32_t>(hash64 & 0xffffffff);
// In Java codes, we use "hash64 >>> 32" which is an unsigned shift op.
diff --git a/contrib/libs/apache/orc/c++/src/BloomFilter.hh b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
index ebc4a5ee046..75fb02a0268 100644
--- a/contrib/libs/apache/orc/c++/src/BloomFilter.hh
+++ b/contrib/libs/apache/orc/c++/src/BloomFilter.hh
@@ -194,6 +194,7 @@ namespace orc {
// Thomas Wang's integer hash function
// http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
// Put this in header file so tests can use it as well.
+ NO_SANITIZE_ATTR
inline int64_t getLongHash(int64_t key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
diff --git a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
index 8b16ecbd09d..6535c612ce8 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnPrinter.cc
@@ -254,6 +254,8 @@ namespace orc {
break;
case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY:
result = std::make_unique<BinaryColumnPrinter>(buffer, param);
break;
diff --git a/contrib/libs/apache/orc/c++/src/ColumnReader.cc b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
index af434c37cad..89ff0e02452 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnReader.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnReader.cc
@@ -726,6 +726,9 @@ namespace orc {
if (totalBytes <= lastBufferLength_) {
// subtract the needed bytes from the ones left over
lastBufferLength_ -= totalBytes;
+ if (lastBuffer_ == nullptr) {
+ throw ParseError("StringDirectColumnReader::skip: lastBuffer_ is null");
+ }
lastBuffer_ += totalBytes;
} else {
// move the stream forward after accounting for the buffered bytes
@@ -780,7 +783,9 @@ namespace orc {
byteBatch.blob.resize(totalLength);
char* ptr = byteBatch.blob.data();
while (bytesBuffered + lastBufferLength_ < totalLength) {
- memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ if (lastBuffer_ != nullptr) {
+ memcpy(ptr + bytesBuffered, lastBuffer_, lastBufferLength_);
+ }
bytesBuffered += lastBufferLength_;
const void* readBuffer;
int readLength;
@@ -1742,6 +1747,8 @@ namespace orc {
case CHAR:
case STRING:
case VARCHAR:
+ case GEOMETRY:
+ case GEOGRAPHY:
switch (static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
case proto::ColumnEncoding_Kind_DICTIONARY:
case proto::ColumnEncoding_Kind_DICTIONARY_V2:
diff --git a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
index d31b1c65d4e..b9aac1a1214 100644
--- a/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
+++ b/contrib/libs/apache/orc/c++/src/ColumnWriter.cc
@@ -17,10 +17,14 @@
*/
#include "orc/Int128.hh"
+#include "orc/Statistics.hh"
+#include "orc/Type.hh"
#include "orc/Writer.hh"
+#include <memory>
#include "ByteRLE.hh"
#include "ColumnWriter.hh"
+#include "Dictionary.hh"
#include "RLE.hh"
#include "Statistics.hh"
#include "Timezone.hh"
@@ -922,148 +926,6 @@ namespace orc {
ColumnWriter::finishStreams();
dataStream_->finishStream();
}
-
- /**
- * Implementation of increasing sorted string dictionary
- */
- class SortedStringDictionary {
- public:
- struct DictEntry {
- DictEntry(const char* str, size_t len) : data(str), length(len) {}
- const char* data;
- size_t length;
- };
-
- struct DictEntryWithIndex {
- DictEntryWithIndex(const char* str, size_t len, size_t index)
- : entry(str, len), index(index) {}
- DictEntry entry;
- size_t index;
- };
-
- SortedStringDictionary() : totalLength_(0) {}
-
- // insert a new string into dictionary, return its insertion order
- size_t insert(const char* str, size_t len);
-
- // write dictionary data & length to output buffer
- void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const;
-
- // reorder input index buffer from insertion order to dictionary order
- void reorder(std::vector<int64_t>& idxBuffer) const;
-
- // get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
-
- // return count of entries
- size_t size() const;
-
- // return total length of strings in the dictioanry
- uint64_t length() const;
-
- void clear();
-
- private:
- struct LessThan {
- bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& r) {
- const auto& left = l.entry;
- const auto& right = r.entry;
- int ret = memcmp(left.data, right.data, std::min(left.length, right.length));
- if (ret != 0) {
- return ret < 0;
- }
- return left.length < right.length;
- }
- };
-
- mutable std::vector<DictEntryWithIndex> flatDict_;
- std::unordered_map<std::string, size_t> keyToIndex_;
- uint64_t totalLength_;
-
- // use friend class here to avoid being bothered by const function calls
- friend class StringColumnWriter;
- friend class CharColumnWriter;
- friend class VarCharColumnWriter;
- // store indexes of insertion order in the dictionary for not-null rows
- std::vector<int64_t> idxInDictBuffer_;
- };
-
- // insert a new string into dictionary, return its insertion order
- size_t SortedStringDictionary::insert(const char* str, size_t len) {
- size_t index = flatDict_.size();
- auto ret = keyToIndex_.emplace(std::string(str, len), index);
- if (ret.second) {
- flatDict_.emplace_back(ret.first->first.data(), ret.first->first.size(), index);
- totalLength_ += len;
- }
- return ret.first->second;
- }
-
- // write dictionary data & length to output buffer
- void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
- RleEncoder* lengthEncoder) const {
- std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
-
- for (const auto& entryWithIndex : flatDict_) {
- const auto& entry = entryWithIndex.entry;
- dataStream->write(entry.data, entry.length);
- lengthEncoder->write(static_cast<int64_t>(entry.length));
- }
- }
-
- /**
- * Reorder input index buffer from insertion order to dictionary order
- *
- * We require this function because string values are buffered by indexes
- * in their insertion order. Until the entire dictionary is complete can
- * we get their sorted indexes in the dictionary in that ORC specification
- * demands dictionary should be ordered. Therefore this function transforms
- * the indexes from insertion order to dictionary value order for final
- * output.
- */
- void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
- // iterate the dictionary to get mapping from insertion order to value order
- std::vector<size_t> mapping(flatDict_.size());
- for (size_t i = 0; i < flatDict_.size(); ++i) {
- mapping[flatDict_[i].index] = i;
- }
-
- // do the transformation
- for (size_t i = 0; i != idxBuffer.size(); ++i) {
- idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
- }
- }
-
- // get dict entries in insertion order
- void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector<const DictEntry*>& entries) const {
- std::sort(flatDict_.begin(), flatDict_.end(),
- [](const DictEntryWithIndex& left, const DictEntryWithIndex& right) {
- return left.index < right.index;
- });
-
- entries.resize(flatDict_.size());
- for (size_t i = 0; i < flatDict_.size(); ++i) {
- entries[i] = &(flatDict_[i].entry);
- }
- }
-
- // return count of entries
- size_t SortedStringDictionary::size() const {
- return flatDict_.size();
- }
-
- // return total length of strings in the dictioanry
- uint64_t SortedStringDictionary::length() const {
- return totalLength_;
- }
-
- void SortedStringDictionary::clear() {
- totalLength_ = 0;
- keyToIndex_.clear();
- flatDict_.clear();
- }
-
class StringColumnWriter : public ColumnWriter {
public:
StringColumnWriter(const Type& type, const StreamsFactory& factory,
@@ -1413,12 +1275,11 @@ namespace orc {
dictionary.getEntriesInInsertionOrder(entries);
// store each length of the data into a vector
- const SortedStringDictionary::DictEntry* dictEntry = nullptr;
for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
// write one row data in direct encoding
- dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
- directDataStream->write(dictEntry->data, dictEntry->length);
- directLengthEncoder->write(static_cast<int64_t>(dictEntry->length));
+ const auto& dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
+ directDataStream->write(dictEntry->data->data(), dictEntry->data->size());
+ directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size()));
}
deleteDictStreams();
@@ -2871,6 +2732,65 @@ namespace orc {
}
}
+ class GeospatialColumnWriter : public BinaryColumnWriter {
+ public:
+ GeospatialColumnWriter(const Type& type, const StreamsFactory& factory,
+ const WriterOptions& options)
+ : BinaryColumnWriter(type, factory, options),
+ isGeometry_(type.getKind() == TypeKind::GEOMETRY) {}
+
+ virtual void add(ColumnVectorBatch& rowBatch, uint64_t offset, uint64_t numValues,
+ const char* incomingMask) override {
+ ColumnWriter::add(rowBatch, offset, numValues, incomingMask);
+
+ const StringVectorBatch* strBatch = dynamic_cast<const StringVectorBatch*>(&rowBatch);
+ if (strBatch == nullptr) {
+ throw InvalidArgument("Failed to cast to StringVectorBatch");
+ }
+ auto data = &strBatch->data[offset];
+ auto length = &strBatch->length[offset];
+ const char* notNull = strBatch->hasNulls ? strBatch->notNull.data() + offset : nullptr;
+
+ bool hasNull = false;
+ GeospatialColumnStatisticsImpl* geoStats = nullptr;
+ if (isGeometry_) {
+ geoStats = dynamic_cast<GeospatialColumnStatisticsImpl*>(colIndexStatistics.get());
+ }
+
+ uint64_t count = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull == nullptr || notNull[i]) {
+ uint64_t len = static_cast<uint64_t>(length[i]);
+ directDataStream->write(data[i], len);
+
+ // update stats
+ if (geoStats) {
+ ++count;
+ geoStats->update(data[i], len);
+ }
+
+ if (enableBloomFilter) {
+ bloomFilter->addBytes(data[i], length[i]);
+ }
+ } else if (!hasNull) {
+ hasNull = true;
+ if (geoStats) {
+ geoStats->setHasNull(hasNull);
+ }
+ }
+ }
+
+ directLengthEncoder->add(length, numValues, notNull);
+
+ if (geoStats) {
+ geoStats->increase(count);
+ }
+ }
+
+ private:
+ bool isGeometry_;
+ };
+
std::unique_ptr<ColumnWriter> buildWriter(const Type& type, const StreamsFactory& factory,
const WriterOptions& options) {
switch (static_cast<int64_t>(type.getKind())) {
@@ -2941,6 +2861,9 @@ namespace orc {
return std::make_unique<MapColumnWriter>(type, factory, options);
case UNION:
return std::make_unique<UnionColumnWriter>(type, factory, options);
+ case GEOMETRY:
+ case GEOGRAPHY:
+ return std::make_unique<GeospatialColumnWriter>(type, factory, options);
default:
throw NotImplementedYet(
"Type is not supported yet for creating "
diff --git a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
index c0f88246e80..7db5b889548 100644
--- a/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
+++ b/contrib/libs/apache/orc/c++/src/ConvertColumnReader.cc
@@ -126,13 +126,13 @@ namespace orc {
bool shouldThrow) {
constexpr bool isFileTypeFloatingPoint(std::is_floating_point<FileType>::value);
constexpr bool isReadTypeFloatingPoint(std::is_floating_point<ReadType>::value);
- int64_t longValue = static_cast<int64_t>(srcValue);
+
if (isFileTypeFloatingPoint) {
if (isReadTypeFloatingPoint) {
destValue = static_cast<ReadType>(srcValue);
} else {
if (!canFitInLong(static_cast<double>(srcValue)) ||
- !downCastToInteger(destValue, longValue)) {
+ !downCastToInteger(destValue, static_cast<int64_t>(srcValue))) {
handleOverflow<FileType, ReadType>(destBatch, idx, shouldThrow);
}
}
diff --git a/contrib/libs/apache/orc/c++/src/Dictionary.cc b/contrib/libs/apache/orc/c++/src/Dictionary.cc
new file mode 100644
index 00000000000..9eb60bb5bfa
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Dictionary.cc
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dictionary.hh"
+
+namespace orc {
+
+ // insert a new string into dictionary, return its insertion order
+ size_t SortedStringDictionary::insert(const char* str, size_t len) {
+ size_t index = flatDict_.size();
+
+ auto it = keyToIndex_.find(std::string_view{str, len});
+ if (it != keyToIndex_.end()) {
+ return it->second;
+ } else {
+ flatDict_.emplace_back(str, len, index);
+ totalLength_ += len;
+
+ const auto& lastEntry = flatDict_.back().entry;
+ keyToIndex_.emplace(std::string_view{lastEntry.data->data(), lastEntry.data->size()}, index);
+ return index;
+ }
+ }
+
+ // write dictionary data & length to output buffer
+ void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+ RleEncoder* lengthEncoder) const {
+ std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
+
+ for (const auto& entryWithIndex : flatDict_) {
+ dataStream->write(entryWithIndex.entry.data->data(), entryWithIndex.entry.data->size());
+ lengthEncoder->write(static_cast<int64_t>(entryWithIndex.entry.data->size()));
+ }
+ }
+
+ /**
+ * Reorder input index buffer from insertion order to dictionary order
+ *
+ * We require this function because string values are buffered by indexes
+ * in their insertion order. Until the entire dictionary is complete can
+ * we get their sorted indexes in the dictionary in that ORC specification
+ * demands dictionary should be ordered. Therefore this function transforms
+ * the indexes from insertion order to dictionary value order for final
+ * output.
+ */
+ void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
+ // iterate the dictionary to get mapping from insertion order to value order
+ std::vector<size_t> mapping(flatDict_.size());
+ for (size_t i = 0; i < flatDict_.size(); ++i) {
+ mapping[flatDict_[i].index] = i;
+ }
+
+ // do the transformation
+ for (size_t i = 0; i != idxBuffer.size(); ++i) {
+ idxBuffer[i] = static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
+ }
+ }
+
+ // get dict entries in insertion order
+ void SortedStringDictionary::getEntriesInInsertionOrder(
+ std::vector<const DictEntry*>& entries) const {
+ /// flatDict_ is sorted in insertion order before [[SortedStringDictionary::flush]] is invoked.
+ entries.resize(flatDict_.size());
+ for (size_t i = 0; i < flatDict_.size(); ++i) {
+ entries[i] = &(flatDict_[i].entry);
+ }
+ }
+
+ // return count of entries
+ size_t SortedStringDictionary::size() const {
+ return flatDict_.size();
+ }
+
+ // return total length of strings in the dictioanry
+ uint64_t SortedStringDictionary::length() const {
+ return totalLength_;
+ }
+
+ void SortedStringDictionary::clear() {
+ totalLength_ = 0;
+ keyToIndex_.clear();
+ flatDict_.clear();
+ }
+} // namespace orc \ No newline at end of file
diff --git a/contrib/libs/apache/orc/c++/src/Dictionary.hh b/contrib/libs/apache/orc/c++/src/Dictionary.hh
new file mode 100644
index 00000000000..c8d6eb198cb
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Dictionary.hh
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#ifdef BUILD_SPARSEHASH
+#error #include <sparsehash/dense_hash_map>
+#else
+#include <unordered_map>
+#endif
+
+#include "RLE.hh"
+
+namespace orc {
+ /**
+ * Implementation of increasing sorted string dictionary
+ */
+ class SortedStringDictionary {
+ public:
+ struct DictEntry {
+ DictEntry(const char* str, size_t len) : data(std::make_unique<std::string>(str, len)) {}
+
+ std::unique_ptr<std::string> data;
+ };
+
+ struct DictEntryWithIndex {
+ DictEntryWithIndex(const char* str, size_t len, size_t index)
+ : entry(str, len), index(index) {}
+
+ DictEntry entry;
+ size_t index;
+ };
+
+ SortedStringDictionary() : totalLength_(0) {
+#ifdef BUILD_SPARSEHASH
+ /// Need to set empty key otherwise dense_hash_map will not work correctly
+ keyToIndex_.set_empty_key(std::string_view{});
+#endif
+ }
+
+ // insert a new string into dictionary, return its insertion order
+ size_t insert(const char* str, size_t len);
+
+ // write dictionary data & length to output buffer
+ void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const;
+
+ // reorder input index buffer from insertion order to dictionary order
+ void reorder(std::vector<int64_t>& idxBuffer) const;
+
+ // get dict entries in insertion order
+ void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
+
+ // return count of entries
+ size_t size() const;
+
+ // return total length of strings in the dictioanry
+ uint64_t length() const;
+
+ void clear();
+
+ private:
+ struct LessThan {
+ bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& r) {
+ return *l.entry.data < *r.entry.data; // use std::string's operator<
+ }
+ };
+ // store dictionary entries in insertion order
+ mutable std::vector<DictEntryWithIndex> flatDict_;
+
+#ifdef BUILD_SPARSEHASH
+ // map from string to its insertion order index
+ google::dense_hash_map<std::string_view, size_t> keyToIndex_;
+#else
+ std::unordered_map<std::string_view, size_t> keyToIndex_;
+#endif
+
+ uint64_t totalLength_;
+
+ // use friend class here to avoid being bothered by const function calls
+ friend class StringColumnWriter;
+ friend class CharColumnWriter;
+ friend class VarCharColumnWriter;
+ // store indexes of insertion order in the dictionary for not-null rows
+ std::vector<int64_t> idxInDictBuffer_;
+ };
+
+} // namespace orc
diff --git a/contrib/libs/apache/orc/c++/src/Geospatial.cc b/contrib/libs/apache/orc/c++/src/Geospatial.cc
new file mode 100644
index 00000000000..2b110cacb63
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Geospatial.cc
@@ -0,0 +1,307 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This file contains code adapted from the Apache Arrow project.
+ *
+ * Original source:
+ * https://github.com/apache/arrow/blob/main/cpp/src/parquet/geospatial/statistics.cc
+ *
+ * The original code is licensed under the Apache License, Version 2.0.
+ *
+ * Modifications may have been made from the original source.
+ */
+
+#include "orc/Geospatial.hh"
+#include "orc/Exceptions.hh"
+
+#include "Geospatial.hh"
+
+#include <algorithm>
+#include <cstring>
+#include <optional>
+#include <sstream>
+
+namespace orc::geospatial {
+
+ template <typename T>
+ inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> safeLoadAs(const uint8_t* unaligned) {
+ std::remove_const_t<T> ret;
+ std::memcpy(&ret, unaligned, sizeof(T));
+ return ret;
+ }
+
+ template <typename U, typename T>
+ inline std::enable_if_t<std::is_trivially_copyable_v<T> && std::is_trivially_copyable_v<U> &&
+ sizeof(T) == sizeof(U),
+ U>
+ safeCopy(T value) {
+ std::remove_const_t<U> ret;
+ std::memcpy(&ret, static_cast<const void*>(&value), sizeof(T));
+ return ret;
+ }
+
+ static bool isLittleEndian() {
+ static union {
+ uint32_t i;
+ char c[4];
+ } num = {0x01020304};
+ return num.c[0] == 4;
+ }
+
+#if defined(_MSC_VER)
+#include <intrin.h> // IWYU pragma: keep
+#define ORC_BYTE_SWAP64 _byteswap_uint64
+#define ORC_BYTE_SWAP32 _byteswap_ulong
+#else
+#define ORC_BYTE_SWAP64 __builtin_bswap64
+#define ORC_BYTE_SWAP32 __builtin_bswap32
+#endif
+
+ // Swap the byte order (i.e. endianness)
+ static inline uint32_t byteSwap(uint32_t value) {
+ return static_cast<uint32_t>(ORC_BYTE_SWAP32(value));
+ }
+ static inline double byteSwap(double value) {
+ const uint64_t swapped = ORC_BYTE_SWAP64(safeCopy<uint64_t>(value));
+ return safeCopy<double>(swapped);
+ }
+
+ std::string BoundingBox::toString() const {
+ std::stringstream ss;
+ ss << "BoundingBox{xMin=" << min[0] << ", xMax=" << max[0] << ", yMin=" << min[1]
+ << ", yMax=" << max[1] << ", zMin=" << min[2] << ", zMax=" << max[2] << ", mMin=" << min[3]
+ << ", mMax=" << max[3] << "}";
+ return ss.str();
+ }
+
+ /// \brief Object to keep track of the low-level consumption of a well-known binary
+ /// geometry
+ ///
+ /// Briefly, ISO well-known binary supported by the Parquet spec is an endian byte
+ /// (0x01 or 0x00), followed by geometry type + dimensions encoded as a (uint32_t),
+ /// followed by geometry-specific data. Coordinate sequences are represented by a
+ /// uint32_t (the number of coordinates) plus a sequence of doubles (number of coordinates
+ /// multiplied by the number of dimensions).
+ class WKBBuffer {
+ public:
+ WKBBuffer() : data_(nullptr), size_(0) {}
+ WKBBuffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {}
+
+ uint8_t readUInt8() {
+ return readChecked<uint8_t>();
+ }
+
+ uint32_t readUInt32(bool swap) {
+ auto value = readChecked<uint32_t>();
+ return swap ? byteSwap(value) : value;
+ }
+
+ template <typename Coord, typename Visit>
+ void readCoords(uint32_t nCoords, bool swap, Visit&& visit) {
+ size_t total_bytes = nCoords * sizeof(Coord);
+ if (size_ < total_bytes) {
+ }
+
+ if (swap) {
+ Coord coord;
+ for (uint32_t i = 0; i < nCoords; i++) {
+ coord = readUnchecked<Coord>();
+ for (auto& c : coord) {
+ c = byteSwap(c);
+ }
+
+ std::forward<Visit>(visit)(coord);
+ }
+ } else {
+ for (uint32_t i = 0; i < nCoords; i++) {
+ std::forward<Visit>(visit)(readUnchecked<Coord>());
+ }
+ }
+ }
+
+ size_t size() const {
+ return size_;
+ }
+
+ private:
+ const uint8_t* data_;
+ size_t size_;
+
+ template <typename T>
+ T readChecked() {
+ if (size_ < sizeof(T)) {
+ std::stringstream ss;
+ ss << "Can't read" << sizeof(T) << " bytes from WKBBuffer with " << size_ << " remaining";
+ throw ParseError(ss.str());
+ }
+
+ return readUnchecked<T>();
+ }
+
+ template <typename T>
+ T readUnchecked() {
+ T out = safeLoadAs<T>(data_);
+ data_ += sizeof(T);
+ size_ -= sizeof(T);
+ return out;
+ }
+ };
+
+ using GeometryTypeAndDimensions = std::pair<GeometryType, Dimensions>;
+
+ namespace {
+
+ std::optional<GeometryTypeAndDimensions> parseGeometryType(uint32_t wkbGeometryType) {
+ // The number 1000 can be used because WKB geometry types are constructed
+ // on purpose such that this relationship is true (e.g., LINESTRING ZM maps
+ // to 3002).
+ uint32_t geometryTypeComponent = wkbGeometryType % 1000;
+ uint32_t dimensionsComponent = wkbGeometryType / 1000;
+
+ auto minGeometryTypeValue = static_cast<uint32_t>(GeometryType::VALUE_MIN);
+ auto maxGeometryTypeValue = static_cast<uint32_t>(GeometryType::VALUE_MAX);
+ auto minDimensionValue = static_cast<uint32_t>(Dimensions::VALUE_MIN);
+ auto maxDimensionValue = static_cast<uint32_t>(Dimensions::VALUE_MAX);
+
+ if (geometryTypeComponent < minGeometryTypeValue ||
+ geometryTypeComponent > maxGeometryTypeValue || dimensionsComponent < minDimensionValue ||
+ dimensionsComponent > maxDimensionValue) {
+ return std::nullopt;
+ }
+
+ return std::make_optional(
+ GeometryTypeAndDimensions{static_cast<GeometryType>(geometryTypeComponent),
+ static_cast<Dimensions>(dimensionsComponent)});
+ }
+
+ } // namespace
+
+ std::vector<int32_t> WKBGeometryBounder::geometryTypes() const {
+ std::vector<int32_t> out(geospatialTypes_.begin(), geospatialTypes_.end());
+ std::sort(out.begin(), out.end());
+ return out;
+ }
+
+ void WKBGeometryBounder::mergeGeometry(std::string_view bytesWkb) {
+ if (!isValid_) {
+ return;
+ }
+ mergeGeometry(reinterpret_cast<const uint8_t*>(bytesWkb.data()), bytesWkb.size());
+ }
+
+ void WKBGeometryBounder::mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize) {
+ if (!isValid_) {
+ return;
+ }
+ WKBBuffer src{bytesWkb, static_cast<int64_t>(bytesSize)};
+ try {
+ mergeGeometryInternal(&src, /*record_wkb_type=*/true);
+ } catch (const ParseError&) {
+ invalidate();
+ return;
+ }
+ if (src.size() != 0) {
+ // "Exepcted zero bytes after consuming WKB
+ invalidate();
+ }
+ }
+
+ void WKBGeometryBounder::mergeGeometryInternal(WKBBuffer* src, bool recordWkbType) {
+ uint8_t endian = src->readUInt8();
+ bool swap = endian != 0x00;
+ if (isLittleEndian()) {
+ swap = endian != 0x01;
+ }
+
+ uint32_t wkbGeometryType = src->readUInt32(swap);
+ auto geometryTypeAndDimensions = parseGeometryType(wkbGeometryType);
+ if (!geometryTypeAndDimensions.has_value()) {
+ invalidate();
+ return;
+ }
+ auto& [geometry_type, dimensions] = geometryTypeAndDimensions.value();
+
+ // Keep track of geometry types encountered if at the top level
+ if (recordWkbType) {
+ geospatialTypes_.insert(static_cast<int32_t>(wkbGeometryType));
+ }
+
+ switch (geometry_type) {
+ case GeometryType::POINT:
+ mergeSequence(src, dimensions, 1, swap);
+ break;
+
+ case GeometryType::LINESTRING: {
+ uint32_t nCoords = src->readUInt32(swap);
+ mergeSequence(src, dimensions, nCoords, swap);
+ break;
+ }
+ case GeometryType::POLYGON: {
+ uint32_t n_parts = src->readUInt32(swap);
+ for (uint32_t i = 0; i < n_parts; i++) {
+ uint32_t nCoords = src->readUInt32(swap);
+ mergeSequence(src, dimensions, nCoords, swap);
+ }
+ break;
+ }
+
+ // These are all encoded the same in WKB, even though this encoding would
+ // allow for parts to be of a different geometry type or different dimensions.
+ // For the purposes of bounding, this does not cause us problems. We pass
+ // record_wkb_type = false because we do not want the child geometry to be
+ // added to the geometry_types list (e.g., for a MultiPoint, we only want
+ // the code for MultiPoint to be added, not the code for Point).
+ case GeometryType::MULTIPOINT:
+ case GeometryType::MULTILINESTRING:
+ case GeometryType::MULTIPOLYGON:
+ case GeometryType::GEOMETRYCOLLECTION: {
+ uint32_t n_parts = src->readUInt32(swap);
+ for (uint32_t i = 0; i < n_parts; i++) {
+ mergeGeometryInternal(src, /*record_wkb_type*/ false);
+ }
+ break;
+ }
+ }
+ }
+
+ void WKBGeometryBounder::mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords,
+ bool swap) {
+ switch (dimensions) {
+ case Dimensions::XY:
+ src->readCoords<BoundingBox::XY>(nCoords, swap,
+ [&](BoundingBox::XY coord) { box_.updateXY(coord); });
+ break;
+ case Dimensions::XYZ:
+ src->readCoords<BoundingBox::XYZ>(nCoords, swap,
+ [&](BoundingBox::XYZ coord) { box_.updateXYZ(coord); });
+ break;
+ case Dimensions::XYM:
+ src->readCoords<BoundingBox::XYM>(nCoords, swap,
+ [&](BoundingBox::XYM coord) { box_.updateXYM(coord); });
+ break;
+ case Dimensions::XYZM:
+ src->readCoords<BoundingBox::XYZM>(
+ nCoords, swap, [&](BoundingBox::XYZM coord) { box_.updateXYZM(coord); });
+ break;
+ default:
+ invalidate();
+ }
+ }
+
+} // namespace orc::geospatial
diff --git a/contrib/libs/apache/orc/c++/src/Geospatial.hh b/contrib/libs/apache/orc/c++/src/Geospatial.hh
new file mode 100644
index 00000000000..aebb72747a7
--- /dev/null
+++ b/contrib/libs/apache/orc/c++/src/Geospatial.hh
@@ -0,0 +1,86 @@
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ORC_GEOSPATIAL_IMPL_HH
+#define ORC_GEOSPATIAL_IMPL_HH
+
+#include "orc/Geospatial.hh"
+
+#include <unordered_set>
+#include <vector>
+
+namespace orc {
+ namespace geospatial {
+ class WKBBuffer;
+
+ class WKBGeometryBounder {
+ public:
+ void mergeGeometry(std::string_view bytesWkb);
+ void mergeGeometry(const uint8_t* bytesWkb, size_t bytesSize);
+
+ void mergeBox(const BoundingBox& box) {
+ box_.merge(box);
+ }
+ void mergeGeometryTypes(const std::vector<int>& geospatialTypes) {
+ geospatialTypes_.insert(geospatialTypes.begin(), geospatialTypes.end());
+ }
+ void merge(const WKBGeometryBounder& other) {
+ if (!isValid() || !other.isValid()) {
+ invalidate();
+ return;
+ }
+ box_.merge(other.box_);
+ geospatialTypes_.insert(other.geospatialTypes_.begin(), other.geospatialTypes_.end());
+ }
+
+ // Get the bounding box for the merged geometries.
+ const BoundingBox& bounds() const {
+ return box_;
+ }
+
+ // Get the set of geometry types encountered during merging.
+ // Returns a sorted vector of geometry type IDs.
+ std::vector<int32_t> geometryTypes() const;
+
+ void reset() {
+ isValid_ = true;
+ box_.reset();
+ geospatialTypes_.clear();
+ }
+ bool isValid() const {
+ return isValid_;
+ }
+ void invalidate() {
+ isValid_ = false;
+ box_.invalidate();
+ geospatialTypes_.clear();
+ }
+
+ private:
+ BoundingBox box_;
+ std::unordered_set<int32_t> geospatialTypes_;
+ bool isValid_ = true;
+
+ void mergeGeometryInternal(WKBBuffer* src, bool recordWkbType);
+ void mergeSequence(WKBBuffer* src, Dimensions dimensions, uint32_t nCoords, bool swap);
+ };
+ } // namespace geospatial
+} // namespace orc
+
+#endif
diff --git a/contrib/libs/apache/orc/c++/src/Int128.cc b/contrib/libs/apache/orc/c++/src/Int128.cc
index 1e059fd4e22..0d4da78b5a7 100644
--- a/contrib/libs/apache/orc/c++/src/Int128.cc
+++ b/contrib/libs/apache/orc/c++/src/Int128.cc
@@ -25,6 +25,41 @@
#include <sstream>
namespace orc {
+ NO_SANITIZE_ATTR
+ Int128& Int128::operator<<=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ highbits_ <<= bits;
+ highbits_ |= (lowbits_ >> (64 - bits));
+ lowbits_ <<= bits;
+ } else if (bits < 128) {
+ highbits_ = static_cast<int64_t>(lowbits_) << (bits - 64);
+ lowbits_ = 0;
+ } else {
+ highbits_ = 0;
+ lowbits_ = 0;
+ }
+ }
+ return *this;
+ }
+
+ NO_SANITIZE_ATTR
+ Int128& Int128::operator>>=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ lowbits_ >>= bits;
+ lowbits_ |= static_cast<uint64_t>(highbits_ << (64 - bits));
+ highbits_ = static_cast<int64_t>(static_cast<uint64_t>(highbits_) >> bits);
+ } else if (bits < 128) {
+ lowbits_ = static_cast<uint64_t>(highbits_ >> (bits - 64));
+ highbits_ = highbits_ >= 0 ? 0 : -1l;
+ } else {
+ highbits_ = highbits_ >= 0 ? 0 : -1l;
+ lowbits_ = static_cast<uint64_t>(highbits_);
+ }
+ }
+ return *this;
+ }
Int128 Int128::maximumValue() {
return Int128(0x7fffffffffffffff, 0xffffffffffffffff);
diff --git a/contrib/libs/apache/orc/c++/src/RLE.cc b/contrib/libs/apache/orc/c++/src/RLE.cc
index cb831c80f71..b29ee8706f4 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.cc
+++ b/contrib/libs/apache/orc/c++/src/RLE.cc
@@ -78,7 +78,7 @@ namespace orc {
}
void RleEncoder::writeVslong(int64_t val) {
- writeVulong((val << 1) ^ (val >> 63));
+ writeVulong((static_cast<uint64_t>(val) << 1) ^ (val >> 63));
}
void RleEncoder::writeVulong(int64_t val) {
diff --git a/contrib/libs/apache/orc/c++/src/RLE.hh b/contrib/libs/apache/orc/c++/src/RLE.hh
index e46504e8850..42c1d5b0e1f 100644
--- a/contrib/libs/apache/orc/c++/src/RLE.hh
+++ b/contrib/libs/apache/orc/c++/src/RLE.hh
@@ -25,9 +25,8 @@
#include <memory>
namespace orc {
-
inline int64_t zigZag(int64_t value) {
- return (value << 1) ^ (value >> 63);
+ return ((static_cast<uint64_t>(value) << 1) ^ (value >> 63));
}
inline int64_t unZigZag(uint64_t value) {
diff --git a/contrib/libs/apache/orc/c++/src/RLEv2.hh b/contrib/libs/apache/orc/c++/src/RLEv2.hh
index 8ceb7f125b2..c2ce5aa8514 100644
--- a/contrib/libs/apache/orc/c++/src/RLEv2.hh
+++ b/contrib/libs/apache/orc/c++/src/RLEv2.hh
@@ -123,6 +123,7 @@ namespace orc {
int64_t* zigzagLiterals_;
int64_t* baseRedLiterals_;
int64_t* adjDeltas_;
+ static constexpr int64_t BASE_VALUE_LIMIT = int64_t(1) << 56;
uint32_t getOpCode(EncodingType encoding);
int64_t* prepareForDirectOrPatchedBase(EncodingOption& option);
diff --git a/contrib/libs/apache/orc/c++/src/Reader.cc b/contrib/libs/apache/orc/c++/src/Reader.cc
index f47c40ebbe0..7c35a834a5b 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.cc
+++ b/contrib/libs/apache/orc/c++/src/Reader.cc
@@ -751,27 +751,35 @@ namespace orc {
return *(contents_->schema.get());
}
- std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
+ std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex,
+ bool includeRowIndex) const {
if (!isMetadataLoaded_) {
readMetadata();
}
if (contents_->metadata == nullptr) {
throw std::logic_error("No stripe statistics in file");
}
- size_t num_cols = static_cast<size_t>(
- contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
- std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols);
proto::StripeInformation currentStripeInfo = footer_->stripes(static_cast<int>(stripeIndex));
proto::StripeFooter currentStripeFooter = getStripeFooter(currentStripeInfo, *contents_.get());
- getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
-
const Timezone& writerTZ = currentStripeFooter.has_writer_timezone()
? getTimezoneByName(currentStripeFooter.writer_timezone())
: getLocalTimezone();
StatContext statContext(hasCorrectStatistics(), &writerTZ);
- return std::make_unique<StripeStatisticsImpl>(
+
+ if (!includeRowIndex) {
+ return std::make_unique<StripeStatisticsImpl>(
+ contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)), statContext);
+ }
+
+ size_t num_cols = static_cast<size_t>(
+ contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)).col_stats_size());
+ std::vector<std::vector<proto::ColumnStatistics>> indexStats(num_cols);
+
+ getRowIndexStatistics(currentStripeInfo, stripeIndex, currentStripeFooter, &indexStats);
+
+ return std::make_unique<StripeStatisticsWithRowGroupIndexImpl>(
contents_->metadata->stripe_stats(static_cast<int>(stripeIndex)), indexStats, statContext);
}
@@ -865,6 +873,8 @@ namespace orc {
case proto::Type_Kind_CHAR:
case proto::Type_Kind_STRING:
case proto::Type_Kind_VARCHAR:
+ case proto::Type_Kind_GEOMETRY:
+ case proto::Type_Kind_GEOGRAPHY:
return 4;
default:
return 0;
diff --git a/contrib/libs/apache/orc/c++/src/Reader.hh b/contrib/libs/apache/orc/c++/src/Reader.hh
index 39ca7396750..3d81d269203 100644
--- a/contrib/libs/apache/orc/c++/src/Reader.hh
+++ b/contrib/libs/apache/orc/c++/src/Reader.hh
@@ -330,7 +330,8 @@ namespace orc {
const std::string& getStreamName() const override;
- std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const override;
+ std::unique_ptr<StripeStatistics> getStripeStatistics(
+ uint64_t stripeIndex, bool includeRowIndex = true) const override;
std::unique_ptr<RowReader> createRowReader() const override;
diff --git a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
index 1cda9ee91ea..91383bb5699 100644
--- a/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
+++ b/contrib/libs/apache/orc/c++/src/RleEncoderV2.cc
@@ -423,7 +423,7 @@ namespace orc {
// fallback to DIRECT encoding.
// The decision to use patched base was based on zigzag values, but the
// actual patching is done on base reduced literals.
- if ((option.brBits100p - option.brBits95p) != 0) {
+ if ((option.brBits100p - option.brBits95p) != 0 && std::abs(option.min) < BASE_VALUE_LIMIT) {
option.encoding = PATCHED_BASE;
preparePatchedBlob(option);
return;
diff --git a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
index 7cf3b5c512d..442c43c228e 100644
--- a/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
+++ b/contrib/libs/apache/orc/c++/src/SchemaEvolution.cc
@@ -18,6 +18,7 @@
#include "SchemaEvolution.hh"
#include "orc/Exceptions.hh"
+#include "orc/Type.hh"
namespace orc {
@@ -113,7 +114,9 @@ namespace orc {
case TIMESTAMP:
case TIMESTAMP_INSTANT:
case DATE:
- case BINARY: {
+ case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY: {
// Not support
break;
}
@@ -235,6 +238,8 @@ namespace orc {
case FLOAT:
case DOUBLE:
case BINARY:
+ case GEOMETRY:
+ case GEOGRAPHY:
case TIMESTAMP:
case LIST:
case MAP:
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.cc b/contrib/libs/apache/orc/c++/src/Statistics.cc
index 76fd736b27e..a86247f1070 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.cc
+++ b/contrib/libs/apache/orc/c++/src/Statistics.cc
@@ -44,6 +44,8 @@ namespace orc {
return new DateColumnStatisticsImpl(s, statContext);
} else if (s.has_binary_statistics()) {
return new BinaryColumnStatisticsImpl(s, statContext);
+ } else if (s.has_geospatial_statistics()) {
+ return new GeospatialColumnStatisticsImpl(s);
} else {
return new ColumnStatisticsImpl(s);
}
@@ -81,11 +83,20 @@ namespace orc {
// PASS
}
- StripeStatisticsImpl::StripeStatisticsImpl(
+ StripeStatisticsImpl::StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
+ const StatContext& statContext) {
+ columnStats_ = std::make_unique<StatisticsImpl>(stripeStats, statContext);
+ }
+
+ StripeStatisticsWithRowGroupIndexImpl::~StripeStatisticsWithRowGroupIndexImpl() {
+ // PASS
+ }
+
+ StripeStatisticsWithRowGroupIndexImpl::StripeStatisticsWithRowGroupIndexImpl(
const proto::StripeStatistics& stripeStats,
std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
- const StatContext& statContext) {
- columnStats_ = std::make_unique<StatisticsImpl>(stripeStats, statContext);
+ const StatContext& statContext)
+ : StripeStatisticsImpl(stripeStats, statContext) {
rowIndexStats_.resize(indexStats.size());
for (size_t i = 0; i < rowIndexStats_.size(); i++) {
for (size_t j = 0; j < indexStats[i].size(); j++) {
@@ -139,6 +150,10 @@ namespace orc {
// PASS
}
+ GeospatialColumnStatistics::~GeospatialColumnStatistics() {
+ // PASS
+ }
+
ColumnStatisticsImpl::~ColumnStatisticsImpl() {
// PASS
}
@@ -179,6 +194,10 @@ namespace orc {
// PASS
}
+ GeospatialColumnStatisticsImpl::~GeospatialColumnStatisticsImpl() {
+ // PASS
+ }
+
ColumnStatisticsImpl::ColumnStatisticsImpl(const proto::ColumnStatistics& pb) {
stats_.setNumberOfValues(pb.number_of_values());
stats_.setHasNull(pb.has_has_null() ? pb.has_null() : true);
@@ -382,6 +401,40 @@ namespace orc {
}
}
+ GeospatialColumnStatisticsImpl::GeospatialColumnStatisticsImpl(
+ const proto::ColumnStatistics& pb) {
+ reset();
+ if (!pb.has_geospatial_statistics()) {
+ bounder_.invalidate();
+ } else {
+ const proto::GeospatialStatistics& stats = pb.geospatial_statistics();
+ geospatial::BoundingBox::XYZM min;
+ geospatial::BoundingBox::XYZM max;
+ for (int i = 0; i < geospatial::MAX_DIMENSIONS; i++) {
+ min[i] = max[i] = std::numeric_limits<double>::quiet_NaN();
+ }
+ if (stats.has_bbox()) {
+ const auto& protoBBox = stats.bbox();
+ min[0] = protoBBox.xmin();
+ min[1] = protoBBox.ymin();
+ max[0] = protoBBox.xmax();
+ max[1] = protoBBox.ymax();
+ if (protoBBox.has_zmin() && protoBBox.has_zmax()) {
+ min[2] = protoBBox.zmin();
+ max[2] = protoBBox.zmax();
+ }
+ if (protoBBox.has_mmin() && protoBBox.has_mmax()) {
+ min[3] = protoBBox.mmin();
+ max[3] = protoBBox.mmax();
+ }
+ }
+ bounder_.mergeBox(geospatial::BoundingBox(min, max));
+ std::vector<int32_t> types = {stats.geospatial_types().begin(),
+ stats.geospatial_types().end()};
+ bounder_.mergeGeometryTypes(types);
+ }
+ }
+
std::unique_ptr<MutableColumnStatistics> createColumnStatistics(const Type& type) {
switch (static_cast<int64_t>(type.getKind())) {
case BOOLEAN:
@@ -413,6 +466,9 @@ namespace orc {
return std::make_unique<TimestampColumnStatisticsImpl>();
case DECIMAL:
return std::make_unique<DecimalColumnStatisticsImpl>();
+ case GEOGRAPHY:
+ case GEOMETRY:
+ return std::make_unique<GeospatialColumnStatisticsImpl>();
default:
throw NotImplementedYet("Not supported type: " + type.toString());
}
diff --git a/contrib/libs/apache/orc/c++/src/Statistics.hh b/contrib/libs/apache/orc/c++/src/Statistics.hh
index 6f212c15ccd..94b1e5d2b2b 100644
--- a/contrib/libs/apache/orc/c++/src/Statistics.hh
+++ b/contrib/libs/apache/orc/c++/src/Statistics.hh
@@ -24,6 +24,7 @@
#include "orc/OrcFile.hh"
#include "orc/Reader.hh"
+#include "Geospatial.hh"
#include "Timezone.hh"
#include "TypeImpl.hh"
@@ -1683,6 +1684,127 @@ namespace orc {
}
};
+ class GeospatialColumnStatisticsImpl : public GeospatialColumnStatistics,
+ public MutableColumnStatistics {
+ private:
+ geospatial::WKBGeometryBounder bounder_;
+ InternalCharStatistics stats_;
+
+ public:
+ GeospatialColumnStatisticsImpl() {
+ reset();
+ }
+ explicit GeospatialColumnStatisticsImpl(const proto::ColumnStatistics& stats);
+ virtual ~GeospatialColumnStatisticsImpl();
+
+ uint64_t getNumberOfValues() const override {
+ return stats_.getNumberOfValues();
+ }
+
+ void setNumberOfValues(uint64_t value) override {
+ stats_.setNumberOfValues(value);
+ }
+
+ void increase(uint64_t count) override {
+ stats_.setNumberOfValues(stats_.getNumberOfValues() + count);
+ }
+
+ bool hasNull() const override {
+ return stats_.hasNull();
+ }
+
+ void setHasNull(bool hasNull) override {
+ stats_.setHasNull(hasNull);
+ }
+
+ void merge(const MutableColumnStatistics& other) override {
+ const GeospatialColumnStatisticsImpl& geoStats =
+ dynamic_cast<const GeospatialColumnStatisticsImpl&>(other);
+ stats_.merge(geoStats.stats_);
+ bounder_.merge(geoStats.bounder_);
+ }
+
+ void reset() override {
+ stats_.reset();
+ bounder_.reset();
+ }
+
+ void update(const char* value, size_t length) override {
+ bounder_.mergeGeometry(std::string_view(value, length));
+ }
+
+ void toProtoBuf(proto::ColumnStatistics& pbStats) const override {
+ pbStats.set_has_null(stats_.hasNull());
+ pbStats.set_number_of_values(stats_.getNumberOfValues());
+
+ proto::GeospatialStatistics* geoStats = pbStats.mutable_geospatial_statistics();
+ const auto& bbox = bounder_.bounds();
+ if (bbox.boundValid(0) && bbox.boundValid(1) && !bbox.boundEmpty(0) && !bbox.boundEmpty(1)) {
+ geoStats->mutable_bbox()->set_xmin(bbox.min[0]);
+ geoStats->mutable_bbox()->set_xmax(bbox.max[0]);
+ geoStats->mutable_bbox()->set_ymin(bbox.min[1]);
+ geoStats->mutable_bbox()->set_ymax(bbox.max[1]);
+ if (bbox.boundValid(2) && !bbox.boundEmpty(2)) {
+ geoStats->mutable_bbox()->set_zmin(bbox.min[2]);
+ geoStats->mutable_bbox()->set_zmax(bbox.max[2]);
+ }
+ if (bbox.boundValid(3) && !bbox.boundEmpty(3)) {
+ geoStats->mutable_bbox()->set_mmin(bbox.min[3]);
+ geoStats->mutable_bbox()->set_mmax(bbox.max[3]);
+ }
+ }
+ for (auto type : bounder_.geometryTypes()) {
+ geoStats->add_geospatial_types(type);
+ }
+ }
+
+ std::string toString() const override {
+ if (!bounder_.isValid()) {
+ return "<GeoStatistics> invalid";
+ }
+
+ std::stringstream ss;
+ ss << "<GeoStatistics>";
+
+ std::string dim_label("xyzm");
+ const auto& bbox = bounder_.bounds();
+ auto dim_valid = bbox.dimensionValid();
+ auto dim_empty = bbox.dimensionEmpty();
+ auto lower = bbox.lowerBound();
+ auto upper = bbox.upperBound();
+
+ for (int i = 0; i < 4; i++) {
+ ss << " " << dim_label[i] << ": ";
+ if (!dim_valid[i]) {
+ ss << "invalid";
+ } else if (dim_empty[i]) {
+ ss << "empty";
+ } else {
+ ss << "[" << lower[i] << ", " << upper[i] << "]";
+ }
+ }
+
+ std::vector<int32_t> maybe_geometry_types = bounder_.geometryTypes();
+ ss << " geometry_types: [";
+ std::string sep("");
+ for (int32_t geometry_type : maybe_geometry_types) {
+ ss << sep << geometry_type;
+ sep = ", ";
+ }
+ ss << "]";
+
+ return ss.str();
+ }
+
+ const geospatial::BoundingBox& getBoundingBox() const override {
+ return bounder_.bounds();
+ }
+
+ std::vector<int32_t> getGeospatialTypes() const override {
+ return bounder_.geometryTypes();
+ }
+ };
+
ColumnStatistics* convertColumnStatistics(const proto::ColumnStatistics& s,
const StatContext& statContext);
@@ -1713,7 +1835,6 @@ namespace orc {
class StripeStatisticsImpl : public StripeStatistics {
private:
std::unique_ptr<StatisticsImpl> columnStats_;
- std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats_;
// DELIBERATELY NOT IMPLEMENTED
StripeStatisticsImpl(const StripeStatisticsImpl&);
@@ -1721,7 +1842,6 @@ namespace orc {
public:
StripeStatisticsImpl(const proto::StripeStatistics& stripeStats,
- std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
const StatContext& statContext);
virtual const ColumnStatistics* getColumnStatistics(uint32_t columnId) const override {
@@ -1732,13 +1852,38 @@ namespace orc {
return columnStats_->getNumberOfColumns();
}
+ virtual const ColumnStatistics* getRowIndexStatistics(uint32_t, uint32_t) const override {
+ throw NotImplementedYet("set includeRowIndex true to get row index stats");
+ }
+
+ virtual ~StripeStatisticsImpl() override;
+
+ virtual uint32_t getNumberOfRowIndexStats(uint32_t) const override {
+ throw NotImplementedYet("set includeRowIndex true to get row index stats");
+ }
+ };
+
+ class StripeStatisticsWithRowGroupIndexImpl : public StripeStatisticsImpl {
+ private:
+ std::vector<std::vector<std::shared_ptr<const ColumnStatistics> > > rowIndexStats_;
+
+ // DELIBERATELY NOT IMPLEMENTED
+ StripeStatisticsWithRowGroupIndexImpl(const StripeStatisticsWithRowGroupIndexImpl&);
+ StripeStatisticsWithRowGroupIndexImpl& operator=(const StripeStatisticsWithRowGroupIndexImpl&);
+
+ public:
+ StripeStatisticsWithRowGroupIndexImpl(
+ const proto::StripeStatistics& stripeStats,
+ std::vector<std::vector<proto::ColumnStatistics> >& indexStats,
+ const StatContext& statContext);
+
virtual const ColumnStatistics* getRowIndexStatistics(uint32_t columnId,
uint32_t rowIndex) const override {
// check id indices are valid
return rowIndexStats_[columnId][rowIndex].get();
}
- virtual ~StripeStatisticsImpl() override;
+ virtual ~StripeStatisticsWithRowGroupIndexImpl() override;
uint32_t getNumberOfRowIndexStats(uint32_t columnId) const override {
return static_cast<uint32_t>(rowIndexStats_[columnId].size());
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.cc b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
index cbc7b82796d..18c4985ab1b 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.cc
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.cc
@@ -19,8 +19,10 @@
#include "TypeImpl.hh"
#include "Adaptor.hh"
#include "orc/Exceptions.hh"
+#include "orc/Type.hh"
#include <iostream>
+#include <memory>
#include <sstream>
namespace orc {
@@ -62,6 +64,33 @@ namespace orc {
subtypeCount_ = 0;
}
+ TypeImpl::TypeImpl(TypeKind kind, const std::string& crs) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = 0;
+ precision_ = 0;
+ scale_ = 0;
+ subtypeCount_ = 0;
+ crs_ = crs;
+ edgeInterpolationAlgorithm_ = geospatial::EdgeInterpolationAlgorithm::SPHERICAL;
+ }
+
+ TypeImpl::TypeImpl(TypeKind kind, const std::string& crs,
+ geospatial::EdgeInterpolationAlgorithm algo) {
+ parent_ = nullptr;
+ columnId_ = -1;
+ maximumColumnId_ = -1;
+ kind_ = kind;
+ maxLength_ = 0;
+ precision_ = 0;
+ scale_ = 0;
+ subtypeCount_ = 0;
+ crs_ = crs;
+ edgeInterpolationAlgorithm_ = algo;
+ }
+
uint64_t TypeImpl::assignIds(uint64_t root) const {
columnId_ = static_cast<int64_t>(root);
uint64_t current = root + 1;
@@ -120,6 +149,14 @@ namespace orc {
return scale_;
}
+ const std::string& TypeImpl::getCrs() const {
+ return crs_;
+ }
+
+ geospatial::EdgeInterpolationAlgorithm TypeImpl::getAlgorithm() const {
+ return edgeInterpolationAlgorithm_;
+ }
+
Type& TypeImpl::setAttribute(const std::string& key, const std::string& value) {
attributes_[key] = value;
return *this;
@@ -189,6 +226,45 @@ namespace orc {
return true;
}
+ namespace geospatial {
+ std::string AlgoToString(EdgeInterpolationAlgorithm algo) {
+ switch (algo) {
+ case EdgeInterpolationAlgorithm::SPHERICAL:
+ return "speherial";
+ case VINCENTY:
+ return "vincenty";
+ case THOMAS:
+ return "thomas";
+ case ANDOYER:
+ return "andoyer";
+ case KARNEY:
+ return "karney";
+ default:
+ throw InvalidArgument("Unknown algo");
+ }
+ }
+
+ EdgeInterpolationAlgorithm AlgoFromString(const std::string& algo) {
+ if (algo == "speherial") {
+ return EdgeInterpolationAlgorithm::SPHERICAL;
+ }
+ if (algo == "vincenty") {
+ return VINCENTY;
+ }
+ if (algo == "thomas") {
+ return THOMAS;
+ }
+ if (algo == "andoyer") {
+ return ANDOYER;
+ }
+ if (algo == "karney") {
+ return KARNEY;
+ }
+ throw InvalidArgument("Unknown algo: " + algo);
+ }
+
+ } // namespace geospatial
+
std::string TypeImpl::toString() const {
switch (static_cast<int64_t>(kind_)) {
case BOOLEAN:
@@ -271,6 +347,17 @@ namespace orc {
result << "char(" << maxLength_ << ")";
return result.str();
}
+ case GEOMETRY: {
+ std::stringstream result;
+ result << "geometry(" << crs_ << ")";
+ return result.str();
+ }
+ case GEOGRAPHY: {
+ std::stringstream result;
+ result << "geography(" << crs_ << ","
+ << geospatial::AlgoToString(edgeInterpolationAlgorithm_) << ")";
+ return result.str();
+ }
default:
throw NotImplementedYet("Unknown type");
}
@@ -322,6 +409,8 @@ namespace orc {
case BINARY:
case CHAR:
case VARCHAR:
+ case GEOMETRY:
+ case GEOGRAPHY:
return encoded ? std::make_unique<EncodedStringVectorBatch>(capacity, memoryPool)
: std::make_unique<StringVectorBatch>(capacity, memoryPool);
@@ -419,6 +508,15 @@ namespace orc {
return std::make_unique<TypeImpl>(UNION);
}
+ std::unique_ptr<Type> createGeometryType(const std::string& crs) {
+ return std::make_unique<TypeImpl>(GEOMETRY, crs);
+ }
+
+ std::unique_ptr<Type> createGeographyType(const std::string& crs,
+ geospatial::EdgeInterpolationAlgorithm algo) {
+ return std::make_unique<TypeImpl>(GEOGRAPHY, crs, algo);
+ }
+
std::string printProtobufMessage(const google::protobuf::Message& message);
std::unique_ptr<Type> convertType(const proto::Type& type, const proto::Footer& footer) {
std::unique_ptr<Type> ret;
@@ -443,6 +541,16 @@ namespace orc {
ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()), type.maximum_length());
break;
+ case proto::Type_Kind_GEOMETRY:
+ ret = std::make_unique<TypeImpl>(static_cast<TypeKind>(type.kind()), type.crs());
+ break;
+
+ case proto::Type_Kind_GEOGRAPHY:
+ ret = std::make_unique<TypeImpl>(
+ static_cast<TypeKind>(type.kind()), type.crs(),
+ static_cast<geospatial::EdgeInterpolationAlgorithm>(type.algorithm()));
+ break;
+
case proto::Type_Kind_DECIMAL:
ret = std::make_unique<TypeImpl>(DECIMAL, type.precision(), type.scale());
break;
@@ -523,6 +631,13 @@ namespace orc {
case CHAR:
result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getMaximumLength());
break;
+ case GEOMETRY:
+ result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getCrs());
+ break;
+ case GEOGRAPHY:
+ result = std::make_unique<TypeImpl>(fileType->getKind(), fileType->getCrs(),
+ fileType->getAlgorithm());
+ break;
case LIST:
result = std::make_unique<TypeImpl>(fileType->getKind());
@@ -710,6 +825,22 @@ namespace orc {
return std::make_unique<TypeImpl>(DECIMAL, precision, scale);
}
+ std::unique_ptr<Type> TypeImpl::parseGeographyType(const std::string& input, size_t start,
+ size_t end) {
+ if (input[start] != '(') {
+ throw std::logic_error("Missing ( after geography.");
+ }
+ size_t pos = start + 1;
+ size_t sep = input.find(',', pos);
+ if (sep + 1 >= end || sep == std::string::npos) {
+ throw std::logic_error("Geography type must specify CRS.");
+ }
+ std::string crs = input.substr(pos, sep - pos);
+ std::string algoStr = input.substr(sep + 1, end - sep - 1);
+ geospatial::EdgeInterpolationAlgorithm algo = geospatial::AlgoFromString(algoStr);
+ return std::make_unique<TypeImpl>(GEOGRAPHY, crs, algo);
+ }
+
void validatePrimitiveType(std::string category, const std::string& input, const size_t pos) {
if (input[pos] == '<' || input[pos] == '(') {
std::ostringstream oss;
@@ -780,6 +911,14 @@ namespace orc {
uint64_t maxLength =
static_cast<uint64_t>(atoi(input.substr(start + 1, end - start + 1).c_str()));
return std::make_unique<TypeImpl>(CHAR, maxLength);
+ } else if (category == "geometry") {
+ if (input[start] != '(') {
+ throw std::logic_error("Missing ( after geometry.");
+ }
+ std::string crs = input.substr(start + 1, end - start + 1);
+ return std::make_unique<TypeImpl>(GEOMETRY, crs);
+ } else if (category == "geography") {
+ return parseGeographyType(input, start, end);
} else {
throw std::logic_error("Unknown type " + category);
}
diff --git a/contrib/libs/apache/orc/c++/src/TypeImpl.hh b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
index 647d5a5d2c5..2db175aba67 100644
--- a/contrib/libs/apache/orc/c++/src/TypeImpl.hh
+++ b/contrib/libs/apache/orc/c++/src/TypeImpl.hh
@@ -24,6 +24,7 @@
#include "Adaptor.hh"
#include "wrap/orc-proto-wrapper.hh"
+#include <memory>
#include <vector>
namespace orc {
@@ -41,6 +42,9 @@ namespace orc {
uint64_t precision_;
uint64_t scale_;
std::map<std::string, std::string> attributes_;
+ std::string crs_;
+ geospatial::EdgeInterpolationAlgorithm edgeInterpolationAlgorithm_ =
+ geospatial::EdgeInterpolationAlgorithm::SPHERICAL;
public:
/**
@@ -58,6 +62,16 @@ namespace orc {
*/
TypeImpl(TypeKind kind, uint64_t precision, uint64_t scale);
+ /**
+ * Create geometry type.
+ */
+ TypeImpl(TypeKind kind, const std::string& crs);
+
+ /**
+ * Create geography type.
+ */
+ TypeImpl(TypeKind kind, const std::string& crs, geospatial::EdgeInterpolationAlgorithm algo);
+
uint64_t getColumnId() const override;
uint64_t getMaximumColumnId() const override;
@@ -76,6 +90,10 @@ namespace orc {
uint64_t getScale() const override;
+ const std::string& getCrs() const override;
+
+ geospatial::EdgeInterpolationAlgorithm getAlgorithm() const override;
+
Type& setAttribute(const std::string& key, const std::string& value) override;
bool hasAttributeKey(const std::string& key) const override;
@@ -177,6 +195,14 @@ namespace orc {
size_t end);
/**
+ * Parse geography type from string
+ * @param input the input string of a decimal type
+ * @param start start position of the input string
+ * @param end end position of the input string
+ */
+ static std::unique_ptr<Type> parseGeographyType(const std::string& input, size_t start,
+ size_t end);
+ /**
* Parse type for a category
* @param category type name
* @param input the input string of the category
diff --git a/contrib/libs/apache/orc/c++/src/Writer.cc b/contrib/libs/apache/orc/c++/src/Writer.cc
index 775e6d24525..c235169ccaf 100644
--- a/contrib/libs/apache/orc/c++/src/Writer.cc
+++ b/contrib/libs/apache/orc/c++/src/Writer.cc
@@ -24,6 +24,7 @@
#include "Utils.hh"
#include <memory>
+#include <stdexcept>
namespace orc {
@@ -702,6 +703,40 @@ namespace orc {
protoType.set_kind(proto::Type_Kind_CHAR);
break;
}
+ case GEOMETRY: {
+ protoType.set_kind(proto::Type_Kind_GEOMETRY);
+ protoType.set_crs(t.getCrs());
+ break;
+ }
+ case GEOGRAPHY: {
+ protoType.set_kind(proto::Type_Kind_GEOGRAPHY);
+ protoType.set_crs(t.getCrs());
+ switch (t.getAlgorithm()) {
+ case geospatial::EdgeInterpolationAlgorithm::SPHERICAL: {
+ protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_SPHERICAL);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::VINCENTY: {
+ protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::THOMAS: {
+ protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_VINCENTY);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::ANDOYER: {
+ protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_ANDOYER);
+ break;
+ }
+ case orc::geospatial::EdgeInterpolationAlgorithm::KARNEY: {
+ protoType.set_algorithm(proto::Type_EdgeInterpolationAlgorithm_KARNEY);
+ break;
+ }
+ default:
+ throw std::invalid_argument("Unknown Algorithm.");
+ }
+ break;
+ }
default:
throw std::logic_error("Unknown type.");
}
diff --git a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
index e49bca4b776..58dd13817d6 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/ExpressionTree.cc
@@ -110,6 +110,9 @@ namespace orc {
return result;
}
case Operator::NOT:
+ if (children_.size() != 1) {
+ throw std::invalid_argument("NOT operator must have exactly one child");
+ }
return !children_.at(0)->evaluate(leaves);
case Operator::LEAF:
return leaves[leaf_];
@@ -159,6 +162,9 @@ namespace orc {
sstream << ')';
break;
case Operator::NOT:
+ if (children_.size() != 1) {
+ throw std::invalid_argument("NOT operator must have exactly one child");
+ }
sstream << "(not " << children_.at(0)->toString() << ')';
break;
case Operator::LEAF:
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
index 83d4af2435e..ff0ba1e2d5f 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
+++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.cc
@@ -272,6 +272,12 @@ namespace orc {
return *this;
}
+ SearchArgumentBuilder& SearchArgumentBuilderImpl::maybe() {
+ TreeNode& parent = currTree_.front();
+ parent->addChild(std::make_shared<ExpressionTree>(TruthValue::YES_NO_NULL));
+ return *this;
+ }
+
/**
* Recursively explore the tree to find the leaves that are still reachable
* after optimizations.
diff --git a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
index 1963c993d62..7d663f7349d 100644
--- a/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
+++ b/contrib/libs/apache/orc/c++/src/sargs/SearchArgument.hh
@@ -275,6 +275,12 @@ namespace orc {
*/
std::unique_ptr<SearchArgument> build() override;
+ /**
+ * Add a maybe leaf to the current item on the stack.
+ * @return this
+ */
+ SearchArgumentBuilder& maybe() override;
+
private:
SearchArgumentBuilder& start(ExpressionTree::Operator op);
size_t addLeaf(PredicateLeaf leaf);
diff --git a/contrib/libs/apache/orc/patches/darwin-byte-count.patch b/contrib/libs/apache/orc/patches/darwin-byte-count.patch
new file mode 100644
index 00000000000..09f8fff6950
--- /dev/null
+++ b/contrib/libs/apache/orc/patches/darwin-byte-count.patch
@@ -0,0 +1,20 @@
+--- a/c++/src/io/InputStream.cc
++++ b/c++/src/io/InputStream.cc
+@@ -115,1 +115,1 @@ namespace orc {
+- google::protobuf::int64 SeekableArrayInputStream::ByteCount() const {
++ int64_t SeekableArrayInputStream::ByteCount() const {
+--- a/c++/src/io/InputStream.hh
++++ b/c++/src/io/InputStream.hh
+@@ -76,1 +76,1 @@ namespace orc {
+- virtual google::protobuf::int64 ByteCount() const override;
++ virtual int64_t ByteCount() const override;
+--- a/c++/src/io/OutputStream.cc
++++ b/c++/src/io/OutputStream.cc
+@@ -67,1 +67,1 @@ namespace orc {
+- google::protobuf::int64 BufferedOutputStream::ByteCount() const {
++ int64_t BufferedOutputStream::ByteCount() const {
+--- a/c++/src/io/OutputStream.hh
++++ b/c++/src/io/OutputStream.hh
+@@ -55,1 +55,1 @@ namespace orc {
+- virtual google::protobuf::int64 ByteCount() const override;
++ virtual int64_t ByteCount() const override;
diff --git a/contrib/libs/apache/orc/patches/darwin-proto-int64.patch b/contrib/libs/apache/orc/patches/darwin-proto-int64.patch
new file mode 100644
index 00000000000..8f591a5e3b1
--- /dev/null
+++ b/contrib/libs/apache/orc/patches/darwin-proto-int64.patch
@@ -0,0 +1,10 @@
+--- a/c++/src/sargs/PredicateLeaf.cc
++++ b/c++/src/sargs/PredicateLeaf.cc
+@@ -480,5 +480,5 @@ namespace orc {
+- static std::vector<int64_t> literal2Long(const std::vector<Literal>& values) {
+- std::vector<int64_t> result;
++ static std::vector<google::protobuf::int64> literal2Long(const std::vector<Literal>& values) {
++ std::vector<google::protobuf::int64> result;
+ std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) {
+ if (!val.isNull()) {
+ result.emplace_back(val.getLong());
diff --git a/contrib/libs/apache/orc/patches/fix_strings.patch b/contrib/libs/apache/orc/patches/fix_strings.patch
new file mode 100644
index 00000000000..60c4d78511a
--- /dev/null
+++ b/contrib/libs/apache/orc/patches/fix_strings.patch
@@ -0,0 +1,36 @@
+--- a/c++/include/orc/Common.hh
++++ b/c++/include/orc/Common.hh
+@@ -23,10 +23,14 @@
+ #include "orc/Type.hh"
+ #include "orc/Vector.hh"
+
++#include <google/protobuf/message.h>
++
+ #include <string>
+
+ namespace orc {
+
++ using TProtobufString = decltype(std::declval<::google::protobuf::MessageLite>().GetTypeName());
++
+ class FileVersion {
+ private:
+ uint32_t majorVersion;
+--- a/c++/src/Reader.cc
++++ b/c++/src/Reader.cc
+@@ -566,4 +566,4 @@ namespace orc {
+- std::string result;
++ TProtobufString result;
+ if (!tail.SerializeToString(&result)) {
+ throw ParseError("Failed to serialize file tail");
+ }
+--- a/c++/src/sargs/PredicateLeaf.cc
++++ b/c++/src/sargs/PredicateLeaf.cc
+@@ -533,5 +533,5 @@ namespace orc {
+- static std::vector<std::string> literal2String(const std::vector<Literal>& values) {
++ static std::vector<TProtobufString> literal2String(const std::vector<Literal>& values) {
+- std::vector<std::string> result;
++ std::vector<TProtobufString> result;
+ std::for_each(values.cbegin(), values.cend(), [&](const Literal& val) {
+ if (!val.isNull()) {
+- result.emplace_back(val.getString());
++ result.emplace_back(TProtobufString(val.getString()));
diff --git a/contrib/libs/apache/orc/patches/pr2345-fix-windows-build.patch b/contrib/libs/apache/orc/patches/pr2345-fix-windows-build.patch
new file mode 100644
index 00000000000..1cec51838e4
--- /dev/null
+++ b/contrib/libs/apache/orc/patches/pr2345-fix-windows-build.patch
@@ -0,0 +1,26 @@
+From a76249e13a6e364e0507a12cb71abaaf1647252e Mon Sep 17 00:00:00 2001
+From: Yuriy Chernyshov <[email protected]>
+Date: Thu, 31 Jul 2025 13:20:15 +0200
+Subject: [PATCH] Fix Windows build
+
+See
+https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/byteswap-uint64-byteswap-ulong-byteswap-ushort?view=msvc-170
+---
+ c++/src/Geospatial.cc | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/c++/src/Geospatial.cc b/c++/src/Geospatial.cc
+index 6d7d268703..2b110cacb6 100644
+--- a/c++/src/Geospatial.cc
++++ b/c++/src/Geospatial.cc
+@@ -66,8 +66,8 @@ namespace orc::geospatial {
+
+ #if defined(_MSC_VER)
+ #include <intrin.h> // IWYU pragma: keep
+-#define ORC_BYTE_SWAP64 _byteSwap_uint64
+-#define ORC_BYTE_SWAP32 _byteSwap_ulong
++#define ORC_BYTE_SWAP64 _byteswap_uint64
++#define ORC_BYTE_SWAP32 _byteswap_ulong
+ #else
+ #define ORC_BYTE_SWAP64 __builtin_bswap64
+ #define ORC_BYTE_SWAP32 __builtin_bswap32
diff --git a/contrib/libs/apache/orc/ya.make b/contrib/libs/apache/orc/ya.make
index c7edc68289e..af1230c805f 100644
--- a/contrib/libs/apache/orc/ya.make
+++ b/contrib/libs/apache/orc/ya.make
@@ -6,9 +6,9 @@ LICENSE(Apache-2.0)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-VERSION(2.1.3)
+VERSION(2.2.0)
-ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.1.3.tar.gz)
+ORIGINAL_SOURCE(https://github.com/apache/orc/archive/rel/release-2.2.0.tar.gz)
PEERDIR(
contrib/libs/apache/orc-format
@@ -46,7 +46,9 @@ SRCS(
c++/src/Compression.cc
c++/src/ConvertColumnReader.cc
c++/src/CpuInfoUtil.cc
+ c++/src/Dictionary.cc
c++/src/Exceptions.cc
+ c++/src/Geospatial.cc
c++/src/Int128.cc
c++/src/LzoDecompressor.cc
c++/src/MemoryPool.cc