summaryrefslogtreecommitdiffstats
path: root/contrib/libs/apache/arrow/cpp
diff options
context:
space:
mode:
authorthegeorg <[email protected]>2022-02-10 16:45:12 +0300
committerDaniil Cherednik <[email protected]>2022-02-10 16:45:12 +0300
commit49116032d905455a7b1c994e4a696afc885c1e71 (patch)
treebe835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/apache/arrow/cpp
parent4e839db24a3bbc9f1c610c43d6faaaa99824dcca (diff)
Restoring authorship annotation for <[email protected]>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/apache/arrow/cpp')
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc1190
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h362
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc2138
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h114
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc12
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc394
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h114
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc318
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h132
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc70
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h62
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc60
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h98
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h76
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc166
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/data.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc472
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/util.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc668
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h44
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer.h12
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h132
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/builder.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc40
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc56
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compare.cc1098
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compare.h52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc292
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h594
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc754
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h1410
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc296
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc178
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc742
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h62
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc1646
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h574
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc2372
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h538
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h672
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc536
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc3298
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h1270
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc476
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h188
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc1220
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h344
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc556
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h342
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc382
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h272
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc226
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h1252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h168
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc902
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h236
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h242
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc644
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc986
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc328
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc344
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc316
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h806
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc2758
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc3100
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc886
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc114
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc126
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc364
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc254
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc156
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc754
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc162
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc3460
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc288
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc6768
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc1326
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc340
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc1080
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc766
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc3270
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/config.cc72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/config.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/datum.cc120
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/datum.h58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc308
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h76
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/file.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h156
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc190
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h164
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc26
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h22
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h122
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc120
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h12
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc106
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h34
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h132
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc956
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc150
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h72
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc570
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc140
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc122
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/result.h116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc142
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/scalar.h48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/status.cc8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/status.h28
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h292
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table.cc24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc174
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor.h24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type.cc372
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type.h332
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h126
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h3228
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc1376
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h326
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h360
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h706
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h866
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc112
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h74
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h334
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h24
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc116
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h98
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h372
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h176
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h1252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc452
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc60
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc94
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc204
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h38
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc662
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h234
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc104
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h68
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h362
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc36
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h32
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h156
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc252
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/future.h1418
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h30
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc192
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h42
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h56
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc336
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h316
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc44
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h52
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h2
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h88
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h266
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h1652
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h196
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc58
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/string.h18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc168
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h70
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc834
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h206
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc272
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h458
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h8
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h66
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc46
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h18
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h452
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc10
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h184
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h830
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h202
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h434
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h4
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor.h6
-rw-r--r--contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h14
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp34
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h48
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp14830
-rw-r--r--contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h5834
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/README20
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc1800
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h310
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc2496
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h686
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc1582
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h244
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc2174
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h368
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc444
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h102
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc964
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h218
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc324
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h494
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_page.h320
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc3604
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h752
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc182
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h524
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc4134
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h540
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc5094
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encoding.h920
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc824
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h1020
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h232
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc220
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc480
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h242
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc340
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h218
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/exception.cc54
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/exception.h316
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc1736
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h376
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc1094
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h468
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/hasher.h144
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc164
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h80
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h130
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc366
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h398
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h714
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc3566
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/metadata.h968
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc444
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/platform.cc82
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/platform.h222
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/printer.cc594
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/printer.h92
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/properties.cc128
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/properties.h1626
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema.cc1890
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema.h988
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h108
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc1770
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/statistics.h684
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc1042
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h598
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc648
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h486
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/symbols.map80
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h988
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h86
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/types.cc3134
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/types.h1530
-rw-r--r--contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h60
337 files changed, 85425 insertions, 85425 deletions
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
index 33ac56ff816..2f74b40e40d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.cc
@@ -1,595 +1,595 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/adapters/orc/adapter.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <functional>
-#include <list>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/adapters/orc/adapter_util.h"
-#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/io/interfaces.h"
-#include "arrow/memory_pool.h"
-#include "arrow/record_batch.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/table_builder.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/decimal.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/range.h"
-#include "arrow/util/visibility.h"
-#include "orc/Exceptions.hh"
-
-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
-#define ORC_THROW_NOT_OK(s) \
- do { \
- Status _s = (s); \
- if (!_s.ok()) { \
- std::stringstream ss; \
- ss << "Arrow error: " << _s.ToString(); \
- throw liborc::ParseError(ss.str()); \
- } \
- } while (0)
-
-#define ORC_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
- auto status_name = (rexpr); \
- ORC_THROW_NOT_OK(status_name.status()); \
- lhs = std::move(status_name).ValueOrDie();
-
-#define ORC_ASSIGN_OR_THROW(lhs, rexpr) \
- ORC_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
- lhs, rexpr);
-
-#define ORC_BEGIN_CATCH_NOT_OK try {
-#define ORC_END_CATCH_NOT_OK \
- } \
- catch (const liborc::ParseError& e) { \
- return Status::IOError(e.what()); \
- } \
- catch (const liborc::InvalidArgument& e) { \
- return Status::Invalid(e.what()); \
- } \
- catch (const liborc::NotImplementedYet& e) { \
- return Status::NotImplemented(e.what()); \
- }
-
-#define ORC_CATCH_NOT_OK(_s) \
- ORC_BEGIN_CATCH_NOT_OK(_s); \
- ORC_END_CATCH_NOT_OK
-
-namespace arrow {
-namespace adapters {
-namespace orc {
-
-namespace {
-
-// The following are required by ORC to be uint64_t
-constexpr uint64_t kOrcWriterBatchSize = 128 * 1024;
-constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024;
-
-using internal::checked_cast;
-
-class ArrowInputFile : public liborc::InputStream {
- public:
- explicit ArrowInputFile(const std::shared_ptr<io::RandomAccessFile>& file)
- : file_(file) {}
-
- uint64_t getLength() const override {
- ORC_ASSIGN_OR_THROW(int64_t size, file_->GetSize());
- return static_cast<uint64_t>(size);
- }
-
- uint64_t getNaturalReadSize() const override { return 128 * 1024; }
-
- void read(void* buf, uint64_t length, uint64_t offset) override {
- ORC_ASSIGN_OR_THROW(int64_t bytes_read, file_->ReadAt(offset, length, buf));
-
- if (static_cast<uint64_t>(bytes_read) != length) {
- throw liborc::ParseError("Short read from arrow input file");
- }
- }
-
- const std::string& getName() const override {
- static const std::string filename("ArrowInputFile");
- return filename;
- }
-
- private:
- std::shared_ptr<io::RandomAccessFile> file_;
-};
-
-struct StripeInformation {
- uint64_t offset;
- uint64_t length;
- uint64_t num_rows;
- uint64_t first_row_of_stripe;
-};
-
-// The number of rows to read in a ColumnVectorBatch
-constexpr int64_t kReadRowsBatch = 1000;
-
-class OrcStripeReader : public RecordBatchReader {
- public:
- OrcStripeReader(std::unique_ptr<liborc::RowReader> row_reader,
- std::shared_ptr<Schema> schema, int64_t batch_size, MemoryPool* pool)
- : row_reader_(std::move(row_reader)),
- schema_(schema),
- pool_(pool),
- batch_size_{batch_size} {}
-
- std::shared_ptr<Schema> schema() const override { return schema_; }
-
- Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
- std::unique_ptr<liborc::ColumnVectorBatch> batch;
- ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
-
- const liborc::Type& type = row_reader_->getSelectedType();
- if (!row_reader_->next(*batch)) {
- out->reset();
- return Status::OK();
- }
-
- std::unique_ptr<RecordBatchBuilder> builder;
- RETURN_NOT_OK(RecordBatchBuilder::Make(schema_, pool_, batch->numElements, &builder));
-
- // The top-level type must be a struct to read into an arrow table
- const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
-
- for (int i = 0; i < builder->num_fields(); i++) {
- RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
- batch->numElements, builder->GetField(i)));
- }
-
- RETURN_NOT_OK(builder->Flush(out));
- return Status::OK();
- }
-
- private:
- std::unique_ptr<liborc::RowReader> row_reader_;
- std::shared_ptr<Schema> schema_;
- MemoryPool* pool_;
- int64_t batch_size_;
-};
-
-} // namespace
-
-class ORCFileReader::Impl {
- public:
- Impl() {}
- ~Impl() {}
-
- Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
- std::unique_ptr<ArrowInputFile> io_wrapper(new ArrowInputFile(file));
- liborc::ReaderOptions options;
- std::unique_ptr<liborc::Reader> liborc_reader;
- ORC_CATCH_NOT_OK(liborc_reader = createReader(std::move(io_wrapper), options));
- pool_ = pool;
- reader_ = std::move(liborc_reader);
- current_row_ = 0;
-
- return Init();
- }
-
- Status Init() {
- int64_t nstripes = reader_->getNumberOfStripes();
- stripes_.resize(nstripes);
- std::unique_ptr<liborc::StripeInformation> stripe;
- uint64_t first_row_of_stripe = 0;
- for (int i = 0; i < nstripes; ++i) {
- stripe = reader_->getStripe(i);
- stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(),
- stripe->getNumberOfRows(), first_row_of_stripe});
- first_row_of_stripe += stripe->getNumberOfRows();
- }
- return Status::OK();
- }
-
- int64_t NumberOfStripes() { return stripes_.size(); }
-
- int64_t NumberOfRows() { return reader_->getNumberOfRows(); }
-
- Status ReadSchema(std::shared_ptr<Schema>* out) {
- const liborc::Type& type = reader_->getType();
- return GetArrowSchema(type, out);
- }
-
- Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
- std::unique_ptr<liborc::RowReader> row_reader;
- ORC_CATCH_NOT_OK(row_reader = reader_->createRowReader(opts));
- const liborc::Type& type = row_reader->getSelectedType();
- return GetArrowSchema(type, out);
- }
-
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() {
- const std::list<std::string> keys = reader_->getMetadataKeys();
- auto metadata = std::make_shared<KeyValueMetadata>();
- for (const auto& key : keys) {
- metadata->Append(key, reader_->getMetadataValue(key));
- }
- return std::const_pointer_cast<const KeyValueMetadata>(metadata);
- }
-
- Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
- if (type.getKind() != liborc::STRUCT) {
- return Status::NotImplemented(
- "Only ORC files with a top-level struct "
- "can be handled");
- }
- int size = static_cast<int>(type.getSubtypeCount());
- std::vector<std::shared_ptr<Field>> fields;
- for (int child = 0; child < size; ++child) {
- std::shared_ptr<DataType> elemtype;
- RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype));
- std::string name = type.getFieldName(child);
- fields.push_back(field(name, elemtype));
- }
- ARROW_ASSIGN_OR_RAISE(auto metadata, ReadMetadata());
- *out = std::make_shared<Schema>(std::move(fields), std::move(metadata));
- return Status::OK();
- }
-
- Status Read(std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadTable(opts, schema, out);
- }
-
- Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- return ReadTable(opts, schema, out);
- }
-
- Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadTable(opts, schema, out);
- }
-
- Status Read(const std::shared_ptr<Schema>& schema,
- const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- return ReadTable(opts, schema, out);
- }
-
- Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectStripe(&opts, stripe));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
- }
-
- Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatch>* out) {
- liborc::RowReaderOptions opts;
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- RETURN_NOT_OK(SelectStripe(&opts, stripe));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
- }
-
- Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
- ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
- Status::Invalid("Out of bounds stripe: ", stripe));
-
- opts->range(stripes_[stripe].offset, stripes_[stripe].length);
- return Status::OK();
- }
-
- Status SelectStripeWithRowNumber(liborc::RowReaderOptions* opts, int64_t row_number,
- StripeInformation* out) {
- ARROW_RETURN_IF(row_number >= NumberOfRows(),
- Status::Invalid("Out of bounds row number: ", row_number));
-
- for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
- if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
- static_cast<uint64_t>(row_number) < it->first_row_of_stripe + it->num_rows) {
- opts->range(it->offset, it->length);
- *out = *it;
- return Status::OK();
- }
- }
-
- return Status::Invalid("Invalid row number", row_number);
- }
-
- Status SelectIndices(liborc::RowReaderOptions* opts,
- const std::vector<int>& include_indices) {
- std::list<uint64_t> include_indices_list;
- for (auto it = include_indices.begin(); it != include_indices.end(); ++it) {
- ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
- include_indices_list.push_back(*it);
- }
- opts->includeTypes(include_indices_list);
- return Status::OK();
- }
-
- Status ReadTable(const liborc::RowReaderOptions& row_opts,
- const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
- liborc::RowReaderOptions opts(row_opts);
- std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
- for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
- opts.range(stripes_[stripe].offset, stripes_[stripe].length);
- RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
- }
- return Table::FromRecordBatches(schema, std::move(batches)).Value(out);
- }
-
- Status ReadBatch(const liborc::RowReaderOptions& opts,
- const std::shared_ptr<Schema>& schema, int64_t nrows,
- std::shared_ptr<RecordBatch>* out) {
- std::unique_ptr<liborc::RowReader> row_reader;
- std::unique_ptr<liborc::ColumnVectorBatch> batch;
-
- ORC_BEGIN_CATCH_NOT_OK
- row_reader = reader_->createRowReader(opts);
- batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
- ORC_END_CATCH_NOT_OK
-
- std::unique_ptr<RecordBatchBuilder> builder;
- RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
-
- // The top-level type must be a struct to read into an arrow table
- const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
-
- const liborc::Type& type = row_reader->getSelectedType();
- while (row_reader->next(*batch)) {
- for (int i = 0; i < builder->num_fields(); i++) {
- RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
- batch->numElements, builder->GetField(i)));
- }
- }
- RETURN_NOT_OK(builder->Flush(out));
- return Status::OK();
- }
-
- Status Seek(int64_t row_number) {
- ARROW_RETURN_IF(row_number >= NumberOfRows(),
- Status::Invalid("Out of bounds row number: ", row_number));
-
- current_row_ = row_number;
- return Status::OK();
- }
-
- Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- if (current_row_ >= NumberOfRows()) {
- out->reset();
- return Status::OK();
- }
-
- liborc::RowReaderOptions opts;
- if (!include_indices.empty()) {
- RETURN_NOT_OK(SelectIndices(&opts, include_indices));
- }
- StripeInformation stripe_info({0, 0, 0, 0});
- RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
- std::shared_ptr<Schema> schema;
- RETURN_NOT_OK(ReadSchema(opts, &schema));
- std::unique_ptr<liborc::RowReader> row_reader;
-
- ORC_BEGIN_CATCH_NOT_OK
- row_reader = reader_->createRowReader(opts);
- row_reader->seekToRow(current_row_);
- current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
- ORC_END_CATCH_NOT_OK
-
- *out = std::shared_ptr<RecordBatchReader>(
- new OrcStripeReader(std::move(row_reader), schema, batch_size, pool_));
- return Status::OK();
- }
-
- Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
- return NextStripeReader(batch_size, {}, out);
- }
-
- private:
- MemoryPool* pool_;
- std::unique_ptr<liborc::Reader> reader_;
- std::vector<StripeInformation> stripes_;
- int64_t current_row_;
-};
-
-ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); }
-
-ORCFileReader::~ORCFileReader() {}
-
-Status ORCFileReader::Open(const std::shared_ptr<io::RandomAccessFile>& file,
- MemoryPool* pool, std::unique_ptr<ORCFileReader>* reader) {
- auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
- RETURN_NOT_OK(result->impl_->Open(file, pool));
- *reader = std::move(result);
- return Status::OK();
-}
-
-Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
- return impl_->ReadMetadata();
-}
-
-Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
- return impl_->ReadSchema(out);
-}
-
-Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
-
-Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
- std::shared_ptr<Table>* out) {
- return impl_->Read(schema, out);
-}
-
-Status ORCFileReader::Read(const std::vector<int>& include_indices,
- std::shared_ptr<Table>* out) {
- return impl_->Read(include_indices, out);
-}
-
-Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
- const std::vector<int>& include_indices,
- std::shared_ptr<Table>* out) {
- return impl_->Read(schema, include_indices, out);
-}
-
-Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
- return impl_->ReadStripe(stripe, out);
-}
-
-Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatch>* out) {
- return impl_->ReadStripe(stripe, include_indices, out);
-}
-
-Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); }
-
-Status ORCFileReader::NextStripeReader(int64_t batch_sizes,
- std::shared_ptr<RecordBatchReader>* out) {
- return impl_->NextStripeReader(batch_sizes, out);
-}
-
-Status ORCFileReader::NextStripeReader(int64_t batch_size,
- const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- return impl_->NextStripeReader(batch_size, include_indices, out);
-}
-
-int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
-
-int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
-
-namespace {
-
-class ArrowOutputStream : public liborc::OutputStream {
- public:
- explicit ArrowOutputStream(arrow::io::OutputStream& output_stream)
- : output_stream_(output_stream), length_(0) {}
-
- uint64_t getLength() const override { return length_; }
-
- uint64_t getNaturalWriteSize() const override { return kOrcNaturalWriteSize; }
-
- void write(const void* buf, size_t length) override {
- ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast<int64_t>(length)));
- length_ += static_cast<int64_t>(length);
- }
-
- // Mandatory due to us implementing an ORC virtual class.
- // Used by ORC for error messages, not used by Arrow
- const std::string& getName() const override {
- static const std::string filename("ArrowOutputFile");
- return filename;
- }
-
- void close() override {
- if (!output_stream_.closed()) {
- ORC_THROW_NOT_OK(output_stream_.Close());
- }
- }
-
- void set_length(int64_t length) { length_ = length; }
-
- private:
- arrow::io::OutputStream& output_stream_;
- int64_t length_;
-};
-
-} // namespace
-
-class ORCFileWriter::Impl {
- public:
- Status Open(arrow::io::OutputStream* output_stream) {
- out_stream_ = std::unique_ptr<liborc::OutputStream>(
- checked_cast<liborc::OutputStream*>(new ArrowOutputStream(*output_stream)));
- return Status::OK();
- }
-
- Status Write(const Table& table) {
- std::unique_ptr<liborc::WriterOptions> orc_options =
- std::unique_ptr<liborc::WriterOptions>(new liborc::WriterOptions());
- ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema())));
- ORC_CATCH_NOT_OK(
- writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), *orc_options))
-
- int64_t num_rows = table.num_rows();
- const int num_cols_ = table.num_columns();
- std::vector<int64_t> arrow_index_offset(num_cols_, 0);
- std::vector<int> arrow_chunk_offset(num_cols_, 0);
- std::unique_ptr<liborc::ColumnVectorBatch> batch =
- writer_->createRowBatch(kOrcWriterBatchSize);
- liborc::StructVectorBatch* root =
- internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
- while (num_rows > 0) {
- for (int i = 0; i < num_cols_; i++) {
- RETURN_NOT_OK(adapters::orc::WriteBatch(
- *(table.column(i)), kOrcWriterBatchSize, &(arrow_chunk_offset[i]),
- &(arrow_index_offset[i]), (root->fields)[i]));
- }
- root->numElements = (root->fields)[0]->numElements;
- writer_->add(*batch);
- batch->clear();
- num_rows -= kOrcWriterBatchSize;
- }
- return Status::OK();
- }
-
- Status Close() {
- writer_->close();
- return Status::OK();
- }
-
- private:
- std::unique_ptr<liborc::Writer> writer_;
- std::unique_ptr<liborc::OutputStream> out_stream_;
-};
-
-ORCFileWriter::~ORCFileWriter() {}
-
-ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); }
-
-Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open(
- io::OutputStream* output_stream) {
- std::unique_ptr<ORCFileWriter> result =
- std::unique_ptr<ORCFileWriter>(new ORCFileWriter());
- Status status = result->impl_->Open(output_stream);
- RETURN_NOT_OK(status);
- return std::move(result);
-}
-
-Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); }
-
-Status ORCFileWriter::Close() { return impl_->Close(); }
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/adapters/orc/adapter_util.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/table_builder.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/range.h"
+#include "arrow/util/visibility.h"
+#include "orc/Exceptions.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+#define ORC_THROW_NOT_OK(s) \
+ do { \
+ Status _s = (s); \
+ if (!_s.ok()) { \
+ std::stringstream ss; \
+ ss << "Arrow error: " << _s.ToString(); \
+ throw liborc::ParseError(ss.str()); \
+ } \
+ } while (0)
+
+#define ORC_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+ auto status_name = (rexpr); \
+ ORC_THROW_NOT_OK(status_name.status()); \
+ lhs = std::move(status_name).ValueOrDie();
+
+#define ORC_ASSIGN_OR_THROW(lhs, rexpr) \
+ ORC_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+#define ORC_BEGIN_CATCH_NOT_OK try {
+#define ORC_END_CATCH_NOT_OK \
+ } \
+ catch (const liborc::ParseError& e) { \
+ return Status::IOError(e.what()); \
+ } \
+ catch (const liborc::InvalidArgument& e) { \
+ return Status::Invalid(e.what()); \
+ } \
+ catch (const liborc::NotImplementedYet& e) { \
+ return Status::NotImplemented(e.what()); \
+ }
+
+#define ORC_CATCH_NOT_OK(_s) \
+ ORC_BEGIN_CATCH_NOT_OK(_s); \
+ ORC_END_CATCH_NOT_OK
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+namespace {
+
+// The following are required by ORC to be uint64_t
+constexpr uint64_t kOrcWriterBatchSize = 128 * 1024;
+constexpr uint64_t kOrcNaturalWriteSize = 128 * 1024;
+
+using internal::checked_cast;
+
+class ArrowInputFile : public liborc::InputStream {
+ public:
+ explicit ArrowInputFile(const std::shared_ptr<io::RandomAccessFile>& file)
+ : file_(file) {}
+
+ uint64_t getLength() const override {
+ ORC_ASSIGN_OR_THROW(int64_t size, file_->GetSize());
+ return static_cast<uint64_t>(size);
+ }
+
+ uint64_t getNaturalReadSize() const override { return 128 * 1024; }
+
+ void read(void* buf, uint64_t length, uint64_t offset) override {
+ ORC_ASSIGN_OR_THROW(int64_t bytes_read, file_->ReadAt(offset, length, buf));
+
+ if (static_cast<uint64_t>(bytes_read) != length) {
+ throw liborc::ParseError("Short read from arrow input file");
+ }
+ }
+
+ const std::string& getName() const override {
+ static const std::string filename("ArrowInputFile");
+ return filename;
+ }
+
+ private:
+ std::shared_ptr<io::RandomAccessFile> file_;
+};
+
+struct StripeInformation {
+ uint64_t offset;
+ uint64_t length;
+ uint64_t num_rows;
+ uint64_t first_row_of_stripe;
+};
+
+// The number of rows to read in a ColumnVectorBatch
+constexpr int64_t kReadRowsBatch = 1000;
+
+class OrcStripeReader : public RecordBatchReader {
+ public:
+ OrcStripeReader(std::unique_ptr<liborc::RowReader> row_reader,
+ std::shared_ptr<Schema> schema, int64_t batch_size, MemoryPool* pool)
+ : row_reader_(std::move(row_reader)),
+ schema_(schema),
+ pool_(pool),
+ batch_size_{batch_size} {}
+
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
+ std::unique_ptr<liborc::ColumnVectorBatch> batch;
+ ORC_CATCH_NOT_OK(batch = row_reader_->createRowBatch(batch_size_));
+
+ const liborc::Type& type = row_reader_->getSelectedType();
+ if (!row_reader_->next(*batch)) {
+ out->reset();
+ return Status::OK();
+ }
+
+ std::unique_ptr<RecordBatchBuilder> builder;
+ RETURN_NOT_OK(RecordBatchBuilder::Make(schema_, pool_, batch->numElements, &builder));
+
+ // The top-level type must be a struct to read into an arrow table
+ const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
+ batch->numElements, builder->GetField(i)));
+ }
+
+ RETURN_NOT_OK(builder->Flush(out));
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<liborc::RowReader> row_reader_;
+ std::shared_ptr<Schema> schema_;
+ MemoryPool* pool_;
+ int64_t batch_size_;
+};
+
+} // namespace
+
+class ORCFileReader::Impl {
+ public:
+ Impl() {}
+ ~Impl() {}
+
+ Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool) {
+ std::unique_ptr<ArrowInputFile> io_wrapper(new ArrowInputFile(file));
+ liborc::ReaderOptions options;
+ std::unique_ptr<liborc::Reader> liborc_reader;
+ ORC_CATCH_NOT_OK(liborc_reader = createReader(std::move(io_wrapper), options));
+ pool_ = pool;
+ reader_ = std::move(liborc_reader);
+ current_row_ = 0;
+
+ return Init();
+ }
+
+ Status Init() {
+ int64_t nstripes = reader_->getNumberOfStripes();
+ stripes_.resize(nstripes);
+ std::unique_ptr<liborc::StripeInformation> stripe;
+ uint64_t first_row_of_stripe = 0;
+ for (int i = 0; i < nstripes; ++i) {
+ stripe = reader_->getStripe(i);
+ stripes_[i] = StripeInformation({stripe->getOffset(), stripe->getLength(),
+ stripe->getNumberOfRows(), first_row_of_stripe});
+ first_row_of_stripe += stripe->getNumberOfRows();
+ }
+ return Status::OK();
+ }
+
+ int64_t NumberOfStripes() { return stripes_.size(); }
+
+ int64_t NumberOfRows() { return reader_->getNumberOfRows(); }
+
+ Status ReadSchema(std::shared_ptr<Schema>* out) {
+ const liborc::Type& type = reader_->getType();
+ return GetArrowSchema(type, out);
+ }
+
+ Status ReadSchema(const liborc::RowReaderOptions& opts, std::shared_ptr<Schema>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ ORC_CATCH_NOT_OK(row_reader = reader_->createRowReader(opts));
+ const liborc::Type& type = row_reader->getSelectedType();
+ return GetArrowSchema(type, out);
+ }
+
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() {
+ const std::list<std::string> keys = reader_->getMetadataKeys();
+ auto metadata = std::make_shared<KeyValueMetadata>();
+ for (const auto& key : keys) {
+ metadata->Append(key, reader_->getMetadataValue(key));
+ }
+ return std::const_pointer_cast<const KeyValueMetadata>(metadata);
+ }
+
+ Status GetArrowSchema(const liborc::Type& type, std::shared_ptr<Schema>* out) {
+ if (type.getKind() != liborc::STRUCT) {
+ return Status::NotImplemented(
+ "Only ORC files with a top-level struct "
+ "can be handled");
+ }
+ int size = static_cast<int>(type.getSubtypeCount());
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int child = 0; child < size; ++child) {
+ std::shared_ptr<DataType> elemtype;
+ RETURN_NOT_OK(GetArrowType(type.getSubtype(child), &elemtype));
+ std::string name = type.getFieldName(child);
+ fields.push_back(field(name, elemtype));
+ }
+ ARROW_ASSIGN_OR_RAISE(auto metadata, ReadMetadata());
+ *out = std::make_shared<Schema>(std::move(fields), std::move(metadata));
+ return Status::OK();
+ }
+
+ Status Read(std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ return ReadTable(opts, schema, out);
+ }
+
+ Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectStripe(&opts, stripe));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
+ }
+
+ Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out) {
+ liborc::RowReaderOptions opts;
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ RETURN_NOT_OK(SelectStripe(&opts, stripe));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ return ReadBatch(opts, schema, stripes_[stripe].num_rows, out);
+ }
+
+ Status SelectStripe(liborc::RowReaderOptions* opts, int64_t stripe) {
+ ARROW_RETURN_IF(stripe < 0 || stripe >= NumberOfStripes(),
+ Status::Invalid("Out of bounds stripe: ", stripe));
+
+ opts->range(stripes_[stripe].offset, stripes_[stripe].length);
+ return Status::OK();
+ }
+
+ Status SelectStripeWithRowNumber(liborc::RowReaderOptions* opts, int64_t row_number,
+ StripeInformation* out) {
+ ARROW_RETURN_IF(row_number >= NumberOfRows(),
+ Status::Invalid("Out of bounds row number: ", row_number));
+
+ for (auto it = stripes_.begin(); it != stripes_.end(); it++) {
+ if (static_cast<uint64_t>(row_number) >= it->first_row_of_stripe &&
+ static_cast<uint64_t>(row_number) < it->first_row_of_stripe + it->num_rows) {
+ opts->range(it->offset, it->length);
+ *out = *it;
+ return Status::OK();
+ }
+ }
+
+ return Status::Invalid("Invalid row number", row_number);
+ }
+
+ Status SelectIndices(liborc::RowReaderOptions* opts,
+ const std::vector<int>& include_indices) {
+ std::list<uint64_t> include_indices_list;
+ for (auto it = include_indices.begin(); it != include_indices.end(); ++it) {
+ ARROW_RETURN_IF(*it < 0, Status::Invalid("Negative field index"));
+ include_indices_list.push_back(*it);
+ }
+ opts->includeTypes(include_indices_list);
+ return Status::OK();
+ }
+
+ Status ReadTable(const liborc::RowReaderOptions& row_opts,
+ const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out) {
+ liborc::RowReaderOptions opts(row_opts);
+ std::vector<std::shared_ptr<RecordBatch>> batches(stripes_.size());
+ for (size_t stripe = 0; stripe < stripes_.size(); stripe++) {
+ opts.range(stripes_[stripe].offset, stripes_[stripe].length);
+ RETURN_NOT_OK(ReadBatch(opts, schema, stripes_[stripe].num_rows, &batches[stripe]));
+ }
+ return Table::FromRecordBatches(schema, std::move(batches)).Value(out);
+ }
+
+ Status ReadBatch(const liborc::RowReaderOptions& opts,
+ const std::shared_ptr<Schema>& schema, int64_t nrows,
+ std::shared_ptr<RecordBatch>* out) {
+ std::unique_ptr<liborc::RowReader> row_reader;
+ std::unique_ptr<liborc::ColumnVectorBatch> batch;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ batch = row_reader->createRowBatch(std::min(nrows, kReadRowsBatch));
+ ORC_END_CATCH_NOT_OK
+
+ std::unique_ptr<RecordBatchBuilder> builder;
+ RETURN_NOT_OK(RecordBatchBuilder::Make(schema, pool_, nrows, &builder));
+
+ // The top-level type must be a struct to read into an arrow table
+ const auto& struct_batch = checked_cast<liborc::StructVectorBatch&>(*batch);
+
+ const liborc::Type& type = row_reader->getSelectedType();
+ while (row_reader->next(*batch)) {
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type.getSubtype(i), struct_batch.fields[i], 0,
+ batch->numElements, builder->GetField(i)));
+ }
+ }
+ RETURN_NOT_OK(builder->Flush(out));
+ return Status::OK();
+ }
+
+ Status Seek(int64_t row_number) {
+ ARROW_RETURN_IF(row_number >= NumberOfRows(),
+ Status::Invalid("Out of bounds row number: ", row_number));
+
+ current_row_ = row_number;
+ return Status::OK();
+ }
+
+ Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ if (current_row_ >= NumberOfRows()) {
+ out->reset();
+ return Status::OK();
+ }
+
+ liborc::RowReaderOptions opts;
+ if (!include_indices.empty()) {
+ RETURN_NOT_OK(SelectIndices(&opts, include_indices));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ *out = std::shared_ptr<RecordBatchReader>(
+ new OrcStripeReader(std::move(row_reader), schema, batch_size, pool_));
+ return Status::OK();
+ }
+
+ Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
+ return NextStripeReader(batch_size, {}, out);
+ }
+
+ private:
+ MemoryPool* pool_;
+ std::unique_ptr<liborc::Reader> reader_;
+ std::vector<StripeInformation> stripes_;
+ int64_t current_row_;
+};
+
+ORCFileReader::ORCFileReader() { impl_.reset(new ORCFileReader::Impl()); }
+
+ORCFileReader::~ORCFileReader() {}
+
+Status ORCFileReader::Open(const std::shared_ptr<io::RandomAccessFile>& file,
+ MemoryPool* pool, std::unique_ptr<ORCFileReader>* reader) {
+ auto result = std::unique_ptr<ORCFileReader>(new ORCFileReader());
+ RETURN_NOT_OK(result->impl_->Open(file, pool));
+ *reader = std::move(result);
+ return Status::OK();
+}
+
+Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
+ return impl_->ReadMetadata();
+}
+
+Status ORCFileReader::ReadSchema(std::shared_ptr<Schema>* out) {
+ return impl_->ReadSchema(out);
+}
+
+Status ORCFileReader::Read(std::shared_ptr<Table>* out) { return impl_->Read(out); }
+
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, out);
+}
+
+Status ORCFileReader::Read(const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(include_indices, out);
+}
+
+Status ORCFileReader::Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<Table>* out) {
+ return impl_->Read(schema, include_indices, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out) {
+ return impl_->ReadStripe(stripe, out);
+}
+
+Status ORCFileReader::ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out) {
+ return impl_->ReadStripe(stripe, include_indices, out);
+}
+
+Status ORCFileReader::Seek(int64_t row_number) { return impl_->Seek(row_number); }
+
+Status ORCFileReader::NextStripeReader(int64_t batch_sizes,
+ std::shared_ptr<RecordBatchReader>* out) {
+ return impl_->NextStripeReader(batch_sizes, out);
+}
+
+Status ORCFileReader::NextStripeReader(int64_t batch_size,
+ const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ return impl_->NextStripeReader(batch_size, include_indices, out);
+}
+
+int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
+
+int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
+
+namespace {
+
+class ArrowOutputStream : public liborc::OutputStream {
+ public:
+ explicit ArrowOutputStream(arrow::io::OutputStream& output_stream)
+ : output_stream_(output_stream), length_(0) {}
+
+ uint64_t getLength() const override { return length_; }
+
+ uint64_t getNaturalWriteSize() const override { return kOrcNaturalWriteSize; }
+
+ void write(const void* buf, size_t length) override {
+ ORC_THROW_NOT_OK(output_stream_.Write(buf, static_cast<int64_t>(length)));
+ length_ += static_cast<int64_t>(length);
+ }
+
+ // Mandatory due to us implementing an ORC virtual class.
+ // Used by ORC for error messages, not used by Arrow
+ const std::string& getName() const override {
+ static const std::string filename("ArrowOutputFile");
+ return filename;
+ }
+
+ void close() override {
+ if (!output_stream_.closed()) {
+ ORC_THROW_NOT_OK(output_stream_.Close());
+ }
+ }
+
+ void set_length(int64_t length) { length_ = length; }
+
+ private:
+ arrow::io::OutputStream& output_stream_;
+ int64_t length_;
+};
+
+} // namespace
+
+class ORCFileWriter::Impl {
+ public:
+ Status Open(arrow::io::OutputStream* output_stream) {
+ out_stream_ = std::unique_ptr<liborc::OutputStream>(
+ checked_cast<liborc::OutputStream*>(new ArrowOutputStream(*output_stream)));
+ return Status::OK();
+ }
+
+ Status Write(const Table& table) {
+ std::unique_ptr<liborc::WriterOptions> orc_options =
+ std::unique_ptr<liborc::WriterOptions>(new liborc::WriterOptions());
+ ARROW_ASSIGN_OR_RAISE(auto orc_schema, GetOrcType(*(table.schema())));
+ ORC_CATCH_NOT_OK(
+ writer_ = liborc::createWriter(*orc_schema, out_stream_.get(), *orc_options))
+
+ int64_t num_rows = table.num_rows();
+ const int num_cols_ = table.num_columns();
+ std::vector<int64_t> arrow_index_offset(num_cols_, 0);
+ std::vector<int> arrow_chunk_offset(num_cols_, 0);
+ std::unique_ptr<liborc::ColumnVectorBatch> batch =
+ writer_->createRowBatch(kOrcWriterBatchSize);
+ liborc::StructVectorBatch* root =
+ internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+ while (num_rows > 0) {
+ for (int i = 0; i < num_cols_; i++) {
+ RETURN_NOT_OK(adapters::orc::WriteBatch(
+ *(table.column(i)), kOrcWriterBatchSize, &(arrow_chunk_offset[i]),
+ &(arrow_index_offset[i]), (root->fields)[i]));
+ }
+ root->numElements = (root->fields)[0]->numElements;
+ writer_->add(*batch);
+ batch->clear();
+ num_rows -= kOrcWriterBatchSize;
+ }
+ return Status::OK();
+ }
+
+ Status Close() {
+ writer_->close();
+ return Status::OK();
+ }
+
+ private:
+ std::unique_ptr<liborc::Writer> writer_;
+ std::unique_ptr<liborc::OutputStream> out_stream_;
+};
+
+ORCFileWriter::~ORCFileWriter() {}
+
+ORCFileWriter::ORCFileWriter() { impl_.reset(new ORCFileWriter::Impl()); }
+
+Result<std::unique_ptr<ORCFileWriter>> ORCFileWriter::Open(
+ io::OutputStream* output_stream) {
+ std::unique_ptr<ORCFileWriter> result =
+ std::unique_ptr<ORCFileWriter>(new ORCFileWriter());
+ Status status = result->impl_->Open(output_stream);
+ RETURN_NOT_OK(status);
+ return std::move(result);
+}
+
+Status ORCFileWriter::Write(const Table& table) { return impl_->Write(table); }
+
+Status ORCFileWriter::Close() { return impl_->Close(); }
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
index e6e406068a9..012c1701980 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter.h
@@ -1,181 +1,181 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "arrow/io/interfaces.h"
-#include "arrow/memory_pool.h"
-#include "arrow/record_batch.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace adapters {
-namespace orc {
-
-/// \class ORCFileReader
-/// \brief Read an Arrow Table or RecordBatch from an ORC file.
-class ARROW_EXPORT ORCFileReader {
- public:
- ~ORCFileReader();
-
- /// \brief Creates a new ORC reader.
- ///
- /// \param[in] file the data source
- /// \param[in] pool a MemoryPool to use for buffer allocations
- /// \param[out] reader the returned reader object
- /// \return Status
- static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
- std::unique_ptr<ORCFileReader>* reader);
-
- /// \brief Return the metadata read from the ORC file
- ///
- /// \return A KeyValueMetadata object containing the ORC metadata
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
-
- /// \brief Return the schema read from the ORC file
- ///
- /// \param[out] out the returned Schema object
- Status ReadSchema(std::shared_ptr<Schema>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[out] out the returned Table
- Status Read(std::shared_ptr<Table>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[in] schema the Table schema
- /// \param[out] out the returned Table
- Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned Table
- Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
-
- /// \brief Read the file as a Table
- ///
- /// The table will be composed of one record batch per stripe.
- ///
- /// \param[in] schema the Table schema
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned Table
- Status Read(const std::shared_ptr<Schema>& schema,
- const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
-
- /// \brief Read a single stripe as a RecordBatch
- ///
- /// \param[in] stripe the stripe index
- /// \param[out] out the returned RecordBatch
- Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);
-
- /// \brief Read a single stripe as a RecordBatch
- ///
- /// \param[in] stripe the stripe index
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned RecordBatch
- Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatch>* out);
-
- /// \brief Seek to designated row. Invoke NextStripeReader() after seek
- /// will return stripe reader starting from designated row.
- ///
- /// \param[in] row_number the rows number to seek
- Status Seek(int64_t row_number);
-
- /// \brief Get a stripe level record batch iterator with specified row count
- /// in each record batch. NextStripeReader serves as a fine grain
- /// alternative to ReadStripe which may cause OOM issue by loading
- /// the whole stripes into memory.
- ///
- /// \param[in] batch_size the number of rows each record batch contains in
- /// record batch iteration.
- /// \param[out] out the returned stripe reader
- Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out);
-
- /// \brief Get a stripe level record batch iterator with specified row count
- /// in each record batch. NextStripeReader serves as a fine grain
- /// alternative to ReadStripe which may cause OOM issue by loading
- /// the whole stripes into memory.
- ///
- /// \param[in] batch_size Get a stripe level record batch iterator with specified row
- /// count in each record batch.
- ///
- /// \param[in] include_indices the selected field indices to read
- /// \param[out] out the returned stripe reader
- Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
- std::shared_ptr<RecordBatchReader>* out);
-
- /// \brief The number of stripes in the file
- int64_t NumberOfStripes();
-
- /// \brief The number of rows in the file
- int64_t NumberOfRows();
-
- private:
- class Impl;
- std::unique_ptr<Impl> impl_;
- ORCFileReader();
-};
-
-/// \class ORCFileWriter
-/// \brief Write an Arrow Table or RecordBatch to an ORC file.
-class ARROW_EXPORT ORCFileWriter {
- public:
- ~ORCFileWriter();
- /// \brief Creates a new ORC writer.
- ///
- /// \param[in] output_stream a pointer to the io::OutputStream to write into
- /// \return the returned writer object
- static Result<std::unique_ptr<ORCFileWriter>> Open(io::OutputStream* output_stream);
-
- /// \brief Write a table
- ///
- /// \param[in] table the Arrow table from which data is extracted
- /// \return Status
- Status Write(const Table& table);
-
- /// \brief Close an ORC writer (orc::Writer)
- ///
- /// \return Status
- Status Close();
-
- private:
- class Impl;
- std::unique_ptr<Impl> impl_;
-
- private:
- ORCFileWriter();
-};
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+/// \class ORCFileReader
+/// \brief Read an Arrow Table or RecordBatch from an ORC file.
+class ARROW_EXPORT ORCFileReader {
+ public:
+ ~ORCFileReader();
+
+ /// \brief Creates a new ORC reader.
+ ///
+ /// \param[in] file the data source
+ /// \param[in] pool a MemoryPool to use for buffer allocations
+ /// \param[out] reader the returned reader object
+ /// \return Status
+ static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
+ std::unique_ptr<ORCFileReader>* reader);
+
+ /// \brief Return the metadata read from the ORC file
+ ///
+ /// \return A KeyValueMetadata object containing the ORC metadata
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ /// \brief Return the schema read from the ORC file
+ ///
+ /// \param[out] out the returned Schema object
+ Status ReadSchema(std::shared_ptr<Schema>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[out] out the returned Table
+ Status Read(std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
+ /// \brief Read the file as a Table
+ ///
+ /// The table will be composed of one record batch per stripe.
+ ///
+ /// \param[in] schema the Table schema
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned Table
+ Status Read(const std::shared_ptr<Schema>& schema,
+ const std::vector<int>& include_indices, std::shared_ptr<Table>* out);
+
+ /// \brief Read a single stripe as a RecordBatch
+ ///
+ /// \param[in] stripe the stripe index
+ /// \param[out] out the returned RecordBatch
+ Status ReadStripe(int64_t stripe, std::shared_ptr<RecordBatch>* out);
+
+ /// \brief Read a single stripe as a RecordBatch
+ ///
+ /// \param[in] stripe the stripe index
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned RecordBatch
+ Status ReadStripe(int64_t stripe, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatch>* out);
+
+ /// \brief Seek to designated row. Invoke NextStripeReader() after seek
+ /// will return stripe reader starting from designated row.
+ ///
+ /// \param[in] row_number the rows number to seek
+ Status Seek(int64_t row_number);
+
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size the number of rows each record batch contains in
+ /// record batch iteration.
+ /// \param[out] out the returned stripe reader
+ Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out);
+
+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
+ /// the whole stripes into memory.
+ ///
+ /// \param[in] batch_size Get a stripe level record batch iterator with specified row
+ /// count in each record batch.
+ ///
+ /// \param[in] include_indices the selected field indices to read
+ /// \param[out] out the returned stripe reader
+ Status NextStripeReader(int64_t batch_size, const std::vector<int>& include_indices,
+ std::shared_ptr<RecordBatchReader>* out);
+
+ /// \brief The number of stripes in the file
+ int64_t NumberOfStripes();
+
+ /// \brief The number of rows in the file
+ int64_t NumberOfRows();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+ ORCFileReader();
+};
+
+/// \class ORCFileWriter
+/// \brief Write an Arrow Table or RecordBatch to an ORC file.
+class ARROW_EXPORT ORCFileWriter {
+ public:
+ ~ORCFileWriter();
+ /// \brief Creates a new ORC writer.
+ ///
+ /// \param[in] output_stream a pointer to the io::OutputStream to write into
+ /// \return the returned writer object
+ static Result<std::unique_ptr<ORCFileWriter>> Open(io::OutputStream* output_stream);
+
+ /// \brief Write a table
+ ///
+ /// \param[in] table the Arrow table from which data is extracted
+ /// \return Status
+ Status Write(const Table& table);
+
+ /// \brief Close an ORC writer (orc::Writer)
+ ///
+ /// \return Status
+ Status Close();
+
+ private:
+ class Impl;
+ std::unique_ptr<Impl> impl_;
+
+ private:
+ ORCFileWriter();
+};
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
index cbd29b3741b..f956a6f6217 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.cc
@@ -1,1069 +1,1069 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/adapters/orc/adapter_util.h"
-
-#include <cmath>
-#include <string>
-#include <vector>
-
-#include "arrow/array/builder_base.h"
-#include "arrow/builder.h"
-#include "arrow/chunked_array.h"
-#include "arrow/scalar.h"
-#include "arrow/status.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/decimal.h"
-#include "arrow/util/range.h"
-#include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
-#include "orc/Exceptions.hh"
-#include "orc/MemoryPool.hh"
-#include "orc/OrcFile.hh"
-
-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
-namespace arrow {
-
-using internal::checked_cast;
-
-namespace adapters {
-namespace orc {
-
-namespace {
-
-// The number of milliseconds, microseconds and nanoseconds in a second
-constexpr int64_t kOneSecondMillis = 1000LL;
-constexpr int64_t kOneMicroNanos = 1000LL;
-constexpr int64_t kOneSecondMicros = 1000000LL;
-constexpr int64_t kOneMilliNanos = 1000000LL;
-constexpr int64_t kOneSecondNanos = 1000000000LL;
-
-Status AppendStructBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<StructBuilder*>(abuilder);
- auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- RETURN_NOT_OK(builder->AppendValues(length, valid_bytes));
-
- for (int i = 0; i < builder->num_fields(); i++) {
- RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, length,
- builder->field_builder(i)));
- }
- return Status::OK();
-}
-
-Status AppendListBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<ListBuilder*>(abuilder);
- auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* elements = batch->elements.get();
- const liborc::Type* elemtype = type->getSubtype(0);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- int64_t start = batch->offsets[i];
- int64_t end = batch->offsets[i + 1];
- RETURN_NOT_OK(builder->Append());
- RETURN_NOT_OK(
- AppendBatch(elemtype, elements, start, end - start, builder->value_builder()));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-Status AppendMapBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<MapBuilder*>(abuilder);
- auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* keys = batch->keys.get();
- liborc::ColumnVectorBatch* items = batch->elements.get();
- const liborc::Type* key_type = type->getSubtype(0);
- const liborc::Type* item_type = type->getSubtype(1);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- int64_t start = batch->offsets[i];
- int64_t end = batch->offsets[i + 1];
- RETURN_NOT_OK(builder->Append());
- RETURN_NOT_OK(
- AppendBatch(key_type, keys, start, end - start, builder->key_builder()));
- RETURN_NOT_OK(
- AppendBatch(item_type, items, start, end - start, builder->item_builder()));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-template <class BuilderType, class BatchType, class ElemType>
-Status AppendNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BuilderType*>(abuilder);
- auto batch = checked_cast<BatchType*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- const ElemType* source = batch->data.data() + offset;
- RETURN_NOT_OK(builder->AppendValues(source, length, valid_bytes));
- return Status::OK();
-}
-
-template <class BuilderType, class TargetType, class BatchType, class SourceType>
-Status AppendNumericBatchCast(liborc::ColumnVectorBatch* column_vector_batch,
- int64_t offset, int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BuilderType*>(abuilder);
- auto batch = checked_cast<BatchType*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- const SourceType* source = batch->data.data() + offset;
- auto cast_iter = internal::MakeLazyRange(
- [&source](int64_t index) { return static_cast<TargetType>(source[index]); },
- length);
-
- RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
-
- return Status::OK();
-}
-
-Status AppendBoolBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BooleanBuilder*>(abuilder);
- auto batch = checked_cast<liborc::LongVectorBatch*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
- const int64_t* source = batch->data.data() + offset;
-
- auto cast_iter = internal::MakeLazyRange(
- [&source](int64_t index) { return static_cast<bool>(source[index]); }, length);
-
- RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
-
- return Status::OK();
-}
-
-Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch,
- int64_t offset, int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<TimestampBuilder*>(abuilder);
- auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
-
- if (length == 0) {
- return Status::OK();
- }
-
- const uint8_t* valid_bytes = nullptr;
- if (batch->hasNulls) {
- valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
- }
-
- const int64_t* seconds = batch->data.data() + offset;
- const int64_t* nanos = batch->nanoseconds.data() + offset;
-
- auto transform_timestamp = [seconds, nanos](int64_t index) {
- return seconds[index] * kOneSecondNanos + nanos[index];
- };
-
- auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
-
- RETURN_NOT_OK(
- builder->AppendValues(transform_range.begin(), transform_range.end(), valid_bytes));
- return Status::OK();
-}
-
-template <class BuilderType>
-Status AppendBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<BuilderType*>(abuilder);
- auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(
- builder->Append(batch->data[i], static_cast<int32_t>(batch->length[i])));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
- int64_t offset, int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<FixedSizeBinaryBuilder*>(abuilder);
- auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
-
- const bool has_nulls = batch->hasNulls;
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(builder->Append(batch->data[i]));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return Status::OK();
-}
-
-Status AppendDecimalBatch(const liborc::Type* type,
- liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
- int64_t length, ArrayBuilder* abuilder) {
- auto builder = checked_cast<Decimal128Builder*>(abuilder);
-
- const bool has_nulls = column_vector_batch->hasNulls;
- if (type->getPrecision() == 0 || type->getPrecision() > 18) {
- auto batch = checked_cast<liborc::Decimal128VectorBatch*>(column_vector_batch);
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(builder->Append(
- Decimal128(batch->values[i].getHighBits(), batch->values[i].getLowBits())));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- } else {
- auto batch = checked_cast<liborc::Decimal64VectorBatch*>(column_vector_batch);
- for (int64_t i = offset; i < length + offset; i++) {
- if (!has_nulls || batch->notNull[i]) {
- RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i])));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- }
- return Status::OK();
-}
-
-} // namespace
-
-Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
- int64_t offset, int64_t length, ArrayBuilder* builder) {
- if (type == nullptr) {
- return Status::OK();
- }
- liborc::TypeKind kind = type->getKind();
- switch (kind) {
- case liborc::STRUCT:
- return AppendStructBatch(type, batch, offset, length, builder);
- case liborc::LIST:
- return AppendListBatch(type, batch, offset, length, builder);
- case liborc::MAP:
- return AppendMapBatch(type, batch, offset, length, builder);
- case liborc::LONG:
- return AppendNumericBatch<Int64Builder, liborc::LongVectorBatch, int64_t>(
- batch, offset, length, builder);
- case liborc::INT:
- return AppendNumericBatchCast<Int32Builder, int32_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::SHORT:
- return AppendNumericBatchCast<Int16Builder, int16_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::BYTE:
- return AppendNumericBatchCast<Int8Builder, int8_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::DOUBLE:
- return AppendNumericBatch<DoubleBuilder, liborc::DoubleVectorBatch, double>(
- batch, offset, length, builder);
- case liborc::FLOAT:
- return AppendNumericBatchCast<FloatBuilder, float, liborc::DoubleVectorBatch,
- double>(batch, offset, length, builder);
- case liborc::BOOLEAN:
- return AppendBoolBatch(batch, offset, length, builder);
- case liborc::VARCHAR:
- case liborc::STRING:
- return AppendBinaryBatch<StringBuilder>(batch, offset, length, builder);
- case liborc::BINARY:
- return AppendBinaryBatch<BinaryBuilder>(batch, offset, length, builder);
- case liborc::CHAR:
- return AppendFixedBinaryBatch(batch, offset, length, builder);
- case liborc::DATE:
- return AppendNumericBatchCast<Date32Builder, int32_t, liborc::LongVectorBatch,
- int64_t>(batch, offset, length, builder);
- case liborc::TIMESTAMP:
- return AppendTimestampBatch(batch, offset, length, builder);
- case liborc::DECIMAL:
- return AppendDecimalBatch(type, batch, offset, length, builder);
- default:
- return Status::NotImplemented("Not implemented type kind: ", kind);
- }
-}
-
-namespace {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-Status WriteBatch(const Array& parray, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch);
-
-// Make sure children of StructArray have appropriate null.
-Result<std::shared_ptr<Array>> NormalizeArray(const std::shared_ptr<Array>& array) {
- Type::type kind = array->type_id();
- switch (kind) {
- case Type::type::STRUCT: {
- if (array->null_count() == 0) {
- return array;
- } else {
- auto struct_array = checked_pointer_cast<StructArray>(array);
- const std::shared_ptr<Buffer> bitmap = struct_array->null_bitmap();
- std::shared_ptr<DataType> struct_type = struct_array->type();
- std::size_t size = struct_type->fields().size();
- std::vector<std::shared_ptr<Array>> new_children(size, nullptr);
- for (std::size_t i = 0; i < size; i++) {
- std::shared_ptr<Array> child = struct_array->field(i);
- const std::shared_ptr<Buffer> child_bitmap = child->null_bitmap();
- std::shared_ptr<Buffer> final_child_bitmap;
- if (child_bitmap == nullptr) {
- final_child_bitmap = bitmap;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- final_child_bitmap,
- internal::BitmapAnd(default_memory_pool(), bitmap->data(), 0,
- child_bitmap->data(), 0, struct_array->length(), 0));
- }
- std::shared_ptr<ArrayData> child_array_data = child->data();
- std::vector<std::shared_ptr<Buffer>> child_buffers = child_array_data->buffers;
- child_buffers[0] = final_child_bitmap;
- std::shared_ptr<ArrayData> new_child_array_data =
- ArrayData::Make(child->type(), child->length(), child_buffers,
- child_array_data->child_data, child_array_data->dictionary);
- ARROW_ASSIGN_OR_RAISE(new_children[i],
- NormalizeArray(MakeArray(new_child_array_data)));
- }
- return std::make_shared<StructArray>(struct_type, struct_array->length(),
- new_children, bitmap);
- }
- }
- case Type::type::LIST: {
- auto list_array = checked_pointer_cast<ListArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
- return std::make_shared<ListArray>(list_array->type(), list_array->length(),
- list_array->value_offsets(), value_array,
- list_array->null_bitmap());
- }
- case Type::type::LARGE_LIST: {
- auto list_array = checked_pointer_cast<LargeListArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
- return std::make_shared<LargeListArray>(list_array->type(), list_array->length(),
- list_array->value_offsets(), value_array,
- list_array->null_bitmap());
- }
- case Type::type::FIXED_SIZE_LIST: {
- auto list_array = checked_pointer_cast<FixedSizeListArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
- return std::make_shared<FixedSizeListArray>(list_array->type(),
- list_array->length(), value_array,
- list_array->null_bitmap());
- }
- case Type::type::MAP: {
- auto map_array = checked_pointer_cast<MapArray>(array);
- ARROW_ASSIGN_OR_RAISE(auto key_array, NormalizeArray(map_array->keys()));
- ARROW_ASSIGN_OR_RAISE(auto item_array, NormalizeArray(map_array->items()));
- return std::make_shared<MapArray>(map_array->type(), map_array->length(),
- map_array->value_offsets(), key_array, item_array,
- map_array->null_bitmap());
- }
- default: {
- return array;
- }
- }
-}
-
-template <class DataType, class BatchType, typename Enable = void>
-struct Appender {};
-
-// Types for long/double-like Appender, that is, numeric, boolean or date32
-template <typename T>
-using is_generic_type =
- std::integral_constant<bool, is_number_type<T>::value ||
- std::is_same<Date32Type, T>::value ||
- is_boolean_type<T>::value>;
-template <typename T, typename R = void>
-using enable_if_generic = enable_if_t<is_generic_type<T>::value, R>;
-
-// Number-like
-template <class DataType, class BatchType>
-struct Appender<DataType, BatchType, enable_if_generic<DataType>> {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- using ValueType = typename TypeTraits<DataType>::CType;
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(ValueType v) {
- batch->data[running_orc_offset] = array.Value(running_arrow_offset);
- batch->notNull[running_orc_offset] = true;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const ArrayType& array;
- BatchType* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-// Binary
-template <class DataType>
-struct Appender<DataType, liborc::StringVectorBatch> {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- using COffsetType = typename TypeTraits<DataType>::OffsetType::c_type;
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- COffsetType data_length = 0;
- batch->data[running_orc_offset] = reinterpret_cast<char*>(
- const_cast<uint8_t*>(array.GetValue(running_arrow_offset, &data_length)));
- batch->length[running_orc_offset] = data_length;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const ArrayType& array;
- liborc::StringVectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-// Decimal
-template <>
-struct Appender<Decimal128Type, liborc::Decimal64VectorBatch> {
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- const Decimal128 dec_value(array.GetValue(running_arrow_offset));
- batch->values[running_orc_offset] = static_cast<int64_t>(dec_value.low_bits());
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const Decimal128Array& array;
- liborc::Decimal64VectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-template <>
-struct Appender<Decimal128Type, liborc::Decimal128VectorBatch> {
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- const Decimal128 dec_value(array.GetValue(running_arrow_offset));
- batch->values[running_orc_offset] =
- liborc::Int128(dec_value.high_bits(), dec_value.low_bits());
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const Decimal128Array& array;
- liborc::Decimal128VectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
-};
-
-// Date64 and Timestamp
-template <class DataType>
-struct TimestampAppender {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(int64_t v) {
- int64_t data = array.Value(running_arrow_offset);
- batch->notNull[running_orc_offset] = true;
- batch->data[running_orc_offset] =
- static_cast<int64_t>(std::floor(data / conversion_factor_from_second));
- batch->nanoseconds[running_orc_offset] =
- (data - conversion_factor_from_second * batch->data[running_orc_offset]) *
- conversion_factor_to_nano;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const ArrayType& array;
- liborc::TimestampVectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
- int64_t conversion_factor_from_second, conversion_factor_to_nano;
-};
-
-// FSB
-struct FixedSizeBinaryAppender {
- Status VisitNull() {
- batch->notNull[running_orc_offset] = false;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- Status VisitValue(util::string_view v) {
- batch->notNull[running_orc_offset] = true;
- batch->data[running_orc_offset] = reinterpret_cast<char*>(
- const_cast<uint8_t*>(array.GetValue(running_arrow_offset)));
- batch->length[running_orc_offset] = data_length;
- running_orc_offset++;
- running_arrow_offset++;
- return Status::OK();
- }
- const FixedSizeBinaryArray& array;
- liborc::StringVectorBatch* batch;
- int64_t running_orc_offset, running_arrow_offset;
- const int32_t data_length;
-};
-
-// static_cast from int64_t or double to itself shouldn't introduce overhead
-// Pleae see
-// https://stackoverflow.com/questions/19106826/
-// can-static-cast-to-same-type-introduce-runtime-overhead
-template <class DataType, class BatchType>
-Status WriteGenericBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- const ArrayType& array_(checked_cast<const ArrayType&>(array));
- auto batch = checked_cast<BatchType*>(column_vector_batch);
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- Appender<DataType, BatchType> appender{array_, batch, orc_offset, 0};
- ArrayDataVisitor<DataType> visitor;
- RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
- return Status::OK();
-}
-
-template <class DataType>
-Status WriteTimestampBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch,
- const int64_t& conversion_factor_from_second,
- const int64_t& conversion_factor_to_nano) {
- using ArrayType = typename TypeTraits<DataType>::ArrayType;
- const ArrayType& array_(checked_cast<const ArrayType&>(array));
- auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- TimestampAppender<DataType> appender{array_,
- batch,
- orc_offset,
- 0,
- conversion_factor_from_second,
- conversion_factor_to_nano};
- ArrayDataVisitor<DataType> visitor;
- RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
- return Status::OK();
-}
-
-Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- const FixedSizeBinaryArray& array_(checked_cast<const FixedSizeBinaryArray&>(array));
- auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()};
- ArrayDataVisitor<FixedSizeBinaryType> visitor;
- RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
- return Status::OK();
-}
-
-Status WriteStructBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- std::shared_ptr<Array> array_ = MakeArray(array.data());
- std::shared_ptr<StructArray> struct_array(checked_pointer_cast<StructArray>(array_));
- auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
- std::size_t size = array.type()->fields().size();
- int64_t arrow_length = array.length();
- int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
- // First fill fields of ColumnVectorBatch
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- for (; running_arrow_offset < arrow_length;
- running_orc_offset++, running_arrow_offset++) {
- if (array.IsNull(running_arrow_offset)) {
- batch->notNull[running_orc_offset] = false;
- } else {
- batch->notNull[running_orc_offset] = true;
- }
- }
- // Fill the fields
- for (std::size_t i = 0; i < size; i++) {
- batch->fields[i]->resize(orc_offset + arrow_length);
- RETURN_NOT_OK(WriteBatch(*(struct_array->field(i)), orc_offset, batch->fields[i]));
- }
- return Status::OK();
-}
-
-template <class ArrayType>
-Status WriteListBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- const ArrayType& list_array(checked_cast<const ArrayType&>(array));
- auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
- int64_t arrow_length = array.length();
- int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
- if (orc_offset == 0) {
- batch->offsets[0] = 0;
- }
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- for (; running_arrow_offset < arrow_length;
- running_orc_offset++, running_arrow_offset++) {
- if (array.IsNull(running_arrow_offset)) {
- batch->notNull[running_orc_offset] = false;
- batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
- } else {
- batch->notNull[running_orc_offset] = true;
- batch->offsets[running_orc_offset + 1] =
- batch->offsets[running_orc_offset] +
- list_array.value_offset(running_arrow_offset + 1) -
- list_array.value_offset(running_arrow_offset);
- element_batch->resize(batch->offsets[running_orc_offset + 1]);
- int64_t subarray_arrow_offset = list_array.value_offset(running_arrow_offset),
- subarray_orc_offset = batch->offsets[running_orc_offset],
- subarray_orc_length =
- batch->offsets[running_orc_offset + 1] - subarray_orc_offset;
- RETURN_NOT_OK(WriteBatch(
- *(list_array.values()->Slice(subarray_arrow_offset, subarray_orc_length)),
- subarray_orc_offset, element_batch));
- }
- }
- return Status::OK();
-}
-
-Status WriteMapBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- const MapArray& map_array(checked_cast<const MapArray&>(array));
- auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
- liborc::ColumnVectorBatch* key_batch = (batch->keys).get();
- liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
- std::shared_ptr<Array> key_array = map_array.keys();
- std::shared_ptr<Array> element_array = map_array.items();
- int64_t arrow_length = array.length();
- int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
- if (orc_offset == 0) {
- batch->offsets[0] = 0;
- }
- if (array.null_count()) {
- batch->hasNulls = true;
- }
- for (; running_arrow_offset < arrow_length;
- running_orc_offset++, running_arrow_offset++) {
- if (array.IsNull(running_arrow_offset)) {
- batch->notNull[running_orc_offset] = false;
- batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
- } else {
- batch->notNull[running_orc_offset] = true;
- batch->offsets[running_orc_offset + 1] =
- batch->offsets[running_orc_offset] +
- map_array.value_offset(running_arrow_offset + 1) -
- map_array.value_offset(running_arrow_offset);
- int64_t subarray_arrow_offset = map_array.value_offset(running_arrow_offset),
- subarray_orc_offset = batch->offsets[running_orc_offset],
- new_subarray_orc_offset = batch->offsets[running_orc_offset + 1],
- subarray_orc_length = new_subarray_orc_offset - subarray_orc_offset;
- key_batch->resize(new_subarray_orc_offset);
- element_batch->resize(new_subarray_orc_offset);
- RETURN_NOT_OK(
- WriteBatch(*(key_array->Slice(subarray_arrow_offset, subarray_orc_length)),
- subarray_orc_offset, key_batch));
- RETURN_NOT_OK(
- WriteBatch(*(element_array->Slice(subarray_arrow_offset, subarray_orc_length)),
- subarray_orc_offset, element_batch));
- }
- }
- return Status::OK();
-}
-
-Status WriteBatch(const Array& array, int64_t orc_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- Type::type kind = array.type_id();
- column_vector_batch->numElements = orc_offset;
- switch (kind) {
- case Type::type::BOOL:
- return WriteGenericBatch<BooleanType, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT8:
- return WriteGenericBatch<Int8Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT16:
- return WriteGenericBatch<Int16Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT32:
- return WriteGenericBatch<Int32Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::INT64:
- return WriteGenericBatch<Int64Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::FLOAT:
- return WriteGenericBatch<FloatType, liborc::DoubleVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::DOUBLE:
- return WriteGenericBatch<DoubleType, liborc::DoubleVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::BINARY:
- return WriteGenericBatch<BinaryType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::LARGE_BINARY:
- return WriteGenericBatch<LargeBinaryType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::STRING:
- return WriteGenericBatch<StringType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::LARGE_STRING:
- return WriteGenericBatch<LargeStringType, liborc::StringVectorBatch>(
- array, orc_offset, column_vector_batch);
- case Type::type::FIXED_SIZE_BINARY:
- return WriteFixedSizeBinaryBatch(array, orc_offset, column_vector_batch);
- case Type::type::DATE32:
- return WriteGenericBatch<Date32Type, liborc::LongVectorBatch>(array, orc_offset,
- column_vector_batch);
- case Type::type::DATE64:
- return WriteTimestampBatch<Date64Type>(array, orc_offset, column_vector_batch,
- kOneSecondMillis, kOneMilliNanos);
- case Type::type::TIMESTAMP: {
- switch (internal::checked_pointer_cast<TimestampType>(array.type())->unit()) {
- case TimeUnit::type::SECOND:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, 1, kOneSecondNanos);
- case TimeUnit::type::MILLI:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, kOneSecondMillis, kOneMilliNanos);
- case TimeUnit::type::MICRO:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, kOneSecondMicros, kOneMicroNanos);
- case TimeUnit::type::NANO:
- return WriteTimestampBatch<TimestampType>(
- array, orc_offset, column_vector_batch, kOneSecondNanos, 1);
- default:
- return Status::TypeError("Unknown or unsupported Arrow type: ",
- array.type()->ToString());
- }
- }
- case Type::type::DECIMAL128: {
- int32_t precision = checked_pointer_cast<Decimal128Type>(array.type())->precision();
- if (precision > 18) {
- return WriteGenericBatch<Decimal128Type, liborc::Decimal128VectorBatch>(
- array, orc_offset, column_vector_batch);
- } else {
- return WriteGenericBatch<Decimal128Type, liborc::Decimal64VectorBatch>(
- array, orc_offset, column_vector_batch);
- }
- }
- case Type::type::STRUCT:
- return WriteStructBatch(array, orc_offset, column_vector_batch);
- case Type::type::LIST:
- return WriteListBatch<ListArray>(array, orc_offset, column_vector_batch);
- case Type::type::LARGE_LIST:
- return WriteListBatch<LargeListArray>(array, orc_offset, column_vector_batch);
- case Type::type::FIXED_SIZE_LIST:
- return WriteListBatch<FixedSizeListArray>(array, orc_offset, column_vector_batch);
- case Type::type::MAP:
- return WriteMapBatch(array, orc_offset, column_vector_batch);
- default: {
- return Status::NotImplemented("Unknown or unsupported Arrow type: ",
- array.type()->ToString());
- }
- }
- return Status::OK();
-}
-
-Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const DataType& type) {
- Type::type kind = type.id();
- switch (kind) {
- case Type::type::BOOL:
- return liborc::createPrimitiveType(liborc::TypeKind::BOOLEAN);
- case Type::type::INT8:
- return liborc::createPrimitiveType(liborc::TypeKind::BYTE);
- case Type::type::INT16:
- return liborc::createPrimitiveType(liborc::TypeKind::SHORT);
- case Type::type::INT32:
- return liborc::createPrimitiveType(liborc::TypeKind::INT);
- case Type::type::INT64:
- return liborc::createPrimitiveType(liborc::TypeKind::LONG);
- case Type::type::FLOAT:
- return liborc::createPrimitiveType(liborc::TypeKind::FLOAT);
- case Type::type::DOUBLE:
- return liborc::createPrimitiveType(liborc::TypeKind::DOUBLE);
- // Use STRING instead of VARCHAR for now, both use UTF-8
- case Type::type::STRING:
- case Type::type::LARGE_STRING:
- return liborc::createPrimitiveType(liborc::TypeKind::STRING);
- case Type::type::BINARY:
- case Type::type::LARGE_BINARY:
- case Type::type::FIXED_SIZE_BINARY:
- return liborc::createPrimitiveType(liborc::TypeKind::BINARY);
- case Type::type::DATE32:
- return liborc::createPrimitiveType(liborc::TypeKind::DATE);
- case Type::type::DATE64:
- case Type::type::TIMESTAMP:
- return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP);
- case Type::type::DECIMAL128: {
- const uint64_t precision =
- static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).precision());
- const uint64_t scale =
- static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).scale());
- return liborc::createDecimalType(precision, scale);
- }
- case Type::type::LIST:
- case Type::type::FIXED_SIZE_LIST:
- case Type::type::LARGE_LIST: {
- std::shared_ptr<DataType> arrow_child_type =
- checked_cast<const BaseListType&>(type).value_type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- return liborc::createListType(std::move(orc_subtype));
- }
- case Type::type::STRUCT: {
- ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
- std::vector<std::shared_ptr<Field>> arrow_fields =
- checked_cast<const StructType&>(type).fields();
- for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
- it != arrow_fields.end(); ++it) {
- std::string field_name = (*it)->name();
- std::shared_ptr<DataType> arrow_child_type = (*it)->type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- out_type->addStructField(field_name, std::move(orc_subtype));
- }
- return std::move(out_type);
- }
- case Type::type::MAP: {
- std::shared_ptr<DataType> key_arrow_type =
- checked_cast<const MapType&>(type).key_type();
- std::shared_ptr<DataType> item_arrow_type =
- checked_cast<const MapType&>(type).item_type();
- ARROW_ASSIGN_OR_RAISE(auto key_orc_type, GetOrcType(*key_arrow_type));
- ARROW_ASSIGN_OR_RAISE(auto item_orc_type, GetOrcType(*item_arrow_type));
- return liborc::createMapType(std::move(key_orc_type), std::move(item_orc_type));
- }
- case Type::type::DENSE_UNION:
- case Type::type::SPARSE_UNION: {
- ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createUnionType();
- std::vector<std::shared_ptr<Field>> arrow_fields =
- checked_cast<const UnionType&>(type).fields();
- for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
- it != arrow_fields.end(); ++it) {
- std::string field_name = (*it)->name();
- std::shared_ptr<DataType> arrow_child_type = (*it)->type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- out_type->addUnionChild(std::move(orc_subtype));
- }
- return std::move(out_type);
- }
- default: {
- return Status::NotImplemented("Unknown or unsupported Arrow type: ",
- type.ToString());
- }
- }
-}
-
-} // namespace
-
-Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
- int* arrow_chunk_offset, int64_t* arrow_index_offset,
- liborc::ColumnVectorBatch* column_vector_batch) {
- int num_batch = chunked_array.num_chunks();
- int64_t orc_offset = 0;
- while (*arrow_chunk_offset < num_batch && orc_offset < length) {
- ARROW_ASSIGN_OR_RAISE(auto array,
- NormalizeArray(chunked_array.chunk(*arrow_chunk_offset)));
- int64_t num_written_elements =
- std::min(length - orc_offset, array->length() - *arrow_index_offset);
- if (num_written_elements > 0) {
- RETURN_NOT_OK(WriteBatch(*(array->Slice(*arrow_index_offset, num_written_elements)),
- orc_offset, column_vector_batch));
- orc_offset += num_written_elements;
- *arrow_index_offset += num_written_elements;
- }
- if (orc_offset < length) { // Another Arrow Array done
- *arrow_index_offset = 0;
- (*arrow_chunk_offset)++;
- }
- }
- column_vector_batch->numElements = orc_offset;
- return Status::OK();
-}
-
-Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
- // When subselecting fields on read, liborc will set some nodes to nullptr,
- // so we need to check for nullptr before progressing
- if (type == nullptr) {
- *out = null();
- return Status::OK();
- }
- liborc::TypeKind kind = type->getKind();
- const int subtype_count = static_cast<int>(type->getSubtypeCount());
-
- switch (kind) {
- case liborc::BOOLEAN:
- *out = boolean();
- break;
- case liborc::BYTE:
- *out = int8();
- break;
- case liborc::SHORT:
- *out = int16();
- break;
- case liborc::INT:
- *out = int32();
- break;
- case liborc::LONG:
- *out = int64();
- break;
- case liborc::FLOAT:
- *out = float32();
- break;
- case liborc::DOUBLE:
- *out = float64();
- break;
- case liborc::VARCHAR:
- case liborc::STRING:
- *out = utf8();
- break;
- case liborc::BINARY:
- *out = binary();
- break;
- case liborc::CHAR:
- *out = fixed_size_binary(static_cast<int>(type->getMaximumLength()));
- break;
- case liborc::TIMESTAMP:
- *out = timestamp(TimeUnit::NANO);
- break;
- case liborc::DATE:
- *out = date32();
- break;
- case liborc::DECIMAL: {
- const int precision = static_cast<int>(type->getPrecision());
- const int scale = static_cast<int>(type->getScale());
- if (precision == 0) {
- // In HIVE 0.11/0.12 precision is set as 0, but means max precision
- *out = decimal128(38, 6);
- } else {
- *out = decimal128(precision, scale);
- }
- break;
- }
- case liborc::LIST: {
- if (subtype_count != 1) {
- return Status::TypeError("Invalid Orc List type");
- }
- std::shared_ptr<DataType> elemtype;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype));
- *out = list(elemtype);
- break;
- }
- case liborc::MAP: {
- if (subtype_count != 2) {
- return Status::TypeError("Invalid Orc Map type");
- }
- std::shared_ptr<DataType> key_type, item_type;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &key_type));
- RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &item_type));
- *out = map(key_type, item_type);
- break;
- }
- case liborc::STRUCT: {
- std::vector<std::shared_ptr<Field>> fields;
- for (int child = 0; child < subtype_count; ++child) {
- std::shared_ptr<DataType> elem_type;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
- std::string name = type->getFieldName(child);
- fields.push_back(field(name, elem_type));
- }
- *out = struct_(fields);
- break;
- }
- case liborc::UNION: {
- std::vector<std::shared_ptr<Field>> fields;
- std::vector<int8_t> type_codes;
- for (int child = 0; child < subtype_count; ++child) {
- std::shared_ptr<DataType> elem_type;
- RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
- fields.push_back(field("_union_" + std::to_string(child), elem_type));
- type_codes.push_back(static_cast<int8_t>(child));
- }
- *out = sparse_union(fields, type_codes);
- break;
- }
- default: {
- return Status::TypeError("Unknown Orc type kind: ", type->toString());
- }
- }
- return Status::OK();
-}
-
-Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema) {
- int numFields = schema.num_fields();
- ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
- for (int i = 0; i < numFields; i++) {
- std::shared_ptr<Field> field = schema.field(i);
- std::string field_name = field->name();
- std::shared_ptr<DataType> arrow_child_type = field->type();
- ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
- out_type->addStructField(field_name, std::move(orc_subtype));
- }
- return std::move(out_type);
-}
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/adapters/orc/adapter_util.h"
+
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/builder.h"
+#include "arrow/chunked_array.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/range.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
+#include "orc/Exceptions.hh"
+#include "orc/MemoryPool.hh"
+#include "orc/OrcFile.hh"
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace adapters {
+namespace orc {
+
+namespace {
+
+// The number of milliseconds, microseconds and nanoseconds in a second
+constexpr int64_t kOneSecondMillis = 1000LL;
+constexpr int64_t kOneMicroNanos = 1000LL;
+constexpr int64_t kOneSecondMicros = 1000000LL;
+constexpr int64_t kOneMilliNanos = 1000000LL;
+constexpr int64_t kOneSecondNanos = 1000000000LL;
+
+Status AppendStructBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<StructBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ RETURN_NOT_OK(builder->AppendValues(length, valid_bytes));
+
+ for (int i = 0; i < builder->num_fields(); i++) {
+ RETURN_NOT_OK(AppendBatch(type->getSubtype(i), batch->fields[i], offset, length,
+ builder->field_builder(i)));
+ }
+ return Status::OK();
+}
+
+Status AppendListBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<ListBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* elements = batch->elements.get();
+ const liborc::Type* elemtype = type->getSubtype(0);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ int64_t start = batch->offsets[i];
+ int64_t end = batch->offsets[i + 1];
+ RETURN_NOT_OK(builder->Append());
+ RETURN_NOT_OK(
+ AppendBatch(elemtype, elements, start, end - start, builder->value_builder()));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendMapBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<MapBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* keys = batch->keys.get();
+ liborc::ColumnVectorBatch* items = batch->elements.get();
+ const liborc::Type* key_type = type->getSubtype(0);
+ const liborc::Type* item_type = type->getSubtype(1);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ int64_t start = batch->offsets[i];
+ int64_t end = batch->offsets[i + 1];
+ RETURN_NOT_OK(builder->Append());
+ RETURN_NOT_OK(
+ AppendBatch(key_type, keys, start, end - start, builder->key_builder()));
+ RETURN_NOT_OK(
+ AppendBatch(item_type, items, start, end - start, builder->item_builder()));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+template <class BuilderType, class BatchType, class ElemType>
+Status AppendNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const ElemType* source = batch->data.data() + offset;
+ RETURN_NOT_OK(builder->AppendValues(source, length, valid_bytes));
+ return Status::OK();
+}
+
+template <class BuilderType, class TargetType, class BatchType, class SourceType>
+Status AppendNumericBatchCast(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const SourceType* source = batch->data.data() + offset;
+ auto cast_iter = internal::MakeLazyRange(
+ [&source](int64_t index) { return static_cast<TargetType>(source[index]); },
+ length);
+
+ RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
+
+ return Status::OK();
+}
+
+Status AppendBoolBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BooleanBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::LongVectorBatch*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+ const int64_t* source = batch->data.data() + offset;
+
+ auto cast_iter = internal::MakeLazyRange(
+ [&source](int64_t index) { return static_cast<bool>(source[index]); }, length);
+
+ RETURN_NOT_OK(builder->AppendValues(cast_iter.begin(), cast_iter.end(), valid_bytes));
+
+ return Status::OK();
+}
+
+Status AppendTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<TimestampBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+
+ if (length == 0) {
+ return Status::OK();
+ }
+
+ const uint8_t* valid_bytes = nullptr;
+ if (batch->hasNulls) {
+ valid_bytes = reinterpret_cast<const uint8_t*>(batch->notNull.data()) + offset;
+ }
+
+ const int64_t* seconds = batch->data.data() + offset;
+ const int64_t* nanos = batch->nanoseconds.data() + offset;
+
+ auto transform_timestamp = [seconds, nanos](int64_t index) {
+ return seconds[index] * kOneSecondNanos + nanos[index];
+ };
+
+ auto transform_range = internal::MakeLazyRange(transform_timestamp, length);
+
+ RETURN_NOT_OK(
+ builder->AppendValues(transform_range.begin(), transform_range.end(), valid_bytes));
+ return Status::OK();
+}
+
+template <class BuilderType>
+Status AppendBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<BuilderType*>(abuilder);
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(
+ builder->Append(batch->data[i], static_cast<int32_t>(batch->length[i])));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendFixedBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
+ int64_t offset, int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<FixedSizeBinaryBuilder*>(abuilder);
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+
+ const bool has_nulls = batch->hasNulls;
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(batch->data[i]));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return Status::OK();
+}
+
+Status AppendDecimalBatch(const liborc::Type* type,
+ liborc::ColumnVectorBatch* column_vector_batch, int64_t offset,
+ int64_t length, ArrayBuilder* abuilder) {
+ auto builder = checked_cast<Decimal128Builder*>(abuilder);
+
+ const bool has_nulls = column_vector_batch->hasNulls;
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ auto batch = checked_cast<liborc::Decimal128VectorBatch*>(column_vector_batch);
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(
+ Decimal128(batch->values[i].getHighBits(), batch->values[i].getLowBits())));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ } else {
+ auto batch = checked_cast<liborc::Decimal64VectorBatch*>(column_vector_batch);
+ for (int64_t i = offset; i < length + offset; i++) {
+ if (!has_nulls || batch->notNull[i]) {
+ RETURN_NOT_OK(builder->Append(Decimal128(batch->values[i])));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace
+
+Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
+ int64_t offset, int64_t length, ArrayBuilder* builder) {
+ if (type == nullptr) {
+ return Status::OK();
+ }
+ liborc::TypeKind kind = type->getKind();
+ switch (kind) {
+ case liborc::STRUCT:
+ return AppendStructBatch(type, batch, offset, length, builder);
+ case liborc::LIST:
+ return AppendListBatch(type, batch, offset, length, builder);
+ case liborc::MAP:
+ return AppendMapBatch(type, batch, offset, length, builder);
+ case liborc::LONG:
+ return AppendNumericBatch<Int64Builder, liborc::LongVectorBatch, int64_t>(
+ batch, offset, length, builder);
+ case liborc::INT:
+ return AppendNumericBatchCast<Int32Builder, int32_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::SHORT:
+ return AppendNumericBatchCast<Int16Builder, int16_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::BYTE:
+ return AppendNumericBatchCast<Int8Builder, int8_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::DOUBLE:
+ return AppendNumericBatch<DoubleBuilder, liborc::DoubleVectorBatch, double>(
+ batch, offset, length, builder);
+ case liborc::FLOAT:
+ return AppendNumericBatchCast<FloatBuilder, float, liborc::DoubleVectorBatch,
+ double>(batch, offset, length, builder);
+ case liborc::BOOLEAN:
+ return AppendBoolBatch(batch, offset, length, builder);
+ case liborc::VARCHAR:
+ case liborc::STRING:
+ return AppendBinaryBatch<StringBuilder>(batch, offset, length, builder);
+ case liborc::BINARY:
+ return AppendBinaryBatch<BinaryBuilder>(batch, offset, length, builder);
+ case liborc::CHAR:
+ return AppendFixedBinaryBatch(batch, offset, length, builder);
+ case liborc::DATE:
+ return AppendNumericBatchCast<Date32Builder, int32_t, liborc::LongVectorBatch,
+ int64_t>(batch, offset, length, builder);
+ case liborc::TIMESTAMP:
+ return AppendTimestampBatch(batch, offset, length, builder);
+ case liborc::DECIMAL:
+ return AppendDecimalBatch(type, batch, offset, length, builder);
+ default:
+ return Status::NotImplemented("Not implemented type kind: ", kind);
+ }
+}
+
+namespace {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+Status WriteBatch(const Array& parray, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch);
+
+// Make sure children of StructArray have appropriate null.
+Result<std::shared_ptr<Array>> NormalizeArray(const std::shared_ptr<Array>& array) {
+ Type::type kind = array->type_id();
+ switch (kind) {
+ case Type::type::STRUCT: {
+ if (array->null_count() == 0) {
+ return array;
+ } else {
+ auto struct_array = checked_pointer_cast<StructArray>(array);
+ const std::shared_ptr<Buffer> bitmap = struct_array->null_bitmap();
+ std::shared_ptr<DataType> struct_type = struct_array->type();
+ std::size_t size = struct_type->fields().size();
+ std::vector<std::shared_ptr<Array>> new_children(size, nullptr);
+ for (std::size_t i = 0; i < size; i++) {
+ std::shared_ptr<Array> child = struct_array->field(i);
+ const std::shared_ptr<Buffer> child_bitmap = child->null_bitmap();
+ std::shared_ptr<Buffer> final_child_bitmap;
+ if (child_bitmap == nullptr) {
+ final_child_bitmap = bitmap;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ final_child_bitmap,
+ internal::BitmapAnd(default_memory_pool(), bitmap->data(), 0,
+ child_bitmap->data(), 0, struct_array->length(), 0));
+ }
+ std::shared_ptr<ArrayData> child_array_data = child->data();
+ std::vector<std::shared_ptr<Buffer>> child_buffers = child_array_data->buffers;
+ child_buffers[0] = final_child_bitmap;
+ std::shared_ptr<ArrayData> new_child_array_data =
+ ArrayData::Make(child->type(), child->length(), child_buffers,
+ child_array_data->child_data, child_array_data->dictionary);
+ ARROW_ASSIGN_OR_RAISE(new_children[i],
+ NormalizeArray(MakeArray(new_child_array_data)));
+ }
+ return std::make_shared<StructArray>(struct_type, struct_array->length(),
+ new_children, bitmap);
+ }
+ }
+ case Type::type::LIST: {
+ auto list_array = checked_pointer_cast<ListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<ListArray>(list_array->type(), list_array->length(),
+ list_array->value_offsets(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::LARGE_LIST: {
+ auto list_array = checked_pointer_cast<LargeListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<LargeListArray>(list_array->type(), list_array->length(),
+ list_array->value_offsets(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::FIXED_SIZE_LIST: {
+ auto list_array = checked_pointer_cast<FixedSizeListArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto value_array, NormalizeArray(list_array->values()));
+ return std::make_shared<FixedSizeListArray>(list_array->type(),
+ list_array->length(), value_array,
+ list_array->null_bitmap());
+ }
+ case Type::type::MAP: {
+ auto map_array = checked_pointer_cast<MapArray>(array);
+ ARROW_ASSIGN_OR_RAISE(auto key_array, NormalizeArray(map_array->keys()));
+ ARROW_ASSIGN_OR_RAISE(auto item_array, NormalizeArray(map_array->items()));
+ return std::make_shared<MapArray>(map_array->type(), map_array->length(),
+ map_array->value_offsets(), key_array, item_array,
+ map_array->null_bitmap());
+ }
+ default: {
+ return array;
+ }
+ }
+}
+
+template <class DataType, class BatchType, typename Enable = void>
+struct Appender {};
+
+// Types for long/double-like Appender, that is, numeric, boolean or date32
+template <typename T>
+using is_generic_type =
+ std::integral_constant<bool, is_number_type<T>::value ||
+ std::is_same<Date32Type, T>::value ||
+ is_boolean_type<T>::value>;
+template <typename T, typename R = void>
+using enable_if_generic = enable_if_t<is_generic_type<T>::value, R>;
+
+// Number-like
+template <class DataType, class BatchType>
+struct Appender<DataType, BatchType, enable_if_generic<DataType>> {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ using ValueType = typename TypeTraits<DataType>::CType;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(ValueType v) {
+ batch->data[running_orc_offset] = array.Value(running_arrow_offset);
+ batch->notNull[running_orc_offset] = true;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ BatchType* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Binary
+template <class DataType>
+struct Appender<DataType, liborc::StringVectorBatch> {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ using COffsetType = typename TypeTraits<DataType>::OffsetType::c_type;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ COffsetType data_length = 0;
+ batch->data[running_orc_offset] = reinterpret_cast<char*>(
+ const_cast<uint8_t*>(array.GetValue(running_arrow_offset, &data_length)));
+ batch->length[running_orc_offset] = data_length;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ liborc::StringVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Decimal
+template <>
+struct Appender<Decimal128Type, liborc::Decimal64VectorBatch> {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ const Decimal128 dec_value(array.GetValue(running_arrow_offset));
+ batch->values[running_orc_offset] = static_cast<int64_t>(dec_value.low_bits());
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const Decimal128Array& array;
+ liborc::Decimal64VectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+template <>
+struct Appender<Decimal128Type, liborc::Decimal128VectorBatch> {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ const Decimal128 dec_value(array.GetValue(running_arrow_offset));
+ batch->values[running_orc_offset] =
+ liborc::Int128(dec_value.high_bits(), dec_value.low_bits());
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const Decimal128Array& array;
+ liborc::Decimal128VectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+};
+
+// Date64 and Timestamp
+template <class DataType>
+struct TimestampAppender {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(int64_t v) {
+ int64_t data = array.Value(running_arrow_offset);
+ batch->notNull[running_orc_offset] = true;
+ batch->data[running_orc_offset] =
+ static_cast<int64_t>(std::floor(data / conversion_factor_from_second));
+ batch->nanoseconds[running_orc_offset] =
+ (data - conversion_factor_from_second * batch->data[running_orc_offset]) *
+ conversion_factor_to_nano;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const ArrayType& array;
+ liborc::TimestampVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+ int64_t conversion_factor_from_second, conversion_factor_to_nano;
+};
+
+// FSB
+struct FixedSizeBinaryAppender {
+ Status VisitNull() {
+ batch->notNull[running_orc_offset] = false;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ Status VisitValue(util::string_view v) {
+ batch->notNull[running_orc_offset] = true;
+ batch->data[running_orc_offset] = reinterpret_cast<char*>(
+ const_cast<uint8_t*>(array.GetValue(running_arrow_offset)));
+ batch->length[running_orc_offset] = data_length;
+ running_orc_offset++;
+ running_arrow_offset++;
+ return Status::OK();
+ }
+ const FixedSizeBinaryArray& array;
+ liborc::StringVectorBatch* batch;
+ int64_t running_orc_offset, running_arrow_offset;
+ const int32_t data_length;
+};
+
+// static_cast from int64_t or double to itself shouldn't introduce overhead
+// Pleae see
+// https://stackoverflow.com/questions/19106826/
+// can-static-cast-to-same-type-introduce-runtime-overhead
+template <class DataType, class BatchType>
+Status WriteGenericBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ const ArrayType& array_(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<BatchType*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ Appender<DataType, BatchType> appender{array_, batch, orc_offset, 0};
+ ArrayDataVisitor<DataType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+template <class DataType>
+Status WriteTimestampBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch,
+ const int64_t& conversion_factor_from_second,
+ const int64_t& conversion_factor_to_nano) {
+ using ArrayType = typename TypeTraits<DataType>::ArrayType;
+ const ArrayType& array_(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ TimestampAppender<DataType> appender{array_,
+ batch,
+ orc_offset,
+ 0,
+ conversion_factor_from_second,
+ conversion_factor_to_nano};
+ ArrayDataVisitor<DataType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const FixedSizeBinaryArray& array_(checked_cast<const FixedSizeBinaryArray&>(array));
+ auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()};
+ ArrayDataVisitor<FixedSizeBinaryType> visitor;
+ RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+ return Status::OK();
+}
+
+Status WriteStructBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ std::shared_ptr<Array> array_ = MakeArray(array.data());
+ std::shared_ptr<StructArray> struct_array(checked_pointer_cast<StructArray>(array_));
+ auto batch = checked_cast<liborc::StructVectorBatch*>(column_vector_batch);
+ std::size_t size = array.type()->fields().size();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ // First fill fields of ColumnVectorBatch
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ }
+ }
+ // Fill the fields
+ for (std::size_t i = 0; i < size; i++) {
+ batch->fields[i]->resize(orc_offset + arrow_length);
+ RETURN_NOT_OK(WriteBatch(*(struct_array->field(i)), orc_offset, batch->fields[i]));
+ }
+ return Status::OK();
+}
+
+template <class ArrayType>
+Status WriteListBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const ArrayType& list_array(checked_cast<const ArrayType&>(array));
+ auto batch = checked_cast<liborc::ListVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ if (orc_offset == 0) {
+ batch->offsets[0] = 0;
+ }
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ batch->offsets[running_orc_offset + 1] =
+ batch->offsets[running_orc_offset] +
+ list_array.value_offset(running_arrow_offset + 1) -
+ list_array.value_offset(running_arrow_offset);
+ element_batch->resize(batch->offsets[running_orc_offset + 1]);
+ int64_t subarray_arrow_offset = list_array.value_offset(running_arrow_offset),
+ subarray_orc_offset = batch->offsets[running_orc_offset],
+ subarray_orc_length =
+ batch->offsets[running_orc_offset + 1] - subarray_orc_offset;
+ RETURN_NOT_OK(WriteBatch(
+ *(list_array.values()->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, element_batch));
+ }
+ }
+ return Status::OK();
+}
+
+Status WriteMapBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ const MapArray& map_array(checked_cast<const MapArray&>(array));
+ auto batch = checked_cast<liborc::MapVectorBatch*>(column_vector_batch);
+ liborc::ColumnVectorBatch* key_batch = (batch->keys).get();
+ liborc::ColumnVectorBatch* element_batch = (batch->elements).get();
+ std::shared_ptr<Array> key_array = map_array.keys();
+ std::shared_ptr<Array> element_array = map_array.items();
+ int64_t arrow_length = array.length();
+ int64_t running_arrow_offset = 0, running_orc_offset = orc_offset;
+ if (orc_offset == 0) {
+ batch->offsets[0] = 0;
+ }
+ if (array.null_count()) {
+ batch->hasNulls = true;
+ }
+ for (; running_arrow_offset < arrow_length;
+ running_orc_offset++, running_arrow_offset++) {
+ if (array.IsNull(running_arrow_offset)) {
+ batch->notNull[running_orc_offset] = false;
+ batch->offsets[running_orc_offset + 1] = batch->offsets[running_orc_offset];
+ } else {
+ batch->notNull[running_orc_offset] = true;
+ batch->offsets[running_orc_offset + 1] =
+ batch->offsets[running_orc_offset] +
+ map_array.value_offset(running_arrow_offset + 1) -
+ map_array.value_offset(running_arrow_offset);
+ int64_t subarray_arrow_offset = map_array.value_offset(running_arrow_offset),
+ subarray_orc_offset = batch->offsets[running_orc_offset],
+ new_subarray_orc_offset = batch->offsets[running_orc_offset + 1],
+ subarray_orc_length = new_subarray_orc_offset - subarray_orc_offset;
+ key_batch->resize(new_subarray_orc_offset);
+ element_batch->resize(new_subarray_orc_offset);
+ RETURN_NOT_OK(
+ WriteBatch(*(key_array->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, key_batch));
+ RETURN_NOT_OK(
+ WriteBatch(*(element_array->Slice(subarray_arrow_offset, subarray_orc_length)),
+ subarray_orc_offset, element_batch));
+ }
+ }
+ return Status::OK();
+}
+
+Status WriteBatch(const Array& array, int64_t orc_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ Type::type kind = array.type_id();
+ column_vector_batch->numElements = orc_offset;
+ switch (kind) {
+ case Type::type::BOOL:
+ return WriteGenericBatch<BooleanType, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT8:
+ return WriteGenericBatch<Int8Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT16:
+ return WriteGenericBatch<Int16Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT32:
+ return WriteGenericBatch<Int32Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::INT64:
+ return WriteGenericBatch<Int64Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::FLOAT:
+ return WriteGenericBatch<FloatType, liborc::DoubleVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::DOUBLE:
+ return WriteGenericBatch<DoubleType, liborc::DoubleVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::BINARY:
+ return WriteGenericBatch<BinaryType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_BINARY:
+ return WriteGenericBatch<LargeBinaryType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::STRING:
+ return WriteGenericBatch<StringType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_STRING:
+ return WriteGenericBatch<LargeStringType, liborc::StringVectorBatch>(
+ array, orc_offset, column_vector_batch);
+ case Type::type::FIXED_SIZE_BINARY:
+ return WriteFixedSizeBinaryBatch(array, orc_offset, column_vector_batch);
+ case Type::type::DATE32:
+ return WriteGenericBatch<Date32Type, liborc::LongVectorBatch>(array, orc_offset,
+ column_vector_batch);
+ case Type::type::DATE64:
+ return WriteTimestampBatch<Date64Type>(array, orc_offset, column_vector_batch,
+ kOneSecondMillis, kOneMilliNanos);
+ case Type::type::TIMESTAMP: {
+ switch (internal::checked_pointer_cast<TimestampType>(array.type())->unit()) {
+ case TimeUnit::type::SECOND:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, 1, kOneSecondNanos);
+ case TimeUnit::type::MILLI:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondMillis, kOneMilliNanos);
+ case TimeUnit::type::MICRO:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondMicros, kOneMicroNanos);
+ case TimeUnit::type::NANO:
+ return WriteTimestampBatch<TimestampType>(
+ array, orc_offset, column_vector_batch, kOneSecondNanos, 1);
+ default:
+ return Status::TypeError("Unknown or unsupported Arrow type: ",
+ array.type()->ToString());
+ }
+ }
+ case Type::type::DECIMAL128: {
+ int32_t precision = checked_pointer_cast<Decimal128Type>(array.type())->precision();
+ if (precision > 18) {
+ return WriteGenericBatch<Decimal128Type, liborc::Decimal128VectorBatch>(
+ array, orc_offset, column_vector_batch);
+ } else {
+ return WriteGenericBatch<Decimal128Type, liborc::Decimal64VectorBatch>(
+ array, orc_offset, column_vector_batch);
+ }
+ }
+ case Type::type::STRUCT:
+ return WriteStructBatch(array, orc_offset, column_vector_batch);
+ case Type::type::LIST:
+ return WriteListBatch<ListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::LARGE_LIST:
+ return WriteListBatch<LargeListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::FIXED_SIZE_LIST:
+ return WriteListBatch<FixedSizeListArray>(array, orc_offset, column_vector_batch);
+ case Type::type::MAP:
+ return WriteMapBatch(array, orc_offset, column_vector_batch);
+ default: {
+ return Status::NotImplemented("Unknown or unsupported Arrow type: ",
+ array.type()->ToString());
+ }
+ }
+ return Status::OK();
+}
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const DataType& type) {
+ Type::type kind = type.id();
+ switch (kind) {
+ case Type::type::BOOL:
+ return liborc::createPrimitiveType(liborc::TypeKind::BOOLEAN);
+ case Type::type::INT8:
+ return liborc::createPrimitiveType(liborc::TypeKind::BYTE);
+ case Type::type::INT16:
+ return liborc::createPrimitiveType(liborc::TypeKind::SHORT);
+ case Type::type::INT32:
+ return liborc::createPrimitiveType(liborc::TypeKind::INT);
+ case Type::type::INT64:
+ return liborc::createPrimitiveType(liborc::TypeKind::LONG);
+ case Type::type::FLOAT:
+ return liborc::createPrimitiveType(liborc::TypeKind::FLOAT);
+ case Type::type::DOUBLE:
+ return liborc::createPrimitiveType(liborc::TypeKind::DOUBLE);
+ // Use STRING instead of VARCHAR for now, both use UTF-8
+ case Type::type::STRING:
+ case Type::type::LARGE_STRING:
+ return liborc::createPrimitiveType(liborc::TypeKind::STRING);
+ case Type::type::BINARY:
+ case Type::type::LARGE_BINARY:
+ case Type::type::FIXED_SIZE_BINARY:
+ return liborc::createPrimitiveType(liborc::TypeKind::BINARY);
+ case Type::type::DATE32:
+ return liborc::createPrimitiveType(liborc::TypeKind::DATE);
+ case Type::type::DATE64:
+ case Type::type::TIMESTAMP:
+ return liborc::createPrimitiveType(liborc::TypeKind::TIMESTAMP);
+ case Type::type::DECIMAL128: {
+ const uint64_t precision =
+ static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).precision());
+ const uint64_t scale =
+ static_cast<uint64_t>(checked_cast<const Decimal128Type&>(type).scale());
+ return liborc::createDecimalType(precision, scale);
+ }
+ case Type::type::LIST:
+ case Type::type::FIXED_SIZE_LIST:
+ case Type::type::LARGE_LIST: {
+ std::shared_ptr<DataType> arrow_child_type =
+ checked_cast<const BaseListType&>(type).value_type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ return liborc::createListType(std::move(orc_subtype));
+ }
+ case Type::type::STRUCT: {
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
+ std::vector<std::shared_ptr<Field>> arrow_fields =
+ checked_cast<const StructType&>(type).fields();
+ for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
+ it != arrow_fields.end(); ++it) {
+ std::string field_name = (*it)->name();
+ std::shared_ptr<DataType> arrow_child_type = (*it)->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addStructField(field_name, std::move(orc_subtype));
+ }
+ return std::move(out_type);
+ }
+ case Type::type::MAP: {
+ std::shared_ptr<DataType> key_arrow_type =
+ checked_cast<const MapType&>(type).key_type();
+ std::shared_ptr<DataType> item_arrow_type =
+ checked_cast<const MapType&>(type).item_type();
+ ARROW_ASSIGN_OR_RAISE(auto key_orc_type, GetOrcType(*key_arrow_type));
+ ARROW_ASSIGN_OR_RAISE(auto item_orc_type, GetOrcType(*item_arrow_type));
+ return liborc::createMapType(std::move(key_orc_type), std::move(item_orc_type));
+ }
+ case Type::type::DENSE_UNION:
+ case Type::type::SPARSE_UNION: {
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createUnionType();
+ std::vector<std::shared_ptr<Field>> arrow_fields =
+ checked_cast<const UnionType&>(type).fields();
+ for (std::vector<std::shared_ptr<Field>>::iterator it = arrow_fields.begin();
+ it != arrow_fields.end(); ++it) {
+ std::string field_name = (*it)->name();
+ std::shared_ptr<DataType> arrow_child_type = (*it)->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addUnionChild(std::move(orc_subtype));
+ }
+ return std::move(out_type);
+ }
+ default: {
+ return Status::NotImplemented("Unknown or unsupported Arrow type: ",
+ type.ToString());
+ }
+ }
+}
+
+} // namespace
+
+Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
+ int* arrow_chunk_offset, int64_t* arrow_index_offset,
+ liborc::ColumnVectorBatch* column_vector_batch) {
+ int num_batch = chunked_array.num_chunks();
+ int64_t orc_offset = 0;
+ while (*arrow_chunk_offset < num_batch && orc_offset < length) {
+ ARROW_ASSIGN_OR_RAISE(auto array,
+ NormalizeArray(chunked_array.chunk(*arrow_chunk_offset)));
+ int64_t num_written_elements =
+ std::min(length - orc_offset, array->length() - *arrow_index_offset);
+ if (num_written_elements > 0) {
+ RETURN_NOT_OK(WriteBatch(*(array->Slice(*arrow_index_offset, num_written_elements)),
+ orc_offset, column_vector_batch));
+ orc_offset += num_written_elements;
+ *arrow_index_offset += num_written_elements;
+ }
+ if (orc_offset < length) { // Another Arrow Array done
+ *arrow_index_offset = 0;
+ (*arrow_chunk_offset)++;
+ }
+ }
+ column_vector_batch->numElements = orc_offset;
+ return Status::OK();
+}
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out) {
+ // When subselecting fields on read, liborc will set some nodes to nullptr,
+ // so we need to check for nullptr before progressing
+ if (type == nullptr) {
+ *out = null();
+ return Status::OK();
+ }
+ liborc::TypeKind kind = type->getKind();
+ const int subtype_count = static_cast<int>(type->getSubtypeCount());
+
+ switch (kind) {
+ case liborc::BOOLEAN:
+ *out = boolean();
+ break;
+ case liborc::BYTE:
+ *out = int8();
+ break;
+ case liborc::SHORT:
+ *out = int16();
+ break;
+ case liborc::INT:
+ *out = int32();
+ break;
+ case liborc::LONG:
+ *out = int64();
+ break;
+ case liborc::FLOAT:
+ *out = float32();
+ break;
+ case liborc::DOUBLE:
+ *out = float64();
+ break;
+ case liborc::VARCHAR:
+ case liborc::STRING:
+ *out = utf8();
+ break;
+ case liborc::BINARY:
+ *out = binary();
+ break;
+ case liborc::CHAR:
+ *out = fixed_size_binary(static_cast<int>(type->getMaximumLength()));
+ break;
+ case liborc::TIMESTAMP:
+ *out = timestamp(TimeUnit::NANO);
+ break;
+ case liborc::DATE:
+ *out = date32();
+ break;
+ case liborc::DECIMAL: {
+ const int precision = static_cast<int>(type->getPrecision());
+ const int scale = static_cast<int>(type->getScale());
+ if (precision == 0) {
+ // In HIVE 0.11/0.12 precision is set as 0, but means max precision
+ *out = decimal128(38, 6);
+ } else {
+ *out = decimal128(precision, scale);
+ }
+ break;
+ }
+ case liborc::LIST: {
+ if (subtype_count != 1) {
+ return Status::TypeError("Invalid Orc List type");
+ }
+ std::shared_ptr<DataType> elemtype;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &elemtype));
+ *out = list(elemtype);
+ break;
+ }
+ case liborc::MAP: {
+ if (subtype_count != 2) {
+ return Status::TypeError("Invalid Orc Map type");
+ }
+ std::shared_ptr<DataType> key_type, item_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(0), &key_type));
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(1), &item_type));
+ *out = map(key_type, item_type);
+ break;
+ }
+ case liborc::STRUCT: {
+ std::vector<std::shared_ptr<Field>> fields;
+ for (int child = 0; child < subtype_count; ++child) {
+ std::shared_ptr<DataType> elem_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
+ std::string name = type->getFieldName(child);
+ fields.push_back(field(name, elem_type));
+ }
+ *out = struct_(fields);
+ break;
+ }
+ case liborc::UNION: {
+ std::vector<std::shared_ptr<Field>> fields;
+ std::vector<int8_t> type_codes;
+ for (int child = 0; child < subtype_count; ++child) {
+ std::shared_ptr<DataType> elem_type;
+ RETURN_NOT_OK(GetArrowType(type->getSubtype(child), &elem_type));
+ fields.push_back(field("_union_" + std::to_string(child), elem_type));
+ type_codes.push_back(static_cast<int8_t>(child));
+ }
+ *out = sparse_union(fields, type_codes);
+ break;
+ }
+ default: {
+ return Status::TypeError("Unknown Orc type kind: ", type->toString());
+ }
+ }
+ return Status::OK();
+}
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema) {
+ int numFields = schema.num_fields();
+ ORC_UNIQUE_PTR<liborc::Type> out_type = liborc::createStructType();
+ for (int i = 0; i < numFields; i++) {
+ std::shared_ptr<Field> field = schema.field(i);
+ std::string field_name = field->name();
+ std::shared_ptr<DataType> arrow_child_type = field->type();
+ ARROW_ASSIGN_OR_RAISE(auto orc_subtype, GetOrcType(*arrow_child_type));
+ out_type->addStructField(field_name, std::move(orc_subtype));
+ }
+ return std::move(out_type);
+}
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
index 8176715aa51..3e6d0fcc660 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/adapters/orc/adapter_util.h
@@ -1,57 +1,57 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "arrow/array/builder_base.h"
-#include "arrow/status.h"
-#include "orc/OrcFile.hh"
-
-namespace liborc = orc;
-
-namespace arrow {
-
-namespace adapters {
-
-namespace orc {
-
-Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out);
-
-Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema);
-
-Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
- int64_t offset, int64_t length, arrow::ArrayBuilder* builder);
-
-/// \brief Write a chunked array to an orc::ColumnVectorBatch
-///
-/// \param[in] chunked_array the chunked array
-/// \param[in] length the orc::ColumnVectorBatch size limit
-/// \param[in,out] arrow_chunk_offset The current chunk being processed
-/// \param[in,out] arrow_index_offset The index of the arrow_chunk_offset array
-/// before or after a process
-/// \param[in,out] column_vector_batch the orc::ColumnVectorBatch to be filled
-/// \return Status
-Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
- int* arrow_chunk_offset, int64_t* arrow_index_offset,
- liborc::ColumnVectorBatch* column_vector_batch);
-
-} // namespace orc
-} // namespace adapters
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/status.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+Status GetArrowType(const liborc::Type* type, std::shared_ptr<DataType>* out);
+
+Result<ORC_UNIQUE_PTR<liborc::Type>> GetOrcType(const Schema& schema);
+
+Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch,
+ int64_t offset, int64_t length, arrow::ArrayBuilder* builder);
+
+/// \brief Write a chunked array to an orc::ColumnVectorBatch
+///
+/// \param[in] chunked_array the chunked array
+/// \param[in] length the orc::ColumnVectorBatch size limit
+/// \param[in,out] arrow_chunk_offset The current chunk being processed
+/// \param[in,out] arrow_index_offset The index of the arrow_chunk_offset array
+/// before or after a process
+/// \param[in,out] column_vector_batch the orc::ColumnVectorBatch to be filled
+/// \return Status
+Status WriteBatch(const ChunkedArray& chunked_array, int64_t length,
+ int* arrow_chunk_offset, int64_t* arrow_index_offset,
+ liborc::ColumnVectorBatch* column_vector_batch);
+
+} // namespace orc
+} // namespace adapters
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
index 5d731baa777..67c5ca84e1f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.cc
@@ -73,10 +73,10 @@ struct ScalarFromArraySlotImpl {
return Finish(Decimal128(a.GetValue(index_)));
}
- Status Visit(const Decimal256Array& a) {
- return Finish(Decimal256(a.GetValue(index_)));
- }
-
+ Status Visit(const Decimal256Array& a) {
+ return Finish(Decimal256(a.GetValue(index_)));
+ }
+
template <typename T>
Status Visit(const BaseBinaryArray<T>& a) {
return Finish(a.GetString(index_));
@@ -222,31 +222,31 @@ bool Array::ApproxEquals(const std::shared_ptr<Array>& arr,
}
bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx, const EqualOptions& opts) const {
- return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
+ int64_t other_start_idx, const EqualOptions& opts) const {
+ return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
}
bool Array::RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
- int64_t end_idx, int64_t other_start_idx,
- const EqualOptions& opts) const {
+ int64_t end_idx, int64_t other_start_idx,
+ const EqualOptions& opts) const {
if (!other) {
return false;
}
- return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
+ return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
}
bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const Array& other, const EqualOptions& opts) const {
- return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
+ const Array& other, const EqualOptions& opts) const {
+ return ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, opts);
}
bool Array::RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const std::shared_ptr<Array>& other,
- const EqualOptions& opts) const {
+ const std::shared_ptr<Array>& other,
+ const EqualOptions& opts) const {
if (!other) {
return false;
}
- return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
+ return ArrayRangeEquals(*this, *other, start_idx, end_idx, other_start_idx, opts);
}
std::shared_ptr<Array> Array::Slice(int64_t offset, int64_t length) const {
@@ -302,7 +302,7 @@ Status Array::Validate() const { return internal::ValidateArray(*this); }
Status Array::ValidateFull() const {
RETURN_NOT_OK(internal::ValidateArray(*this));
- return internal::ValidateArrayFull(*this);
+ return internal::ValidateArrayFull(*this);
}
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
index 469ae94d2eb..2add572e7a4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_base.h
@@ -56,17 +56,17 @@ class ARROW_EXPORT Array {
/// \brief Return true if value at index is null. Does not boundscheck
bool IsNull(int64_t i) const {
- return null_bitmap_data_ != NULLPTR
- ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
- : data_->null_count == data_->length;
+ return null_bitmap_data_ != NULLPTR
+ ? !BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
+ : data_->null_count == data_->length;
}
/// \brief Return true if value at index is valid (not null). Does not
/// boundscheck
bool IsValid(int64_t i) const {
- return null_bitmap_data_ != NULLPTR
- ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
- : data_->null_count != data_->length;
+ return null_bitmap_data_ != NULLPTR
+ ? BitUtil::GetBit(null_bitmap_data_, i + data_->offset)
+ : data_->null_count != data_->length;
}
/// \brief Return a Scalar containing the value of this array at i
@@ -93,7 +93,7 @@ class ARROW_EXPORT Array {
///
/// Note that for `null_count == 0` or for null type, this will be null.
/// This buffer does not account for any slice offset
- const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
+ const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
/// Raw pointer to the null bitmap.
///
@@ -121,17 +121,17 @@ class ARROW_EXPORT Array {
/// Compare if the range of slots specified are equal for the given array and
/// this array. end_idx exclusive. This methods does not bounds check.
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const Array& other,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ const Array& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
- const std::shared_ptr<Array>& other,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ const std::shared_ptr<Array>& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults()) const;
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
- int64_t end_idx, int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ int64_t end_idx, int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults()) const;
Status Accept(ArrayVisitor* visitor) const;
@@ -162,7 +162,7 @@ class ARROW_EXPORT Array {
/// Input-checking variant of Array::Slice
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
- const std::shared_ptr<ArrayData>& data() const { return data_; }
+ const std::shared_ptr<ArrayData>& data() const { return data_; }
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
index 14a3a8ef961..9466b5a48f9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.cc
@@ -21,9 +21,9 @@
#include <memory>
#include "arrow/array/array_base.h"
-#include "arrow/array/validate.h"
+#include "arrow/array/validate.h"
#include "arrow/type.h"
-#include "arrow/type_traits.h"
+#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
@@ -32,7 +32,7 @@ namespace arrow {
using internal::checked_cast;
BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
- ARROW_CHECK(is_binary_like(data->type->id()));
+ ARROW_CHECK(is_binary_like(data->type->id()));
SetData(data);
}
@@ -45,7 +45,7 @@ BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_of
}
LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
- ARROW_CHECK(is_large_binary_like(data->type->id()));
+ ARROW_CHECK(is_large_binary_like(data->type->id()));
SetData(data);
}
@@ -71,7 +71,7 @@ StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_of
offset));
}
-Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
@@ -87,7 +87,7 @@ LargeStringArray::LargeStringArray(int64_t length,
null_count, offset));
}
-Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
+Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }
FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
index 735042f4a09..f8e8c4f8a44 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_binary.h
@@ -28,7 +28,7 @@
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
#include "arrow/buffer.h"
-#include "arrow/stl_iterator.h"
+#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/macros.h"
@@ -47,7 +47,7 @@ class BaseBinaryArray : public FlatArray {
public:
using TypeClass = TYPE;
using offset_type = typename TypeClass::offset_type;
- using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
+ using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
/// Return the pointer to the given elements bytes
// XXX should GetValue(int64_t i) return a string_view?
@@ -71,13 +71,13 @@ class BaseBinaryArray : public FlatArray {
raw_value_offsets_[i + 1] - pos);
}
- /// \brief Get binary value as a string_view
- /// Provided for consistency with other arrays.
- ///
- /// \param i the value index
- /// \return the view over the selected value
- util::string_view Value(int64_t i) const { return GetView(i); }
-
+ /// \brief Get binary value as a string_view
+ /// Provided for consistency with other arrays.
+ ///
+ /// \param i the value index
+ /// \return the view over the selected value
+ util::string_view Value(int64_t i) const { return GetView(i); }
+
/// \brief Get binary value as a std::string
///
/// \param i the value index
@@ -124,13 +124,13 @@ class BaseBinaryArray : public FlatArray {
}
}
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
// For subclasses
- BaseBinaryArray() = default;
+ BaseBinaryArray() = default;
// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
@@ -139,8 +139,8 @@ class BaseBinaryArray : public FlatArray {
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
}
- const offset_type* raw_value_offsets_ = NULLPTR;
- const uint8_t* raw_data_ = NULLPTR;
+ const offset_type* raw_value_offsets_ = NULLPTR;
+ const uint8_t* raw_data_ = NULLPTR;
};
/// Concrete Array class for variable-size binary data
@@ -216,7 +216,7 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
public:
using TypeClass = FixedSizeBinaryType;
- using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
+ using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
@@ -238,10 +238,10 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
void SetData(const std::shared_ptr<ArrayData>& data) {
this->PrimitiveArray::SetData(data);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
index 58852a7b6c5..d65f6ee5356 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.cc
@@ -33,11 +33,11 @@ namespace arrow {
using internal::checked_cast;
// ----------------------------------------------------------------------
-// Decimal128
+// Decimal128
Decimal128Array::Decimal128Array(const std::shared_ptr<ArrayData>& data)
: FixedSizeBinaryArray(data) {
- ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL128);
+ ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL128);
}
std::string Decimal128Array::FormatValue(int64_t i) const {
@@ -46,18 +46,18 @@ std::string Decimal128Array::FormatValue(int64_t i) const {
return value.ToString(type_.scale());
}
-// ----------------------------------------------------------------------
-// Decimal256
-
-Decimal256Array::Decimal256Array(const std::shared_ptr<ArrayData>& data)
- : FixedSizeBinaryArray(data) {
- ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL256);
-}
-
-std::string Decimal256Array::FormatValue(int64_t i) const {
- const auto& type_ = checked_cast<const Decimal256Type&>(*type());
- const Decimal256 value(GetValue(i));
- return value.ToString(type_.scale());
-}
-
+// ----------------------------------------------------------------------
+// Decimal256
+
+Decimal256Array::Decimal256Array(const std::shared_ptr<ArrayData>& data)
+ : FixedSizeBinaryArray(data) {
+ ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL256);
+}
+
+std::string Decimal256Array::FormatValue(int64_t i) const {
+ const auto& type_ = checked_cast<const Decimal256Type&>(*type());
+ const Decimal256 value(GetValue(i));
+ return value.ToString(type_.scale());
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
index e32b9d26a35..8d7d1c59cd0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_decimal.h
@@ -47,20 +47,20 @@ class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
// Backward compatibility
using DecimalArray = Decimal128Array;
-// ----------------------------------------------------------------------
-// Decimal256Array
-
-/// Concrete Array class for 256-bit decimal data
-class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
- public:
- using TypeClass = Decimal256Type;
-
- using FixedSizeBinaryArray::FixedSizeBinaryArray;
-
- /// \brief Construct Decimal256Array from ArrayData instance
- explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
-
- std::string FormatValue(int64_t i) const;
-};
-
+// ----------------------------------------------------------------------
+// Decimal256Array
+
+/// Concrete Array class for 256-bit decimal data
+class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
+ public:
+ using TypeClass = Decimal256Type;
+
+ using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+ /// \brief Construct Decimal256Array from ArrayData instance
+ explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
+
+ std::string FormatValue(int64_t i) const;
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
index ddb44b470f6..2fa95e9a176 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.cc
@@ -29,10 +29,10 @@
#include "arrow/array/dict_internal.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
-#include "arrow/chunked_array.h"
-#include "arrow/datum.h"
+#include "arrow/chunked_array.h"
+#include "arrow/datum.h"
#include "arrow/status.h"
-#include "arrow/table.h"
+#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
@@ -144,88 +144,88 @@ bool DictionaryArray::CanCompareIndices(const DictionaryArray& other) const {
}
// ----------------------------------------------------------------------
-// Dictionary transposition
-
-namespace {
-
-inline bool IsTrivialTransposition(const int32_t* transpose_map,
- int64_t input_dict_size) {
- for (int64_t i = 0; i < input_dict_size; ++i) {
- if (transpose_map[i] != i) {
- return false;
- }
- }
- return true;
-}
-
-Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
- const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& in_type,
- const std::shared_ptr<DataType>& out_type,
- const std::shared_ptr<ArrayData>& dictionary, const int32_t* transpose_map,
- MemoryPool* pool) {
- // Note that in_type may be different from data->type if data is of type ExtensionType
- if (in_type->id() != Type::DICTIONARY || out_type->id() != Type::DICTIONARY) {
- return Status::TypeError("Expected dictionary type");
- }
- const int64_t in_dict_len = data->dictionary->length;
- const auto& in_dict_type = checked_cast<const DictionaryType&>(*in_type);
- const auto& out_dict_type = checked_cast<const DictionaryType&>(*out_type);
-
- const auto& in_index_type = *in_dict_type.index_type();
- const auto& out_index_type =
- checked_cast<const FixedWidthType&>(*out_dict_type.index_type());
-
- if (in_index_type.id() == out_index_type.id() &&
- IsTrivialTransposition(transpose_map, in_dict_len)) {
- // Index type and values will be identical => we can simply reuse
- // the existing buffers.
- auto out_data =
- ArrayData::Make(out_type, data->length, {data->buffers[0], data->buffers[1]},
- data->null_count, data->offset);
- out_data->dictionary = dictionary;
- return out_data;
- }
-
- // Default path: compute a buffer of transposed indices.
- ARROW_ASSIGN_OR_RAISE(
- auto out_buffer,
- AllocateBuffer(data->length * (out_index_type.bit_width() / CHAR_BIT), pool));
-
- // Shift null buffer if the original offset is non-zero
- std::shared_ptr<Buffer> null_bitmap;
- if (data->offset != 0 && data->null_count != 0) {
- ARROW_ASSIGN_OR_RAISE(null_bitmap, CopyBitmap(pool, data->buffers[0]->data(),
- data->offset, data->length));
- } else {
- null_bitmap = data->buffers[0];
- }
-
- auto out_data = ArrayData::Make(out_type, data->length,
- {null_bitmap, std::move(out_buffer)}, data->null_count);
- out_data->dictionary = dictionary;
- RETURN_NOT_OK(internal::TransposeInts(
- in_index_type, out_index_type, data->GetValues<uint8_t>(1, 0),
- out_data->GetMutableValues<uint8_t>(1, 0), data->offset, out_data->offset,
- data->length, transpose_map));
- return out_data;
-}
-
-} // namespace
-
-Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
- const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
- const int32_t* transpose_map, MemoryPool* pool) const {
- ARROW_ASSIGN_OR_RAISE(auto transposed,
- TransposeDictIndices(data_, data_->type, type, dictionary->data(),
- transpose_map, pool));
- return MakeArray(std::move(transposed));
-}
-
-// ----------------------------------------------------------------------
-// Dictionary unification
-
-namespace {
-
+// Dictionary transposition
+
+namespace {
+
+inline bool IsTrivialTransposition(const int32_t* transpose_map,
+ int64_t input_dict_size) {
+ for (int64_t i = 0; i < input_dict_size; ++i) {
+ if (transpose_map[i] != i) {
+ return false;
+ }
+ }
+ return true;
+}
+
+Result<std::shared_ptr<ArrayData>> TransposeDictIndices(
+ const std::shared_ptr<ArrayData>& data, const std::shared_ptr<DataType>& in_type,
+ const std::shared_ptr<DataType>& out_type,
+ const std::shared_ptr<ArrayData>& dictionary, const int32_t* transpose_map,
+ MemoryPool* pool) {
+ // Note that in_type may be different from data->type if data is of type ExtensionType
+ if (in_type->id() != Type::DICTIONARY || out_type->id() != Type::DICTIONARY) {
+ return Status::TypeError("Expected dictionary type");
+ }
+ const int64_t in_dict_len = data->dictionary->length;
+ const auto& in_dict_type = checked_cast<const DictionaryType&>(*in_type);
+ const auto& out_dict_type = checked_cast<const DictionaryType&>(*out_type);
+
+ const auto& in_index_type = *in_dict_type.index_type();
+ const auto& out_index_type =
+ checked_cast<const FixedWidthType&>(*out_dict_type.index_type());
+
+ if (in_index_type.id() == out_index_type.id() &&
+ IsTrivialTransposition(transpose_map, in_dict_len)) {
+ // Index type and values will be identical => we can simply reuse
+ // the existing buffers.
+ auto out_data =
+ ArrayData::Make(out_type, data->length, {data->buffers[0], data->buffers[1]},
+ data->null_count, data->offset);
+ out_data->dictionary = dictionary;
+ return out_data;
+ }
+
+ // Default path: compute a buffer of transposed indices.
+ ARROW_ASSIGN_OR_RAISE(
+ auto out_buffer,
+ AllocateBuffer(data->length * (out_index_type.bit_width() / CHAR_BIT), pool));
+
+ // Shift null buffer if the original offset is non-zero
+ std::shared_ptr<Buffer> null_bitmap;
+ if (data->offset != 0 && data->null_count != 0) {
+ ARROW_ASSIGN_OR_RAISE(null_bitmap, CopyBitmap(pool, data->buffers[0]->data(),
+ data->offset, data->length));
+ } else {
+ null_bitmap = data->buffers[0];
+ }
+
+ auto out_data = ArrayData::Make(out_type, data->length,
+ {null_bitmap, std::move(out_buffer)}, data->null_count);
+ out_data->dictionary = dictionary;
+ RETURN_NOT_OK(internal::TransposeInts(
+ in_index_type, out_index_type, data->GetValues<uint8_t>(1, 0),
+ out_data->GetMutableValues<uint8_t>(1, 0), data->offset, out_data->offset,
+ data->length, transpose_map));
+ return out_data;
+}
+
+} // namespace
+
+Result<std::shared_ptr<Array>> DictionaryArray::Transpose(
+ const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+ const int32_t* transpose_map, MemoryPool* pool) const {
+ ARROW_ASSIGN_OR_RAISE(auto transposed,
+ TransposeDictIndices(data_, data_->type, type, dictionary->data(),
+ transpose_map, pool));
+ return MakeArray(std::move(transposed));
+}
+
+// ----------------------------------------------------------------------
+// Dictionary unification
+
+namespace {
+
template <typename T>
class DictionaryUnifierImpl : public DictionaryUnifier {
public:
@@ -288,23 +288,23 @@ class DictionaryUnifierImpl : public DictionaryUnifier {
return Status::OK();
}
- Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
- std::shared_ptr<Array>* out_dict) override {
- int64_t dict_length = memo_table_.size();
- if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) {
- return Status::Invalid(
- "These dictionaries cannot be combined. The unified dictionary requires a "
- "larger index type.");
- }
-
- // Build unified dictionary array
- std::shared_ptr<ArrayData> data;
- RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
- 0 /* start_offset */, &data));
- *out_dict = MakeArray(data);
- return Status::OK();
- }
-
+ Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) override {
+ int64_t dict_length = memo_table_.size();
+ if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) {
+ return Status::Invalid(
+ "These dictionaries cannot be combined. The unified dictionary requires a "
+ "larger index type.");
+ }
+
+ // Build unified dictionary array
+ std::shared_ptr<ArrayData> data;
+ RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
+ 0 /* start_offset */, &data));
+ *out_dict = MakeArray(data);
+ return Status::OK();
+ }
+
private:
MemoryPool* pool_;
std::shared_ptr<DataType> value_type_;
@@ -322,7 +322,7 @@ struct MakeUnifier {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
// Default implementation for non-dictionary-supported datatypes
- return Status::NotImplemented("Unification of ", *value_type,
+ return Status::NotImplemented("Unification of ", *value_type,
" dictionaries is not implemented");
}
@@ -333,110 +333,110 @@ struct MakeUnifier {
}
};
-struct RecursiveUnifier {
- MemoryPool* pool;
-
- // Return true if any of the arrays was changed (including descendents)
- Result<bool> Unify(std::shared_ptr<DataType> type, ArrayDataVector* chunks) {
- DCHECK(!chunks->empty());
- bool changed = false;
- std::shared_ptr<DataType> ext_type = nullptr;
-
- if (type->id() == Type::EXTENSION) {
- ext_type = std::move(type);
- type = checked_cast<const ExtensionType&>(*ext_type).storage_type();
- }
-
- // Unify all child dictionaries (if any)
- if (type->num_fields() > 0) {
- ArrayDataVector children(chunks->size());
- for (int i = 0; i < type->num_fields(); ++i) {
- std::transform(chunks->begin(), chunks->end(), children.begin(),
- [i](const std::shared_ptr<ArrayData>& array) {
- return array->child_data[i];
- });
- ARROW_ASSIGN_OR_RAISE(bool child_changed,
- Unify(type->field(i)->type(), &children));
- if (child_changed) {
- // Only do this when unification actually occurred
- for (size_t j = 0; j < chunks->size(); ++j) {
- (*chunks)[j]->child_data[i] = std::move(children[j]);
- }
- changed = true;
- }
- }
+struct RecursiveUnifier {
+ MemoryPool* pool;
+
+ // Return true if any of the arrays was changed (including descendents)
+ Result<bool> Unify(std::shared_ptr<DataType> type, ArrayDataVector* chunks) {
+ DCHECK(!chunks->empty());
+ bool changed = false;
+ std::shared_ptr<DataType> ext_type = nullptr;
+
+ if (type->id() == Type::EXTENSION) {
+ ext_type = std::move(type);
+ type = checked_cast<const ExtensionType&>(*ext_type).storage_type();
}
- // Unify this dictionary
- if (type->id() == Type::DICTIONARY) {
- const auto& dict_type = checked_cast<const DictionaryType&>(*type);
- // XXX Ideally, we should unify dictionaries nested in value_type first,
- // but DictionaryUnifier doesn't supported nested dictionaries anyway,
- // so this will fail.
- ARROW_ASSIGN_OR_RAISE(auto unifier,
- DictionaryUnifier::Make(dict_type.value_type(), this->pool));
- // Unify all dictionary array chunks
- BufferVector transpose_maps(chunks->size());
- for (size_t j = 0; j < chunks->size(); ++j) {
- DCHECK_NE((*chunks)[j]->dictionary, nullptr);
- RETURN_NOT_OK(
- unifier->Unify(*MakeArray((*chunks)[j]->dictionary), &transpose_maps[j]));
- }
- std::shared_ptr<Array> dictionary;
- RETURN_NOT_OK(unifier->GetResultWithIndexType(dict_type.index_type(), &dictionary));
- for (size_t j = 0; j < chunks->size(); ++j) {
- ARROW_ASSIGN_OR_RAISE(
- (*chunks)[j],
- TransposeDictIndices(
- (*chunks)[j], type, type, dictionary->data(),
- reinterpret_cast<const int32_t*>(transpose_maps[j]->data()), this->pool));
- if (ext_type) {
- (*chunks)[j]->type = ext_type;
- }
- }
- changed = true;
- }
-
- return changed;
+ // Unify all child dictionaries (if any)
+ if (type->num_fields() > 0) {
+ ArrayDataVector children(chunks->size());
+ for (int i = 0; i < type->num_fields(); ++i) {
+ std::transform(chunks->begin(), chunks->end(), children.begin(),
+ [i](const std::shared_ptr<ArrayData>& array) {
+ return array->child_data[i];
+ });
+ ARROW_ASSIGN_OR_RAISE(bool child_changed,
+ Unify(type->field(i)->type(), &children));
+ if (child_changed) {
+ // Only do this when unification actually occurred
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ (*chunks)[j]->child_data[i] = std::move(children[j]);
+ }
+ changed = true;
+ }
+ }
+ }
+
+ // Unify this dictionary
+ if (type->id() == Type::DICTIONARY) {
+ const auto& dict_type = checked_cast<const DictionaryType&>(*type);
+ // XXX Ideally, we should unify dictionaries nested in value_type first,
+ // but DictionaryUnifier doesn't supported nested dictionaries anyway,
+ // so this will fail.
+ ARROW_ASSIGN_OR_RAISE(auto unifier,
+ DictionaryUnifier::Make(dict_type.value_type(), this->pool));
+ // Unify all dictionary array chunks
+ BufferVector transpose_maps(chunks->size());
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ DCHECK_NE((*chunks)[j]->dictionary, nullptr);
+ RETURN_NOT_OK(
+ unifier->Unify(*MakeArray((*chunks)[j]->dictionary), &transpose_maps[j]));
+ }
+ std::shared_ptr<Array> dictionary;
+ RETURN_NOT_OK(unifier->GetResultWithIndexType(dict_type.index_type(), &dictionary));
+ for (size_t j = 0; j < chunks->size(); ++j) {
+ ARROW_ASSIGN_OR_RAISE(
+ (*chunks)[j],
+ TransposeDictIndices(
+ (*chunks)[j], type, type, dictionary->data(),
+ reinterpret_cast<const int32_t*>(transpose_maps[j]->data()), this->pool));
+ if (ext_type) {
+ (*chunks)[j]->type = ext_type;
+ }
+ }
+ changed = true;
+ }
+
+ return changed;
}
-};
+};
-} // namespace
+} // namespace
-Result<std::unique_ptr<DictionaryUnifier>> DictionaryUnifier::Make(
- std::shared_ptr<DataType> value_type, MemoryPool* pool) {
- MakeUnifier maker(pool, value_type);
- RETURN_NOT_OK(VisitTypeInline(*value_type, &maker));
- return std::move(maker.result);
-}
+Result<std::unique_ptr<DictionaryUnifier>> DictionaryUnifier::Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool) {
+ MakeUnifier maker(pool, value_type);
+ RETURN_NOT_OK(VisitTypeInline(*value_type, &maker));
+ return std::move(maker.result);
+}
-Result<std::shared_ptr<ChunkedArray>> DictionaryUnifier::UnifyChunkedArray(
- const std::shared_ptr<ChunkedArray>& array, MemoryPool* pool) {
- if (array->num_chunks() <= 1) {
- return array;
+Result<std::shared_ptr<ChunkedArray>> DictionaryUnifier::UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array, MemoryPool* pool) {
+ if (array->num_chunks() <= 1) {
+ return array;
}
- ArrayDataVector data_chunks(array->num_chunks());
- std::transform(array->chunks().begin(), array->chunks().end(), data_chunks.begin(),
- [](const std::shared_ptr<Array>& array) { return array->data(); });
- ARROW_ASSIGN_OR_RAISE(bool changed,
- RecursiveUnifier{pool}.Unify(array->type(), &data_chunks));
- if (!changed) {
- return array;
+ ArrayDataVector data_chunks(array->num_chunks());
+ std::transform(array->chunks().begin(), array->chunks().end(), data_chunks.begin(),
+ [](const std::shared_ptr<Array>& array) { return array->data(); });
+ ARROW_ASSIGN_OR_RAISE(bool changed,
+ RecursiveUnifier{pool}.Unify(array->type(), &data_chunks));
+ if (!changed) {
+ return array;
}
- ArrayVector chunks(array->num_chunks());
- std::transform(data_chunks.begin(), data_chunks.end(), chunks.begin(),
- [](const std::shared_ptr<ArrayData>& data) { return MakeArray(data); });
- return std::make_shared<ChunkedArray>(std::move(chunks), array->type());
-}
-
-Result<std::shared_ptr<Table>> DictionaryUnifier::UnifyTable(const Table& table,
- MemoryPool* pool) {
- ChunkedArrayVector columns = table.columns();
- for (auto& col : columns) {
- ARROW_ASSIGN_OR_RAISE(col, DictionaryUnifier::UnifyChunkedArray(col, pool));
+ ArrayVector chunks(array->num_chunks());
+ std::transform(data_chunks.begin(), data_chunks.end(), chunks.begin(),
+ [](const std::shared_ptr<ArrayData>& data) { return MakeArray(data); });
+ return std::make_shared<ChunkedArray>(std::move(chunks), array->type());
+}
+
+Result<std::shared_ptr<Table>> DictionaryUnifier::UnifyTable(const Table& table,
+ MemoryPool* pool) {
+ ChunkedArrayVector columns = table.columns();
+ for (auto& col : columns) {
+ ARROW_ASSIGN_OR_RAISE(col, DictionaryUnifier::UnifyChunkedArray(col, pool));
}
- return Table::Make(table.schema(), std::move(columns), table.num_rows());
+ return Table::Make(table.schema(), std::move(columns), table.num_rows());
}
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
index eb039331b51..8791eaa07db 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_dict.h
@@ -120,61 +120,61 @@ class ARROW_EXPORT DictionaryArray : public Array {
mutable std::shared_ptr<Array> dictionary_;
};
-/// \brief Helper class for incremental dictionary unification
-class ARROW_EXPORT DictionaryUnifier {
- public:
- virtual ~DictionaryUnifier() = default;
-
- /// \brief Construct a DictionaryUnifier
- /// \param[in] value_type the data type of the dictionaries
- /// \param[in] pool MemoryPool to use for memory allocations
- static Result<std::unique_ptr<DictionaryUnifier>> Make(
- std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
-
- /// \brief Unify dictionaries accross array chunks
- ///
- /// The dictionaries in the array chunks will be unified, their indices
- /// accordingly transposed.
- ///
- /// Only dictionaries with a primitive value type are currently supported.
- /// However, dictionaries nested inside a more complex type are correctly unified.
- static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
- const std::shared_ptr<ChunkedArray>& array,
- MemoryPool* pool = default_memory_pool());
-
- /// \brief Unify dictionaries accross the chunks of each table column
- ///
- /// The dictionaries in each table column will be unified, their indices
- /// accordingly transposed.
- ///
- /// Only dictionaries with a primitive value type are currently supported.
- /// However, dictionaries nested inside a more complex type are correctly unified.
- static Result<std::shared_ptr<Table>> UnifyTable(
- const Table& table, MemoryPool* pool = default_memory_pool());
-
- /// \brief Append dictionary to the internal memo
- virtual Status Unify(const Array& dictionary) = 0;
-
- /// \brief Append dictionary and compute transpose indices
- /// \param[in] dictionary the dictionary values to unify
- /// \param[out] out_transpose a Buffer containing computed transpose indices
- /// as int32_t values equal in length to the passed dictionary. The value in
- /// each slot corresponds to the new index value for each original index
- /// for a DictionaryArray with the old dictionary
- virtual Status Unify(const Array& dictionary,
- std::shared_ptr<Buffer>* out_transpose) = 0;
-
- /// \brief Return a result DictionaryType with the smallest possible index
- /// type to accommodate the unified dictionary. The unifier cannot be used
- /// after this is called
- virtual Status GetResult(std::shared_ptr<DataType>* out_type,
- std::shared_ptr<Array>* out_dict) = 0;
-
- /// \brief Return a unified dictionary with the given index type. If
- /// the index type is not large enough then an invalid status will be returned.
- /// The unifier cannot be used after this is called
- virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
- std::shared_ptr<Array>* out_dict) = 0;
-};
-
+/// \brief Helper class for incremental dictionary unification
+class ARROW_EXPORT DictionaryUnifier {
+ public:
+ virtual ~DictionaryUnifier() = default;
+
+ /// \brief Construct a DictionaryUnifier
+ /// \param[in] value_type the data type of the dictionaries
+ /// \param[in] pool MemoryPool to use for memory allocations
+ static Result<std::unique_ptr<DictionaryUnifier>> Make(
+ std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross array chunks
+ ///
+ /// The dictionaries in the array chunks will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
+ const std::shared_ptr<ChunkedArray>& array,
+ MemoryPool* pool = default_memory_pool());
+
+ /// \brief Unify dictionaries accross the chunks of each table column
+ ///
+ /// The dictionaries in each table column will be unified, their indices
+ /// accordingly transposed.
+ ///
+ /// Only dictionaries with a primitive value type are currently supported.
+ /// However, dictionaries nested inside a more complex type are correctly unified.
+ static Result<std::shared_ptr<Table>> UnifyTable(
+ const Table& table, MemoryPool* pool = default_memory_pool());
+
+ /// \brief Append dictionary to the internal memo
+ virtual Status Unify(const Array& dictionary) = 0;
+
+ /// \brief Append dictionary and compute transpose indices
+ /// \param[in] dictionary the dictionary values to unify
+ /// \param[out] out_transpose a Buffer containing computed transpose indices
+ /// as int32_t values equal in length to the passed dictionary. The value in
+ /// each slot corresponds to the new index value for each original index
+ /// for a DictionaryArray with the old dictionary
+ virtual Status Unify(const Array& dictionary,
+ std::shared_ptr<Buffer>* out_transpose) = 0;
+
+ /// \brief Return a result DictionaryType with the smallest possible index
+ /// type to accommodate the unified dictionary. The unifier cannot be used
+ /// after this is called
+ virtual Status GetResult(std::shared_ptr<DataType>* out_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+
+ /// \brief Return a unified dictionary with the given index type. If
+ /// the index type is not large enough then an invalid status will be returned.
+ /// The unifier cannot be used after this is called
+ virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+ std::shared_ptr<Array>* out_dict) = 0;
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
index fdbc0eb8f3d..f967127c5f1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_nested.cc
@@ -70,8 +70,8 @@ Status CleanListOffsets(const Array& offsets, MemoryPool* pool,
ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
AllocateBuffer(num_offsets * sizeof(offset_type), pool));
- // Copy valid bits, ignoring the final offset (since for a length N list array,
- // we have N + 1 offsets)
+ // Copy valid bits, ignoring the final offset (since for a length N list array,
+ // we have N + 1 offsets)
ARROW_ASSIGN_OR_RAISE(
auto clean_valid_bits,
offsets.null_bitmap()->CopySlice(0, BitUtil::BytesForBits(num_offsets - 1)));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
index 3b8f769b7dc..b601eb770c3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/array_primitive.h
@@ -25,7 +25,7 @@
#include "arrow/array/array_base.h"
#include "arrow/array/data.h"
-#include "arrow/stl_iterator.h"
+#include "arrow/stl_iterator.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/type_traits.h"
@@ -41,7 +41,7 @@ class NumericArray : public PrimitiveArray {
public:
using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
- using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
+ using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
@@ -64,10 +64,10 @@ class NumericArray : public PrimitiveArray {
// For API compatibility with BinaryArray etc.
value_type GetView(int64_t i) const { return Value(i); }
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
using PrimitiveArray::PrimitiveArray;
};
@@ -76,7 +76,7 @@ class NumericArray : public PrimitiveArray {
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
using TypeClass = BooleanType;
- using IteratorType = stl::ArrayIterator<BooleanArray>;
+ using IteratorType = stl::ArrayIterator<BooleanArray>;
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
@@ -99,10 +99,10 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
/// values. Result is not cached.
int64_t true_count() const;
- IteratorType begin() const { return IteratorType(*this); }
-
- IteratorType end() const { return IteratorType(*this, length()); }
-
+ IteratorType begin() const { return IteratorType(*this); }
+
+ IteratorType end() const { return IteratorType(*this, length()); }
+
protected:
using PrimitiveArray::PrimitiveArray;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
index 0c2782e7466..c0df797256d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_adaptive.h
@@ -64,26 +64,26 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(CommitPendingData());
- ARROW_RETURN_NOT_OK(Reserve(length));
- memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
- UnsafeSetNotNull(length);
- return Status::OK();
- }
-
- Status AppendEmptyValue() final {
- pending_data_[pending_pos_] = 0;
- pending_valid_[pending_pos_] = 1;
- ++pending_pos_;
- ++length_;
-
- if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
- return CommitPendingData();
- }
- return Status::OK();
- }
-
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(CommitPendingData());
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ pending_data_[pending_pos_] = 0;
+ pending_valid_[pending_pos_] = 1;
+ ++pending_pos_;
+ ++length_;
+
+ if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+ return CommitPendingData();
+ }
+ return Status::OK();
+ }
+
void Reset() override;
Status Resize(int64_t capacity) override;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
index ff11984790c..c892e3d664b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.cc
@@ -24,11 +24,11 @@
#include "arrow/array/data.h"
#include "arrow/array/util.h"
#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/scalar.h"
+#include "arrow/builder.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/logging.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
@@ -95,162 +95,162 @@ Status ArrayBuilder::Advance(int64_t elements) {
return null_bitmap_builder_.Advance(elements);
}
-namespace {
-struct AppendScalarImpl {
- template <typename T>
- enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
- is_fixed_size_binary_type<T>::value,
- Status>
- Visit(const T&) {
- auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
- RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
-
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar =
- internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
- if (scalar->is_valid) {
- builder->UnsafeAppend(scalar->value);
- } else {
- builder->UnsafeAppendNull();
- }
- }
- }
- return Status::OK();
- }
-
- template <typename T>
- enable_if_base_binary<T, Status> Visit(const T&) {
- int64_t data_size = 0;
- for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar =
- internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
- if (scalar->is_valid) {
- data_size += scalar->value->size();
- }
- }
-
- auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
- RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
- RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
-
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
- raw++) {
- auto scalar =
- internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
- if (scalar->is_valid) {
- builder->UnsafeAppend(util::string_view{*scalar->value});
- } else {
- builder->UnsafeAppendNull();
- }
- }
- }
- return Status::OK();
- }
-
- template <typename T>
- enable_if_list_like<T, Status> Visit(const T&) {
- auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
- int64_t num_children = 0;
- for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
- scalar++) {
- if (!(*scalar)->is_valid) continue;
- num_children +=
- internal::checked_cast<const BaseListScalar&>(**scalar).value->length();
- }
- RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_));
-
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
- scalar++) {
- if ((*scalar)->is_valid) {
- RETURN_NOT_OK(builder->Append());
- const Array& list =
- *internal::checked_cast<const BaseListScalar&>(**scalar).value;
- for (int64_t i = 0; i < list.length(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
- RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
- }
- } else {
- RETURN_NOT_OK(builder_->AppendNull());
- }
- }
- }
- return Status::OK();
- }
-
- Status Visit(const StructType& type) {
- auto* builder = internal::checked_cast<StructBuilder*>(builder_);
- auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
- RETURN_NOT_OK(builder->Reserve(count));
- for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
- RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count));
- }
- for (int64_t i = 0; i < n_repeats_; i++) {
- for (const std::shared_ptr<Scalar>* s = scalars_begin_; s != scalars_end_; s++) {
- const auto& scalar = internal::checked_cast<const StructScalar&>(**s);
- for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
- if (!scalar.is_valid || !scalar.value[field_index]) {
- RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
- } else {
- RETURN_NOT_OK(builder->field_builder(field_index)
- ->AppendScalar(*scalar.value[field_index]));
- }
- }
- RETURN_NOT_OK(builder->Append(scalar.is_valid));
- }
- }
- return Status::OK();
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("AppendScalar for type ", type);
- }
-
- Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); }
-
- const std::shared_ptr<Scalar>* scalars_begin_;
- const std::shared_ptr<Scalar>* scalars_end_;
- int64_t n_repeats_;
- ArrayBuilder* builder_;
-};
-} // namespace
-
-Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
- if (!scalar.type->Equals(type())) {
- return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
- " to builder for type ", type()->ToString());
- }
- std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
- return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
-}
-
-Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
- if (!scalar.type->Equals(type())) {
- return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
- " to builder for type ", type()->ToString());
- }
- std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
- return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
-}
-
-Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
- if (scalars.empty()) return Status::OK();
- const auto ty = type();
- for (const auto& scalar : scalars) {
- if (!scalar->type->Equals(ty)) {
- return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
- " to builder for type ", type()->ToString());
- }
- }
- return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(),
- /*n_repeats=*/1, this}
- .Convert();
-}
-
+namespace {
+struct AppendScalarImpl {
+ template <typename T>
+ enable_if_t<has_c_type<T>::value || is_decimal_type<T>::value ||
+ is_fixed_size_binary_type<T>::value,
+ Status>
+ Visit(const T&) {
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(scalar->value);
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_base_binary<T, Status> Visit(const T&) {
+ int64_t data_size = 0;
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ data_size += scalar->value->size();
+ }
+ }
+
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_)));
+ RETURN_NOT_OK(builder->ReserveData(n_repeats_ * data_size));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* raw = scalars_begin_; raw != scalars_end_;
+ raw++) {
+ auto scalar =
+ internal::checked_cast<const typename TypeTraits<T>::ScalarType*>(raw->get());
+ if (scalar->is_valid) {
+ builder->UnsafeAppend(util::string_view{*scalar->value});
+ } else {
+ builder->UnsafeAppendNull();
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_list_like<T, Status> Visit(const T&) {
+ auto builder = internal::checked_cast<typename TypeTraits<T>::BuilderType*>(builder_);
+ int64_t num_children = 0;
+ for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if (!(*scalar)->is_valid) continue;
+ num_children +=
+ internal::checked_cast<const BaseListScalar&>(**scalar).value->length();
+ }
+ RETURN_NOT_OK(builder->value_builder()->Reserve(num_children * n_repeats_));
+
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* scalar = scalars_begin_; scalar != scalars_end_;
+ scalar++) {
+ if ((*scalar)->is_valid) {
+ RETURN_NOT_OK(builder->Append());
+ const Array& list =
+ *internal::checked_cast<const BaseListScalar&>(**scalar).value;
+ for (int64_t i = 0; i < list.length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, list.GetScalar(i));
+ RETURN_NOT_OK(builder->value_builder()->AppendScalar(*scalar));
+ }
+ } else {
+ RETURN_NOT_OK(builder_->AppendNull());
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ auto* builder = internal::checked_cast<StructBuilder*>(builder_);
+ auto count = n_repeats_ * (scalars_end_ - scalars_begin_);
+ RETURN_NOT_OK(builder->Reserve(count));
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->Reserve(count));
+ }
+ for (int64_t i = 0; i < n_repeats_; i++) {
+ for (const std::shared_ptr<Scalar>* s = scalars_begin_; s != scalars_end_; s++) {
+ const auto& scalar = internal::checked_cast<const StructScalar&>(**s);
+ for (int field_index = 0; field_index < type.num_fields(); ++field_index) {
+ if (!scalar.is_valid || !scalar.value[field_index]) {
+ RETURN_NOT_OK(builder->field_builder(field_index)->AppendNull());
+ } else {
+ RETURN_NOT_OK(builder->field_builder(field_index)
+ ->AppendScalar(*scalar.value[field_index]));
+ }
+ }
+ RETURN_NOT_OK(builder->Append(scalar.is_valid));
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("AppendScalar for type ", type);
+ }
+
+ Status Convert() { return VisitTypeInline(*(*scalars_begin_)->type, this); }
+
+ const std::shared_ptr<Scalar>* scalars_begin_;
+ const std::shared_ptr<Scalar>* scalars_end_;
+ int64_t n_repeats_;
+ ArrayBuilder* builder_;
+};
+} // namespace
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, /*n_repeats=*/1, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalar(const Scalar& scalar, int64_t n_repeats) {
+ if (!scalar.type->Equals(type())) {
+ return Status::Invalid("Cannot append scalar of type ", scalar.type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ std::shared_ptr<Scalar> shared{const_cast<Scalar*>(&scalar), [](Scalar*) {}};
+ return AppendScalarImpl{&shared, &shared + 1, n_repeats, this}.Convert();
+}
+
+Status ArrayBuilder::AppendScalars(const ScalarVector& scalars) {
+ if (scalars.empty()) return Status::OK();
+ const auto ty = type();
+ for (const auto& scalar : scalars) {
+ if (!scalar->type->Equals(ty)) {
+ return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
+ " to builder for type ", type()->ToString());
+ }
+ }
+ return AppendScalarImpl{scalars.data(), scalars.data() + scalars.size(),
+ /*n_repeats=*/1, this}
+ .Convert();
+}
+
Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
std::shared_ptr<ArrayData> internal_data;
RETURN_NOT_OK(FinishInternal(&internal_data));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
index 7a1ad81998a..905b3c1b491 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_base.h
@@ -29,7 +29,7 @@
#include "arrow/buffer.h"
#include "arrow/buffer_builder.h"
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
+#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -51,7 +51,7 @@ class ARROW_EXPORT ArrayBuilder {
explicit ArrayBuilder(MemoryPool* pool) : pool_(pool), null_bitmap_builder_(pool) {}
virtual ~ArrayBuilder() = default;
- ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
+ ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
/// For nested types. Since the objects are owned by this class instance, we
/// skip shared pointers and just return a raw pointer
@@ -98,30 +98,30 @@ class ARROW_EXPORT ArrayBuilder {
/// Reset the builder.
virtual void Reset();
- /// \brief Append a null value to builder
+ /// \brief Append a null value to builder
virtual Status AppendNull() = 0;
- /// \brief Append a number of null values to builder
+ /// \brief Append a number of null values to builder
virtual Status AppendNulls(int64_t length) = 0;
- /// \brief Append a non-null value to builder
- ///
- /// The appended value is an implementation detail, but the corresponding
- /// memory slot is guaranteed to be initialized.
- /// This method is useful when appending a null value to a parent nested type.
- virtual Status AppendEmptyValue() = 0;
-
- /// \brief Append a number of non-null values to builder
- ///
- /// The appended values are an implementation detail, but the corresponding
- /// memory slot is guaranteed to be initialized.
- /// This method is useful when appending null values to a parent nested type.
- virtual Status AppendEmptyValues(int64_t length) = 0;
-
- /// \brief Append a value from a scalar
- Status AppendScalar(const Scalar& scalar);
- Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
- Status AppendScalars(const ScalarVector& scalars);
-
+ /// \brief Append a non-null value to builder
+ ///
+ /// The appended value is an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending a null value to a parent nested type.
+ virtual Status AppendEmptyValue() = 0;
+
+ /// \brief Append a number of non-null values to builder
+ ///
+ /// The appended values are an implementation detail, but the corresponding
+ /// memory slot is guaranteed to be initialized.
+ /// This method is useful when appending null values to a parent nested type.
+ virtual Status AppendEmptyValues(int64_t length) = 0;
+
+ /// \brief Append a value from a scalar
+ Status AppendScalar(const Scalar& scalar);
+ Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
+ Status AppendScalars(const ScalarVector& scalars);
+
/// For cases where raw data was memcpy'd into the internal buffers, allows us
/// to advance the length of the builder. It is your responsibility to use
/// this function responsibly.
@@ -253,24 +253,24 @@ class ARROW_EXPORT ArrayBuilder {
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
};
-/// \brief Construct an empty ArrayBuilder corresponding to the data
-/// type
-/// \param[in] pool the MemoryPool to use for allocations
-/// \param[in] type the data type to create the builder for
-/// \param[out] out the created ArrayBuilder
-ARROW_EXPORT
-Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
- std::unique_ptr<ArrayBuilder>* out);
-
-/// \brief Construct an empty DictionaryBuilder initialized optionally
-/// with a pre-existing dictionary
-/// \param[in] pool the MemoryPool to use for allocations
-/// \param[in] type the dictionary type to create the builder for
-/// \param[in] dictionary the initial dictionary, if any. May be nullptr
-/// \param[out] out the created ArrayBuilder
-ARROW_EXPORT
-Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
- const std::shared_ptr<Array>& dictionary,
- std::unique_ptr<ArrayBuilder>* out);
-
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the data type to create the builder for
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ std::unique_ptr<ArrayBuilder>* out);
+
+/// \brief Construct an empty DictionaryBuilder initialized optionally
+/// with a pre-existing dictionary
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the dictionary type to create the builder for
+/// \param[in] dictionary the initial dictionary, if any. May be nullptr
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::shared_ptr<Array>& dictionary,
+ std::unique_ptr<ArrayBuilder>* out);
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
index 26d6a7129f6..6822dc89903 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.cc
@@ -73,20 +73,20 @@ Status FixedSizeBinaryBuilder::AppendNulls(int64_t length) {
return Status::OK();
}
-Status FixedSizeBinaryBuilder::AppendEmptyValue() {
- RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(true);
- byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
- return Status::OK();
-}
-
-Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) {
- RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, true);
- byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0);
- return Status::OK();
-}
-
+Status FixedSizeBinaryBuilder::AppendEmptyValue() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
+ return Status::OK();
+}
+
+Status FixedSizeBinaryBuilder::AppendEmptyValues(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ byte_builder_.UnsafeAppend(/*num_copies=*/length * byte_width_, 0);
+ return Status::OK();
+}
+
void FixedSizeBinaryBuilder::Reset() {
ArrayBuilder::Reset();
byte_builder_.Reset();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
index 346e90d25a0..62edc69fb8e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h
@@ -61,7 +61,7 @@ class BaseBinaryBuilder : public ArrayBuilder {
ARROW_RETURN_NOT_OK(AppendNextOffset());
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
- ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+ ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
@@ -77,23 +77,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Append(value.data(), static_cast<offset_type>(value.size()));
}
- /// Extend the last appended value by appending more data at the end
- ///
- /// Unlike Append, this does not create a new offset.
- Status ExtendCurrent(const uint8_t* value, offset_type length) {
- // Safety check for UBSAN.
- if (ARROW_PREDICT_TRUE(length > 0)) {
- ARROW_RETURN_NOT_OK(ValidateOverflow(length));
- ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
- }
- return Status::OK();
- }
-
- Status ExtendCurrent(util::string_view value) {
- return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
- static_cast<offset_type>(value.size()));
- }
-
+ /// Extend the last appended value by appending more data at the end
+ ///
+ /// Unlike Append, this does not create a new offset.
+ Status ExtendCurrent(const uint8_t* value, offset_type length) {
+ // Safety check for UBSAN.
+ if (ARROW_PREDICT_TRUE(length > 0)) {
+ ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+ ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
+ }
+ return Status::OK();
+ }
+
+ Status ExtendCurrent(util::string_view value) {
+ return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<offset_type>(value.size()));
+ }
+
Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
@@ -111,23 +111,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValue() final {
- ARROW_RETURN_NOT_OK(AppendNextOffset());
- ARROW_RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(true);
- return Status::OK();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- const int64_t num_bytes = value_data_builder_.length();
- ARROW_RETURN_NOT_OK(Reserve(length));
- for (int64_t i = 0; i < length; ++i) {
- offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
- }
- UnsafeAppendToBitmap(length, true);
- return Status::OK();
- }
-
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ const int64_t num_bytes = value_data_builder_.length();
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ }
+ UnsafeAppendToBitmap(length, true);
+ return Status::OK();
+ }
+
/// \brief Append without checking capacity
///
/// Offsets and data should have been presized using Reserve() and
@@ -150,28 +150,28 @@ class BaseBinaryBuilder : public ArrayBuilder {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}
- /// Like ExtendCurrent, but do not check capacity
- void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
- value_data_builder_.UnsafeAppend(value, length);
- }
-
- void UnsafeExtendCurrent(util::string_view value) {
- UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
- static_cast<offset_type>(value.size()));
- }
-
+ /// Like ExtendCurrent, but do not check capacity
+ void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
+ value_data_builder_.UnsafeAppend(value, length);
+ }
+
+ void UnsafeExtendCurrent(util::string_view value) {
+ UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<offset_type>(value.size()));
+ }
+
void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}
- void UnsafeAppendEmptyValue() {
- const int64_t num_bytes = value_data_builder_.length();
- offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
- UnsafeAppendToBitmap(true);
- }
-
+ void UnsafeAppendEmptyValue() {
+ const int64_t num_bytes = value_data_builder_.length();
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+ UnsafeAppendToBitmap(true);
+ }
+
/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
@@ -467,14 +467,14 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
- Status Append(const Buffer& s) {
- ARROW_RETURN_NOT_OK(Reserve(1));
- UnsafeAppend(util::string_view(s));
- return Status::OK();
- }
-
- Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
-
+ Status Append(const Buffer& s) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(util::string_view(s));
+ return Status::OK();
+ }
+
+ Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
+
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& value) {
ARROW_RETURN_NOT_OK(Reserve(1));
@@ -489,9 +489,9 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
Status AppendNull() final;
Status AppendNulls(int64_t length) final;
- Status AppendEmptyValue() final;
- Status AppendEmptyValues(int64_t length) final;
-
+ Status AppendEmptyValue() final;
+ Status AppendEmptyValues(int64_t length) final;
+
void UnsafeAppend(const uint8_t* value) {
UnsafeAppendToBitmap(true);
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
@@ -510,10 +510,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
}
- void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
-
- void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
-
+ void UnsafeAppend(const Buffer& s) { UnsafeAppend(util::string_view(s)); }
+
+ void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
+
void UnsafeAppendNull() {
UnsafeAppendToBitmap(false);
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
index 34c81f76c6f..bd7615a7309 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.cc
@@ -67,39 +67,39 @@ Status Decimal128Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::OK();
}
-// ----------------------------------------------------------------------
-// Decimal256Builder
-
-Decimal256Builder::Decimal256Builder(const std::shared_ptr<DataType>& type,
- MemoryPool* pool)
- : FixedSizeBinaryBuilder(type, pool),
- decimal_type_(internal::checked_pointer_cast<Decimal256Type>(type)) {}
-
-Status Decimal256Builder::Append(const Decimal256& value) {
- RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1));
- UnsafeAppend(value);
- return Status::OK();
-}
-
-void Decimal256Builder::UnsafeAppend(const Decimal256& value) {
- value.ToBytes(GetMutableValue(length()));
- byte_builder_.UnsafeAdvance(32);
- UnsafeAppendToBitmap(true);
-}
-
-void Decimal256Builder::UnsafeAppend(util::string_view value) {
- FixedSizeBinaryBuilder::UnsafeAppend(value);
-}
-
-Status Decimal256Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- std::shared_ptr<Buffer> data;
- RETURN_NOT_OK(byte_builder_.Finish(&data));
- std::shared_ptr<Buffer> null_bitmap;
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
-
- *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
- capacity_ = length_ = null_count_ = 0;
- return Status::OK();
-}
-
+// ----------------------------------------------------------------------
+// Decimal256Builder
+
+Decimal256Builder::Decimal256Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool)
+ : FixedSizeBinaryBuilder(type, pool),
+ decimal_type_(internal::checked_pointer_cast<Decimal256Type>(type)) {}
+
+Status Decimal256Builder::Append(const Decimal256& value) {
+ RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1));
+ UnsafeAppend(value);
+ return Status::OK();
+}
+
+void Decimal256Builder::UnsafeAppend(const Decimal256& value) {
+ value.ToBytes(GetMutableValue(length()));
+ byte_builder_.UnsafeAdvance(32);
+ UnsafeAppendToBitmap(true);
+}
+
+void Decimal256Builder::UnsafeAppend(util::string_view value) {
+ FixedSizeBinaryBuilder::UnsafeAppend(value);
+}
+
+Status Decimal256Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
+ std::shared_ptr<Buffer> data;
+ RETURN_NOT_OK(byte_builder_.Finish(&data));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+ *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
index 7fee4ab4c73..f48392ed001 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_decimal.h
@@ -32,7 +32,7 @@ namespace arrow {
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
public:
using TypeClass = Decimal128Type;
- using ValueType = Decimal128;
+ using ValueType = Decimal128;
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
MemoryPool* pool = default_memory_pool());
@@ -59,36 +59,36 @@ class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
std::shared_ptr<Decimal128Type> decimal_type_;
};
-class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
- public:
- using TypeClass = Decimal256Type;
- using ValueType = Decimal256;
-
- explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
- MemoryPool* pool = default_memory_pool());
-
- using FixedSizeBinaryBuilder::Append;
- using FixedSizeBinaryBuilder::AppendValues;
- using FixedSizeBinaryBuilder::Reset;
-
- Status Append(const Decimal256& val);
- void UnsafeAppend(const Decimal256& val);
- void UnsafeAppend(util::string_view val);
-
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
-
- /// \cond FALSE
- using ArrayBuilder::Finish;
- /// \endcond
-
- Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
-
- std::shared_ptr<DataType> type() const override { return decimal_type_; }
-
- protected:
- std::shared_ptr<Decimal256Type> decimal_type_;
-};
-
+class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
+ public:
+ using TypeClass = Decimal256Type;
+ using ValueType = Decimal256;
+
+ explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool());
+
+ using FixedSizeBinaryBuilder::Append;
+ using FixedSizeBinaryBuilder::AppendValues;
+ using FixedSizeBinaryBuilder::Reset;
+
+ Status Append(const Decimal256& val);
+ void UnsafeAppend(const Decimal256& val);
+ void UnsafeAppend(util::string_view val);
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
+
+ std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+ std::shared_ptr<Decimal256Type> decimal_type_;
+};
+
using DecimalBuilder = Decimal128Builder;
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
index 7bbb6b25499..b13f6a2db34 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.cc
@@ -45,7 +45,7 @@ class DictionaryMemoTable::DictionaryMemoTableImpl {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
- return Status::NotImplemented("Initialization of ", value_type_->ToString(),
+ return Status::NotImplemented("Initialization of ", value_type_->ToString(),
" memo table is not implemented");
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
index d5541db2e7c..eb96482dbf7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_dict.h
@@ -29,7 +29,7 @@
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array/data.h"
#include "arrow/array/util.h"
-#include "arrow/scalar.h"
+#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
@@ -241,20 +241,20 @@ class DictionaryBuilderBase : public ArrayBuilder {
/// \brief Append a decimal (only for Decimal128Type)
template <typename T1 = T>
- enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
+ enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
uint8_t data[16];
value.ToBytes(data);
return Append(data, 16);
}
- /// \brief Append a decimal (only for Decimal128Type)
- template <typename T1 = T>
- enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
- uint8_t data[32];
- value.ToBytes(data);
- return Append(data, 32);
- }
-
+ /// \brief Append a decimal (only for Decimal128Type)
+ template <typename T1 = T>
+ enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
+ uint8_t data[32];
+ value.ToBytes(data);
+ return Append(data, 32);
+ }
+
/// \brief Append a scalar null value
Status AppendNull() final {
length_ += 1;
@@ -270,18 +270,18 @@ class DictionaryBuilderBase : public ArrayBuilder {
return indices_builder_.AppendNulls(length);
}
- Status AppendEmptyValue() final {
- length_ += 1;
-
- return indices_builder_.AppendEmptyValue();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- length_ += length;
-
- return indices_builder_.AppendEmptyValues(length);
- }
-
+ Status AppendEmptyValue() final {
+ length_ += 1;
+
+ return indices_builder_.AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ length_ += length;
+
+ return indices_builder_.AppendEmptyValues(length);
+ }
+
/// \brief Insert values into the dictionary's memo, but do not append any
/// indices. Can be used to initialize a new builder with known dictionary
/// values
@@ -458,18 +458,18 @@ class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
return indices_builder_.AppendNulls(length);
}
- Status AppendEmptyValue() final {
- length_ += 1;
-
- return indices_builder_.AppendEmptyValue();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- length_ += length;
-
- return indices_builder_.AppendEmptyValues(length);
- }
-
+ Status AppendEmptyValue() final {
+ length_ += 1;
+
+ return indices_builder_.AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ length_ += length;
+
+ return indices_builder_.AppendEmptyValues(length);
+ }
+
/// \brief Append a whole dense array to the builder
Status AppendArray(const Array& array) {
#ifndef NDEBUG
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
index b49741d365f..a3bcde0381a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.cc
@@ -123,24 +123,24 @@ Status MapBuilder::AppendNulls(int64_t length) {
return Status::OK();
}
-Status MapBuilder::AppendEmptyValue() {
- DCHECK_EQ(item_builder_->length(), key_builder_->length());
- RETURN_NOT_OK(AdjustStructBuilderLength());
- RETURN_NOT_OK(list_builder_->AppendEmptyValue());
- length_ = list_builder_->length();
- null_count_ = list_builder_->null_count();
- return Status::OK();
-}
-
-Status MapBuilder::AppendEmptyValues(int64_t length) {
- DCHECK_EQ(item_builder_->length(), key_builder_->length());
- RETURN_NOT_OK(AdjustStructBuilderLength());
- RETURN_NOT_OK(list_builder_->AppendEmptyValues(length));
- length_ = list_builder_->length();
- null_count_ = list_builder_->null_count();
- return Status::OK();
-}
-
+Status MapBuilder::AppendEmptyValue() {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendEmptyValue());
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
+Status MapBuilder::AppendEmptyValues(int64_t length) {
+ DCHECK_EQ(item_builder_->length(), key_builder_->length());
+ RETURN_NOT_OK(AdjustStructBuilderLength());
+ RETURN_NOT_OK(list_builder_->AppendEmptyValues(length));
+ length_ = list_builder_->length();
+ null_count_ = list_builder_->null_count();
+ return Status::OK();
+}
+
Status MapBuilder::AdjustStructBuilderLength() {
// If key/item builders have been appended, adjust struct builder length
// to match. Struct and key are non-nullable, append all valid values.
@@ -213,18 +213,18 @@ Status FixedSizeListBuilder::ValidateOverflow(int64_t new_elements) {
return Status::OK();
}
-Status FixedSizeListBuilder::AppendEmptyValue() {
- RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(true);
- return value_builder_->AppendEmptyValues(list_size_);
-}
-
-Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) {
- RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, true);
- return value_builder_->AppendEmptyValues(list_size_ * length);
-}
-
+Status FixedSizeListBuilder::AppendEmptyValue() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(true);
+ return value_builder_->AppendEmptyValues(list_size_);
+}
+
+Status FixedSizeListBuilder::AppendEmptyValues(int64_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ return value_builder_->AppendEmptyValues(list_size_ * length);
+}
+
Status FixedSizeListBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(CheckCapacity(capacity));
return ArrayBuilder::Resize(capacity);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
index 3acf421ef3e..12b999b786e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_nested.h
@@ -109,19 +109,19 @@ class BaseListBuilder : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValue() final { return Append(true); }
-
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(Reserve(length));
- ARROW_RETURN_NOT_OK(ValidateOverflow(0));
- UnsafeAppendToBitmap(length, true);
- const int64_t num_values = value_builder_->length();
- for (int64_t i = 0; i < length; ++i) {
- offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
- }
- return Status::OK();
- }
-
+ Status AppendEmptyValue() final { return Append(true); }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(ValidateOverflow(0));
+ UnsafeAppendToBitmap(length, true);
+ const int64_t num_values = value_builder_->length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
+ }
+ return Status::OK();
+ }
+
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
ARROW_RETURN_NOT_OK(AppendNextOffset());
@@ -271,10 +271,10 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
Status AppendNulls(int64_t length) final;
- Status AppendEmptyValue() final;
-
- Status AppendEmptyValues(int64_t length) final;
-
+ Status AppendEmptyValue() final;
+
+ Status AppendEmptyValues(int64_t length) final;
+
/// \brief Get builder to append keys.
///
/// Append a key with this builder should be followed by appending
@@ -370,10 +370,10 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
Status ValidateOverflow(int64_t new_elements);
- Status AppendEmptyValue() final;
-
- Status AppendEmptyValues(int64_t length) final;
-
+ Status AppendEmptyValue() final;
+
+ Status AppendEmptyValues(int64_t length) final;
+
ArrayBuilder* value_builder() const { return value_builder_.get(); }
std::shared_ptr<DataType> type() const override {
@@ -431,42 +431,42 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
return Status::OK();
}
- /// \brief Append a null value. Automatically appends an empty value to each child
+ /// \brief Append a null value. Automatically appends an empty value to each child
/// builder.
Status AppendNull() final {
for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
}
return Append(false);
}
- /// \brief Append multiple null values. Automatically appends empty values to each
+ /// \brief Append multiple null values. Automatically appends empty values to each
/// child builder.
- Status AppendNulls(int64_t length) final {
- for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
- }
- ARROW_RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, false);
- return Status::OK();
- }
-
- Status AppendEmptyValue() final {
- for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
- }
- return Append(true);
- }
-
- Status AppendEmptyValues(int64_t length) final {
- for (const auto& field : children_) {
- ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
- }
- ARROW_RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(length, true);
- return Status::OK();
- }
-
+ Status AppendNulls(int64_t length) final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, false);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+ }
+ return Append(true);
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ for (const auto& field : children_) {
+ ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+ }
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(length, true);
+ return Status::OK();
+ }
+
void Reset() override;
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
index ef5c4d14f7f..e403c42411d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.cc
@@ -65,8 +65,8 @@ Status BooleanBuilder::Resize(int64_t capacity) {
}
Status BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
- ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(boolean(), length_, {null_bitmap, data}, null_count_);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
index 3dd2370cddb..80cfc4061bb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h
@@ -23,7 +23,7 @@
#include "arrow/array/builder_base.h"
#include "arrow/array/data.h"
-#include "arrow/result.h"
+#include "arrow/result.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
@@ -47,10 +47,10 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
/// \brief Append a single null element
Status AppendNull() final { return AppendNulls(1); }
- Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
-
- Status AppendEmptyValue() final { return AppendEmptyValues(1); }
-
+ Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
+
+ Status AppendEmptyValue() final { return AppendEmptyValues(1); }
+
Status Append(std::nullptr_t) { return AppendNull(); }
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
@@ -105,22 +105,22 @@ class NumericBuilder : public ArrayBuilder {
return Status::OK();
}
- /// \brief Append a empty element
- Status AppendEmptyValue() final {
- ARROW_RETURN_NOT_OK(Reserve(1));
- data_builder_.UnsafeAppend(value_type{}); // zero
- UnsafeAppendToBitmap(true);
- return Status::OK();
- }
-
- /// \brief Append several empty elements
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(Reserve(length));
- data_builder_.UnsafeAppend(length, value_type{}); // zero
- UnsafeSetNotNull(length);
- return Status::OK();
- }
-
+ /// \brief Append a empty element
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(value_type{}); // zero
+ UnsafeAppendToBitmap(true);
+ return Status::OK();
+ }
+
+ /// \brief Append several empty elements
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, value_type{}); // zero
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
void Reset() override { data_builder_.Reset(); }
@@ -186,9 +186,9 @@ class NumericBuilder : public ArrayBuilder {
}
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
- null_bitmap_builder_.FinishWithLength(length_));
- ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
+ null_bitmap_builder_.FinishWithLength(length_));
+ ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
capacity_ = length_ = null_count_ = 0;
return Status::OK();
@@ -318,20 +318,20 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
return Status::OK();
}
- Status AppendEmptyValue() final {
- ARROW_RETURN_NOT_OK(Reserve(1));
- data_builder_.UnsafeAppend(false);
- UnsafeSetNotNull(1);
- return Status::OK();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- ARROW_RETURN_NOT_OK(Reserve(length));
- data_builder_.UnsafeAppend(length, false);
- UnsafeSetNotNull(length);
- return Status::OK();
- }
-
+ Status AppendEmptyValue() final {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ data_builder_.UnsafeAppend(false);
+ UnsafeSetNotNull(1);
+ return Status::OK();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ data_builder_.UnsafeAppend(length, false);
+ UnsafeSetNotNull(length);
+ return Status::OK();
+ }
+
/// Scalar append
Status Append(const bool val) {
ARROW_RETURN_NOT_OK(Reserve(1));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
index 0168646cf48..8617cb73fce 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.cc
@@ -65,8 +65,8 @@ BasicUnionBuilder::BasicUnionBuilder(
children_ = children;
type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr);
- DCHECK_LE(
- type_id_to_children_.size() - 1,
+ DCHECK_LE(
+ type_id_to_children_.size() - 1,
static_cast<decltype(type_id_to_children_)::size_type>(UnionType::kMaxTypeCode));
for (size_t i = 0; i < children.size(); ++i) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
index 979b3f1effc..060be474fb8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/builder_union.h
@@ -117,26 +117,26 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
return child_builder->AppendNull();
}
- Status AppendEmptyValue() final {
- const int8_t first_child_code = type_codes_[0];
- ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
- ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
- ARROW_RETURN_NOT_OK(
- offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
- // Append an empty value arbitrarily to the first child
- return child_builder->AppendEmptyValue();
- }
-
- Status AppendEmptyValues(int64_t length) final {
- const int8_t first_child_code = type_codes_[0];
- ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
- ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
- ARROW_RETURN_NOT_OK(
- offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
- // Append just a single empty value to the first child
- return child_builder->AppendEmptyValue();
- }
-
+ Status AppendEmptyValue() final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
+ // Append an empty value arbitrarily to the first child
+ return child_builder->AppendEmptyValue();
+ }
+
+ Status AppendEmptyValues(int64_t length) final {
+ const int8_t first_child_code = type_codes_[0];
+ ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(
+ offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
+ // Append just a single empty value to the first child
+ return child_builder->AppendEmptyValue();
+ }
+
/// \brief Append an element to the UnionArray. This must be followed
/// by an append to the appropriate child builder.
///
@@ -179,45 +179,45 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
const std::shared_ptr<DataType>& type)
: BasicUnionBuilder(pool, children, type) {}
- /// \brief Append a null value.
- ///
- /// A null is appended to the first child, empty values to the other children.
+ /// \brief Append a null value.
+ ///
+ /// A null is appended to the first child, empty values to the other children.
Status AppendNull() final {
- const auto first_child_code = type_codes_[0];
- ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
- ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
- for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
- ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
- }
- return Status::OK();
- }
-
- /// \brief Append multiple null values.
- ///
- /// Nulls are appended to the first child, empty values to the other children.
- Status AppendNulls(int64_t length) final {
- const auto first_child_code = type_codes_[0];
- ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
- ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
- for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
- ARROW_RETURN_NOT_OK(
- type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
- }
- return Status::OK();
- }
-
- Status AppendEmptyValue() final {
+ const auto first_child_code = type_codes_[0];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
+ for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+ ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
+ }
+ return Status::OK();
+ }
+
+ /// \brief Append multiple null values.
+ ///
+ /// Nulls are appended to the first child, empty values to the other children.
+ Status AppendNulls(int64_t length) final {
+ const auto first_child_code = type_codes_[0];
+ ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
+ for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+ ARROW_RETURN_NOT_OK(
+ type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
+ }
+ return Status::OK();
+ }
+
+ Status AppendEmptyValue() final {
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
for (int8_t code : type_codes_) {
- ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
+ ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
}
return Status::OK();
}
- Status AppendEmptyValues(int64_t length) final {
+ Status AppendEmptyValues(int64_t length) final {
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
for (int8_t code : type_codes_) {
- ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
+ ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
}
return Status::OK();
}
@@ -228,7 +228,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
/// \param[in] next_type type_id of the child to which the next value will be appended.
///
/// The corresponding child builder must be appended to independently after this method
- /// is called, and all other child builders must have null or empty value appended.
+ /// is called, and all other child builders must have null or empty value appended.
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
index be9b5c3258c..32478783394 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/concatenate.cc
@@ -36,7 +36,7 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/int_util.h"
+#include "arrow/util/int_util.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -45,7 +45,7 @@ namespace arrow {
using internal::SafeSignedAdd;
-namespace {
+namespace {
/// offset, length pair for representing a Range of a buffer or array
struct Range {
int64_t offset = -1, length = 0;
@@ -68,8 +68,8 @@ struct Bitmap {
};
// Allocate a buffer and concatenate bitmaps into it.
-Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
- std::shared_ptr<Buffer>* out) {
+Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
int64_t out_length = 0;
for (const auto& bitmap : bitmaps) {
if (internal::AddWithOverflow(out_length, bitmap.range.length, &out_length)) {
@@ -96,15 +96,15 @@ Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
// Write offsets in src into dst, adjusting them such that first_offset
// will be the first offset written.
template <typename Offset>
-Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
- Range* values_range);
+Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
+ Range* values_range);
// Concatenate buffers holding offsets into a single buffer of offsets,
// also computing the ranges of values spanned by each buffer of offsets.
template <typename Offset>
-Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
- std::shared_ptr<Buffer>* out,
- std::vector<Range>* values_ranges) {
+Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out,
+ std::vector<Range>* values_ranges) {
values_ranges->resize(buffers.size());
// allocate output buffer
@@ -132,8 +132,8 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
}
template <typename Offset>
-Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
- Range* values_range) {
+Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offset* dst,
+ Range* values_range) {
if (src->size() == 0) {
// It's allowed to have an empty offsets buffer for a 0-length array
// (see Array::Validate)
@@ -167,7 +167,7 @@ Status PutOffsets(const std::shared_ptr<Buffer>& src, Offset first_offset, Offse
class ConcatenateImpl {
public:
- ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool)
+ ConcatenateImpl(const ArrayDataVector& in, MemoryPool* pool)
: in_(std::move(in)), pool_(pool), out_(std::make_shared<ArrayData>()) {
out_->type = in[0]->type;
for (size_t i = 0; i < in_.size(); ++i) {
@@ -202,7 +202,7 @@ class ConcatenateImpl {
}
Status Visit(const FixedWidthType& fixed) {
- // Handles numbers, decimal128, decimal256, fixed_size_binary
+ // Handles numbers, decimal128, decimal256, fixed_size_binary
ARROW_ASSIGN_OR_RAISE(auto buffers, Buffers(1, fixed));
return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]);
}
@@ -243,8 +243,8 @@ class ConcatenateImpl {
return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
}
- Status Visit(const FixedSizeListType& fixed_size_list) {
- ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size()));
+ Status Visit(const FixedSizeListType& fixed_size_list) {
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size()));
return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]);
}
@@ -256,47 +256,47 @@ class ConcatenateImpl {
return Status::OK();
}
- Result<BufferVector> UnifyDictionaries(const DictionaryType& d) {
- BufferVector new_index_lookup;
- ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(d.value_type()));
- new_index_lookup.resize(in_.size());
- for (size_t i = 0; i < in_.size(); i++) {
- auto item = in_[i];
- auto dictionary_array = MakeArray(item->dictionary);
- RETURN_NOT_OK(unifier->Unify(*dictionary_array, &new_index_lookup[i]));
- }
- std::shared_ptr<Array> out_dictionary;
- RETURN_NOT_OK(unifier->GetResultWithIndexType(d.index_type(), &out_dictionary));
- out_->dictionary = out_dictionary->data();
- return new_index_lookup;
- }
-
- // Transpose and concatenate dictionary indices
- Result<std::shared_ptr<Buffer>> ConcatenateDictionaryIndices(
- const DataType& index_type, const BufferVector& index_transpositions) {
- const auto index_width =
- internal::checked_cast<const FixedWidthType&>(index_type).bit_width() / 8;
- int64_t out_length = 0;
- for (const auto& data : in_) {
- out_length += data->length;
- }
- ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length * index_width, pool_));
- uint8_t* out_data = out->mutable_data();
- for (size_t i = 0; i < in_.size(); i++) {
- const auto& data = in_[i];
- auto transpose_map =
- reinterpret_cast<const int32_t*>(index_transpositions[i]->data());
- RETURN_NOT_OK(internal::TransposeInts(index_type, index_type,
- /*src=*/data->GetValues<uint8_t>(1, 0),
- /*dest=*/out_data,
- /*src_offset=*/data->offset,
- /*dest_offset=*/0, /*length=*/data->length,
- transpose_map));
- out_data += data->length * index_width;
- }
- return std::move(out);
- }
-
+ Result<BufferVector> UnifyDictionaries(const DictionaryType& d) {
+ BufferVector new_index_lookup;
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(d.value_type()));
+ new_index_lookup.resize(in_.size());
+ for (size_t i = 0; i < in_.size(); i++) {
+ auto item = in_[i];
+ auto dictionary_array = MakeArray(item->dictionary);
+ RETURN_NOT_OK(unifier->Unify(*dictionary_array, &new_index_lookup[i]));
+ }
+ std::shared_ptr<Array> out_dictionary;
+ RETURN_NOT_OK(unifier->GetResultWithIndexType(d.index_type(), &out_dictionary));
+ out_->dictionary = out_dictionary->data();
+ return new_index_lookup;
+ }
+
+ // Transpose and concatenate dictionary indices
+ Result<std::shared_ptr<Buffer>> ConcatenateDictionaryIndices(
+ const DataType& index_type, const BufferVector& index_transpositions) {
+ const auto index_width =
+ internal::checked_cast<const FixedWidthType&>(index_type).bit_width() / 8;
+ int64_t out_length = 0;
+ for (const auto& data : in_) {
+ out_length += data->length;
+ }
+ ARROW_ASSIGN_OR_RAISE(auto out, AllocateBuffer(out_length * index_width, pool_));
+ uint8_t* out_data = out->mutable_data();
+ for (size_t i = 0; i < in_.size(); i++) {
+ const auto& data = in_[i];
+ auto transpose_map =
+ reinterpret_cast<const int32_t*>(index_transpositions[i]->data());
+ RETURN_NOT_OK(internal::TransposeInts(index_type, index_type,
+ /*src=*/data->GetValues<uint8_t>(1, 0),
+ /*dest=*/out_data,
+ /*src_offset=*/data->offset,
+ /*dest_offset=*/0, /*length=*/data->length,
+ transpose_map));
+ out_data += data->length * index_width;
+ }
+ return std::move(out);
+ }
+
Status Visit(const DictionaryType& d) {
auto fixed = internal::checked_cast<const FixedWidthType*>(d.index_type().get());
@@ -311,15 +311,15 @@ class ConcatenateImpl {
}
}
- ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));
+ ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, *fixed));
if (dictionaries_same) {
out_->dictionary = in_[0]->dictionary;
return ConcatenateBuffers(index_buffers, pool_).Value(&out_->buffers[1]);
} else {
- ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d));
- ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
- ConcatenateDictionaryIndices(*fixed, index_lookup));
- return Status::OK();
+ ARROW_ASSIGN_OR_RAISE(auto index_lookup, UnifyDictionaries(d));
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+ ConcatenateDictionaryIndices(*fixed, index_lookup));
+ return Status::OK();
}
}
@@ -344,7 +344,7 @@ class ConcatenateImpl {
Result<BufferVector> Buffers(size_t index) {
BufferVector buffers;
buffers.reserve(in_.size());
- for (const auto& array_data : in_) {
+ for (const auto& array_data : in_) {
const auto& buffer = array_data->buffers[index];
if (buffer != nullptr) {
ARROW_ASSIGN_OR_RAISE(
@@ -386,7 +386,7 @@ class ConcatenateImpl {
Result<BufferVector> Buffers(size_t index, int byte_width) {
BufferVector buffers;
buffers.reserve(in_.size());
- for (const auto& array_data : in_) {
+ for (const auto& array_data : in_) {
const auto& buffer = array_data->buffers[index];
if (buffer != nullptr) {
ARROW_ASSIGN_OR_RAISE(auto sliced_buffer,
@@ -421,8 +421,8 @@ class ConcatenateImpl {
// Gather the index-th child_data of each input into a vector.
// Elements are sliced with that input's offset and length.
- Result<ArrayDataVector> ChildData(size_t index) {
- ArrayDataVector child_data(in_.size());
+ Result<ArrayDataVector> ChildData(size_t index) {
+ ArrayDataVector child_data(in_.size());
for (size_t i = 0; i < in_.size(); ++i) {
ARROW_ASSIGN_OR_RAISE(child_data[i], in_[i]->child_data[index]->SliceSafe(
in_[i]->offset, in_[i]->length));
@@ -431,22 +431,22 @@ class ConcatenateImpl {
}
// Gather the index-th child_data of each input into a vector.
- // Elements are sliced with that input's offset and length multiplied by multiplier.
- Result<ArrayDataVector> ChildData(size_t index, size_t multiplier) {
- ArrayDataVector child_data(in_.size());
- for (size_t i = 0; i < in_.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- child_data[i], in_[i]->child_data[index]->SliceSafe(
- in_[i]->offset * multiplier, in_[i]->length * multiplier));
- }
- return child_data;
- }
-
- // Gather the index-th child_data of each input into a vector.
+ // Elements are sliced with that input's offset and length multiplied by multiplier.
+ Result<ArrayDataVector> ChildData(size_t index, size_t multiplier) {
+ ArrayDataVector child_data(in_.size());
+ for (size_t i = 0; i < in_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ child_data[i], in_[i]->child_data[index]->SliceSafe(
+ in_[i]->offset * multiplier, in_[i]->length * multiplier));
+ }
+ return child_data;
+ }
+
+ // Gather the index-th child_data of each input into a vector.
// Elements are sliced with the explicitly passed ranges.
- Result<ArrayDataVector> ChildData(size_t index, const std::vector<Range>& ranges) {
+ Result<ArrayDataVector> ChildData(size_t index, const std::vector<Range>& ranges) {
DCHECK_EQ(in_.size(), ranges.size());
- ArrayDataVector child_data(in_.size());
+ ArrayDataVector child_data(in_.size());
for (size_t i = 0; i < in_.size(); ++i) {
ARROW_ASSIGN_OR_RAISE(child_data[i], in_[i]->child_data[index]->SliceSafe(
ranges[i].offset, ranges[i].length));
@@ -454,20 +454,20 @@ class ConcatenateImpl {
return child_data;
}
- const ArrayDataVector& in_;
+ const ArrayDataVector& in_;
MemoryPool* pool_;
std::shared_ptr<ArrayData> out_;
};
-} // namespace
-
+} // namespace
+
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool) {
if (arrays.size() == 0) {
return Status::Invalid("Must pass at least one array");
}
// gather ArrayData of input arrays
- ArrayDataVector data(arrays.size());
+ ArrayDataVector data(arrays.size());
for (size_t i = 0; i < arrays.size(); ++i) {
if (!arrays[i]->type()->Equals(*arrays[0]->type())) {
return Status::Invalid("arrays to be concatenated must be identically typed, but ",
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
index be30ff7d685..5a214473972 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.cc
@@ -37,13 +37,13 @@ namespace arrow {
using internal::CountSetBits;
-static inline void AdjustNonNullable(Type::type type_id, int64_t length,
+static inline void AdjustNonNullable(Type::type type_id, int64_t length,
std::vector<std::shared_ptr<Buffer>>* buffers,
int64_t* null_count) {
- if (type_id == Type::NA) {
- *null_count = length;
- (*buffers)[0] = nullptr;
- } else if (internal::HasValidityBitmap(type_id)) {
+ if (type_id == Type::NA) {
+ *null_count = length;
+ (*buffers)[0] = nullptr;
+ } else if (internal::HasValidityBitmap(type_id)) {
if (*null_count == 0) {
// In case there are no nulls, don't keep an allocated null bitmap around
(*buffers)[0] = nullptr;
@@ -56,39 +56,39 @@ static inline void AdjustNonNullable(Type::type type_id, int64_t length,
}
}
-std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
+std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count, int64_t offset) {
- AdjustNonNullable(type->id(), length, &buffers, &null_count);
- return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
- null_count, offset);
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ null_count, offset);
}
std::shared_ptr<ArrayData> ArrayData::Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data, int64_t null_count,
int64_t offset) {
- AdjustNonNullable(type->id(), length, &buffers, &null_count);
- return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ return std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
std::move(child_data), null_count, offset);
}
std::shared_ptr<ArrayData> ArrayData::Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count, int64_t offset) {
- AdjustNonNullable(type->id(), length, &buffers, &null_count);
- auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
+ AdjustNonNullable(type->id(), length, &buffers, &null_count);
+ auto data = std::make_shared<ArrayData>(std::move(type), length, std::move(buffers),
std::move(child_data), null_count, offset);
data->dictionary = std::move(dictionary);
return data;
}
-std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
- int64_t null_count, int64_t offset) {
- return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
+std::shared_ptr<ArrayData> ArrayData::Make(std::shared_ptr<DataType> type, int64_t length,
+ int64_t null_count, int64_t offset) {
+ return std::make_shared<ArrayData>(std::move(type), length, null_count, offset);
}
std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
@@ -213,7 +213,7 @@ struct ViewDataImpl {
Status MakeDataView(const std::shared_ptr<Field>& out_field,
std::shared_ptr<ArrayData>* out) {
- const auto& out_type = out_field->type();
+ const auto& out_type = out_field->type();
const auto out_layout = out_type->layout();
AdjustInputPointer();
@@ -249,11 +249,11 @@ struct ViewDataImpl {
} else {
// No null bitmap in input, append no-nulls bitmap
out_buffers.push_back(nullptr);
- if (out_type->id() == Type::NA) {
- out_null_count = out_length;
- } else {
- out_null_count = 0;
- }
+ if (out_type->id() == Type::NA) {
+ out_null_count = out_length;
+ } else {
+ out_null_count = 0;
+ }
}
// Process other buffers in output layout
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
index db166ffaa27..418d09def6b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/data.h
@@ -71,47 +71,47 @@ constexpr int64_t kUnknownNullCount = -1;
/// input array and replace them with newly-allocated data, changing the output
/// data type as well.
struct ARROW_EXPORT ArrayData {
- ArrayData() = default;
+ ArrayData() = default;
- ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
+ : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
- ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : ArrayData(std::move(type), length, null_count, offset) {
+ : ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
}
- ArrayData(std::shared_ptr<DataType> type, int64_t length,
+ ArrayData(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
- : ArrayData(std::move(type), length, null_count, offset) {
+ : ArrayData(std::move(type), length, null_count, offset) {
this->buffers = std::move(buffers);
this->child_data = std::move(child_data);
}
- static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+ static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
static std::shared_ptr<ArrayData> Make(
- std::shared_ptr<DataType> type, int64_t length,
+ std::shared_ptr<DataType> type, int64_t length,
std::vector<std::shared_ptr<Buffer>> buffers,
std::vector<std::shared_ptr<ArrayData>> child_data,
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
- static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+ static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
int64_t null_count = kUnknownNullCount,
int64_t offset = 0);
@@ -230,11 +230,11 @@ struct ARROW_EXPORT ArrayData {
}
std::shared_ptr<DataType> type;
- int64_t length = 0;
- mutable std::atomic<int64_t> null_count{0};
+ int64_t length = 0;
+ mutable std::atomic<int64_t> null_count{0};
// The logical start point into the physical buffers (in values, not bytes).
// Note that, for child data, this must be *added* to the child data's own offset.
- int64_t offset = 0;
+ int64_t offset = 0;
std::vector<std::shared_ptr<Buffer>> buffers;
std::vector<std::shared_ptr<ArrayData>> child_data;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
index 41d7242a44f..ed26ecff4e0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.cc
@@ -41,7 +41,7 @@
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -52,7 +52,7 @@ using internal::checked_cast;
// ----------------------------------------------------------------------
// Loading from ArrayData
-namespace {
+namespace {
class ArrayDataWrapper {
public:
@@ -75,209 +75,209 @@ class ArrayDataWrapper {
std::shared_ptr<Array>* out_;
};
-class ArrayDataEndianSwapper {
- public:
- ArrayDataEndianSwapper(const std::shared_ptr<ArrayData>& data, int64_t length)
- : data_(data), length_(length) {
- out_ = data->Copy();
- }
-
- Status SwapType(const DataType& type) {
- RETURN_NOT_OK(VisitTypeInline(type, this));
- RETURN_NOT_OK(SwapChildren(type.fields()));
- if (internal::HasValidityBitmap(type.id())) {
- // Copy null bitmap
- out_->buffers[0] = data_->buffers[0];
- }
- return Status::OK();
- }
-
- Status SwapChildren(const FieldVector& child_fields) {
- for (size_t i = 0; i < child_fields.size(); i++) {
- ARROW_ASSIGN_OR_RAISE(out_->child_data[i],
- internal::SwapEndianArrayData(data_->child_data[i]));
- }
- return Status::OK();
- }
-
- template <typename T>
- Result<std::shared_ptr<Buffer>> ByteSwapBuffer(
- const std::shared_ptr<Buffer>& in_buffer) {
- if (sizeof(T) == 1) {
- // if data size is 1, element is not swapped. We can use the original buffer
- return in_buffer;
- }
- auto in_data = reinterpret_cast<const T*>(in_buffer->data());
- ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size()));
- auto out_data = reinterpret_cast<T*>(out_buffer->mutable_data());
- int64_t length = in_buffer->size() / sizeof(T);
- for (int64_t i = 0; i < length; i++) {
- out_data[i] = BitUtil::ByteSwap(in_data[i]);
- }
- return std::move(out_buffer);
- }
-
- template <typename VALUE_TYPE>
- Status SwapOffsets(int index) {
- if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) {
- out_->buffers[index] = data_->buffers[index];
- return Status::OK();
- }
- // Except union, offset has one more element rather than data->length
- ARROW_ASSIGN_OR_RAISE(out_->buffers[index],
- ByteSwapBuffer<VALUE_TYPE>(data_->buffers[index]));
- return Status::OK();
- }
-
- template <typename T>
- enable_if_t<std::is_base_of<FixedWidthType, T>::value &&
- !std::is_base_of<FixedSizeBinaryType, T>::value &&
- !std::is_base_of<DictionaryType, T>::value,
- Status>
- Visit(const T& type) {
- using value_type = typename T::c_type;
- ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
- ByteSwapBuffer<value_type>(data_->buffers[1]));
- return Status::OK();
- }
-
- Status Visit(const Decimal128Type& type) {
- auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
- ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
- auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
- int64_t length = length_;
- length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2);
- for (int64_t i = 0; i < length; i++) {
- uint64_t tmp;
- auto idx = i * 2;
-#if ARROW_LITTLE_ENDIAN
- tmp = BitUtil::FromBigEndian(data[idx]);
- new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]);
- new_data[idx + 1] = tmp;
-#else
- tmp = BitUtil::FromLittleEndian(data[idx]);
- new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]);
- new_data[idx + 1] = tmp;
-#endif
- }
- out_->buffers[1] = std::move(new_buffer);
- return Status::OK();
- }
-
- Status Visit(const Decimal256Type& type) {
- auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
- ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
- auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
- int64_t length = length_;
- length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4);
- for (int64_t i = 0; i < length; i++) {
- uint64_t tmp0, tmp1, tmp2;
- auto idx = i * 4;
-#if ARROW_LITTLE_ENDIAN
- tmp0 = BitUtil::FromBigEndian(data[idx]);
- tmp1 = BitUtil::FromBigEndian(data[idx + 1]);
- tmp2 = BitUtil::FromBigEndian(data[idx + 2]);
- new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]);
- new_data[idx + 1] = tmp2;
- new_data[idx + 2] = tmp1;
- new_data[idx + 3] = tmp0;
-#else
- tmp0 = BitUtil::FromLittleEndian(data[idx]);
- tmp1 = BitUtil::FromLittleEndian(data[idx + 1]);
- tmp2 = BitUtil::FromLittleEndian(data[idx + 2]);
- new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]);
- new_data[idx + 1] = tmp2;
- new_data[idx + 2] = tmp1;
- new_data[idx + 3] = tmp0;
-#endif
- }
- out_->buffers[1] = std::move(new_buffer);
- return Status::OK();
- }
-
- Status Visit(const DayTimeIntervalType& type) {
- ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer<uint32_t>(data_->buffers[1]));
- return Status::OK();
- }
-
- Status Visit(const NullType& type) { return Status::OK(); }
- Status Visit(const BooleanType& type) { return Status::OK(); }
- Status Visit(const Int8Type& type) { return Status::OK(); }
- Status Visit(const UInt8Type& type) { return Status::OK(); }
- Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
- Status Visit(const FixedSizeListType& type) { return Status::OK(); }
- Status Visit(const StructType& type) { return Status::OK(); }
- Status Visit(const UnionType& type) {
- out_->buffers[1] = data_->buffers[1];
- if (type.mode() == UnionMode::DENSE) {
- RETURN_NOT_OK(SwapOffsets<int32_t>(2));
- }
- return Status::OK();
- }
-
- template <typename T>
- enable_if_t<std::is_same<BinaryType, T>::value || std::is_same<StringType, T>::value,
- Status>
- Visit(const T& type) {
- RETURN_NOT_OK(SwapOffsets<int32_t>(1));
- out_->buffers[2] = data_->buffers[2];
- return Status::OK();
- }
-
- template <typename T>
- enable_if_t<std::is_same<LargeBinaryType, T>::value ||
- std::is_same<LargeStringType, T>::value,
- Status>
- Visit(const T& type) {
- RETURN_NOT_OK(SwapOffsets<int64_t>(1));
- out_->buffers[2] = data_->buffers[2];
- return Status::OK();
- }
-
- Status Visit(const ListType& type) {
- RETURN_NOT_OK(SwapOffsets<int32_t>(1));
- return Status::OK();
- }
- Status Visit(const LargeListType& type) {
- RETURN_NOT_OK(SwapOffsets<int64_t>(1));
- return Status::OK();
- }
-
- Status Visit(const DictionaryType& type) {
- // dictionary was already swapped in ReadDictionary() in ipc/reader.cc
- RETURN_NOT_OK(SwapType(*type.index_type()));
- return Status::OK();
- }
-
- Status Visit(const ExtensionType& type) {
- RETURN_NOT_OK(SwapType(*type.storage_type()));
- return Status::OK();
- }
-
- const std::shared_ptr<ArrayData>& data_;
- int64_t length_;
- std::shared_ptr<ArrayData> out_;
-};
-
-} // namespace
-
-namespace internal {
-
-Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
- const std::shared_ptr<ArrayData>& data) {
- if (data->offset != 0) {
- return Status::Invalid("Unsupported data format: data.offset != 0");
- }
- ArrayDataEndianSwapper swapper(data, data->length);
- RETURN_NOT_OK(swapper.SwapType(*data->type));
- return std::move(swapper.out_);
-}
-
+class ArrayDataEndianSwapper {
+ public:
+ ArrayDataEndianSwapper(const std::shared_ptr<ArrayData>& data, int64_t length)
+ : data_(data), length_(length) {
+ out_ = data->Copy();
+ }
+
+ Status SwapType(const DataType& type) {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ RETURN_NOT_OK(SwapChildren(type.fields()));
+ if (internal::HasValidityBitmap(type.id())) {
+ // Copy null bitmap
+ out_->buffers[0] = data_->buffers[0];
+ }
+ return Status::OK();
+ }
+
+ Status SwapChildren(const FieldVector& child_fields) {
+ for (size_t i = 0; i < child_fields.size(); i++) {
+ ARROW_ASSIGN_OR_RAISE(out_->child_data[i],
+ internal::SwapEndianArrayData(data_->child_data[i]));
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ Result<std::shared_ptr<Buffer>> ByteSwapBuffer(
+ const std::shared_ptr<Buffer>& in_buffer) {
+ if (sizeof(T) == 1) {
+ // if data size is 1, element is not swapped. We can use the original buffer
+ return in_buffer;
+ }
+ auto in_data = reinterpret_cast<const T*>(in_buffer->data());
+ ARROW_ASSIGN_OR_RAISE(auto out_buffer, AllocateBuffer(in_buffer->size()));
+ auto out_data = reinterpret_cast<T*>(out_buffer->mutable_data());
+ int64_t length = in_buffer->size() / sizeof(T);
+ for (int64_t i = 0; i < length; i++) {
+ out_data[i] = BitUtil::ByteSwap(in_data[i]);
+ }
+ return std::move(out_buffer);
+ }
+
+ template <typename VALUE_TYPE>
+ Status SwapOffsets(int index) {
+ if (data_->buffers[index] == nullptr || data_->buffers[index]->size() == 0) {
+ out_->buffers[index] = data_->buffers[index];
+ return Status::OK();
+ }
+ // Except union, offset has one more element rather than data->length
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[index],
+ ByteSwapBuffer<VALUE_TYPE>(data_->buffers[index]));
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_base_of<FixedWidthType, T>::value &&
+ !std::is_base_of<FixedSizeBinaryType, T>::value &&
+ !std::is_base_of<DictionaryType, T>::value,
+ Status>
+ Visit(const T& type) {
+ using value_type = typename T::c_type;
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1],
+ ByteSwapBuffer<value_type>(data_->buffers[1]));
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal128Type& type) {
+ auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+ auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+ int64_t length = length_;
+ length = data_->buffers[1]->size() / (sizeof(uint64_t) * 2);
+ for (int64_t i = 0; i < length; i++) {
+ uint64_t tmp;
+ auto idx = i * 2;
+#if ARROW_LITTLE_ENDIAN
+ tmp = BitUtil::FromBigEndian(data[idx]);
+ new_data[idx] = BitUtil::FromBigEndian(data[idx + 1]);
+ new_data[idx + 1] = tmp;
+#else
+ tmp = BitUtil::FromLittleEndian(data[idx]);
+ new_data[idx] = BitUtil::FromLittleEndian(data[idx + 1]);
+ new_data[idx + 1] = tmp;
+#endif
+ }
+ out_->buffers[1] = std::move(new_buffer);
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Type& type) {
+ auto data = reinterpret_cast<const uint64_t*>(data_->buffers[1]->data());
+ ARROW_ASSIGN_OR_RAISE(auto new_buffer, AllocateBuffer(data_->buffers[1]->size()));
+ auto new_data = reinterpret_cast<uint64_t*>(new_buffer->mutable_data());
+ int64_t length = length_;
+ length = data_->buffers[1]->size() / (sizeof(uint64_t) * 4);
+ for (int64_t i = 0; i < length; i++) {
+ uint64_t tmp0, tmp1, tmp2;
+ auto idx = i * 4;
+#if ARROW_LITTLE_ENDIAN
+ tmp0 = BitUtil::FromBigEndian(data[idx]);
+ tmp1 = BitUtil::FromBigEndian(data[idx + 1]);
+ tmp2 = BitUtil::FromBigEndian(data[idx + 2]);
+ new_data[idx] = BitUtil::FromBigEndian(data[idx + 3]);
+ new_data[idx + 1] = tmp2;
+ new_data[idx + 2] = tmp1;
+ new_data[idx + 3] = tmp0;
+#else
+ tmp0 = BitUtil::FromLittleEndian(data[idx]);
+ tmp1 = BitUtil::FromLittleEndian(data[idx + 1]);
+ tmp2 = BitUtil::FromLittleEndian(data[idx + 2]);
+ new_data[idx] = BitUtil::FromLittleEndian(data[idx + 3]);
+ new_data[idx + 1] = tmp2;
+ new_data[idx + 2] = tmp1;
+ new_data[idx + 3] = tmp0;
+#endif
+ }
+ out_->buffers[1] = std::move(new_buffer);
+ return Status::OK();
+ }
+
+ Status Visit(const DayTimeIntervalType& type) {
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1], ByteSwapBuffer<uint32_t>(data_->buffers[1]));
+ return Status::OK();
+ }
+
+ Status Visit(const NullType& type) { return Status::OK(); }
+ Status Visit(const BooleanType& type) { return Status::OK(); }
+ Status Visit(const Int8Type& type) { return Status::OK(); }
+ Status Visit(const UInt8Type& type) { return Status::OK(); }
+ Status Visit(const FixedSizeBinaryType& type) { return Status::OK(); }
+ Status Visit(const FixedSizeListType& type) { return Status::OK(); }
+ Status Visit(const StructType& type) { return Status::OK(); }
+ Status Visit(const UnionType& type) {
+ out_->buffers[1] = data_->buffers[1];
+ if (type.mode() == UnionMode::DENSE) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(2));
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_same<BinaryType, T>::value || std::is_same<StringType, T>::value,
+ Status>
+ Visit(const T& type) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(1));
+ out_->buffers[2] = data_->buffers[2];
+ return Status::OK();
+ }
+
+ template <typename T>
+ enable_if_t<std::is_same<LargeBinaryType, T>::value ||
+ std::is_same<LargeStringType, T>::value,
+ Status>
+ Visit(const T& type) {
+ RETURN_NOT_OK(SwapOffsets<int64_t>(1));
+ out_->buffers[2] = data_->buffers[2];
+ return Status::OK();
+ }
+
+ Status Visit(const ListType& type) {
+ RETURN_NOT_OK(SwapOffsets<int32_t>(1));
+ return Status::OK();
+ }
+ Status Visit(const LargeListType& type) {
+ RETURN_NOT_OK(SwapOffsets<int64_t>(1));
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryType& type) {
+ // dictionary was already swapped in ReadDictionary() in ipc/reader.cc
+ RETURN_NOT_OK(SwapType(*type.index_type()));
+ return Status::OK();
+ }
+
+ Status Visit(const ExtensionType& type) {
+ RETURN_NOT_OK(SwapType(*type.storage_type()));
+ return Status::OK();
+ }
+
+ const std::shared_ptr<ArrayData>& data_;
+ int64_t length_;
+ std::shared_ptr<ArrayData> out_;
+};
+
+} // namespace
+
+namespace internal {
+
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+ const std::shared_ptr<ArrayData>& data) {
+ if (data->offset != 0) {
+ return Status::Invalid("Unsupported data format: data.offset != 0");
+ }
+ ArrayDataEndianSwapper swapper(data, data->length);
+ RETURN_NOT_OK(swapper.SwapType(*data->type));
+ return std::move(swapper.out_);
+}
+
} // namespace internal
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
std::shared_ptr<Array> out;
- ArrayDataWrapper wrapper_visitor(data, &out);
+ ArrayDataWrapper wrapper_visitor(data, &out);
DCHECK_OK(VisitTypeInline(*data->type, &wrapper_visitor));
DCHECK(out);
return out;
@@ -286,7 +286,7 @@ std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data) {
// ----------------------------------------------------------------------
// Misc APIs
-namespace {
+namespace {
// get the maximum buffer length required, then allocate a single zeroed buffer
// to use anywhere a buffer is required
@@ -496,9 +496,9 @@ class RepeatedArrayFactory {
return out_;
}
- Status Visit(const NullType& type) {
- DCHECK(false); // already forwarded to MakeArrayOfNull
- return Status::OK();
+ Status Visit(const NullType& type) {
+ DCHECK(false); // already forwarded to MakeArrayOfNull
+ return Status::OK();
}
Status Visit(const BooleanType&) {
@@ -510,29 +510,29 @@ class RepeatedArrayFactory {
}
template <typename T>
- enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value, Status> Visit(
- const T&) {
+ enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value, Status> Visit(
+ const T&) {
auto value = checked_cast<const typename TypeTraits<T>::ScalarType&>(scalar_).value;
return FinishFixedWidth(&value, sizeof(value));
}
- Status Visit(const FixedSizeBinaryType& type) {
- auto value = checked_cast<const FixedSizeBinaryScalar&>(scalar_).value;
- return FinishFixedWidth(value->data(), type.byte_width());
- }
-
- template <typename T>
- enable_if_decimal<T, Status> Visit(const T&) {
- using ScalarType = typename TypeTraits<T>::ScalarType;
- auto value = checked_cast<const ScalarType&>(scalar_).value.ToBytes();
+ Status Visit(const FixedSizeBinaryType& type) {
+ auto value = checked_cast<const FixedSizeBinaryScalar&>(scalar_).value;
+ return FinishFixedWidth(value->data(), type.byte_width());
+ }
+
+ template <typename T>
+ enable_if_decimal<T, Status> Visit(const T&) {
+ using ScalarType = typename TypeTraits<T>::ScalarType;
+ auto value = checked_cast<const ScalarType&>(scalar_).value.ToBytes();
+ return FinishFixedWidth(value.data(), value.size());
+ }
+
+ Status Visit(const Decimal256Type&) {
+ auto value = checked_cast<const Decimal256Scalar&>(scalar_).value.ToBytes();
return FinishFixedWidth(value.data(), value.size());
}
- Status Visit(const Decimal256Type&) {
- auto value = checked_cast<const Decimal256Scalar&>(scalar_).value.ToBytes();
- return FinishFixedWidth(value.data(), value.size());
- }
-
template <typename T>
enable_if_base_binary<T, Status> Visit(const T&) {
std::shared_ptr<Buffer> value =
@@ -613,18 +613,18 @@ class RepeatedArrayFactory {
return Status::OK();
}
- Status Visit(const ExtensionType& type) {
- return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
- }
-
- Status Visit(const DenseUnionType& type) {
- return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
- }
-
- Status Visit(const SparseUnionType& type) {
- return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
- }
-
+ Status Visit(const ExtensionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ Status Visit(const DenseUnionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
+ Status Visit(const SparseUnionType& type) {
+ return Status::NotImplemented("construction from scalar of type ", *scalar_.type);
+ }
+
template <typename OffsetType>
Status CreateOffsetsBuffer(OffsetType value_length, std::shared_ptr<Buffer>* out) {
TypedBufferBuilder<OffsetType> builder(pool_);
@@ -660,11 +660,11 @@ class RepeatedArrayFactory {
std::shared_ptr<Array> out_;
};
-} // namespace
+} // namespace
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
int64_t length, MemoryPool* pool) {
- ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create());
+ ARROW_ASSIGN_OR_RAISE(auto data, NullArrayFactory(pool, type, length).Create());
return MakeArray(data);
}
@@ -673,7 +673,7 @@ Result<std::shared_ptr<Array>> MakeArrayFromScalar(const Scalar& scalar, int64_t
if (!scalar.is_valid) {
return MakeArrayOfNull(scalar.type, length, pool);
}
- return RepeatedArrayFactory(pool, scalar, length).Create();
+ return RepeatedArrayFactory(pool, scalar, length).Create();
}
namespace internal {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
index ac71c6d8570..3ef4e08828f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/util.h
@@ -56,17 +56,17 @@ Result<std::shared_ptr<Array>> MakeArrayFromScalar(
namespace internal {
-/// \brief Swap endian of each element in a generic ArrayData
-///
-/// As dictionaries are often shared between different arrays, dictionaries
-/// are not swapped by this function and should be handled separately.
-///
-/// \param[in] data the array contents
-/// \return the resulting ArrayData whose elements were swapped
-ARROW_EXPORT
-Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
- const std::shared_ptr<ArrayData>& data);
-
+/// \brief Swap endian of each element in a generic ArrayData
+///
+/// As dictionaries are often shared between different arrays, dictionaries
+/// are not swapped by this function and should be handled separately.
+///
+/// \param[in] data the array contents
+/// \return the resulting ArrayData whose elements were swapped
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+ const std::shared_ptr<ArrayData>& data);
+
/// Given a number of ArrayVectors, treat each ArrayVector as the
/// chunks of a chunked array. Then rechunk each ArrayVector such that
/// all ArrayVectors are chunked identically. It is mandatory that
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
index 5adc18bd495..5cc3bacf282 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.cc
@@ -23,12 +23,12 @@
#include "arrow/extension_type.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_block_counter.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
-#include "arrow/util/utf8.h"
+#include "arrow/util/utf8.h"
#include "arrow/visitor_inline.h"
namespace arrow {
@@ -39,172 +39,172 @@ namespace internal {
namespace {
-struct ValidateArrayImpl {
- const ArrayData& data;
+struct ValidateArrayImpl {
+ const ArrayData& data;
- Status Validate() { return ValidateWithType(*data.type); }
-
- Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
-
- Status Visit(const NullType&) {
- if (data.null_count != data.length) {
- return Status::Invalid("Null array null_count unequal to its length");
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType&) {
+ if (data.null_count != data.length) {
+ return Status::Invalid("Null array null_count unequal to its length");
}
return Status::OK();
}
- Status Visit(const FixedWidthType&) {
- if (data.length > 0) {
- if (!IsBufferValid(1)) {
- return Status::Invalid("Missing values buffer in non-empty array");
- }
+ Status Visit(const FixedWidthType&) {
+ if (data.length > 0) {
+ if (!IsBufferValid(1)) {
+ return Status::Invalid("Missing values buffer in non-empty array");
+ }
}
return Status::OK();
}
- Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const StringType& type) { return ValidateBinaryLike(type); }
- Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
- Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const LargeStringType& type) { return ValidateBinaryLike(type); }
- Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
- Status Visit(const ListType& type) { return ValidateListLike(type); }
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
- Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
- Status Visit(const MapType& type) { return ValidateListLike(type); }
-
- Status Visit(const FixedSizeListType& type) {
- const ArrayData& values = *data.child_data[0];
- const int64_t list_size = type.list_size();
- if (list_size < 0) {
- return Status::Invalid("Fixed size list has negative list size");
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& values = *data.child_data[0];
+ const int64_t list_size = type.list_size();
+ if (list_size < 0) {
+ return Status::Invalid("Fixed size list has negative list size");
}
int64_t expected_values_length = -1;
- if (MultiplyWithOverflow(data.length, list_size, &expected_values_length) ||
- values.length != expected_values_length) {
- return Status::Invalid("Values length (", values.length,
- ") is not equal to the length (", data.length,
- ") multiplied by the value size (", list_size, ")");
- }
-
- const Status child_valid = ValidateArray(values);
- if (!child_valid.ok()) {
- return Status::Invalid("Fixed size list child array invalid: ",
- child_valid.ToString());
- }
-
+ if (MultiplyWithOverflow(data.length, list_size, &expected_values_length) ||
+ values.length != expected_values_length) {
+ return Status::Invalid("Values length (", values.length,
+ ") is not equal to the length (", data.length,
+ ") multiplied by the value size (", list_size, ")");
+ }
+
+ const Status child_valid = ValidateArray(values);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+
return Status::OK();
}
- Status Visit(const StructType& type) {
- for (int i = 0; i < type.num_fields(); ++i) {
- const auto& field_data = *data.child_data[i];
+ Status Visit(const StructType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
- // Validate child first, to catch nonsensical length / offset etc.
- const Status field_valid = ValidateArray(field_data);
- if (!field_valid.ok()) {
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
return Status::Invalid("Struct child array #", i,
- " invalid: ", field_valid.ToString());
+ " invalid: ", field_valid.ToString());
}
- if (field_data.length < data.length + data.offset) {
+ if (field_data.length < data.length + data.offset) {
return Status::Invalid("Struct child array #", i,
- " has length smaller than expected for struct array (",
- field_data.length, " < ", data.length + data.offset, ")");
+ " has length smaller than expected for struct array (",
+ field_data.length, " < ", data.length + data.offset, ")");
}
- const auto& field_type = type.field(i)->type();
- if (!field_data.type->Equals(*field_type)) {
- return Status::Invalid("Struct child array #", i, " does not match type field: ",
- field_data.type->ToString(), " vs ",
- field_type->ToString());
+ const auto& field_type = type.field(i)->type();
+ if (!field_data.type->Equals(*field_type)) {
+ return Status::Invalid("Struct child array #", i, " does not match type field: ",
+ field_data.type->ToString(), " vs ",
+ field_type->ToString());
}
}
return Status::OK();
}
- Status Visit(const UnionType& type) {
- for (int i = 0; i < type.num_fields(); ++i) {
- const auto& field_data = *data.child_data[i];
-
- // Validate child first, to catch nonsensical length / offset etc.
- const Status field_valid = ValidateArray(field_data);
- if (!field_valid.ok()) {
- return Status::Invalid("Union child array #", i,
- " invalid: ", field_valid.ToString());
+ Status Visit(const UnionType& type) {
+ for (int i = 0; i < type.num_fields(); ++i) {
+ const auto& field_data = *data.child_data[i];
+
+ // Validate child first, to catch nonsensical length / offset etc.
+ const Status field_valid = ValidateArray(field_data);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Union child array #", i,
+ " invalid: ", field_valid.ToString());
}
- if (type.mode() == UnionMode::SPARSE &&
- field_data.length < data.length + data.offset) {
+ if (type.mode() == UnionMode::SPARSE &&
+ field_data.length < data.length + data.offset) {
return Status::Invalid("Sparse union child array #", i,
- " has length smaller than expected for union array (",
- field_data.length, " < ", data.length + data.offset, ")");
+ " has length smaller than expected for union array (",
+ field_data.length, " < ", data.length + data.offset, ")");
}
- const auto& field_type = type.field(i)->type();
- if (!field_data.type->Equals(*field_type)) {
- return Status::Invalid("Union child array #", i, " does not match type field: ",
- field_data.type->ToString(), " vs ",
- field_type->ToString());
+ const auto& field_type = type.field(i)->type();
+ if (!field_data.type->Equals(*field_type)) {
+ return Status::Invalid("Union child array #", i, " does not match type field: ",
+ field_data.type->ToString(), " vs ",
+ field_type->ToString());
}
}
return Status::OK();
}
- Status Visit(const DictionaryType& type) {
- Type::type index_type_id = type.index_type()->id();
+ Status Visit(const DictionaryType& type) {
+ Type::type index_type_id = type.index_type()->id();
if (!is_integer(index_type_id)) {
return Status::Invalid("Dictionary indices must be integer type");
}
- if (!data.dictionary) {
+ if (!data.dictionary) {
return Status::Invalid("Dictionary values must be non-null");
}
- const Status dict_valid = ValidateArray(*data.dictionary);
+ const Status dict_valid = ValidateArray(*data.dictionary);
if (!dict_valid.ok()) {
return Status::Invalid("Dictionary array invalid: ", dict_valid.ToString());
}
- // Visit indices
- return ValidateWithType(*type.index_type());
+ // Visit indices
+ return ValidateWithType(*type.index_type());
+ }
+
+ Status Visit(const ExtensionType& type) {
+ // Visit storage
+ return ValidateWithType(*type.storage_type());
}
- Status Visit(const ExtensionType& type) {
- // Visit storage
- return ValidateWithType(*type.storage_type());
- }
+ private:
+ bool IsBufferValid(int index) { return IsBufferValid(data, index); }
- private:
- bool IsBufferValid(int index) { return IsBufferValid(data, index); }
-
- static bool IsBufferValid(const ArrayData& data, int index) {
- return data.buffers[index] != nullptr && data.buffers[index]->address() != 0;
+ static bool IsBufferValid(const ArrayData& data, int index) {
+ return data.buffers[index] != nullptr && data.buffers[index]->address() != 0;
}
- template <typename BinaryType>
- Status ValidateBinaryLike(const BinaryType& type) {
- if (!IsBufferValid(2)) {
- return Status::Invalid("Value data buffer is null");
+ template <typename BinaryType>
+ Status ValidateBinaryLike(const BinaryType& type) {
+ if (!IsBufferValid(2)) {
+ return Status::Invalid("Value data buffer is null");
}
- // First validate offsets, to make sure the accesses below are valid
- RETURN_NOT_OK(ValidateOffsets(type));
+ // First validate offsets, to make sure the accesses below are valid
+ RETURN_NOT_OK(ValidateOffsets(type));
+
+ if (data.length > 0 && data.buffers[1]->is_cpu()) {
+ using offset_type = typename BinaryType::offset_type;
- if (data.length > 0 && data.buffers[1]->is_cpu()) {
- using offset_type = typename BinaryType::offset_type;
-
- const auto offsets = data.GetValues<offset_type>(1);
- const Buffer& values = *data.buffers[2];
-
- const auto first_offset = offsets[0];
- const auto last_offset = offsets[data.length];
+ const auto offsets = data.GetValues<offset_type>(1);
+ const Buffer& values = *data.buffers[2];
+
+ const auto first_offset = offsets[0];
+ const auto last_offset = offsets[data.length];
// This early test avoids undefined behaviour when computing `data_extent`
if (first_offset < 0 || last_offset < 0) {
return Status::Invalid("Negative offsets in binary array");
}
const auto data_extent = last_offset - first_offset;
- const auto values_length = values.size();
+ const auto values_length = values.size();
if (values_length < data_extent) {
return Status::Invalid("Length spanned by binary offsets (", data_extent,
") larger than values array (size ", values_length, ")");
@@ -221,27 +221,27 @@ struct ValidateArrayImpl {
return Status::OK();
}
- template <typename ListType>
- Status ValidateListLike(const ListType& type) {
+ template <typename ListType>
+ Status ValidateListLike(const ListType& type) {
// First validate offsets, to make sure the accesses below are valid
- RETURN_NOT_OK(ValidateOffsets(type));
+ RETURN_NOT_OK(ValidateOffsets(type));
+
+ const ArrayData& values = *data.child_data[0];
- const ArrayData& values = *data.child_data[0];
-
// An empty list array can have 0 offsets
- if (data.length > 0 && data.buffers[1]->is_cpu()) {
- using offset_type = typename ListType::offset_type;
-
- const auto offsets = data.GetValues<offset_type>(1);
-
- const auto first_offset = offsets[0];
- const auto last_offset = offsets[data.length];
+ if (data.length > 0 && data.buffers[1]->is_cpu()) {
+ using offset_type = typename ListType::offset_type;
+
+ const auto offsets = data.GetValues<offset_type>(1);
+
+ const auto first_offset = offsets[0];
+ const auto last_offset = offsets[data.length];
// This early test avoids undefined behaviour when computing `data_extent`
if (first_offset < 0 || last_offset < 0) {
return Status::Invalid("Negative offsets in list array");
}
const auto data_extent = last_offset - first_offset;
- const auto values_length = values.length;
+ const auto values_length = values.length;
if (values_length < data_extent) {
return Status::Invalid("Length spanned by list offsets (", data_extent,
") larger than values array (length ", values_length, ")");
@@ -256,32 +256,32 @@ struct ValidateArrayImpl {
}
}
- const Status child_valid = ValidateArray(values);
+ const Status child_valid = ValidateArray(values);
if (!child_valid.ok()) {
return Status::Invalid("List child array invalid: ", child_valid.ToString());
}
return Status::OK();
}
- template <typename TypeClass>
- Status ValidateOffsets(const TypeClass& type) {
- using offset_type = typename TypeClass::offset_type;
+ template <typename TypeClass>
+ Status ValidateOffsets(const TypeClass& type) {
+ using offset_type = typename TypeClass::offset_type;
- const Buffer* offsets = data.buffers[1].get();
- if (offsets == nullptr) {
- // For length 0, an empty offsets buffer seems accepted as a special case
- // (ARROW-544)
- if (data.length > 0) {
- return Status::Invalid("Non-empty array but offsets are null");
+ const Buffer* offsets = data.buffers[1].get();
+ if (offsets == nullptr) {
+ // For length 0, an empty offsets buffer seems accepted as a special case
+ // (ARROW-544)
+ if (data.length > 0) {
+ return Status::Invalid("Non-empty array but offsets are null");
}
return Status::OK();
}
// An empty list array can have 0 offsets
- auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
- if (offsets->size() / static_cast<int32_t>(sizeof(offset_type)) < required_offsets) {
- return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(),
- " isn't large enough for length: ", data.length);
+ auto required_offsets = (data.length > 0) ? data.length + data.offset + 1 : 0;
+ if (offsets->size() / static_cast<int32_t>(sizeof(offset_type)) < required_offsets) {
+ return Status::Invalid("Offsets buffer size (bytes): ", offsets->size(),
+ " isn't large enough for length: ", data.length);
}
return Status::OK();
@@ -291,12 +291,12 @@ struct ValidateArrayImpl {
} // namespace
ARROW_EXPORT
-Status ValidateArray(const ArrayData& data) {
- // First check the data layout conforms to the spec
- const DataType& type = *data.type;
+Status ValidateArray(const ArrayData& data) {
+ // First check the data layout conforms to the spec
+ const DataType& type = *data.type;
const auto layout = type.layout();
- if (data.length < 0) {
+ if (data.length < 0) {
return Status::Invalid("Array length is negative");
}
@@ -306,14 +306,14 @@ Status ValidateArray(const ArrayData& data) {
"of type ",
type.ToString(), ", got ", data.buffers.size());
}
-
+
// This check is required to avoid addition overflow below
int64_t length_plus_offset = -1;
- if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
+ if (AddWithOverflow(data.length, data.offset, &length_plus_offset)) {
return Status::Invalid("Array of type ", type.ToString(),
" has impossibly large length and offset");
}
-
+
for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
const auto& buffer = data.buffers[i];
const auto& spec = layout.buffers[i];
@@ -340,7 +340,7 @@ Status ValidateArray(const ArrayData& data) {
}
if (buffer->size() < min_buffer_size) {
return Status::Invalid("Buffer #", i, " too small in array of type ",
- type.ToString(), " and length ", data.length,
+ type.ToString(), " and length ", data.length,
": expected at least ", min_buffer_size, " byte(s), got ",
buffer->size());
}
@@ -352,12 +352,12 @@ Status ValidateArray(const ArrayData& data) {
// Check null_count() *after* validating the buffer sizes, to avoid
// reading out of bounds.
- if (data.null_count > data.length) {
+ if (data.null_count > data.length) {
return Status::Invalid("Null count exceeds array length");
}
- if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
- return Status::Invalid("Negative null count");
- }
+ if (data.null_count < 0 && data.null_count != kUnknownNullCount) {
+ return Status::Invalid("Negative null count");
+ }
if (type.id() != Type::EXTENSION) {
if (data.child_data.size() != static_cast<size_t>(type.num_fields())) {
@@ -376,142 +376,142 @@ Status ValidateArray(const ArrayData& data) {
type.ToString());
}
- ValidateArrayImpl validator{data};
- return validator.Validate();
+ ValidateArrayImpl validator{data};
+ return validator.Validate();
}
-ARROW_EXPORT
-Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
-
+ARROW_EXPORT
+Status ValidateArray(const Array& array) { return ValidateArray(*array.data()); }
+
///////////////////////////////////////////////////////////////////////////
-// ValidateArrayFull: expensive validation checks
+// ValidateArrayFull: expensive validation checks
namespace {
-struct UTF8DataValidator {
- const ArrayData& data;
+struct UTF8DataValidator {
+ const ArrayData& data;
- Status Visit(const DataType&) {
+ Status Visit(const DataType&) {
// Default, should be unreachable
return Status::NotImplemented("");
}
- template <typename StringType>
- enable_if_string<StringType, Status> Visit(const StringType&) {
- util::InitializeUTF8();
-
- int64_t i = 0;
- return VisitArrayDataInline<StringType>(
- data,
- [&](util::string_view v) {
- if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
- return Status::Invalid("Invalid UTF8 sequence at string index ", i);
- }
- ++i;
- return Status::OK();
- },
- [&]() {
- ++i;
- return Status::OK();
- });
+ template <typename StringType>
+ enable_if_string<StringType, Status> Visit(const StringType&) {
+ util::InitializeUTF8();
+
+ int64_t i = 0;
+ return VisitArrayDataInline<StringType>(
+ data,
+ [&](util::string_view v) {
+ if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
+ return Status::Invalid("Invalid UTF8 sequence at string index ", i);
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
}
};
-struct BoundsChecker {
- const ArrayData& data;
- int64_t min_value;
- int64_t max_value;
-
- Status Visit(const DataType&) {
- // Default, should be unreachable
- return Status::NotImplemented("");
- }
-
- template <typename IntegerType>
- enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
- using c_type = typename IntegerType::c_type;
-
- int64_t i = 0;
- return VisitArrayDataInline<IntegerType>(
- data,
- [&](c_type value) {
- const auto v = static_cast<int64_t>(value);
- if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
- return Status::Invalid("Value at position ", i, " out of bounds: ", v,
- " (should be in [", min_value, ", ", max_value, "])");
- }
- ++i;
- return Status::OK();
- },
- [&]() {
- ++i;
- return Status::OK();
- });
- }
-};
-
-struct ValidateArrayFullImpl {
- const ArrayData& data;
-
- Status Validate() { return ValidateWithType(*data.type); }
-
- Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
-
- Status Visit(const NullType& type) { return Status::OK(); }
-
- Status Visit(const FixedWidthType& type) { return Status::OK(); }
-
- Status Visit(const StringType& type) {
- RETURN_NOT_OK(ValidateBinaryLike(type));
- return ValidateUTF8(data);
- }
-
- Status Visit(const LargeStringType& type) {
- RETURN_NOT_OK(ValidateBinaryLike(type));
- return ValidateUTF8(data);
- }
-
- Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
-
- Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
-
- Status Visit(const ListType& type) { return ValidateListLike(type); }
-
- Status Visit(const LargeListType& type) { return ValidateListLike(type); }
-
- Status Visit(const MapType& type) { return ValidateListLike(type); }
-
- Status Visit(const FixedSizeListType& type) {
- const ArrayData& child = *data.child_data[0];
- const Status child_valid = ValidateArrayFull(child);
- if (!child_valid.ok()) {
- return Status::Invalid("Fixed size list child array invalid: ",
- child_valid.ToString());
- }
- return Status::OK();
- }
-
- Status Visit(const StructType& type) {
- // Validate children
- for (int64_t i = 0; i < type.num_fields(); ++i) {
- const ArrayData& field = *data.child_data[i];
- const Status field_valid = ValidateArrayFull(field);
- if (!field_valid.ok()) {
- return Status::Invalid("Struct child array #", i,
- " invalid: ", field_valid.ToString());
+struct BoundsChecker {
+ const ArrayData& data;
+ int64_t min_value;
+ int64_t max_value;
+
+ Status Visit(const DataType&) {
+ // Default, should be unreachable
+ return Status::NotImplemented("");
+ }
+
+ template <typename IntegerType>
+ enable_if_integer<IntegerType, Status> Visit(const IntegerType&) {
+ using c_type = typename IntegerType::c_type;
+
+ int64_t i = 0;
+ return VisitArrayDataInline<IntegerType>(
+ data,
+ [&](c_type value) {
+ const auto v = static_cast<int64_t>(value);
+ if (ARROW_PREDICT_FALSE(v < min_value || v > max_value)) {
+ return Status::Invalid("Value at position ", i, " out of bounds: ", v,
+ " (should be in [", min_value, ", ", max_value, "])");
+ }
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ ++i;
+ return Status::OK();
+ });
+ }
+};
+
+struct ValidateArrayFullImpl {
+ const ArrayData& data;
+
+ Status Validate() { return ValidateWithType(*data.type); }
+
+ Status ValidateWithType(const DataType& type) { return VisitTypeInline(type, this); }
+
+ Status Visit(const NullType& type) { return Status::OK(); }
+
+ Status Visit(const FixedWidthType& type) { return Status::OK(); }
+
+ Status Visit(const StringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
+
+ Status Visit(const LargeStringType& type) {
+ RETURN_NOT_OK(ValidateBinaryLike(type));
+ return ValidateUTF8(data);
+ }
+
+ Status Visit(const BinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); }
+
+ Status Visit(const ListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const LargeListType& type) { return ValidateListLike(type); }
+
+ Status Visit(const MapType& type) { return ValidateListLike(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
+ if (!child_valid.ok()) {
+ return Status::Invalid("Fixed size list child array invalid: ",
+ child_valid.ToString());
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Struct child array #", i,
+ " invalid: ", field_valid.ToString());
}
- }
- return Status::OK();
- }
-
- Status Visit(const UnionType& type) {
- const auto& child_ids = type.child_ids();
- const auto& type_codes_map = type.type_codes();
-
- const int8_t* type_codes = data.GetValues<int8_t>(1);
-
- for (int64_t i = 0; i < data.length; ++i) {
- // Note that union arrays never have top-level nulls
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const auto& type_codes_map = type.type_codes();
+
+ const int8_t* type_codes = data.GetValues<int8_t>(1);
+
+ for (int64_t i = 0; i < data.length; ++i) {
+ // Note that union arrays never have top-level nulls
const int32_t code = type_codes[i];
if (code < 0 || child_ids[code] == UnionType::kInvalidChildId) {
return Status::Invalid("Union value at position ", i, " has invalid type id ",
@@ -519,17 +519,17 @@ struct ValidateArrayFullImpl {
}
}
- if (type.mode() == UnionMode::DENSE) {
+ if (type.mode() == UnionMode::DENSE) {
// Map logical type id to child length
std::vector<int64_t> child_lengths(256);
- for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
- child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
+ for (int child_id = 0; child_id < type.num_fields(); ++child_id) {
+ child_lengths[type_codes_map[child_id]] = data.child_data[child_id]->length;
}
- // Check offsets are in bounds
- std::vector<int64_t> last_child_offsets(256, 0);
- const int32_t* offsets = data.GetValues<int32_t>(2);
- for (int64_t i = 0; i < data.length; ++i) {
+ // Check offsets are in bounds
+ std::vector<int64_t> last_child_offsets(256, 0);
+ const int32_t* offsets = data.GetValues<int32_t>(2);
+ for (int64_t i = 0; i < data.length; ++i) {
const int32_t code = type_codes[i];
const int32_t offset = offsets[i];
if (offset < 0) {
@@ -542,78 +542,78 @@ struct ValidateArrayFullImpl {
"than child length (",
offset, " >= ", child_lengths[code], ")");
}
- if (offset < last_child_offsets[code]) {
- return Status::Invalid("Union value at position ", i,
- " has non-monotonic offset ", offset);
- }
- last_child_offsets[code] = offset;
+ if (offset < last_child_offsets[code]) {
+ return Status::Invalid("Union value at position ", i,
+ " has non-monotonic offset ", offset);
+ }
+ last_child_offsets[code] = offset;
+ }
+ }
+
+ // Validate children
+ for (int64_t i = 0; i < type.num_fields(); ++i) {
+ const ArrayData& field = *data.child_data[i];
+ const Status field_valid = ValidateArrayFull(field);
+ if (!field_valid.ok()) {
+ return Status::Invalid("Union child array #", i,
+ " invalid: ", field_valid.ToString());
}
}
-
- // Validate children
- for (int64_t i = 0; i < type.num_fields(); ++i) {
- const ArrayData& field = *data.child_data[i];
- const Status field_valid = ValidateArrayFull(field);
- if (!field_valid.ok()) {
- return Status::Invalid("Union child array #", i,
- " invalid: ", field_valid.ToString());
- }
- }
return Status::OK();
}
- Status Visit(const DictionaryType& type) {
+ Status Visit(const DictionaryType& type) {
const Status indices_status =
- CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
+ CheckBounds(*type.index_type(), 0, data.dictionary->length - 1);
if (!indices_status.ok()) {
return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString());
}
- return ValidateArrayFull(*data.dictionary);
+ return ValidateArrayFull(*data.dictionary);
}
- Status Visit(const ExtensionType& type) {
- return ValidateWithType(*type.storage_type());
+ Status Visit(const ExtensionType& type) {
+ return ValidateWithType(*type.storage_type());
}
protected:
- template <typename BinaryType>
- Status ValidateBinaryLike(const BinaryType& type) {
- const auto& data_buffer = data.buffers[2];
- if (data_buffer == nullptr) {
- return Status::Invalid("Binary data buffer is null");
+ template <typename BinaryType>
+ Status ValidateBinaryLike(const BinaryType& type) {
+ const auto& data_buffer = data.buffers[2];
+ if (data_buffer == nullptr) {
+ return Status::Invalid("Binary data buffer is null");
}
- return ValidateOffsets(type, data_buffer->size());
+ return ValidateOffsets(type, data_buffer->size());
}
- template <typename ListType>
- Status ValidateListLike(const ListType& type) {
- const ArrayData& child = *data.child_data[0];
- const Status child_valid = ValidateArrayFull(child);
+ template <typename ListType>
+ Status ValidateListLike(const ListType& type) {
+ const ArrayData& child = *data.child_data[0];
+ const Status child_valid = ValidateArrayFull(child);
if (!child_valid.ok()) {
return Status::Invalid("List child array invalid: ", child_valid.ToString());
}
- return ValidateOffsets(type, child.offset + child.length);
+ return ValidateOffsets(type, child.offset + child.length);
}
- template <typename TypeClass>
- Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
- using offset_type = typename TypeClass::offset_type;
- if (data.length == 0) {
+ template <typename TypeClass>
+ Status ValidateOffsets(const TypeClass& type, int64_t offset_limit) {
+ using offset_type = typename TypeClass::offset_type;
+ if (data.length == 0) {
return Status::OK();
}
-
- const offset_type* offsets = data.GetValues<offset_type>(1);
- if (offsets == nullptr) {
- return Status::Invalid("Non-empty array but offsets are null");
+
+ const offset_type* offsets = data.GetValues<offset_type>(1);
+ if (offsets == nullptr) {
+ return Status::Invalid("Non-empty array but offsets are null");
}
- auto prev_offset = offsets[0];
+ auto prev_offset = offsets[0];
if (prev_offset < 0) {
- return Status::Invalid("Offset invariant failure: array starts at negative offset ",
- prev_offset);
+ return Status::Invalid("Offset invariant failure: array starts at negative offset ",
+ prev_offset);
}
- for (int64_t i = 1; i <= data.length; ++i) {
- const auto current_offset = offsets[i];
+ for (int64_t i = 1; i <= data.length; ++i) {
+ const auto current_offset = offsets[i];
if (current_offset < prev_offset) {
return Status::Invalid("Offset invariant failure: non-monotonic offset at slot ",
i, ": ", current_offset, " < ", prev_offset);
@@ -627,31 +627,31 @@ struct ValidateArrayFullImpl {
return Status::OK();
}
- Status CheckBounds(const DataType& type, int64_t min_value, int64_t max_value) {
- BoundsChecker checker{data, min_value, max_value};
- return VisitTypeInline(type, &checker);
+ Status CheckBounds(const DataType& type, int64_t min_value, int64_t max_value) {
+ BoundsChecker checker{data, min_value, max_value};
+ return VisitTypeInline(type, &checker);
}
};
} // namespace
ARROW_EXPORT
-Status ValidateArrayFull(const ArrayData& data) {
- return ValidateArrayFullImpl{data}.Validate();
+Status ValidateArrayFull(const ArrayData& data) {
+ return ValidateArrayFullImpl{data}.Validate();
+}
+
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.data()); }
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data) {
+ DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
+ UTF8DataValidator validator{data};
+ return VisitTypeInline(*data.type, &validator);
}
-ARROW_EXPORT
-Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.data()); }
-
-ARROW_EXPORT
-Status ValidateUTF8(const ArrayData& data) {
- DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING);
- UTF8DataValidator validator{data};
- return VisitTypeInline(*data.type, &validator);
-}
-
-ARROW_EXPORT
-Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
-
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); }
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
index 7e07100e1fc..cae3e16b3c5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/array/validate.h
@@ -18,7 +18,7 @@
#pragma once
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
+#include "arrow/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -26,30 +26,30 @@ namespace internal {
// Internal functions implementing Array::Validate() and friends.
-// O(1) array metadata validation
-
+// O(1) array metadata validation
+
ARROW_EXPORT
Status ValidateArray(const Array& array);
ARROW_EXPORT
-Status ValidateArray(const ArrayData& data);
-
-// O(N) array data validation.
-// Note the "full" routines don't validate metadata. It should be done
-// beforehand using ValidateArray(), otherwise invalid memory accesses
-// may occur.
-
-ARROW_EXPORT
-Status ValidateArrayFull(const Array& array);
-
-ARROW_EXPORT
-Status ValidateArrayFull(const ArrayData& data);
-
-ARROW_EXPORT
-Status ValidateUTF8(const Array& array);
-
-ARROW_EXPORT
-Status ValidateUTF8(const ArrayData& data);
-
+Status ValidateArray(const ArrayData& data);
+
+// O(N) array data validation.
+// Note the "full" routines don't validate metadata. It should be done
+// beforehand using ValidateArray(), otherwise invalid memory accesses
+// may occur.
+
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array);
+
+ARROW_EXPORT
+Status ValidateArrayFull(const ArrayData& data);
+
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array);
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data);
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
index 7d71846d9ab..6c47a464b1d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer.h
@@ -56,13 +56,13 @@ class ARROW_EXPORT Buffer {
///
/// \note The passed memory must be kept alive through some other means
Buffer(const uint8_t* data, int64_t size)
- : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
+ : is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
SetMemoryManager(default_cpu_memory_manager());
}
Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
std::shared_ptr<Buffer> parent = NULLPTR)
- : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
+ : is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
SetMemoryManager(std::move(mm));
}
@@ -121,7 +121,7 @@ class ARROW_EXPORT Buffer {
#endif
// A zero-capacity buffer can have a null data pointer
if (capacity_ != 0) {
- memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
+ memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
}
}
@@ -195,8 +195,8 @@ class ARROW_EXPORT Buffer {
CheckCPU();
CheckMutable();
#endif
- return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
- : NULLPTR;
+ return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
+ : NULLPTR;
}
/// \brief Return the device address of the buffer's data
@@ -210,7 +210,7 @@ class ARROW_EXPORT Buffer {
#ifndef NDEBUG
CheckMutable();
#endif
- return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
+ return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
}
/// \brief Return the buffer's size in bytes
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h b/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
index cebaa5db510..c6250ae2b76 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/buffer_builder.h
@@ -64,10 +64,10 @@ class ARROW_EXPORT BufferBuilder {
/// \brief Resize the buffer to the nearest multiple of 64 bytes
///
/// \param new_capacity the new capacity of the of the builder. Will be
- /// rounded up to a multiple of 64 bytes for padding
- /// \param shrink_to_fit if new capacity is smaller than the existing,
- /// reallocate internal buffer. Set to false to avoid reallocations when
- /// shrinking the builder.
+ /// rounded up to a multiple of 64 bytes for padding
+ /// \param shrink_to_fit if new capacity is smaller than the existing,
+ /// reallocate internal buffer. Set to false to avoid reallocations when
+ /// shrinking the builder.
/// \return Status
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
if (buffer_ == NULLPTR) {
@@ -159,23 +159,23 @@ class ARROW_EXPORT BufferBuilder {
return Status::OK();
}
- Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
- std::shared_ptr<Buffer> out;
- ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
- return out;
- }
-
- /// \brief Like Finish, but override the final buffer size
- ///
- /// This is useful after writing data directly into the builder memory
- /// without calling the Append methods (basically, when using BufferBuilder
- /// mostly for memory allocation).
- Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
- bool shrink_to_fit = true) {
- size_ = final_length;
- return Finish(shrink_to_fit);
- }
-
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using BufferBuilder
+ /// mostly for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ size_ = final_length;
+ return Finish(shrink_to_fit);
+ }
+
void Reset() {
buffer_ = NULLPTR;
capacity_ = size_ = 0;
@@ -216,11 +216,11 @@ class TypedBufferBuilder<
MemoryPool* pool = default_memory_pool())
: bytes_builder_(std::move(buffer), pool) {}
- explicit TypedBufferBuilder(BufferBuilder builder)
- : bytes_builder_(std::move(builder)) {}
-
- BufferBuilder* bytes_builder() { return &bytes_builder_; }
-
+ explicit TypedBufferBuilder(BufferBuilder builder)
+ : bytes_builder_(std::move(builder)) {}
+
+ BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
Status Append(T value) {
return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
}
@@ -275,22 +275,22 @@ class TypedBufferBuilder<
return bytes_builder_.Finish(out, shrink_to_fit);
}
- Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
- std::shared_ptr<Buffer> out;
- ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
- return out;
- }
-
- /// \brief Like Finish, but override the final buffer size
- ///
- /// This is useful after writing data directly into the builder memory
- /// without calling the Append methods (basically, when using TypedBufferBuilder
- /// only for memory allocation).
- Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
- bool shrink_to_fit = true) {
- return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
- }
-
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using TypedBufferBuilder
+ /// only for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
+ }
+
void Reset() { bytes_builder_.Reset(); }
int64_t length() const { return bytes_builder_.length() / sizeof(T); }
@@ -309,11 +309,11 @@ class TypedBufferBuilder<bool> {
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool())
: bytes_builder_(pool) {}
- explicit TypedBufferBuilder(BufferBuilder builder)
- : bytes_builder_(std::move(builder)) {}
-
- BufferBuilder* bytes_builder() { return &bytes_builder_; }
-
+ explicit TypedBufferBuilder(BufferBuilder builder)
+ : bytes_builder_(std::move(builder)) {}
+
+ BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
Status Append(bool value) {
ARROW_RETURN_NOT_OK(Reserve(1));
UnsafeAppend(value);
@@ -411,25 +411,25 @@ class TypedBufferBuilder<bool> {
return bytes_builder_.Finish(out, shrink_to_fit);
}
- Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
- std::shared_ptr<Buffer> out;
- ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
- return out;
- }
-
- /// \brief Like Finish, but override the final buffer size
- ///
- /// This is useful after writing data directly into the builder memory
- /// without calling the Append methods (basically, when using TypedBufferBuilder
- /// only for memory allocation).
- Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
- bool shrink_to_fit = true) {
- const auto final_byte_length = BitUtil::BytesForBits(final_length);
- bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
- bit_length_ = false_count_ = 0;
- return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
- }
-
+ Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+ std::shared_ptr<Buffer> out;
+ ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+ return out;
+ }
+
+ /// \brief Like Finish, but override the final buffer size
+ ///
+ /// This is useful after writing data directly into the builder memory
+ /// without calling the Append methods (basically, when using TypedBufferBuilder
+ /// only for memory allocation).
+ Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+ bool shrink_to_fit = true) {
+ const auto final_byte_length = BitUtil::BytesForBits(final_length);
+ bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
+ bit_length_ = false_count_ = 0;
+ return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
+ }
+
void Reset() {
bytes_builder_.Reset();
bit_length_ = false_count_ = 0;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
index e46661b4b42..f22228a4588 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/builder.cc
@@ -51,7 +51,7 @@ struct DictionaryBuilderCase {
}
Status Visit(const FixedSizeBinaryType&) { return CreateFor<FixedSizeBinaryType>(); }
Status Visit(const Decimal128Type&) { return CreateFor<Decimal128Type>(); }
- Status Visit(const Decimal256Type&) { return CreateFor<Decimal256Type>(); }
+ Status Visit(const Decimal256Type&) { return CreateFor<Decimal256Type>(); }
Status Visit(const DataType& value_type) { return NotImplemented(value_type); }
Status Visit(const HalfFloatType& value_type) { return NotImplemented(value_type); }
@@ -139,7 +139,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(LargeBinary);
BUILDER_CASE(FixedSizeBinary);
BUILDER_CASE(Decimal128);
- BUILDER_CASE(Decimal256);
+ BUILDER_CASE(Decimal256);
case Type::DICTIONARY: {
const auto& dict_type = static_cast<const DictionaryType&>(*type);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
index ccd780fa687..a43bf8104f2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/c/bridge.cc
@@ -304,16 +304,16 @@ struct SchemaExporter {
return SetFormat("w:" + std::to_string(type.byte_width()));
}
- Status Visit(const DecimalType& type) {
- if (type.bit_width() == 128) {
- // 128 is the default bit-width
- return SetFormat("d:" + std::to_string(type.precision()) + "," +
- std::to_string(type.scale()));
- } else {
- return SetFormat("d:" + std::to_string(type.precision()) + "," +
- std::to_string(type.scale()) + "," +
- std::to_string(type.bit_width()));
- }
+ Status Visit(const DecimalType& type) {
+ if (type.bit_width() == 128) {
+ // 128 is the default bit-width
+ return SetFormat("d:" + std::to_string(type.precision()) + "," +
+ std::to_string(type.scale()));
+ } else {
+ return SetFormat("d:" + std::to_string(type.precision()) + "," +
+ std::to_string(type.scale()) + "," +
+ std::to_string(type.bit_width()));
+ }
}
Status Visit(const BinaryType& type) { return SetFormat("z"); }
@@ -980,20 +980,20 @@ struct SchemaImporter {
Status ProcessDecimal() {
RETURN_NOT_OK(f_parser_.CheckNext(':'));
ARROW_ASSIGN_OR_RAISE(auto prec_scale, f_parser_.ParseInts(f_parser_.Rest()));
- // 3 elements indicates bit width was communicated as well.
- if (prec_scale.size() != 2 && prec_scale.size() != 3) {
+ // 3 elements indicates bit width was communicated as well.
+ if (prec_scale.size() != 2 && prec_scale.size() != 3) {
return f_parser_.Invalid();
}
- if (prec_scale[0] <= 0) {
+ if (prec_scale[0] <= 0) {
+ return f_parser_.Invalid();
+ }
+ if (prec_scale.size() == 2 || prec_scale[2] == 128) {
+ type_ = decimal128(prec_scale[0], prec_scale[1]);
+ } else if (prec_scale[2] == 256) {
+ type_ = decimal256(prec_scale[0], prec_scale[1]);
+ } else {
return f_parser_.Invalid();
}
- if (prec_scale.size() == 2 || prec_scale[2] == 128) {
- type_ = decimal128(prec_scale[0], prec_scale[1]);
- } else if (prec_scale[2] == 256) {
- type_ = decimal256(prec_scale[0], prec_scale[1]);
- } else {
- return f_parser_.Invalid();
- }
return Status::OK();
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
index 20c63c78959..142bd0d8c89 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.cc
@@ -118,33 +118,33 @@ bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const {
return Equals(*other.get());
}
-bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
- const EqualOptions& equal_options) const {
- if (length_ != other.length()) {
- return false;
- }
- if (null_count_ != other.null_count()) {
- return false;
- }
- // We cannot toggle check_metadata here yet, so we don't check it
- if (!type_->Equals(*other.type_, /*check_metadata=*/false)) {
- return false;
- }
-
- // Check contents of the underlying arrays. This checks for equality of
- // the underlying data independently of the chunk size.
- return internal::ApplyBinaryChunked(
- *this, other,
- [&](const Array& left_piece, const Array& right_piece,
- int64_t ARROW_ARG_UNUSED(position)) {
- if (!left_piece.ApproxEquals(right_piece, equal_options)) {
- return Status::Invalid("Unequal piece");
- }
- return Status::OK();
- })
- .ok();
-}
-
+bool ChunkedArray::ApproxEquals(const ChunkedArray& other,
+ const EqualOptions& equal_options) const {
+ if (length_ != other.length()) {
+ return false;
+ }
+ if (null_count_ != other.null_count()) {
+ return false;
+ }
+ // We cannot toggle check_metadata here yet, so we don't check it
+ if (!type_->Equals(*other.type_, /*check_metadata=*/false)) {
+ return false;
+ }
+
+ // Check contents of the underlying arrays. This checks for equality of
+ // the underlying data independently of the chunk size.
+ return internal::ApplyBinaryChunked(
+ *this, other,
+ [&](const Array& left_piece, const Array& right_piece,
+ int64_t ARROW_ARG_UNUSED(position)) {
+ if (!left_piece.ApproxEquals(right_piece, equal_options)) {
+ return Status::Invalid("Unequal piece");
+ }
+ return Status::OK();
+ })
+ .ok();
+}
+
std::shared_ptr<ChunkedArray> ChunkedArray::Slice(int64_t offset, int64_t length) const {
ARROW_CHECK_LE(offset, length_) << "Slice offset greater than array length";
bool offset_equals_length = offset == length_;
@@ -246,7 +246,7 @@ Status ChunkedArray::ValidateFull() const {
RETURN_NOT_OK(Validate());
for (size_t i = 0; i < chunks_.size(); ++i) {
const Array& chunk = *chunks_[i];
- const Status st = internal::ValidateArrayFull(chunk);
+ const Status st = internal::ValidateArrayFull(chunk);
if (!st.ok()) {
return Status::Invalid("In chunk ", i, ": ", st.ToString());
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
index 892ae637545..2ace045c2bf 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/chunked_array.h
@@ -23,7 +23,7 @@
#include <utility>
#include <vector>
-#include "arrow/compare.h"
+#include "arrow/compare.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
@@ -73,9 +73,9 @@ class ARROW_EXPORT ChunkedArray {
/// data type.
explicit ChunkedArray(ArrayVector chunks);
- ChunkedArray(ChunkedArray&&) = default;
- ChunkedArray& operator=(ChunkedArray&&) = default;
-
+ ChunkedArray(ChunkedArray&&) = default;
+ ChunkedArray& operator=(ChunkedArray&&) = default;
+
/// \brief Construct a chunked array from a single Array
explicit ChunkedArray(std::shared_ptr<Array> chunk)
: ChunkedArray(ArrayVector{std::move(chunk)}) {}
@@ -137,9 +137,9 @@ class ARROW_EXPORT ChunkedArray {
bool Equals(const ChunkedArray& other) const;
/// \brief Determine if two chunked arrays are equal.
bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
- /// \brief Determine if two chunked arrays approximately equal
- bool ApproxEquals(const ChunkedArray& other,
- const EqualOptions& = EqualOptions::Defaults()) const;
+ /// \brief Determine if two chunked arrays approximately equal
+ bool ApproxEquals(const ChunkedArray& other,
+ const EqualOptions& = EqualOptions::Defaults()) const;
/// \return PrettyPrint representation suitable for debugging
std::string ToString() const;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
index 51fec14e768..4c6f97faf95 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compare.cc
@@ -38,10 +38,10 @@
#include "arrow/tensor.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_reader.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
@@ -51,499 +51,499 @@
namespace arrow {
using internal::BitmapEquals;
-using internal::BitmapReader;
-using internal::BitmapUInt64Reader;
+using internal::BitmapReader;
+using internal::BitmapUInt64Reader;
using internal::checked_cast;
-using internal::OptionalBitmapEquals;
+using internal::OptionalBitmapEquals;
// ----------------------------------------------------------------------
// Public method implementations
namespace {
-// TODO also handle HALF_FLOAT NaNs
-
-enum FloatingEqualityFlags : int8_t { Approximate = 1, NansEqual = 2 };
-
-template <typename T, int8_t Flags>
-struct FloatingEquality {
- bool operator()(T x, T y) { return x == y; }
-};
-
-template <typename T>
-struct FloatingEquality<T, NansEqual> {
- bool operator()(T x, T y) { return (x == y) || (std::isnan(x) && std::isnan(y)); }
-};
-
-template <typename T>
-struct FloatingEquality<T, Approximate> {
- explicit FloatingEquality(const EqualOptions& options)
- : epsilon(static_cast<T>(options.atol())) {}
-
- bool operator()(T x, T y) { return (fabs(x - y) <= epsilon) || (x == y); }
-
- const T epsilon;
-};
-
-template <typename T>
-struct FloatingEquality<T, Approximate | NansEqual> {
- explicit FloatingEquality(const EqualOptions& options)
- : epsilon(static_cast<T>(options.atol())) {}
-
- bool operator()(T x, T y) {
- return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
- }
-
- const T epsilon;
-};
-
-template <typename T, typename Visitor>
-void VisitFloatingEquality(const EqualOptions& options, bool floating_approximate,
- Visitor&& visit) {
- if (options.nans_equal()) {
- if (floating_approximate) {
- visit(FloatingEquality<T, NansEqual | Approximate>{options});
- } else {
- visit(FloatingEquality<T, NansEqual>{});
+// TODO also handle HALF_FLOAT NaNs
+
+enum FloatingEqualityFlags : int8_t { Approximate = 1, NansEqual = 2 };
+
+template <typename T, int8_t Flags>
+struct FloatingEquality {
+ bool operator()(T x, T y) { return x == y; }
+};
+
+template <typename T>
+struct FloatingEquality<T, NansEqual> {
+ bool operator()(T x, T y) { return (x == y) || (std::isnan(x) && std::isnan(y)); }
+};
+
+template <typename T>
+struct FloatingEquality<T, Approximate> {
+ explicit FloatingEquality(const EqualOptions& options)
+ : epsilon(static_cast<T>(options.atol())) {}
+
+ bool operator()(T x, T y) { return (fabs(x - y) <= epsilon) || (x == y); }
+
+ const T epsilon;
+};
+
+template <typename T>
+struct FloatingEquality<T, Approximate | NansEqual> {
+ explicit FloatingEquality(const EqualOptions& options)
+ : epsilon(static_cast<T>(options.atol())) {}
+
+ bool operator()(T x, T y) {
+ return (fabs(x - y) <= epsilon) || (x == y) || (std::isnan(x) && std::isnan(y));
+ }
+
+ const T epsilon;
+};
+
+template <typename T, typename Visitor>
+void VisitFloatingEquality(const EqualOptions& options, bool floating_approximate,
+ Visitor&& visit) {
+ if (options.nans_equal()) {
+ if (floating_approximate) {
+ visit(FloatingEquality<T, NansEqual | Approximate>{options});
+ } else {
+ visit(FloatingEquality<T, NansEqual>{});
}
} else {
- if (floating_approximate) {
- visit(FloatingEquality<T, Approximate>{options});
- } else {
- visit(FloatingEquality<T, 0>{});
+ if (floating_approximate) {
+ visit(FloatingEquality<T, Approximate>{options});
+ } else {
+ visit(FloatingEquality<T, 0>{});
}
}
}
-inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) {
- if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) {
- return false;
+inline bool IdentityImpliesEqualityNansNotEqual(const DataType& type) {
+ if (type.id() == Type::FLOAT || type.id() == Type::DOUBLE) {
+ return false;
+ }
+ for (const auto& child : type.fields()) {
+ if (!IdentityImpliesEqualityNansNotEqual(*child->type())) {
+ return false;
+ }
}
- for (const auto& child : type.fields()) {
- if (!IdentityImpliesEqualityNansNotEqual(*child->type())) {
- return false;
- }
- }
- return true;
+ return true;
}
-inline bool IdentityImpliesEquality(const DataType& type, const EqualOptions& options) {
- if (options.nans_equal()) {
- return true;
+inline bool IdentityImpliesEquality(const DataType& type, const EqualOptions& options) {
+ if (options.nans_equal()) {
+ return true;
}
- return IdentityImpliesEqualityNansNotEqual(type);
+ return IdentityImpliesEqualityNansNotEqual(type);
}
-bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
- int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx, const EqualOptions& options,
- bool floating_approximate);
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate);
-class RangeDataEqualsImpl {
+class RangeDataEqualsImpl {
public:
- // PRE-CONDITIONS:
- // - the types are equal
- // - the ranges are in bounds
- RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
- const ArrayData& left, const ArrayData& right,
- int64_t left_start_idx, int64_t right_start_idx,
- int64_t range_length)
- : options_(options),
- floating_approximate_(floating_approximate),
- left_(left),
- right_(right),
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the ranges are in bounds
+ RangeDataEqualsImpl(const EqualOptions& options, bool floating_approximate,
+ const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t right_start_idx,
+ int64_t range_length)
+ : options_(options),
+ floating_approximate_(floating_approximate),
+ left_(left),
+ right_(right),
left_start_idx_(left_start_idx),
right_start_idx_(right_start_idx),
- range_length_(range_length),
+ range_length_(range_length),
result_(false) {}
- bool Compare() {
- // Compare null bitmaps
- if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
- range_length_ == right_.length) {
- // If we're comparing entire arrays, we can first compare the cached null counts
- if (left_.GetNullCount() != right_.GetNullCount()) {
- return false;
+ bool Compare() {
+ // Compare null bitmaps
+ if (left_start_idx_ == 0 && right_start_idx_ == 0 && range_length_ == left_.length &&
+ range_length_ == right_.length) {
+ // If we're comparing entire arrays, we can first compare the cached null counts
+ if (left_.GetNullCount() != right_.GetNullCount()) {
+ return false;
}
}
- if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
- right_.buffers[0], right_.offset + right_start_idx_,
- range_length_)) {
- return false;
- }
- // Compare values
- return CompareWithType(*left_.type);
- }
-
- bool CompareWithType(const DataType& type) {
- result_ = true;
- if (range_length_ != 0) {
- ARROW_CHECK_OK(VisitTypeInline(type, this));
+ if (!OptionalBitmapEquals(left_.buffers[0], left_.offset + left_start_idx_,
+ right_.buffers[0], right_.offset + right_start_idx_,
+ range_length_)) {
+ return false;
}
- return result_;
+ // Compare values
+ return CompareWithType(*left_.type);
}
- Status Visit(const NullType&) { return Status::OK(); }
+ bool CompareWithType(const DataType& type) {
+ result_ = true;
+ if (range_length_ != 0) {
+ ARROW_CHECK_OK(VisitTypeInline(type, this));
+ }
+ return result_;
+ }
- template <typename TypeClass>
- enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
- return ComparePrimitive(type);
+ Status Visit(const NullType&) { return Status::OK(); }
+
+ template <typename TypeClass>
+ enable_if_primitive_ctype<TypeClass, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
}
- template <typename TypeClass>
- enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
- return ComparePrimitive(type);
- }
+ template <typename TypeClass>
+ enable_if_t<is_temporal_type<TypeClass>::value, Status> Visit(const TypeClass& type) {
+ return ComparePrimitive(type);
+ }
- Status Visit(const BooleanType&) {
- const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
- const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- if (length <= 8) {
- // Avoid the BitmapUInt64Reader overhead for very small runs
- for (int64_t j = i; j < i + length; ++j) {
- if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
- BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
- return false;
- }
- }
+ Status Visit(const BooleanType&) {
+ const uint8_t* left_bits = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_bits = right_.GetValues<uint8_t>(1, 0);
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ if (length <= 8) {
+ // Avoid the BitmapUInt64Reader overhead for very small runs
+ for (int64_t j = i; j < i + length; ++j) {
+ if (BitUtil::GetBit(left_bits, left_start_idx_ + left_.offset + j) !=
+ BitUtil::GetBit(right_bits, right_start_idx_ + right_.offset + j)) {
+ return false;
+ }
+ }
return true;
- } else if (length <= 1024) {
- BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
- length);
- BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
- length);
- while (left_reader.position() < length) {
- if (left_reader.NextWord() != right_reader.NextWord()) {
- return false;
- }
- }
- DCHECK_EQ(right_reader.position(), length);
- } else {
- // BitmapEquals is the fastest method on large runs
- return BitmapEquals(left_bits, left_start_idx_ + left_.offset + i, right_bits,
- right_start_idx_ + right_.offset + i, length);
+ } else if (length <= 1024) {
+ BitmapUInt64Reader left_reader(left_bits, left_start_idx_ + left_.offset + i,
+ length);
+ BitmapUInt64Reader right_reader(right_bits, right_start_idx_ + right_.offset + i,
+ length);
+ while (left_reader.position() < length) {
+ if (left_reader.NextWord() != right_reader.NextWord()) {
+ return false;
+ }
+ }
+ DCHECK_EQ(right_reader.position(), length);
+ } else {
+ // BitmapEquals is the fastest method on large runs
+ return BitmapEquals(left_bits, left_start_idx_ + left_.offset + i, right_bits,
+ right_start_idx_ + right_.offset + i, length);
}
- return true;
+ return true;
};
- VisitValidRuns(compare_runs);
- return Status::OK();
- }
-
- Status Visit(const FloatType& type) { return CompareFloating(type); }
-
- Status Visit(const DoubleType& type) { return CompareFloating(type); }
-
- // Also matches StringType
- Status Visit(const BinaryType& type) { return CompareBinary(type); }
-
- // Also matches LargeStringType
- Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
-
- Status Visit(const FixedSizeBinaryType& type) {
- const auto byte_width = type.byte_width();
- const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
- const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
-
- if (left_data != nullptr && right_data != nullptr) {
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
- right_data + (right_start_idx_ + right_.offset + i) * byte_width,
- length * byte_width) == 0;
- };
- VisitValidRuns(compare_runs);
- } else {
- auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
- VisitValidRuns(compare_runs);
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const FloatType& type) { return CompareFloating(type); }
+
+ Status Visit(const DoubleType& type) { return CompareFloating(type); }
+
+ // Also matches StringType
+ Status Visit(const BinaryType& type) { return CompareBinary(type); }
+
+ // Also matches LargeStringType
+ Status Visit(const LargeBinaryType& type) { return CompareBinary(type); }
+
+ Status Visit(const FixedSizeBinaryType& type) {
+ const auto byte_width = type.byte_width();
+ const uint8_t* left_data = left_.GetValues<uint8_t>(1, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(1, 0);
+
+ if (left_data != nullptr && right_data != nullptr) {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ return memcmp(left_data + (left_start_idx_ + left_.offset + i) * byte_width,
+ right_data + (right_start_idx_ + right_.offset + i) * byte_width,
+ length * byte_width) == 0;
+ };
+ VisitValidRuns(compare_runs);
+ } else {
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool { return true; };
+ VisitValidRuns(compare_runs);
}
- return Status::OK();
- }
-
- // Also matches MapType
- Status Visit(const ListType& type) { return CompareList(type); }
-
- Status Visit(const LargeListType& type) { return CompareList(type); }
-
- Status Visit(const FixedSizeListType& type) {
- const auto list_size = type.list_size();
- const ArrayData& left_data = *left_.child_data[0];
- const ArrayData& right_data = *right_.child_data[0];
-
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
- (left_start_idx_ + left_.offset + i) * list_size,
- (right_start_idx_ + right_.offset + i) * list_size,
- length * list_size);
- return impl.Compare();
- };
- VisitValidRuns(compare_runs);
- return Status::OK();
- }
-
- Status Visit(const StructType& type) {
- const int32_t num_fields = type.num_fields();
-
- auto compare_runs = [&](int64_t i, int64_t length) -> bool {
- for (int32_t f = 0; f < num_fields; ++f) {
- RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
- *right_.child_data[f],
- left_start_idx_ + left_.offset + i,
- right_start_idx_ + right_.offset + i, length);
- if (!impl.Compare()) {
+ return Status::OK();
+ }
+
+ // Also matches MapType
+ Status Visit(const ListType& type) { return CompareList(type); }
+
+ Status Visit(const LargeListType& type) { return CompareList(type); }
+
+ Status Visit(const FixedSizeListType& type) {
+ const auto list_size = type.list_size();
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
+
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ (left_start_idx_ + left_.offset + i) * list_size,
+ (right_start_idx_ + right_.offset + i) * list_size,
+ length * list_size);
+ return impl.Compare();
+ };
+ VisitValidRuns(compare_runs);
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) {
+ const int32_t num_fields = type.num_fields();
+
+ auto compare_runs = [&](int64_t i, int64_t length) -> bool {
+ for (int32_t f = 0; f < num_fields; ++f) {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, *left_.child_data[f],
+ *right_.child_data[f],
+ left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, length);
+ if (!impl.Compare()) {
return false;
}
}
- return true;
- };
- VisitValidRuns(compare_runs);
+ return true;
+ };
+ VisitValidRuns(compare_runs);
return Status::OK();
}
- Status Visit(const SparseUnionType& type) {
- const auto& child_ids = type.child_ids();
- const int8_t* left_codes = left_.GetValues<int8_t>(1);
- const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ Status Visit(const SparseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
- // Unions don't have a null bitmap
- for (int64_t i = 0; i < range_length_; ++i) {
- const auto type_id = left_codes[left_start_idx_ + i];
- if (type_id != right_codes[right_start_idx_ + i]) {
+ // Unions don't have a null bitmap
+ for (int64_t i = 0; i < range_length_; ++i) {
+ const auto type_id = left_codes[left_start_idx_ + i];
+ if (type_id != right_codes[right_start_idx_ + i]) {
result_ = false;
- break;
+ break;
}
- const auto child_num = child_ids[type_id];
- // XXX can we instead detect runs of same-child union values?
- RangeDataEqualsImpl impl(
- options_, floating_approximate_, *left_.child_data[child_num],
- *right_.child_data[child_num], left_start_idx_ + left_.offset + i,
- right_start_idx_ + right_.offset + i, 1);
- if (!impl.Compare()) {
+ const auto child_num = child_ids[type_id];
+ // XXX can we instead detect runs of same-child union values?
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_start_idx_ + left_.offset + i,
+ right_start_idx_ + right_.offset + i, 1);
+ if (!impl.Compare()) {
result_ = false;
- break;
+ break;
}
}
return Status::OK();
}
- Status Visit(const DenseUnionType& type) {
- const auto& child_ids = type.child_ids();
- const int8_t* left_codes = left_.GetValues<int8_t>(1);
- const int8_t* right_codes = right_.GetValues<int8_t>(1);
- const int32_t* left_offsets = left_.GetValues<int32_t>(2);
- const int32_t* right_offsets = right_.GetValues<int32_t>(2);
-
- for (int64_t i = 0; i < range_length_; ++i) {
- const auto type_id = left_codes[left_start_idx_ + i];
- if (type_id != right_codes[right_start_idx_ + i]) {
- result_ = false;
- break;
- }
- const auto child_num = child_ids[type_id];
- RangeDataEqualsImpl impl(
- options_, floating_approximate_, *left_.child_data[child_num],
- *right_.child_data[child_num], left_offsets[left_start_idx_ + i],
- right_offsets[right_start_idx_ + i], 1);
- if (!impl.Compare()) {
- result_ = false;
- break;
- }
- }
+ Status Visit(const DenseUnionType& type) {
+ const auto& child_ids = type.child_ids();
+ const int8_t* left_codes = left_.GetValues<int8_t>(1);
+ const int8_t* right_codes = right_.GetValues<int8_t>(1);
+ const int32_t* left_offsets = left_.GetValues<int32_t>(2);
+ const int32_t* right_offsets = right_.GetValues<int32_t>(2);
+
+ for (int64_t i = 0; i < range_length_; ++i) {
+ const auto type_id = left_codes[left_start_idx_ + i];
+ if (type_id != right_codes[right_start_idx_ + i]) {
+ result_ = false;
+ break;
+ }
+ const auto child_num = child_ids[type_id];
+ RangeDataEqualsImpl impl(
+ options_, floating_approximate_, *left_.child_data[child_num],
+ *right_.child_data[child_num], left_offsets[left_start_idx_ + i],
+ right_offsets[right_start_idx_ + i], 1);
+ if (!impl.Compare()) {
+ result_ = false;
+ break;
+ }
+ }
return Status::OK();
}
- Status Visit(const DictionaryType& type) {
- // Compare dictionaries
- result_ &= CompareArrayRanges(
- *left_.dictionary, *right_.dictionary,
- /*left_start_idx=*/0,
- /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
- /*right_start_idx=*/0, options_, floating_approximate_);
- if (result_) {
- // Compare indices
- result_ &= CompareWithType(*type.index_type());
+ Status Visit(const DictionaryType& type) {
+ // Compare dictionaries
+ result_ &= CompareArrayRanges(
+ *left_.dictionary, *right_.dictionary,
+ /*left_start_idx=*/0,
+ /*left_end_idx=*/std::max(left_.dictionary->length, right_.dictionary->length),
+ /*right_start_idx=*/0, options_, floating_approximate_);
+ if (result_) {
+ // Compare indices
+ result_ &= CompareWithType(*type.index_type());
}
return Status::OK();
}
- Status Visit(const ExtensionType& type) {
- // Compare storages
- result_ &= CompareWithType(*type.storage_type());
+ Status Visit(const ExtensionType& type) {
+ // Compare storages
+ result_ &= CompareWithType(*type.storage_type());
return Status::OK();
}
protected:
- // For CompareFloating (templated local classes or lambdas not supported in C++11)
- template <typename CType>
- struct ComparatorVisitor {
- RangeDataEqualsImpl* impl;
- const CType* left_values;
- const CType* right_values;
-
- template <typename CompareFunction>
- void operator()(CompareFunction&& compare) {
- impl->VisitValues([&](int64_t i) {
- const CType x = left_values[i + impl->left_start_idx_];
- const CType y = right_values[i + impl->right_start_idx_];
- return compare(x, y);
- });
- }
- };
-
- template <typename CType>
- friend struct ComparatorVisitor;
-
- template <typename TypeClass, typename CType = typename TypeClass::c_type>
- Status ComparePrimitive(const TypeClass&) {
- const CType* left_values = left_.GetValues<CType>(1);
- const CType* right_values = right_.GetValues<CType>(1);
- VisitValidRuns([&](int64_t i, int64_t length) {
- return memcmp(left_values + left_start_idx_ + i,
- right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
- });
- return Status::OK();
- }
-
- template <typename TypeClass>
- Status CompareFloating(const TypeClass&) {
- using CType = typename TypeClass::c_type;
- const CType* left_values = left_.GetValues<CType>(1);
- const CType* right_values = right_.GetValues<CType>(1);
-
- ComparatorVisitor<CType> visitor{this, left_values, right_values};
- VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
+ // For CompareFloating (templated local classes or lambdas not supported in C++11)
+ template <typename CType>
+ struct ComparatorVisitor {
+ RangeDataEqualsImpl* impl;
+ const CType* left_values;
+ const CType* right_values;
+
+ template <typename CompareFunction>
+ void operator()(CompareFunction&& compare) {
+ impl->VisitValues([&](int64_t i) {
+ const CType x = left_values[i + impl->left_start_idx_];
+ const CType y = right_values[i + impl->right_start_idx_];
+ return compare(x, y);
+ });
+ }
+ };
+
+ template <typename CType>
+ friend struct ComparatorVisitor;
+
+ template <typename TypeClass, typename CType = typename TypeClass::c_type>
+ Status ComparePrimitive(const TypeClass&) {
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+ VisitValidRuns([&](int64_t i, int64_t length) {
+ return memcmp(left_values + left_start_idx_ + i,
+ right_values + right_start_idx_ + i, length * sizeof(CType)) == 0;
+ });
+ return Status::OK();
+ }
+
+ template <typename TypeClass>
+ Status CompareFloating(const TypeClass&) {
+ using CType = typename TypeClass::c_type;
+ const CType* left_values = left_.GetValues<CType>(1);
+ const CType* right_values = right_.GetValues<CType>(1);
+
+ ComparatorVisitor<CType> visitor{this, left_values, right_values};
+ VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
return Status::OK();
}
- template <typename TypeClass>
- Status CompareBinary(const TypeClass&) {
- const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
- const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
+ template <typename TypeClass>
+ Status CompareBinary(const TypeClass&) {
+ const uint8_t* left_data = left_.GetValues<uint8_t>(2, 0);
+ const uint8_t* right_data = right_.GetValues<uint8_t>(2, 0);
- if (left_data != nullptr && right_data != nullptr) {
- const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
- int64_t length) -> bool {
- return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
- };
- CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
+ if (left_data != nullptr && right_data != nullptr) {
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ return memcmp(left_data + left_offset, right_data + right_offset, length) == 0;
+ };
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
} else {
- // One of the arrays is an array of empty strings and nulls.
- // We just need to compare the offsets.
- // (note we must not call memcmp() with null data pointers)
- CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
+ // One of the arrays is an array of empty strings and nulls.
+ // We just need to compare the offsets.
+ // (note we must not call memcmp() with null data pointers)
+ CompareWithOffsets<typename TypeClass::offset_type>(1, [](...) { return true; });
}
return Status::OK();
}
- template <typename TypeClass>
- Status CompareList(const TypeClass&) {
- const ArrayData& left_data = *left_.child_data[0];
- const ArrayData& right_data = *right_.child_data[0];
+ template <typename TypeClass>
+ Status CompareList(const TypeClass&) {
+ const ArrayData& left_data = *left_.child_data[0];
+ const ArrayData& right_data = *right_.child_data[0];
- const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
- int64_t length) -> bool {
- RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
- left_offset, right_offset, length);
- return impl.Compare();
- };
+ const auto compare_ranges = [&](int64_t left_offset, int64_t right_offset,
+ int64_t length) -> bool {
+ RangeDataEqualsImpl impl(options_, floating_approximate_, left_data, right_data,
+ left_offset, right_offset, length);
+ return impl.Compare();
+ };
- CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
+ CompareWithOffsets<typename TypeClass::offset_type>(1, compare_ranges);
return Status::OK();
}
- template <typename offset_type, typename CompareRanges>
- void CompareWithOffsets(int offsets_buffer_index, CompareRanges&& compare_ranges) {
- const offset_type* left_offsets =
- left_.GetValues<offset_type>(offsets_buffer_index) + left_start_idx_;
- const offset_type* right_offsets =
- right_.GetValues<offset_type>(offsets_buffer_index) + right_start_idx_;
+ template <typename offset_type, typename CompareRanges>
+ void CompareWithOffsets(int offsets_buffer_index, CompareRanges&& compare_ranges) {
+ const offset_type* left_offsets =
+ left_.GetValues<offset_type>(offsets_buffer_index) + left_start_idx_;
+ const offset_type* right_offsets =
+ right_.GetValues<offset_type>(offsets_buffer_index) + right_start_idx_;
- const auto compare_runs = [&](int64_t i, int64_t length) {
- for (int64_t j = i; j < i + length; ++j) {
- if (left_offsets[j + 1] - left_offsets[j] !=
- right_offsets[j + 1] - right_offsets[j]) {
+ const auto compare_runs = [&](int64_t i, int64_t length) {
+ for (int64_t j = i; j < i + length; ++j) {
+ if (left_offsets[j + 1] - left_offsets[j] !=
+ right_offsets[j + 1] - right_offsets[j]) {
return false;
}
}
- if (!compare_ranges(left_offsets[i], right_offsets[i],
- left_offsets[i + length] - left_offsets[i])) {
- return false;
- }
+ if (!compare_ranges(left_offsets[i], right_offsets[i],
+ left_offsets[i + length] - left_offsets[i])) {
+ return false;
+ }
return true;
- };
-
- VisitValidRuns(compare_runs);
- }
-
- template <typename CompareValues>
- void VisitValues(CompareValues&& compare_values) {
- internal::VisitSetBitRunsVoid(left_.buffers[0], left_.offset + left_start_idx_,
- range_length_, [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; ++i) {
- result_ &= compare_values(position + i);
- }
- });
- }
-
- // Visit and compare runs of non-null values
- template <typename CompareRuns>
- void VisitValidRuns(CompareRuns&& compare_runs) {
- const uint8_t* left_null_bitmap = left_.GetValues<uint8_t>(0, 0);
- if (left_null_bitmap == nullptr) {
- result_ = compare_runs(0, range_length_);
- return;
+ };
+
+ VisitValidRuns(compare_runs);
+ }
+
+ template <typename CompareValues>
+ void VisitValues(CompareValues&& compare_values) {
+ internal::VisitSetBitRunsVoid(left_.buffers[0], left_.offset + left_start_idx_,
+ range_length_, [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; ++i) {
+ result_ &= compare_values(position + i);
+ }
+ });
+ }
+
+ // Visit and compare runs of non-null values
+ template <typename CompareRuns>
+ void VisitValidRuns(CompareRuns&& compare_runs) {
+ const uint8_t* left_null_bitmap = left_.GetValues<uint8_t>(0, 0);
+ if (left_null_bitmap == nullptr) {
+ result_ = compare_runs(0, range_length_);
+ return;
}
- internal::SetBitRunReader reader(left_null_bitmap, left_.offset + left_start_idx_,
- range_length_);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- return;
+ internal::SetBitRunReader reader(left_null_bitmap, left_.offset + left_start_idx_,
+ range_length_);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ return;
}
- if (!compare_runs(run.position, run.length)) {
- result_ = false;
- return;
+ if (!compare_runs(run.position, run.length)) {
+ result_ = false;
+ return;
}
}
}
- const EqualOptions& options_;
- const bool floating_approximate_;
- const ArrayData& left_;
- const ArrayData& right_;
- const int64_t left_start_idx_;
- const int64_t right_start_idx_;
- const int64_t range_length_;
+ const EqualOptions& options_;
+ const bool floating_approximate_;
+ const ArrayData& left_;
+ const ArrayData& right_;
+ const int64_t left_start_idx_;
+ const int64_t right_start_idx_;
+ const int64_t range_length_;
- bool result_;
+ bool result_;
};
-bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
- int64_t left_start_idx, int64_t left_end_idx,
- int64_t right_start_idx, const EqualOptions& options,
- bool floating_approximate) {
- if (left.type->id() != right.type->id() ||
- !TypeEquals(*left.type, *right.type, false /* check_metadata */)) {
- return false;
+bool CompareArrayRanges(const ArrayData& left, const ArrayData& right,
+ int64_t left_start_idx, int64_t left_end_idx,
+ int64_t right_start_idx, const EqualOptions& options,
+ bool floating_approximate) {
+ if (left.type->id() != right.type->id() ||
+ !TypeEquals(*left.type, *right.type, false /* check_metadata */)) {
+ return false;
}
- const int64_t range_length = left_end_idx - left_start_idx;
- DCHECK_GE(range_length, 0);
- if (left_start_idx + range_length > left.length) {
- // Left range too small
+ const int64_t range_length = left_end_idx - left_start_idx;
+ DCHECK_GE(range_length, 0);
+ if (left_start_idx + range_length > left.length) {
+ // Left range too small
return false;
}
- if (right_start_idx + range_length > right.length) {
- // Right range too small
+ if (right_start_idx + range_length > right.length) {
+ // Right range too small
return false;
}
- if (&left == &right && left_start_idx == right_start_idx &&
- IdentityImpliesEquality(*left.type, options)) {
- return true;
+ if (&left == &right && left_start_idx == right_start_idx &&
+ IdentityImpliesEquality(*left.type, options)) {
+ return true;
}
- // Compare values
- RangeDataEqualsImpl impl(options, floating_approximate, left, right, left_start_idx,
- right_start_idx, range_length);
- return impl.Compare();
+ // Compare values
+ RangeDataEqualsImpl impl(options, floating_approximate, left, right, left_start_idx,
+ right_start_idx, range_length);
+ return impl.Compare();
}
class TypeEqualsVisitor {
@@ -611,12 +611,12 @@ class TypeEqualsVisitor {
return Status::OK();
}
- Status Visit(const Decimal256Type& left) {
- const auto& right = checked_cast<const Decimal256Type&>(right_);
- result_ = left.precision() == right.precision() && left.scale() == right.scale();
- return Status::OK();
- }
-
+ Status Visit(const Decimal256Type& left) {
+ const auto& right = checked_cast<const Decimal256Type&>(right_);
+ result_ = left.precision() == right.precision() && left.scale() == right.scale();
+ return Status::OK();
+ }
+
template <typename T>
enable_if_t<is_list_like_type<T>::value || is_struct_type<T>::value, Status> Visit(
const T& left) {
@@ -671,22 +671,22 @@ class TypeEqualsVisitor {
bool result_;
};
-bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
- bool floating_approximate);
-bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
- bool floating_approximate);
-
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
+ bool floating_approximate);
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
+ bool floating_approximate);
+
class ScalarEqualsVisitor {
public:
- // PRE-CONDITIONS:
- // - the types are equal
- // - the scalars are non-null
- explicit ScalarEqualsVisitor(const Scalar& right, const EqualOptions& opts,
- bool floating_approximate)
- : right_(right),
- options_(opts),
- floating_approximate_(floating_approximate),
- result_(false) {}
+ // PRE-CONDITIONS:
+ // - the types are equal
+ // - the scalars are non-null
+ explicit ScalarEqualsVisitor(const Scalar& right, const EqualOptions& opts,
+ bool floating_approximate)
+ : right_(right),
+ options_(opts),
+ floating_approximate_(floating_approximate),
+ result_(false) {}
Status Visit(const NullScalar& left) {
result_ = true;
@@ -700,8 +700,8 @@ class ScalarEqualsVisitor {
}
template <typename T>
- typename std::enable_if<(is_primitive_ctype<typename T::TypeClass>::value ||
- is_temporal_type<typename T::TypeClass>::value),
+ typename std::enable_if<(is_primitive_ctype<typename T::TypeClass>::value ||
+ is_temporal_type<typename T::TypeClass>::value),
Status>::type
Visit(const T& left_) {
const auto& right = checked_cast<const T&>(right_);
@@ -709,10 +709,10 @@ class ScalarEqualsVisitor {
return Status::OK();
}
- Status Visit(const FloatScalar& left) { return CompareFloating(left); }
-
- Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
-
+ Status Visit(const FloatScalar& left) { return CompareFloating(left); }
+
+ Status Visit(const DoubleScalar& left) { return CompareFloating(left); }
+
template <typename T>
typename std::enable_if<std::is_base_of<BaseBinaryScalar, T>::value, Status>::type
Visit(const T& left) {
@@ -727,33 +727,33 @@ class ScalarEqualsVisitor {
return Status::OK();
}
- Status Visit(const Decimal256Scalar& left) {
- const auto& right = checked_cast<const Decimal256Scalar&>(right_);
- result_ = left.value == right.value;
- return Status::OK();
- }
-
+ Status Visit(const Decimal256Scalar& left) {
+ const auto& right = checked_cast<const Decimal256Scalar&>(right_);
+ result_ = left.value == right.value;
+ return Status::OK();
+ }
+
Status Visit(const ListScalar& left) {
const auto& right = checked_cast<const ListScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
Status Visit(const LargeListScalar& left) {
const auto& right = checked_cast<const LargeListScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
Status Visit(const MapScalar& left) {
const auto& right = checked_cast<const MapScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
Status Visit(const FixedSizeListScalar& left) {
const auto& right = checked_cast<const FixedSizeListScalar&>(right_);
- result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ArrayEquals(*left.value, *right.value, options_, floating_approximate_);
return Status::OK();
}
@@ -765,8 +765,8 @@ class ScalarEqualsVisitor {
} else {
bool all_equals = true;
for (size_t i = 0; i < left.value.size() && all_equals; i++) {
- all_equals &= ScalarEquals(*left.value[i], *right.value[i], options_,
- floating_approximate_);
+ all_equals &= ScalarEquals(*left.value[i], *right.value[i], options_,
+ floating_approximate_);
}
result_ = all_equals;
}
@@ -777,7 +777,7 @@ class ScalarEqualsVisitor {
Status Visit(const UnionScalar& left) {
const auto& right = checked_cast<const UnionScalar&>(right_);
if (left.is_valid && right.is_valid) {
- result_ = ScalarEquals(*left.value, *right.value, options_, floating_approximate_);
+ result_ = ScalarEquals(*left.value, *right.value, options_, floating_approximate_);
} else if (!left.is_valid && !right.is_valid) {
result_ = true;
} else {
@@ -788,10 +788,10 @@ class ScalarEqualsVisitor {
Status Visit(const DictionaryScalar& left) {
const auto& right = checked_cast<const DictionaryScalar&>(right_);
- result_ = ScalarEquals(*left.value.index, *right.value.index, options_,
- floating_approximate_) &&
- ArrayEquals(*left.value.dictionary, *right.value.dictionary, options_,
- floating_approximate_);
+ result_ = ScalarEquals(*left.value.index, *right.value.index, options_,
+ floating_approximate_) &&
+ ArrayEquals(*left.value.dictionary, *right.value.dictionary, options_,
+ floating_approximate_);
return Status::OK();
}
@@ -802,40 +802,40 @@ class ScalarEqualsVisitor {
bool result() const { return result_; }
protected:
- // For CompareFloating (templated local classes or lambdas not supported in C++11)
- template <typename ScalarType>
- struct ComparatorVisitor {
- const ScalarType& left;
- const ScalarType& right;
- bool* result;
-
- template <typename CompareFunction>
- void operator()(CompareFunction&& compare) {
- *result = compare(left.value, right.value);
- }
- };
-
- template <typename ScalarType>
- Status CompareFloating(const ScalarType& left) {
- using CType = decltype(left.value);
-
- ComparatorVisitor<ScalarType> visitor{left, checked_cast<const ScalarType&>(right_),
- &result_};
- VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
- return Status::OK();
- }
-
+ // For CompareFloating (templated local classes or lambdas not supported in C++11)
+ template <typename ScalarType>
+ struct ComparatorVisitor {
+ const ScalarType& left;
+ const ScalarType& right;
+ bool* result;
+
+ template <typename CompareFunction>
+ void operator()(CompareFunction&& compare) {
+ *result = compare(left.value, right.value);
+ }
+ };
+
+ template <typename ScalarType>
+ Status CompareFloating(const ScalarType& left) {
+ using CType = decltype(left.value);
+
+ ComparatorVisitor<ScalarType> visitor{left, checked_cast<const ScalarType&>(right_),
+ &result_};
+ VisitFloatingEquality<CType>(options_, floating_approximate_, visitor);
+ return Status::OK();
+ }
+
const Scalar& right_;
- const EqualOptions options_;
- const bool floating_approximate_;
+ const EqualOptions options_;
+ const bool floating_approximate_;
bool result_;
};
-Status PrintDiff(const Array& left, const Array& right, std::ostream* os);
-
-Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
- int64_t left_length, int64_t right_offset, int64_t right_length,
- std::ostream* os) {
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os);
+
+Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
+ int64_t left_length, int64_t right_offset, int64_t right_length,
+ std::ostream* os) {
if (os == nullptr) {
return Status::OK();
}
@@ -868,100 +868,100 @@ Status PrintDiff(const Array& left, const Array& right, int64_t left_offset,
return Status::OK();
}
- const auto left_slice = left.Slice(left_offset, left_length);
- const auto right_slice = right.Slice(right_offset, right_length);
- ARROW_ASSIGN_OR_RAISE(auto edits,
- Diff(*left_slice, *right_slice, default_memory_pool()));
+ const auto left_slice = left.Slice(left_offset, left_length);
+ const auto right_slice = right.Slice(right_offset, right_length);
+ ARROW_ASSIGN_OR_RAISE(auto edits,
+ Diff(*left_slice, *right_slice, default_memory_pool()));
ARROW_ASSIGN_OR_RAISE(auto formatter, MakeUnifiedDiffFormatter(*left.type(), os));
- return formatter(*edits, *left_slice, *right_slice);
+ return formatter(*edits, *left_slice, *right_slice);
}
-Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
- return PrintDiff(left, right, 0, left.length(), 0, right.length(), os);
-}
+Status PrintDiff(const Array& left, const Array& right, std::ostream* os) {
+ return PrintDiff(left, right, 0, left.length(), 0, right.length(), os);
+}
-bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
- int64_t left_end_idx, int64_t right_start_idx,
- const EqualOptions& options, bool floating_approximate) {
- bool are_equal =
- CompareArrayRanges(*left.data(), *right.data(), left_start_idx, left_end_idx,
- right_start_idx, options, floating_approximate);
+bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options, bool floating_approximate) {
+ bool are_equal =
+ CompareArrayRanges(*left.data(), *right.data(), left_start_idx, left_end_idx,
+ right_start_idx, options, floating_approximate);
if (!are_equal) {
- ARROW_IGNORE_EXPR(PrintDiff(
- left, right, left_start_idx, left_end_idx, right_start_idx,
- right_start_idx + (left_end_idx - left_start_idx), options.diff_sink()));
+ ARROW_IGNORE_EXPR(PrintDiff(
+ left, right, left_start_idx, left_end_idx, right_start_idx,
+ right_start_idx + (left_end_idx - left_start_idx), options.diff_sink()));
}
return are_equal;
}
-bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
- bool floating_approximate) {
- if (left.length() != right.length()) {
- ARROW_IGNORE_EXPR(PrintDiff(left, right, opts.diff_sink()));
- return false;
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts,
+ bool floating_approximate) {
+ if (left.length() != right.length()) {
+ ARROW_IGNORE_EXPR(PrintDiff(left, right, opts.diff_sink()));
+ return false;
+ }
+ return ArrayRangeEquals(left, right, 0, left.length(), 0, opts, floating_approximate);
+}
+
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
+ bool floating_approximate) {
+ if (&left == &right && IdentityImpliesEquality(*left.type, options)) {
+ return true;
+ }
+ if (!left.type->Equals(right.type)) {
+ return false;
+ }
+ if (left.is_valid != right.is_valid) {
+ return false;
+ }
+ if (!left.is_valid) {
+ return true;
}
- return ArrayRangeEquals(left, right, 0, left.length(), 0, opts, floating_approximate);
+ ScalarEqualsVisitor visitor(right, options, floating_approximate);
+ auto error = VisitScalarInline(left, &visitor);
+ DCHECK_OK(error);
+ return visitor.result();
+}
+
+} // namespace
+
+bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options) {
+ const bool floating_approximate = false;
+ return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
+ options, floating_approximate);
+}
+
+bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_start_idx,
+ int64_t left_end_idx, int64_t right_start_idx,
+ const EqualOptions& options) {
+ const bool floating_approximate = true;
+ return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
+ options, floating_approximate);
+}
+
+bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
+ const bool floating_approximate = false;
+ return ArrayEquals(left, right, opts, floating_approximate);
+}
+
+bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
+ const bool floating_approximate = true;
+ return ArrayEquals(left, right, opts, floating_approximate);
+}
+
+bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) {
+ const bool floating_approximate = false;
+ return ScalarEquals(left, right, options, floating_approximate);
}
-bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options,
- bool floating_approximate) {
- if (&left == &right && IdentityImpliesEquality(*left.type, options)) {
- return true;
- }
- if (!left.type->Equals(right.type)) {
- return false;
- }
- if (left.is_valid != right.is_valid) {
- return false;
- }
- if (!left.is_valid) {
- return true;
- }
- ScalarEqualsVisitor visitor(right, options, floating_approximate);
- auto error = VisitScalarInline(left, &visitor);
- DCHECK_OK(error);
- return visitor.result();
+bool ScalarApproxEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options) {
+ const bool floating_approximate = true;
+ return ScalarEquals(left, right, options, floating_approximate);
}
-} // namespace
-
-bool ArrayRangeEquals(const Array& left, const Array& right, int64_t left_start_idx,
- int64_t left_end_idx, int64_t right_start_idx,
- const EqualOptions& options) {
- const bool floating_approximate = false;
- return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
- options, floating_approximate);
-}
-
-bool ArrayRangeApproxEquals(const Array& left, const Array& right, int64_t left_start_idx,
- int64_t left_end_idx, int64_t right_start_idx,
- const EqualOptions& options) {
- const bool floating_approximate = true;
- return ArrayRangeEquals(left, right, left_start_idx, left_end_idx, right_start_idx,
- options, floating_approximate);
-}
-
-bool ArrayEquals(const Array& left, const Array& right, const EqualOptions& opts) {
- const bool floating_approximate = false;
- return ArrayEquals(left, right, opts, floating_approximate);
-}
-
-bool ArrayApproxEquals(const Array& left, const Array& right, const EqualOptions& opts) {
- const bool floating_approximate = true;
- return ArrayEquals(left, right, opts, floating_approximate);
-}
-
-bool ScalarEquals(const Scalar& left, const Scalar& right, const EqualOptions& options) {
- const bool floating_approximate = false;
- return ScalarEquals(left, right, options, floating_approximate);
-}
-
-bool ScalarApproxEquals(const Scalar& left, const Scalar& right,
- const EqualOptions& options) {
- const bool floating_approximate = true;
- return ScalarEquals(left, right, options, floating_approximate);
-}
-
namespace {
bool StridedIntegerTensorContentEquals(const int dim_index, int64_t left_offset,
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
index 3acd6b1b33e..6769b23867b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compare.h
@@ -71,7 +71,7 @@ class EqualOptions {
return res;
}
- static EqualOptions Defaults() { return {}; }
+ static EqualOptions Defaults() { return {}; }
protected:
double atol_ = kDefaultAbsoluteTolerance;
@@ -88,25 +88,25 @@ bool ARROW_EXPORT ArrayEquals(const Array& left, const Array& right,
bool ARROW_EXPORT ArrayApproxEquals(const Array& left, const Array& right,
const EqualOptions& = EqualOptions::Defaults());
-/// Returns true if indicated equal-length segment of arrays are exactly equal
+/// Returns true if indicated equal-length segment of arrays are exactly equal
bool ARROW_EXPORT ArrayRangeEquals(const Array& left, const Array& right,
int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults());
-
-/// Returns true if indicated equal-length segment of arrays are approximately equal
-bool ARROW_EXPORT ArrayRangeApproxEquals(const Array& left, const Array& right,
- int64_t start_idx, int64_t end_idx,
- int64_t other_start_idx,
- const EqualOptions& = EqualOptions::Defaults());
-
-bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
- const EqualOptions& = EqualOptions::Defaults());
-
-/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
-bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
- const EqualOptions& = EqualOptions::Defaults());
-
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if indicated equal-length segment of arrays are approximately equal
+bool ARROW_EXPORT ArrayRangeApproxEquals(const Array& left, const Array& right,
+ int64_t start_idx, int64_t end_idx,
+ int64_t other_start_idx,
+ const EqualOptions& = EqualOptions::Defaults());
+
+bool ARROW_EXPORT TensorEquals(const Tensor& left, const Tensor& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
+/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
+bool ARROW_EXPORT SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
+ const EqualOptions& = EqualOptions::Defaults());
+
/// Returns true if the type metadata are exactly equal
/// \param[in] left a DataType
/// \param[in] right a DataType
@@ -122,12 +122,12 @@ bool ARROW_EXPORT TypeEquals(const DataType& left, const DataType& right,
bool ARROW_EXPORT ScalarEquals(const Scalar& left, const Scalar& right,
const EqualOptions& options = EqualOptions::Defaults());
-/// Returns true if scalars are approximately equal
-/// \param[in] left a Scalar
-/// \param[in] right a Scalar
-/// \param[in] options comparison options
-bool ARROW_EXPORT
-ScalarApproxEquals(const Scalar& left, const Scalar& right,
- const EqualOptions& options = EqualOptions::Defaults());
-
+/// Returns true if scalars are approximately equal
+/// \param[in] left a Scalar
+/// \param[in] right a Scalar
+/// \param[in] options comparison options
+bool ARROW_EXPORT
+ScalarApproxEquals(const Scalar& left, const Scalar& right,
+ const EqualOptions& options = EqualOptions::Defaults());
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
index 2f26520c22a..1b00c366bfd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.cc
@@ -18,157 +18,157 @@
#include "arrow/compute/api_aggregate.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/registry.h"
-#include "arrow/compute/util_internal.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
-
-namespace internal {
-template <>
-struct EnumTraits<compute::QuantileOptions::Interpolation>
- : BasicEnumTraits<compute::QuantileOptions::Interpolation,
- compute::QuantileOptions::LINEAR, compute::QuantileOptions::LOWER,
- compute::QuantileOptions::HIGHER, compute::QuantileOptions::NEAREST,
- compute::QuantileOptions::MIDPOINT> {
- static std::string name() { return "QuantileOptions::Interpolation"; }
- static std::string value_name(compute::QuantileOptions::Interpolation value) {
- switch (value) {
- case compute::QuantileOptions::LINEAR:
- return "LINEAR";
- case compute::QuantileOptions::LOWER:
- return "LOWER";
- case compute::QuantileOptions::HIGHER:
- return "HIGHER";
- case compute::QuantileOptions::NEAREST:
- return "NEAREST";
- case compute::QuantileOptions::MIDPOINT:
- return "MIDPOINT";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
+
+namespace internal {
+template <>
+struct EnumTraits<compute::QuantileOptions::Interpolation>
+ : BasicEnumTraits<compute::QuantileOptions::Interpolation,
+ compute::QuantileOptions::LINEAR, compute::QuantileOptions::LOWER,
+ compute::QuantileOptions::HIGHER, compute::QuantileOptions::NEAREST,
+ compute::QuantileOptions::MIDPOINT> {
+ static std::string name() { return "QuantileOptions::Interpolation"; }
+ static std::string value_name(compute::QuantileOptions::Interpolation value) {
+ switch (value) {
+ case compute::QuantileOptions::LINEAR:
+ return "LINEAR";
+ case compute::QuantileOptions::LOWER:
+ return "LOWER";
+ case compute::QuantileOptions::HIGHER:
+ return "HIGHER";
+ case compute::QuantileOptions::NEAREST:
+ return "NEAREST";
+ case compute::QuantileOptions::MIDPOINT:
+ return "MIDPOINT";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
namespace compute {
// ----------------------------------------------------------------------
-// Function options
-
-using ::arrow::internal::checked_cast;
-
-namespace internal {
-namespace {
-using ::arrow::internal::DataMember;
-static auto kScalarAggregateOptionsType = GetFunctionOptionsType<ScalarAggregateOptions>(
- DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls),
- DataMember("min_count", &ScalarAggregateOptions::min_count));
-static auto kModeOptionsType =
- GetFunctionOptionsType<ModeOptions>(DataMember("n", &ModeOptions::n));
-static auto kVarianceOptionsType =
- GetFunctionOptionsType<VarianceOptions>(DataMember("ddof", &VarianceOptions::ddof));
-static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
- DataMember("q", &QuantileOptions::q),
- DataMember("interpolation", &QuantileOptions::interpolation));
-static auto kTDigestOptionsType = GetFunctionOptionsType<TDigestOptions>(
- DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta),
- DataMember("buffer_size", &TDigestOptions::buffer_size));
-static auto kIndexOptionsType =
- GetFunctionOptionsType<IndexOptions>(DataMember("value", &IndexOptions::value));
-} // namespace
-} // namespace internal
-
-ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count)
- : FunctionOptions(internal::kScalarAggregateOptionsType),
- skip_nulls(skip_nulls),
- min_count(min_count) {}
-constexpr char ScalarAggregateOptions::kTypeName[];
-
-ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {}
-constexpr char ModeOptions::kTypeName[];
-
-VarianceOptions::VarianceOptions(int ddof)
- : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {}
-constexpr char VarianceOptions::kTypeName[];
-
-QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation)
- : FunctionOptions(internal::kQuantileOptionsType),
- q{q},
- interpolation{interpolation} {}
-QuantileOptions::QuantileOptions(std::vector<double> q, enum Interpolation interpolation)
- : FunctionOptions(internal::kQuantileOptionsType),
- q{std::move(q)},
- interpolation{interpolation} {}
-constexpr char QuantileOptions::kTypeName[];
-
-TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size)
- : FunctionOptions(internal::kTDigestOptionsType),
- q{q},
- delta{delta},
- buffer_size{buffer_size} {}
-TDigestOptions::TDigestOptions(std::vector<double> q, uint32_t delta,
- uint32_t buffer_size)
- : FunctionOptions(internal::kTDigestOptionsType),
- q{std::move(q)},
- delta{delta},
- buffer_size{buffer_size} {}
-constexpr char TDigestOptions::kTypeName[];
-
-IndexOptions::IndexOptions(std::shared_ptr<Scalar> value)
- : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {}
-IndexOptions::IndexOptions() : IndexOptions(std::make_shared<NullScalar>()) {}
-constexpr char IndexOptions::kTypeName[];
-
-namespace internal {
-void RegisterAggregateOptions(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
-}
-} // namespace internal
-
-// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kScalarAggregateOptionsType = GetFunctionOptionsType<ScalarAggregateOptions>(
+ DataMember("skip_nulls", &ScalarAggregateOptions::skip_nulls),
+ DataMember("min_count", &ScalarAggregateOptions::min_count));
+static auto kModeOptionsType =
+ GetFunctionOptionsType<ModeOptions>(DataMember("n", &ModeOptions::n));
+static auto kVarianceOptionsType =
+ GetFunctionOptionsType<VarianceOptions>(DataMember("ddof", &VarianceOptions::ddof));
+static auto kQuantileOptionsType = GetFunctionOptionsType<QuantileOptions>(
+ DataMember("q", &QuantileOptions::q),
+ DataMember("interpolation", &QuantileOptions::interpolation));
+static auto kTDigestOptionsType = GetFunctionOptionsType<TDigestOptions>(
+ DataMember("q", &TDigestOptions::q), DataMember("delta", &TDigestOptions::delta),
+ DataMember("buffer_size", &TDigestOptions::buffer_size));
+static auto kIndexOptionsType =
+ GetFunctionOptionsType<IndexOptions>(DataMember("value", &IndexOptions::value));
+} // namespace
+} // namespace internal
+
+ScalarAggregateOptions::ScalarAggregateOptions(bool skip_nulls, uint32_t min_count)
+ : FunctionOptions(internal::kScalarAggregateOptionsType),
+ skip_nulls(skip_nulls),
+ min_count(min_count) {}
+constexpr char ScalarAggregateOptions::kTypeName[];
+
+ModeOptions::ModeOptions(int64_t n) : FunctionOptions(internal::kModeOptionsType), n(n) {}
+constexpr char ModeOptions::kTypeName[];
+
+VarianceOptions::VarianceOptions(int ddof)
+ : FunctionOptions(internal::kVarianceOptionsType), ddof(ddof) {}
+constexpr char VarianceOptions::kTypeName[];
+
+QuantileOptions::QuantileOptions(double q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{q},
+ interpolation{interpolation} {}
+QuantileOptions::QuantileOptions(std::vector<double> q, enum Interpolation interpolation)
+ : FunctionOptions(internal::kQuantileOptionsType),
+ q{std::move(q)},
+ interpolation{interpolation} {}
+constexpr char QuantileOptions::kTypeName[];
+
+TDigestOptions::TDigestOptions(double q, uint32_t delta, uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{q},
+ delta{delta},
+ buffer_size{buffer_size} {}
+TDigestOptions::TDigestOptions(std::vector<double> q, uint32_t delta,
+ uint32_t buffer_size)
+ : FunctionOptions(internal::kTDigestOptionsType),
+ q{std::move(q)},
+ delta{delta},
+ buffer_size{buffer_size} {}
+constexpr char TDigestOptions::kTypeName[];
+
+IndexOptions::IndexOptions(std::shared_ptr<Scalar> value)
+ : FunctionOptions(internal::kIndexOptionsType), value{std::move(value)} {}
+IndexOptions::IndexOptions() : IndexOptions(std::make_shared<NullScalar>()) {}
+constexpr char IndexOptions::kTypeName[];
+
+namespace internal {
+void RegisterAggregateOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kScalarAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kModeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kVarianceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kQuantileOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTDigestOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kIndexOptionsType));
+}
+} // namespace internal
+
+// ----------------------------------------------------------------------
// Scalar aggregates
-Result<Datum> Count(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
+Result<Datum> Count(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
return CallFunction("count", {value}, &options, ctx);
}
-Result<Datum> Mean(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("mean", {value}, &options, ctx);
+Result<Datum> Mean(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("mean", {value}, &options, ctx);
}
-Result<Datum> Sum(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("sum", {value}, &options, ctx);
+Result<Datum> Sum(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("sum", {value}, &options, ctx);
}
-Result<Datum> MinMax(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
+Result<Datum> MinMax(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
return CallFunction("min_max", {value}, &options, ctx);
}
-Result<Datum> Any(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("any", {value}, &options, ctx);
+Result<Datum> Any(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("any", {value}, &options, ctx);
+}
+
+Result<Datum> All(const Datum& value, const ScalarAggregateOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("all", {value}, &options, ctx);
+}
+
+Result<Datum> Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) {
+ return CallFunction("mode", {value}, &options, ctx);
}
-Result<Datum> All(const Datum& value, const ScalarAggregateOptions& options,
- ExecContext* ctx) {
- return CallFunction("all", {value}, &options, ctx);
-}
-
-Result<Datum> Mode(const Datum& value, const ModeOptions& options, ExecContext* ctx) {
- return CallFunction("mode", {value}, &options, ctx);
-}
-
Result<Datum> Stddev(const Datum& value, const VarianceOptions& options,
ExecContext* ctx) {
return CallFunction("stddev", {value}, &options, ctx);
@@ -179,19 +179,19 @@ Result<Datum> Variance(const Datum& value, const VarianceOptions& options,
return CallFunction("variance", {value}, &options, ctx);
}
-Result<Datum> Quantile(const Datum& value, const QuantileOptions& options,
- ExecContext* ctx) {
- return CallFunction("quantile", {value}, &options, ctx);
-}
-
-Result<Datum> TDigest(const Datum& value, const TDigestOptions& options,
- ExecContext* ctx) {
- return CallFunction("tdigest", {value}, &options, ctx);
-}
-
-Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
- return CallFunction("index", {value}, &options, ctx);
-}
-
+Result<Datum> Quantile(const Datum& value, const QuantileOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("quantile", {value}, &options, ctx);
+}
+
+Result<Datum> TDigest(const Datum& value, const TDigestOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("tdigest", {value}, &options, ctx);
+}
+
+Result<Datum> Index(const Datum& value, const IndexOptions& options, ExecContext* ctx) {
+ return CallFunction("index", {value}, &options, ctx);
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
index 37296779b2f..7a6c44bd923 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_aggregate.h
@@ -40,108 +40,108 @@ class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
-/// \brief Control general scalar aggregate kernel behavior
-///
-/// By default, null values are ignored
-class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
- public:
- explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
- constexpr static char const kTypeName[] = "ScalarAggregateOptions";
- static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
-
- bool skip_nulls;
- uint32_t min_count;
-};
-
-/// \brief Control Mode kernel behavior
-///
-/// Returns top-n common values and counts.
-/// By default, returns the most common value and count.
-class ARROW_EXPORT ModeOptions : public FunctionOptions {
- public:
- explicit ModeOptions(int64_t n = 1);
- constexpr static char const kTypeName[] = "ModeOptions";
- static ModeOptions Defaults() { return ModeOptions{}; }
-
- int64_t n = 1;
+/// \brief Control general scalar aggregate kernel behavior
+///
+/// By default, null values are ignored
+class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
+ public:
+ explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
+ constexpr static char const kTypeName[] = "ScalarAggregateOptions";
+ static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
+
+ bool skip_nulls;
+ uint32_t min_count;
};
-/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
-///
-/// The divisor used in calculations is N - ddof, where N is the number of elements.
-/// By default, ddof is zero, and population variance or stddev is returned.
-class ARROW_EXPORT VarianceOptions : public FunctionOptions {
- public:
- explicit VarianceOptions(int ddof = 0);
- constexpr static char const kTypeName[] = "VarianceOptions";
- static VarianceOptions Defaults() { return VarianceOptions{}; }
-
- int ddof = 0;
-};
-
-/// \brief Control Quantile kernel behavior
-///
-/// By default, returns the median value.
-class ARROW_EXPORT QuantileOptions : public FunctionOptions {
- public:
- /// Interpolation method to use when quantile lies between two data points
- enum Interpolation {
- LINEAR = 0,
- LOWER,
- HIGHER,
- NEAREST,
- MIDPOINT,
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+class ARROW_EXPORT ModeOptions : public FunctionOptions {
+ public:
+ explicit ModeOptions(int64_t n = 1);
+ constexpr static char const kTypeName[] = "ModeOptions";
+ static ModeOptions Defaults() { return ModeOptions{}; }
+
+ int64_t n = 1;
+};
+
+/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
+///
+/// The divisor used in calculations is N - ddof, where N is the number of elements.
+/// By default, ddof is zero, and population variance or stddev is returned.
+class ARROW_EXPORT VarianceOptions : public FunctionOptions {
+ public:
+ explicit VarianceOptions(int ddof = 0);
+ constexpr static char const kTypeName[] = "VarianceOptions";
+ static VarianceOptions Defaults() { return VarianceOptions{}; }
+
+ int ddof = 0;
+};
+
+/// \brief Control Quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT QuantileOptions : public FunctionOptions {
+ public:
+ /// Interpolation method to use when quantile lies between two data points
+ enum Interpolation {
+ LINEAR = 0,
+ LOWER,
+ HIGHER,
+ NEAREST,
+ MIDPOINT,
};
- explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR);
+ explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR);
+
+ explicit QuantileOptions(std::vector<double> q,
+ enum Interpolation interpolation = LINEAR);
+
+ constexpr static char const kTypeName[] = "QuantileOptions";
+ static QuantileOptions Defaults() { return QuantileOptions{}; }
- explicit QuantileOptions(std::vector<double> q,
- enum Interpolation interpolation = LINEAR);
+ /// quantile must be between 0 and 1 inclusive
+ std::vector<double> q;
+ enum Interpolation interpolation;
+};
+
+/// \brief Control TDigest approximate quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT TDigestOptions : public FunctionOptions {
+ public:
+ explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
+ uint32_t buffer_size = 500);
+ explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
+ uint32_t buffer_size = 500);
+ constexpr static char const kTypeName[] = "TDigestOptions";
+ static TDigestOptions Defaults() { return TDigestOptions{}; }
- constexpr static char const kTypeName[] = "QuantileOptions";
- static QuantileOptions Defaults() { return QuantileOptions{}; }
-
- /// quantile must be between 0 and 1 inclusive
- std::vector<double> q;
- enum Interpolation interpolation;
+ /// quantile must be between 0 and 1 inclusive
+ std::vector<double> q;
+ /// compression parameter, default 100
+ uint32_t delta;
+ /// input buffer size, default 500
+ uint32_t buffer_size;
};
-/// \brief Control TDigest approximate quantile kernel behavior
-///
-/// By default, returns the median value.
-class ARROW_EXPORT TDigestOptions : public FunctionOptions {
- public:
- explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
- uint32_t buffer_size = 500);
- explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
- uint32_t buffer_size = 500);
- constexpr static char const kTypeName[] = "TDigestOptions";
- static TDigestOptions Defaults() { return TDigestOptions{}; }
-
- /// quantile must be between 0 and 1 inclusive
- std::vector<double> q;
- /// compression parameter, default 100
- uint32_t delta;
- /// input buffer size, default 500
- uint32_t buffer_size;
-};
-
-/// \brief Control Index kernel behavior
-class ARROW_EXPORT IndexOptions : public FunctionOptions {
- public:
- explicit IndexOptions(std::shared_ptr<Scalar> value);
- // Default constructor for serialization
- IndexOptions();
- constexpr static char const kTypeName[] = "IndexOptions";
-
- std::shared_ptr<Scalar> value;
+/// \brief Control Index kernel behavior
+class ARROW_EXPORT IndexOptions : public FunctionOptions {
+ public:
+ explicit IndexOptions(std::shared_ptr<Scalar> value);
+ // Default constructor for serialization
+ IndexOptions();
+ constexpr static char const kTypeName[] = "IndexOptions";
+
+ std::shared_ptr<Scalar> value;
};
/// @}
/// \brief Count non-null (or null) values in an array.
///
-/// \param[in] options counting options, see ScalarAggregateOptions for more information
+/// \param[in] options counting options, see ScalarAggregateOptions for more information
/// \param[in] datum to count
/// \param[in] ctx the function execution context, optional
/// \return out resulting datum
@@ -149,40 +149,40 @@ class ARROW_EXPORT IndexOptions : public FunctionOptions {
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Count(
- const Datum& datum,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Count(
+ const Datum& datum,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Compute the mean of a numeric array.
///
/// \param[in] value datum to compute the mean, expecting Array
-/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed mean as a DoubleScalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Mean(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Mean(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Sum values of a numeric array.
///
/// \param[in] value datum to sum, expecting Array or ChunkedArray
-/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return datum of the computed sum as a Scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Sum(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Sum(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Calculate the min / max of a numeric array
///
@@ -190,78 +190,78 @@ Result<Datum> Sum(
/// struct<min: T, max: T>, where T is the input type
///
/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] options see ScalarAggregateOptions for more information
/// \param[in] ctx the function execution context, optional
/// \return resulting datum as a struct<min: T, max: T> scalar
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> MinMax(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Test whether any element in a boolean array evaluates to true.
-///
-/// This function returns true if any of the elements in the array evaluates
-/// to true and false otherwise. Null values are ignored by default.
-/// If null values are taken into account by setting ScalarAggregateOptions
-/// parameter skip_nulls = false then Kleene logic is used.
-/// See KleeneOr for more details on Kleene logic.
-///
-/// \param[in] value input datum, expecting a boolean array
-/// \param[in] options see ScalarAggregateOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as a BooleanScalar
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Any(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Test whether all elements in a boolean array evaluate to true.
-///
-/// This function returns true if all of the elements in the array evaluate
-/// to true and false otherwise. Null values are ignored by default.
-/// If null values are taken into account by setting ScalarAggregateOptions
-/// parameter skip_nulls = false then Kleene logic is used.
-/// See KleeneAnd for more details on Kleene logic.
-///
-/// \param[in] value input datum, expecting a boolean array
-/// \param[in] options see ScalarAggregateOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as a BooleanScalar
-
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> All(
- const Datum& value,
- const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
+Result<Datum> MinMax(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether any element in a boolean array evaluates to true.
+///
+/// This function returns true if any of the elements in the array evaluates
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneOr for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Any(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether all elements in a boolean array evaluate to true.
+///
+/// This function returns true if all of the elements in the array evaluate
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneAnd for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> All(
+ const Datum& value,
+ const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
/// \brief Calculate the modal (most common) value of a numeric array
///
-/// This function returns top-n most common values and number of times they occur as
-/// an array of `struct<mode: T, count: int64>`, where T is the input type.
-/// Values with larger counts are returned before smaller ones.
-/// If there are more than one values with same count, smaller value is returned first.
+/// This function returns top-n most common values and number of times they occur as
+/// an array of `struct<mode: T, count: int64>`, where T is the input type.
+/// Values with larger counts are returned before smaller ones.
+/// If there are more than one values with same count, smaller value is returned first.
///
/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see ModeOptions for more information
+/// \param[in] options see ModeOptions for more information
/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as an array of struct<mode: T, count: int64>
+/// \return resulting datum as an array of struct<mode: T, count: int64>
///
/// \since 2.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> Mode(const Datum& value,
- const ModeOptions& options = ModeOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> Mode(const Datum& value,
+ const ModeOptions& options = ModeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
/// \brief Calculate the standard deviation of a numeric array
///
@@ -291,143 +291,143 @@ Result<Datum> Variance(const Datum& value,
const VarianceOptions& options = VarianceOptions::Defaults(),
ExecContext* ctx = NULLPTR);
-/// \brief Calculate the quantiles of a numeric array
-///
-/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see QuantileOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as an array
-///
-/// \since 4.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Quantile(const Datum& value,
- const QuantileOptions& options = QuantileOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
-///
-/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see TDigestOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as an array
-///
-/// \since 4.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> TDigest(const Datum& value,
- const TDigestOptions& options = TDigestOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Find the first index of a value in an array.
-///
-/// \param[in] value The array to search.
-/// \param[in] options The array to search for. See IndexOoptions.
-/// \param[in] ctx the function execution context, optional
-/// \return out a Scalar containing the index (or -1 if not found).
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Index(const Datum& value, const IndexOptions& options,
- ExecContext* ctx = NULLPTR);
-
-namespace internal {
-
-/// Internal use only: streaming group identifier.
-/// Consumes batches of keys and yields batches of the group ids.
-class ARROW_EXPORT Grouper {
- public:
- virtual ~Grouper() = default;
-
- /// Construct a Grouper which receives the specified key types
- static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
- ExecContext* ctx = default_exec_context());
-
- /// Consume a batch of keys, producing the corresponding group ids as an integer array.
- /// Currently only uint32 indices will be produced, eventually the bit width will only
- /// be as wide as necessary.
- virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
-
- /// Get current unique keys. May be called multiple times.
- virtual Result<ExecBatch> GetUniques() = 0;
-
- /// Get the current number of groups.
- virtual uint32_t num_groups() const = 0;
-
- /// \brief Assemble lists of indices of identical elements.
- ///
- /// \param[in] ids An unsigned, all-valid integral array which will be
- /// used as grouping criteria.
- /// \param[in] num_groups An upper bound for the elements of ids
- /// \return A num_groups-long ListArray where the slot at i contains a
- /// list of indices where i appears in ids.
- ///
- /// MakeGroupings([
- /// 2,
- /// 2,
- /// 5,
- /// 5,
- /// 2,
- /// 3
- /// ], 8) == [
- /// [],
- /// [],
- /// [0, 1, 4],
- /// [5],
- /// [],
- /// [2, 3],
- /// [],
- /// []
- /// ]
- static Result<std::shared_ptr<ListArray>> MakeGroupings(
- const UInt32Array& ids, uint32_t num_groups,
- ExecContext* ctx = default_exec_context());
-
- /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
- /// the provided groupings.
- ///
- /// For example,
- /// ApplyGroupings([
- /// [],
- /// [],
- /// [0, 1, 4],
- /// [5],
- /// [],
- /// [2, 3],
- /// [],
- /// []
- /// ], [2, 2, 5, 5, 2, 3]) == [
- /// [],
- /// [],
- /// [2, 2, 2],
- /// [3],
- /// [],
- /// [5, 5],
- /// [],
- /// []
- /// ]
- static Result<std::shared_ptr<ListArray>> ApplyGroupings(
- const ListArray& groupings, const Array& array,
- ExecContext* ctx = default_exec_context());
-};
-
-/// \brief Configure a grouped aggregation
-struct ARROW_EXPORT Aggregate {
- /// the name of the aggregation function
- std::string function;
-
- /// options for the aggregation function
- const FunctionOptions* options;
-};
-
-/// Internal use only: helper function for testing HashAggregateKernels.
-/// This will be replaced by streaming execution operators.
-ARROW_EXPORT
-Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
- const std::vector<Aggregate>& aggregates,
- ExecContext* ctx = default_exec_context());
-
-} // namespace internal
+/// \brief Calculate the quantiles of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see QuantileOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Quantile(const Datum& value,
+ const QuantileOptions& options = QuantileOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see TDigestOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> TDigest(const Datum& value,
+ const TDigestOptions& options = TDigestOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Find the first index of a value in an array.
+///
+/// \param[in] value The array to search.
+/// \param[in] options The array to search for. See IndexOoptions.
+/// \param[in] ctx the function execution context, optional
+/// \return out a Scalar containing the index (or -1 if not found).
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Index(const Datum& value, const IndexOptions& options,
+ ExecContext* ctx = NULLPTR);
+
+namespace internal {
+
+/// Internal use only: streaming group identifier.
+/// Consumes batches of keys and yields batches of the group ids.
+class ARROW_EXPORT Grouper {
+ public:
+ virtual ~Grouper() = default;
+
+ /// Construct a Grouper which receives the specified key types
+ static Result<std::unique_ptr<Grouper>> Make(const std::vector<ValueDescr>& descrs,
+ ExecContext* ctx = default_exec_context());
+
+ /// Consume a batch of keys, producing the corresponding group ids as an integer array.
+ /// Currently only uint32 indices will be produced, eventually the bit width will only
+ /// be as wide as necessary.
+ virtual Result<Datum> Consume(const ExecBatch& batch) = 0;
+
+ /// Get current unique keys. May be called multiple times.
+ virtual Result<ExecBatch> GetUniques() = 0;
+
+ /// Get the current number of groups.
+ virtual uint32_t num_groups() const = 0;
+
+ /// \brief Assemble lists of indices of identical elements.
+ ///
+ /// \param[in] ids An unsigned, all-valid integral array which will be
+ /// used as grouping criteria.
+ /// \param[in] num_groups An upper bound for the elements of ids
+ /// \return A num_groups-long ListArray where the slot at i contains a
+ /// list of indices where i appears in ids.
+ ///
+ /// MakeGroupings([
+ /// 2,
+ /// 2,
+ /// 5,
+ /// 5,
+ /// 2,
+ /// 3
+ /// ], 8) == [
+ /// [],
+ /// [],
+ /// [0, 1, 4],
+ /// [5],
+ /// [],
+ /// [2, 3],
+ /// [],
+ /// []
+ /// ]
+ static Result<std::shared_ptr<ListArray>> MakeGroupings(
+ const UInt32Array& ids, uint32_t num_groups,
+ ExecContext* ctx = default_exec_context());
+
+ /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
+ /// the provided groupings.
+ ///
+ /// For example,
+ /// ApplyGroupings([
+ /// [],
+ /// [],
+ /// [0, 1, 4],
+ /// [5],
+ /// [],
+ /// [2, 3],
+ /// [],
+ /// []
+ /// ], [2, 2, 5, 5, 2, 3]) == [
+ /// [],
+ /// [],
+ /// [2, 2, 2],
+ /// [3],
+ /// [],
+ /// [5, 5],
+ /// [],
+ /// []
+ /// ]
+ static Result<std::shared_ptr<ListArray>> ApplyGroupings(
+ const ListArray& groupings, const Array& array,
+ ExecContext* ctx = default_exec_context());
+};
+
+/// \brief Configure a grouped aggregation
+struct ARROW_EXPORT Aggregate {
+ /// the name of the aggregation function
+ std::string function;
+
+ /// options for the aggregation function
+ const FunctionOptions* options;
+};
+
+/// Internal use only: helper function for testing HashAggregateKernels.
+/// This will be replaced by streaming execution operators.
+ARROW_EXPORT
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+ const std::vector<Aggregate>& aggregates,
+ ExecContext* ctx = default_exec_context());
+
+} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
index 989ca2b3937..1feb4e7eee0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.cc
@@ -21,287 +21,287 @@
#include <sstream>
#include <string>
-#include "arrow/array/array_base.h"
+#include "arrow/array/array_base.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/registry.h"
-#include "arrow/compute/util_internal.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/util_internal.h"
#include "arrow/status.h"
#include "arrow/type.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
namespace arrow {
-
-namespace internal {
-template <>
-struct EnumTraits<compute::JoinOptions::NullHandlingBehavior>
- : BasicEnumTraits<compute::JoinOptions::NullHandlingBehavior,
- compute::JoinOptions::NullHandlingBehavior::EMIT_NULL,
- compute::JoinOptions::NullHandlingBehavior::SKIP,
- compute::JoinOptions::NullHandlingBehavior::REPLACE> {
- static std::string name() { return "JoinOptions::NullHandlingBehavior"; }
- static std::string value_name(compute::JoinOptions::NullHandlingBehavior value) {
- switch (value) {
- case compute::JoinOptions::NullHandlingBehavior::EMIT_NULL:
- return "EMIT_NULL";
- case compute::JoinOptions::NullHandlingBehavior::SKIP:
- return "SKIP";
- case compute::JoinOptions::NullHandlingBehavior::REPLACE:
- return "REPLACE";
- }
- return "<INVALID>";
- }
-};
-template <>
-struct EnumTraits<TimeUnit::type>
- : BasicEnumTraits<TimeUnit::type, TimeUnit::type::SECOND, TimeUnit::type::MILLI,
- TimeUnit::type::MICRO, TimeUnit::type::NANO> {
- static std::string name() { return "TimeUnit::type"; }
- static std::string value_name(TimeUnit::type value) {
- switch (value) {
- case TimeUnit::type::SECOND:
- return "SECOND";
- case TimeUnit::type::MILLI:
- return "MILLI";
- case TimeUnit::type::MICRO:
- return "MICRO";
- case TimeUnit::type::NANO:
- return "NANO";
- }
- return "<INVALID>";
- }
-};
-template <>
-struct EnumTraits<compute::CompareOperator>
- : BasicEnumTraits<
- compute::CompareOperator, compute::CompareOperator::EQUAL,
- compute::CompareOperator::NOT_EQUAL, compute::CompareOperator::GREATER,
- compute::CompareOperator::GREATER_EQUAL, compute::CompareOperator::LESS,
- compute::CompareOperator::LESS_EQUAL> {
- static std::string name() { return "compute::CompareOperator"; }
- static std::string value_name(compute::CompareOperator value) {
- switch (value) {
- case compute::CompareOperator::EQUAL:
- return "EQUAL";
- case compute::CompareOperator::NOT_EQUAL:
- return "NOT_EQUAL";
- case compute::CompareOperator::GREATER:
- return "GREATER";
- case compute::CompareOperator::GREATER_EQUAL:
- return "GREATER_EQUAL";
- case compute::CompareOperator::LESS:
- return "LESS";
- case compute::CompareOperator::LESS_EQUAL:
- return "LESS_EQUAL";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
+
+namespace internal {
+template <>
+struct EnumTraits<compute::JoinOptions::NullHandlingBehavior>
+ : BasicEnumTraits<compute::JoinOptions::NullHandlingBehavior,
+ compute::JoinOptions::NullHandlingBehavior::EMIT_NULL,
+ compute::JoinOptions::NullHandlingBehavior::SKIP,
+ compute::JoinOptions::NullHandlingBehavior::REPLACE> {
+ static std::string name() { return "JoinOptions::NullHandlingBehavior"; }
+ static std::string value_name(compute::JoinOptions::NullHandlingBehavior value) {
+ switch (value) {
+ case compute::JoinOptions::NullHandlingBehavior::EMIT_NULL:
+ return "EMIT_NULL";
+ case compute::JoinOptions::NullHandlingBehavior::SKIP:
+ return "SKIP";
+ case compute::JoinOptions::NullHandlingBehavior::REPLACE:
+ return "REPLACE";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<TimeUnit::type>
+ : BasicEnumTraits<TimeUnit::type, TimeUnit::type::SECOND, TimeUnit::type::MILLI,
+ TimeUnit::type::MICRO, TimeUnit::type::NANO> {
+ static std::string name() { return "TimeUnit::type"; }
+ static std::string value_name(TimeUnit::type value) {
+ switch (value) {
+ case TimeUnit::type::SECOND:
+ return "SECOND";
+ case TimeUnit::type::MILLI:
+ return "MILLI";
+ case TimeUnit::type::MICRO:
+ return "MICRO";
+ case TimeUnit::type::NANO:
+ return "NANO";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<compute::CompareOperator>
+ : BasicEnumTraits<
+ compute::CompareOperator, compute::CompareOperator::EQUAL,
+ compute::CompareOperator::NOT_EQUAL, compute::CompareOperator::GREATER,
+ compute::CompareOperator::GREATER_EQUAL, compute::CompareOperator::LESS,
+ compute::CompareOperator::LESS_EQUAL> {
+ static std::string name() { return "compute::CompareOperator"; }
+ static std::string value_name(compute::CompareOperator value) {
+ switch (value) {
+ case compute::CompareOperator::EQUAL:
+ return "EQUAL";
+ case compute::CompareOperator::NOT_EQUAL:
+ return "NOT_EQUAL";
+ case compute::CompareOperator::GREATER:
+ return "GREATER";
+ case compute::CompareOperator::GREATER_EQUAL:
+ return "GREATER_EQUAL";
+ case compute::CompareOperator::LESS:
+ return "LESS";
+ case compute::CompareOperator::LESS_EQUAL:
+ return "LESS_EQUAL";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
namespace compute {
-// ----------------------------------------------------------------------
-// Function options
-
-using ::arrow::internal::checked_cast;
-
-namespace internal {
-namespace {
-using ::arrow::internal::DataMember;
-static auto kArithmeticOptionsType = GetFunctionOptionsType<ArithmeticOptions>(
- DataMember("check_overflow", &ArithmeticOptions::check_overflow));
-static auto kElementWiseAggregateOptionsType =
- GetFunctionOptionsType<ElementWiseAggregateOptions>(
- DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls));
-static auto kJoinOptionsType = GetFunctionOptionsType<JoinOptions>(
- DataMember("null_handling", &JoinOptions::null_handling),
- DataMember("null_replacement", &JoinOptions::null_replacement));
-static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOptions>(
- DataMember("pattern", &MatchSubstringOptions::pattern),
- DataMember("ignore_case", &MatchSubstringOptions::ignore_case));
-static auto kSplitOptionsType = GetFunctionOptionsType<SplitOptions>(
- DataMember("max_splits", &SplitOptions::max_splits),
- DataMember("reverse", &SplitOptions::reverse));
-static auto kSplitPatternOptionsType = GetFunctionOptionsType<SplitPatternOptions>(
- DataMember("pattern", &SplitPatternOptions::pattern),
- DataMember("max_splits", &SplitPatternOptions::max_splits),
- DataMember("reverse", &SplitPatternOptions::reverse));
-static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
- DataMember("start", &ReplaceSliceOptions::start),
- DataMember("stop", &ReplaceSliceOptions::stop),
- DataMember("replacement", &ReplaceSliceOptions::replacement));
-static auto kReplaceSubstringOptionsType =
- GetFunctionOptionsType<ReplaceSubstringOptions>(
- DataMember("pattern", &ReplaceSubstringOptions::pattern),
- DataMember("replacement", &ReplaceSubstringOptions::replacement),
- DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements));
-static auto kExtractRegexOptionsType = GetFunctionOptionsType<ExtractRegexOptions>(
- DataMember("pattern", &ExtractRegexOptions::pattern));
-static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
- DataMember("value_set", &SetLookupOptions::value_set),
- DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
-static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
- DataMember("format", &StrptimeOptions::format),
- DataMember("unit", &StrptimeOptions::unit));
-static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
- DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
-static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
- DataMember("characters", &TrimOptions::characters));
-static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
- DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
- DataMember("step", &SliceOptions::step));
-static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
- DataMember("field_names", &MakeStructOptions::field_names),
- DataMember("field_nullability", &MakeStructOptions::field_nullability),
- DataMember("field_metadata", &MakeStructOptions::field_metadata));
-static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
- DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
- DataMember("week_start", &DayOfWeekOptions::week_start));
-} // namespace
-} // namespace internal
-
-ArithmeticOptions::ArithmeticOptions(bool check_overflow)
- : FunctionOptions(internal::kArithmeticOptionsType), check_overflow(check_overflow) {}
-constexpr char ArithmeticOptions::kTypeName[];
-
-ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls)
- : FunctionOptions(internal::kElementWiseAggregateOptionsType),
- skip_nulls(skip_nulls) {}
-constexpr char ElementWiseAggregateOptions::kTypeName[];
-
-JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement)
- : FunctionOptions(internal::kJoinOptionsType),
- null_handling(null_handling),
- null_replacement(std::move(null_replacement)) {}
-constexpr char JoinOptions::kTypeName[];
-
-MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case)
- : FunctionOptions(internal::kMatchSubstringOptionsType),
- pattern(std::move(pattern)),
- ignore_case(ignore_case) {}
-MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {}
-constexpr char MatchSubstringOptions::kTypeName[];
-
-SplitOptions::SplitOptions(int64_t max_splits, bool reverse)
- : FunctionOptions(internal::kSplitOptionsType),
- max_splits(max_splits),
- reverse(reverse) {}
-constexpr char SplitOptions::kTypeName[];
-
-SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits,
- bool reverse)
- : FunctionOptions(internal::kSplitPatternOptionsType),
- pattern(std::move(pattern)),
- max_splits(max_splits),
- reverse(reverse) {}
-SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {}
-constexpr char SplitPatternOptions::kTypeName[];
-
-ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop,
- std::string replacement)
- : FunctionOptions(internal::kReplaceSliceOptionsType),
- start(start),
- stop(stop),
- replacement(std::move(replacement)) {}
-ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {}
-constexpr char ReplaceSliceOptions::kTypeName[];
-
-ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern,
- std::string replacement,
- int64_t max_replacements)
- : FunctionOptions(internal::kReplaceSubstringOptionsType),
- pattern(std::move(pattern)),
- replacement(std::move(replacement)),
- max_replacements(max_replacements) {}
-ReplaceSubstringOptions::ReplaceSubstringOptions()
- : ReplaceSubstringOptions("", "", -1) {}
-constexpr char ReplaceSubstringOptions::kTypeName[];
-
-ExtractRegexOptions::ExtractRegexOptions(std::string pattern)
- : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {}
-ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {}
-constexpr char ExtractRegexOptions::kTypeName[];
-
-SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
- : FunctionOptions(internal::kSetLookupOptionsType),
- value_set(std::move(value_set)),
- skip_nulls(skip_nulls) {}
-SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
-constexpr char SetLookupOptions::kTypeName[];
-
-StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
- : FunctionOptions(internal::kStrptimeOptionsType),
- format(std::move(format)),
- unit(unit) {}
-StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
-constexpr char StrptimeOptions::kTypeName[];
-
-PadOptions::PadOptions(int64_t width, std::string padding)
- : FunctionOptions(internal::kPadOptionsType),
- width(width),
- padding(std::move(padding)) {}
-PadOptions::PadOptions() : PadOptions(0, " ") {}
-constexpr char PadOptions::kTypeName[];
-
-TrimOptions::TrimOptions(std::string characters)
- : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {}
-TrimOptions::TrimOptions() : TrimOptions("") {}
-constexpr char TrimOptions::kTypeName[];
-
-SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
- : FunctionOptions(internal::kSliceOptionsType),
- start(start),
- stop(stop),
- step(step) {}
-SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {}
-constexpr char SliceOptions::kTypeName[];
-
-MakeStructOptions::MakeStructOptions(
- std::vector<std::string> n, std::vector<bool> r,
- std::vector<std::shared_ptr<const KeyValueMetadata>> m)
- : FunctionOptions(internal::kMakeStructOptionsType),
- field_names(std::move(n)),
- field_nullability(std::move(r)),
- field_metadata(std::move(m)) {}
-
-MakeStructOptions::MakeStructOptions(std::vector<std::string> n)
- : FunctionOptions(internal::kMakeStructOptionsType),
- field_names(std::move(n)),
- field_nullability(field_names.size(), true),
- field_metadata(field_names.size(), NULLPTR) {}
-
-MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector<std::string>()) {}
-constexpr char MakeStructOptions::kTypeName[];
-
-DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start)
- : FunctionOptions(internal::kDayOfWeekOptionsType),
- one_based_numbering(one_based_numbering),
- week_start(week_start) {}
-constexpr char DayOfWeekOptions::kTypeName[];
-
-namespace internal {
-void RegisterScalarOptions(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
-}
-} // namespace internal
-
+// ----------------------------------------------------------------------
+// Function options
+
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kArithmeticOptionsType = GetFunctionOptionsType<ArithmeticOptions>(
+ DataMember("check_overflow", &ArithmeticOptions::check_overflow));
+static auto kElementWiseAggregateOptionsType =
+ GetFunctionOptionsType<ElementWiseAggregateOptions>(
+ DataMember("skip_nulls", &ElementWiseAggregateOptions::skip_nulls));
+static auto kJoinOptionsType = GetFunctionOptionsType<JoinOptions>(
+ DataMember("null_handling", &JoinOptions::null_handling),
+ DataMember("null_replacement", &JoinOptions::null_replacement));
+static auto kMatchSubstringOptionsType = GetFunctionOptionsType<MatchSubstringOptions>(
+ DataMember("pattern", &MatchSubstringOptions::pattern),
+ DataMember("ignore_case", &MatchSubstringOptions::ignore_case));
+static auto kSplitOptionsType = GetFunctionOptionsType<SplitOptions>(
+ DataMember("max_splits", &SplitOptions::max_splits),
+ DataMember("reverse", &SplitOptions::reverse));
+static auto kSplitPatternOptionsType = GetFunctionOptionsType<SplitPatternOptions>(
+ DataMember("pattern", &SplitPatternOptions::pattern),
+ DataMember("max_splits", &SplitPatternOptions::max_splits),
+ DataMember("reverse", &SplitPatternOptions::reverse));
+static auto kReplaceSliceOptionsType = GetFunctionOptionsType<ReplaceSliceOptions>(
+ DataMember("start", &ReplaceSliceOptions::start),
+ DataMember("stop", &ReplaceSliceOptions::stop),
+ DataMember("replacement", &ReplaceSliceOptions::replacement));
+static auto kReplaceSubstringOptionsType =
+ GetFunctionOptionsType<ReplaceSubstringOptions>(
+ DataMember("pattern", &ReplaceSubstringOptions::pattern),
+ DataMember("replacement", &ReplaceSubstringOptions::replacement),
+ DataMember("max_replacements", &ReplaceSubstringOptions::max_replacements));
+static auto kExtractRegexOptionsType = GetFunctionOptionsType<ExtractRegexOptions>(
+ DataMember("pattern", &ExtractRegexOptions::pattern));
+static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
+ DataMember("value_set", &SetLookupOptions::value_set),
+ DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
+static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
+ DataMember("format", &StrptimeOptions::format),
+ DataMember("unit", &StrptimeOptions::unit));
+static auto kPadOptionsType = GetFunctionOptionsType<PadOptions>(
+ DataMember("width", &PadOptions::width), DataMember("padding", &PadOptions::padding));
+static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
+ DataMember("characters", &TrimOptions::characters));
+static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
+ DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
+ DataMember("step", &SliceOptions::step));
+static auto kMakeStructOptionsType = GetFunctionOptionsType<MakeStructOptions>(
+ DataMember("field_names", &MakeStructOptions::field_names),
+ DataMember("field_nullability", &MakeStructOptions::field_nullability),
+ DataMember("field_metadata", &MakeStructOptions::field_metadata));
+static auto kDayOfWeekOptionsType = GetFunctionOptionsType<DayOfWeekOptions>(
+ DataMember("one_based_numbering", &DayOfWeekOptions::one_based_numbering),
+ DataMember("week_start", &DayOfWeekOptions::week_start));
+} // namespace
+} // namespace internal
+
+ArithmeticOptions::ArithmeticOptions(bool check_overflow)
+ : FunctionOptions(internal::kArithmeticOptionsType), check_overflow(check_overflow) {}
+constexpr char ArithmeticOptions::kTypeName[];
+
+ElementWiseAggregateOptions::ElementWiseAggregateOptions(bool skip_nulls)
+ : FunctionOptions(internal::kElementWiseAggregateOptionsType),
+ skip_nulls(skip_nulls) {}
+constexpr char ElementWiseAggregateOptions::kTypeName[];
+
+JoinOptions::JoinOptions(NullHandlingBehavior null_handling, std::string null_replacement)
+ : FunctionOptions(internal::kJoinOptionsType),
+ null_handling(null_handling),
+ null_replacement(std::move(null_replacement)) {}
+constexpr char JoinOptions::kTypeName[];
+
+MatchSubstringOptions::MatchSubstringOptions(std::string pattern, bool ignore_case)
+ : FunctionOptions(internal::kMatchSubstringOptionsType),
+ pattern(std::move(pattern)),
+ ignore_case(ignore_case) {}
+MatchSubstringOptions::MatchSubstringOptions() : MatchSubstringOptions("", false) {}
+constexpr char MatchSubstringOptions::kTypeName[];
+
+SplitOptions::SplitOptions(int64_t max_splits, bool reverse)
+ : FunctionOptions(internal::kSplitOptionsType),
+ max_splits(max_splits),
+ reverse(reverse) {}
+constexpr char SplitOptions::kTypeName[];
+
+SplitPatternOptions::SplitPatternOptions(std::string pattern, int64_t max_splits,
+ bool reverse)
+ : FunctionOptions(internal::kSplitPatternOptionsType),
+ pattern(std::move(pattern)),
+ max_splits(max_splits),
+ reverse(reverse) {}
+SplitPatternOptions::SplitPatternOptions() : SplitPatternOptions("", -1, false) {}
+constexpr char SplitPatternOptions::kTypeName[];
+
+ReplaceSliceOptions::ReplaceSliceOptions(int64_t start, int64_t stop,
+ std::string replacement)
+ : FunctionOptions(internal::kReplaceSliceOptionsType),
+ start(start),
+ stop(stop),
+ replacement(std::move(replacement)) {}
+ReplaceSliceOptions::ReplaceSliceOptions() : ReplaceSliceOptions(0, 0, "") {}
+constexpr char ReplaceSliceOptions::kTypeName[];
+
+ReplaceSubstringOptions::ReplaceSubstringOptions(std::string pattern,
+ std::string replacement,
+ int64_t max_replacements)
+ : FunctionOptions(internal::kReplaceSubstringOptionsType),
+ pattern(std::move(pattern)),
+ replacement(std::move(replacement)),
+ max_replacements(max_replacements) {}
+ReplaceSubstringOptions::ReplaceSubstringOptions()
+ : ReplaceSubstringOptions("", "", -1) {}
+constexpr char ReplaceSubstringOptions::kTypeName[];
+
+ExtractRegexOptions::ExtractRegexOptions(std::string pattern)
+ : FunctionOptions(internal::kExtractRegexOptionsType), pattern(std::move(pattern)) {}
+ExtractRegexOptions::ExtractRegexOptions() : ExtractRegexOptions("") {}
+constexpr char ExtractRegexOptions::kTypeName[];
+
+SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
+ : FunctionOptions(internal::kSetLookupOptionsType),
+ value_set(std::move(value_set)),
+ skip_nulls(skip_nulls) {}
+SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
+constexpr char SetLookupOptions::kTypeName[];
+
+StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
+ : FunctionOptions(internal::kStrptimeOptionsType),
+ format(std::move(format)),
+ unit(unit) {}
+StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
+constexpr char StrptimeOptions::kTypeName[];
+
+PadOptions::PadOptions(int64_t width, std::string padding)
+ : FunctionOptions(internal::kPadOptionsType),
+ width(width),
+ padding(std::move(padding)) {}
+PadOptions::PadOptions() : PadOptions(0, " ") {}
+constexpr char PadOptions::kTypeName[];
+
+TrimOptions::TrimOptions(std::string characters)
+ : FunctionOptions(internal::kTrimOptionsType), characters(std::move(characters)) {}
+TrimOptions::TrimOptions() : TrimOptions("") {}
+constexpr char TrimOptions::kTypeName[];
+
+SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
+ : FunctionOptions(internal::kSliceOptionsType),
+ start(start),
+ stop(stop),
+ step(step) {}
+SliceOptions::SliceOptions() : SliceOptions(0, 0, 1) {}
+constexpr char SliceOptions::kTypeName[];
+
+MakeStructOptions::MakeStructOptions(
+ std::vector<std::string> n, std::vector<bool> r,
+ std::vector<std::shared_ptr<const KeyValueMetadata>> m)
+ : FunctionOptions(internal::kMakeStructOptionsType),
+ field_names(std::move(n)),
+ field_nullability(std::move(r)),
+ field_metadata(std::move(m)) {}
+
+MakeStructOptions::MakeStructOptions(std::vector<std::string> n)
+ : FunctionOptions(internal::kMakeStructOptionsType),
+ field_names(std::move(n)),
+ field_nullability(field_names.size(), true),
+ field_metadata(field_names.size(), NULLPTR) {}
+
+MakeStructOptions::MakeStructOptions() : MakeStructOptions(std::vector<std::string>()) {}
+constexpr char MakeStructOptions::kTypeName[];
+
+DayOfWeekOptions::DayOfWeekOptions(bool one_based_numbering, uint32_t week_start)
+ : FunctionOptions(internal::kDayOfWeekOptionsType),
+ one_based_numbering(one_based_numbering),
+ week_start(week_start) {}
+constexpr char DayOfWeekOptions::kTypeName[];
+
+namespace internal {
+void RegisterScalarOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kArithmeticOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kElementWiseAggregateOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kJoinOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMatchSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSplitPatternOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kReplaceSubstringOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kExtractRegexOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSetLookupOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kStrptimeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPadOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTrimOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSliceOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kMakeStructOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kDayOfWeekOptionsType));
+}
+} // namespace internal
+
#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME) \
Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
return CallFunction(REGISTRY_NAME, {value}, ctx); \
@@ -315,26 +315,26 @@ void RegisterScalarOptions(FunctionRegistry* registry) {
// ----------------------------------------------------------------------
// Arithmetic
-#define SCALAR_ARITHMETIC_UNARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
- Result<Datum> NAME(const Datum& arg, ArithmeticOptions options, ExecContext* ctx) { \
- auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \
- return CallFunction(func_name, {arg}, ctx); \
- }
-
-SCALAR_ARITHMETIC_UNARY(AbsoluteValue, "abs", "abs_checked")
-SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked")
-SCALAR_EAGER_UNARY(Sign, "sign")
-SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked")
-SCALAR_ARITHMETIC_UNARY(Cos, "cos", "cos_checked")
-SCALAR_ARITHMETIC_UNARY(Asin, "asin", "asin_checked")
-SCALAR_ARITHMETIC_UNARY(Acos, "acos", "acos_checked")
-SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked")
-SCALAR_EAGER_UNARY(Atan, "atan")
-SCALAR_ARITHMETIC_UNARY(Ln, "ln", "ln_checked")
-SCALAR_ARITHMETIC_UNARY(Log10, "log10", "log10_checked")
-SCALAR_ARITHMETIC_UNARY(Log2, "log2", "log2_checked")
-SCALAR_ARITHMETIC_UNARY(Log1p, "log1p", "log1p_checked")
-
+#define SCALAR_ARITHMETIC_UNARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
+ Result<Datum> NAME(const Datum& arg, ArithmeticOptions options, ExecContext* ctx) { \
+ auto func_name = (options.check_overflow) ? REGISTRY_CHECKED_NAME : REGISTRY_NAME; \
+ return CallFunction(func_name, {arg}, ctx); \
+ }
+
+SCALAR_ARITHMETIC_UNARY(AbsoluteValue, "abs", "abs_checked")
+SCALAR_ARITHMETIC_UNARY(Negate, "negate", "negate_checked")
+SCALAR_EAGER_UNARY(Sign, "sign")
+SCALAR_ARITHMETIC_UNARY(Sin, "sin", "sin_checked")
+SCALAR_ARITHMETIC_UNARY(Cos, "cos", "cos_checked")
+SCALAR_ARITHMETIC_UNARY(Asin, "asin", "asin_checked")
+SCALAR_ARITHMETIC_UNARY(Acos, "acos", "acos_checked")
+SCALAR_ARITHMETIC_UNARY(Tan, "tan", "tan_checked")
+SCALAR_EAGER_UNARY(Atan, "atan")
+SCALAR_ARITHMETIC_UNARY(Ln, "ln", "ln_checked")
+SCALAR_ARITHMETIC_UNARY(Log10, "log10", "log10_checked")
+SCALAR_ARITHMETIC_UNARY(Log2, "log2", "log2_checked")
+SCALAR_ARITHMETIC_UNARY(Log1p, "log1p", "log1p_checked")
+
#define SCALAR_ARITHMETIC_BINARY(NAME, REGISTRY_NAME, REGISTRY_CHECKED_NAME) \
Result<Datum> NAME(const Datum& left, const Datum& right, ArithmeticOptions options, \
ExecContext* ctx) { \
@@ -346,65 +346,65 @@ SCALAR_ARITHMETIC_BINARY(Add, "add", "add_checked")
SCALAR_ARITHMETIC_BINARY(Subtract, "subtract", "subtract_checked")
SCALAR_ARITHMETIC_BINARY(Multiply, "multiply", "multiply_checked")
SCALAR_ARITHMETIC_BINARY(Divide, "divide", "divide_checked")
-SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked")
-SCALAR_ARITHMETIC_BINARY(ShiftLeft, "shift_left", "shift_left_checked")
-SCALAR_ARITHMETIC_BINARY(ShiftRight, "shift_right", "shift_right_checked")
-SCALAR_EAGER_BINARY(Atan2, "atan2")
-SCALAR_EAGER_UNARY(Floor, "floor")
-SCALAR_EAGER_UNARY(Ceil, "ceil")
-SCALAR_EAGER_UNARY(Trunc, "trunc")
-
-Result<Datum> MaxElementWise(const std::vector<Datum>& args,
- ElementWiseAggregateOptions options, ExecContext* ctx) {
- return CallFunction("max_element_wise", args, &options, ctx);
-}
-
-Result<Datum> MinElementWise(const std::vector<Datum>& args,
- ElementWiseAggregateOptions options, ExecContext* ctx) {
- return CallFunction("min_element_wise", args, &options, ctx);
-}
-
+SCALAR_ARITHMETIC_BINARY(Power, "power", "power_checked")
+SCALAR_ARITHMETIC_BINARY(ShiftLeft, "shift_left", "shift_left_checked")
+SCALAR_ARITHMETIC_BINARY(ShiftRight, "shift_right", "shift_right_checked")
+SCALAR_EAGER_BINARY(Atan2, "atan2")
+SCALAR_EAGER_UNARY(Floor, "floor")
+SCALAR_EAGER_UNARY(Ceil, "ceil")
+SCALAR_EAGER_UNARY(Trunc, "trunc")
+
+Result<Datum> MaxElementWise(const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options, ExecContext* ctx) {
+ return CallFunction("max_element_wise", args, &options, ctx);
+}
+
+Result<Datum> MinElementWise(const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options, ExecContext* ctx) {
+ return CallFunction("min_element_wise", args, &options, ctx);
+}
+
// ----------------------------------------------------------------------
// Set-related operations
static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
- const SetLookupOptions& options, ExecContext* ctx) {
- if (!options.value_set.is_arraylike()) {
+ const SetLookupOptions& options, ExecContext* ctx) {
+ if (!options.value_set.is_arraylike()) {
return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
}
- std::shared_ptr<DataType> data_type;
- if (data.type()->id() == Type::DICTIONARY) {
- data_type =
- arrow::internal::checked_pointer_cast<DictionaryType>(data.type())->value_type();
- } else {
- data_type = data.type();
- }
-
- if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) {
+ std::shared_ptr<DataType> data_type;
+ if (data.type()->id() == Type::DICTIONARY) {
+ data_type =
+ arrow::internal::checked_pointer_cast<DictionaryType>(data.type())->value_type();
+ } else {
+ data_type = data.type();
+ }
+
+ if (options.value_set.length() > 0 && !data_type->Equals(options.value_set.type())) {
std::stringstream ss;
- ss << "Array type didn't match type of values set: " << data_type->ToString()
- << " vs " << options.value_set.type()->ToString();
+ ss << "Array type didn't match type of values set: " << data_type->ToString()
+ << " vs " << options.value_set.type()->ToString();
return Status::Invalid(ss.str());
}
return CallFunction(func_name, {data}, &options, ctx);
}
-Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx) {
- return ExecSetLookup("is_in", values, options, ctx);
-}
-
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx) {
+ return ExecSetLookup("is_in", values, options, ctx);
+}
+
Result<Datum> IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
- return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
+ return ExecSetLookup("is_in", values, SetLookupOptions{value_set}, ctx);
+}
+
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx) {
+ return ExecSetLookup("index_in", values, options, ctx);
}
-Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx) {
- return ExecSetLookup("index_in", values, options, ctx);
-}
-
Result<Datum> IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
- return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
+ return ExecSetLookup("index_in", values, SetLookupOptions{value_set}, ctx);
}
// ----------------------------------------------------------------------
@@ -416,8 +416,8 @@ SCALAR_EAGER_BINARY(KleeneAnd, "and_kleene")
SCALAR_EAGER_BINARY(Or, "or")
SCALAR_EAGER_BINARY(KleeneOr, "or_kleene")
SCALAR_EAGER_BINARY(Xor, "xor")
-SCALAR_EAGER_BINARY(AndNot, "and_not")
-SCALAR_EAGER_BINARY(KleeneAndNot, "and_not_kleene")
+SCALAR_EAGER_BINARY(AndNot, "and_not")
+SCALAR_EAGER_BINARY(KleeneAndNot, "and_not_kleene")
// ----------------------------------------------------------------------
@@ -444,7 +444,7 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
func_name = "less_equal";
break;
}
- return CallFunction(func_name, {left, right}, nullptr, ctx);
+ return CallFunction(func_name, {left, right}, nullptr, ctx);
}
// ----------------------------------------------------------------------
@@ -452,47 +452,47 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
SCALAR_EAGER_UNARY(IsValid, "is_valid")
SCALAR_EAGER_UNARY(IsNull, "is_null")
-SCALAR_EAGER_UNARY(IsNan, "is_nan")
+SCALAR_EAGER_UNARY(IsNan, "is_nan")
Result<Datum> FillNull(const Datum& values, const Datum& fill_value, ExecContext* ctx) {
return CallFunction("fill_null", {values, fill_value}, ctx);
}
-Result<Datum> IfElse(const Datum& cond, const Datum& if_true, const Datum& if_false,
- ExecContext* ctx) {
- return CallFunction("if_else", {cond, if_true, if_false}, ctx);
-}
-
-Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
- ExecContext* ctx) {
- std::vector<Datum> args = {cond};
- args.reserve(cases.size() + 1);
- args.insert(args.end(), cases.begin(), cases.end());
- return CallFunction("case_when", args, ctx);
-}
-
-// ----------------------------------------------------------------------
-// Temporal functions
-
-SCALAR_EAGER_UNARY(Year, "year")
-SCALAR_EAGER_UNARY(Month, "month")
-SCALAR_EAGER_UNARY(Day, "day")
-SCALAR_EAGER_UNARY(DayOfYear, "day_of_year")
-SCALAR_EAGER_UNARY(ISOYear, "iso_year")
-SCALAR_EAGER_UNARY(ISOWeek, "iso_week")
-SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar")
-SCALAR_EAGER_UNARY(Quarter, "quarter")
-SCALAR_EAGER_UNARY(Hour, "hour")
-SCALAR_EAGER_UNARY(Minute, "minute")
-SCALAR_EAGER_UNARY(Second, "second")
-SCALAR_EAGER_UNARY(Millisecond, "millisecond")
-SCALAR_EAGER_UNARY(Microsecond, "microsecond")
-SCALAR_EAGER_UNARY(Nanosecond, "nanosecond")
-SCALAR_EAGER_UNARY(Subsecond, "subsecond")
-
-Result<Datum> DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* ctx) {
- return CallFunction("day_of_week", {arg}, &options, ctx);
-}
-
+Result<Datum> IfElse(const Datum& cond, const Datum& if_true, const Datum& if_false,
+ ExecContext* ctx) {
+ return CallFunction("if_else", {cond, if_true, if_false}, ctx);
+}
+
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+ ExecContext* ctx) {
+ std::vector<Datum> args = {cond};
+ args.reserve(cases.size() + 1);
+ args.insert(args.end(), cases.begin(), cases.end());
+ return CallFunction("case_when", args, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Temporal functions
+
+SCALAR_EAGER_UNARY(Year, "year")
+SCALAR_EAGER_UNARY(Month, "month")
+SCALAR_EAGER_UNARY(Day, "day")
+SCALAR_EAGER_UNARY(DayOfYear, "day_of_year")
+SCALAR_EAGER_UNARY(ISOYear, "iso_year")
+SCALAR_EAGER_UNARY(ISOWeek, "iso_week")
+SCALAR_EAGER_UNARY(ISOCalendar, "iso_calendar")
+SCALAR_EAGER_UNARY(Quarter, "quarter")
+SCALAR_EAGER_UNARY(Hour, "hour")
+SCALAR_EAGER_UNARY(Minute, "minute")
+SCALAR_EAGER_UNARY(Second, "second")
+SCALAR_EAGER_UNARY(Millisecond, "millisecond")
+SCALAR_EAGER_UNARY(Microsecond, "microsecond")
+SCALAR_EAGER_UNARY(Nanosecond, "nanosecond")
+SCALAR_EAGER_UNARY(Subsecond, "subsecond")
+
+Result<Datum> DayOfWeek(const Datum& arg, DayOfWeekOptions options, ExecContext* ctx) {
+ return CallFunction("day_of_week", {arg}, &options, ctx);
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
index 8486cb0126f..e07e41569a1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_scalar.h
@@ -37,125 +37,125 @@ namespace compute {
///
/// @{
-class ARROW_EXPORT ArithmeticOptions : public FunctionOptions {
- public:
- explicit ArithmeticOptions(bool check_overflow = false);
- constexpr static char const kTypeName[] = "ArithmeticOptions";
+class ARROW_EXPORT ArithmeticOptions : public FunctionOptions {
+ public:
+ explicit ArithmeticOptions(bool check_overflow = false);
+ constexpr static char const kTypeName[] = "ArithmeticOptions";
bool check_overflow;
};
-class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
- public:
- explicit ElementWiseAggregateOptions(bool skip_nulls = true);
- constexpr static char const kTypeName[] = "ElementWiseAggregateOptions";
- static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
-
- bool skip_nulls;
-};
-
-/// Options for var_args_join.
-class ARROW_EXPORT JoinOptions : public FunctionOptions {
- public:
- /// How to handle null values. (A null separator always results in a null output.)
- enum NullHandlingBehavior {
- /// A null in any input results in a null in the output.
- EMIT_NULL,
- /// Nulls in inputs are skipped.
- SKIP,
- /// Nulls in inputs are replaced with the replacement string.
- REPLACE,
- };
- explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
- std::string null_replacement = "");
- constexpr static char const kTypeName[] = "JoinOptions";
- static JoinOptions Defaults() { return JoinOptions(); }
- NullHandlingBehavior null_handling;
- std::string null_replacement;
-};
-
-class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
- public:
- explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
- MatchSubstringOptions();
- constexpr static char const kTypeName[] = "MatchSubstringOptions";
-
- /// The exact substring (or regex, depending on kernel) to look for inside input values.
+class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
+ public:
+ explicit ElementWiseAggregateOptions(bool skip_nulls = true);
+ constexpr static char const kTypeName[] = "ElementWiseAggregateOptions";
+ static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
+
+ bool skip_nulls;
+};
+
+/// Options for var_args_join.
+class ARROW_EXPORT JoinOptions : public FunctionOptions {
+ public:
+ /// How to handle null values. (A null separator always results in a null output.)
+ enum NullHandlingBehavior {
+ /// A null in any input results in a null in the output.
+ EMIT_NULL,
+ /// Nulls in inputs are skipped.
+ SKIP,
+ /// Nulls in inputs are replaced with the replacement string.
+ REPLACE,
+ };
+ explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
+ std::string null_replacement = "");
+ constexpr static char const kTypeName[] = "JoinOptions";
+ static JoinOptions Defaults() { return JoinOptions(); }
+ NullHandlingBehavior null_handling;
+ std::string null_replacement;
+};
+
+class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
+ public:
+ explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
+ MatchSubstringOptions();
+ constexpr static char const kTypeName[] = "MatchSubstringOptions";
+
+ /// The exact substring (or regex, depending on kernel) to look for inside input values.
+ std::string pattern;
+ /// Whether to perform a case-insensitive match.
+ bool ignore_case = false;
+};
+
+class ARROW_EXPORT SplitOptions : public FunctionOptions {
+ public:
+ explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
+ constexpr static char const kTypeName[] = "SplitOptions";
+
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
+};
+
+class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
+ public:
+ explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
+ bool reverse = false);
+ SplitPatternOptions();
+ constexpr static char const kTypeName[] = "SplitPatternOptions";
+
+ /// The exact substring to split on.
+ std::string pattern;
+ /// Maximum number of splits allowed, or unlimited when -1
+ int64_t max_splits;
+ /// Start splitting from the end of the string (only relevant when max_splits != -1)
+ bool reverse;
+};
+
+class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
+ ReplaceSliceOptions();
+ constexpr static char const kTypeName[] = "ReplaceSliceOptions";
+
+ /// Index to start slicing at
+ int64_t start;
+ /// Index to stop slicing at
+ int64_t stop;
+ /// String to replace the slice with
+ std::string replacement;
+};
+
+class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+ public:
+ explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
+ int64_t max_replacements = -1);
+ ReplaceSubstringOptions();
+ constexpr static char const kTypeName[] = "ReplaceSubstringOptions";
+
+ /// Pattern to match, literal, or regular expression depending on which kernel is used
+ std::string pattern;
+ /// String to replace the pattern with
+ std::string replacement;
+ /// Max number of substrings to replace (-1 means unbounded)
+ int64_t max_replacements;
+};
+
+class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
+ public:
+ explicit ExtractRegexOptions(std::string pattern);
+ ExtractRegexOptions();
+ constexpr static char const kTypeName[] = "ExtractRegexOptions";
+
+ /// Regular expression with named capture fields
std::string pattern;
- /// Whether to perform a case-insensitive match.
- bool ignore_case = false;
};
-class ARROW_EXPORT SplitOptions : public FunctionOptions {
- public:
- explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
- constexpr static char const kTypeName[] = "SplitOptions";
-
- /// Maximum number of splits allowed, or unlimited when -1
- int64_t max_splits;
- /// Start splitting from the end of the string (only relevant when max_splits != -1)
- bool reverse;
-};
-
-class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
- public:
- explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
- bool reverse = false);
- SplitPatternOptions();
- constexpr static char const kTypeName[] = "SplitPatternOptions";
-
- /// The exact substring to split on.
- std::string pattern;
- /// Maximum number of splits allowed, or unlimited when -1
- int64_t max_splits;
- /// Start splitting from the end of the string (only relevant when max_splits != -1)
- bool reverse;
-};
-
-class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
- public:
- explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
- ReplaceSliceOptions();
- constexpr static char const kTypeName[] = "ReplaceSliceOptions";
-
- /// Index to start slicing at
- int64_t start;
- /// Index to stop slicing at
- int64_t stop;
- /// String to replace the slice with
- std::string replacement;
-};
-
-class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
- public:
- explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
- int64_t max_replacements = -1);
- ReplaceSubstringOptions();
- constexpr static char const kTypeName[] = "ReplaceSubstringOptions";
-
- /// Pattern to match, literal, or regular expression depending on which kernel is used
- std::string pattern;
- /// String to replace the pattern with
- std::string replacement;
- /// Max number of substrings to replace (-1 means unbounded)
- int64_t max_replacements;
-};
-
-class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
- public:
- explicit ExtractRegexOptions(std::string pattern);
- ExtractRegexOptions();
- constexpr static char const kTypeName[] = "ExtractRegexOptions";
-
- /// Regular expression with named capture fields
- std::string pattern;
-};
-
/// Options for IsIn and IndexIn functions
-class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
- public:
- explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
- SetLookupOptions();
- constexpr static char const kTypeName[] = "SetLookupOptions";
+class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+ public:
+ explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
+ SetLookupOptions();
+ constexpr static char const kTypeName[] = "SetLookupOptions";
/// The set of values to look up input values into.
Datum value_set;
@@ -168,47 +168,47 @@ class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
bool skip_nulls;
};
-class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
- public:
- explicit StrptimeOptions(std::string format, TimeUnit::type unit);
- StrptimeOptions();
- constexpr static char const kTypeName[] = "StrptimeOptions";
+class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
+ public:
+ explicit StrptimeOptions(std::string format, TimeUnit::type unit);
+ StrptimeOptions();
+ constexpr static char const kTypeName[] = "StrptimeOptions";
std::string format;
TimeUnit::type unit;
};
-class ARROW_EXPORT PadOptions : public FunctionOptions {
- public:
- explicit PadOptions(int64_t width, std::string padding = " ");
- PadOptions();
- constexpr static char const kTypeName[] = "PadOptions";
-
- /// The desired string length.
- int64_t width;
- /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
- std::string padding;
-};
-
-class ARROW_EXPORT TrimOptions : public FunctionOptions {
- public:
- explicit TrimOptions(std::string characters);
- TrimOptions();
- constexpr static char const kTypeName[] = "TrimOptions";
-
- /// The individual characters that can be trimmed from the string.
- std::string characters;
-};
-
-class ARROW_EXPORT SliceOptions : public FunctionOptions {
- public:
- explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits<int64_t>::max(),
- int64_t step = 1);
- SliceOptions();
- constexpr static char const kTypeName[] = "SliceOptions";
- int64_t start, stop, step;
-};
-
+class ARROW_EXPORT PadOptions : public FunctionOptions {
+ public:
+ explicit PadOptions(int64_t width, std::string padding = " ");
+ PadOptions();
+ constexpr static char const kTypeName[] = "PadOptions";
+
+ /// The desired string length.
+ int64_t width;
+ /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
+ std::string padding;
+};
+
+class ARROW_EXPORT TrimOptions : public FunctionOptions {
+ public:
+ explicit TrimOptions(std::string characters);
+ TrimOptions();
+ constexpr static char const kTypeName[] = "TrimOptions";
+
+ /// The individual characters that can be trimmed from the string.
+ std::string characters;
+};
+
+class ARROW_EXPORT SliceOptions : public FunctionOptions {
+ public:
+ explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits<int64_t>::max(),
+ int64_t step = 1);
+ SliceOptions();
+ constexpr static char const kTypeName[] = "SliceOptions";
+ int64_t start, stop, step;
+};
+
enum CompareOperator : int8_t {
EQUAL,
NOT_EQUAL,
@@ -218,57 +218,57 @@ enum CompareOperator : int8_t {
LESS_EQUAL,
};
-struct ARROW_EXPORT CompareOptions {
+struct ARROW_EXPORT CompareOptions {
explicit CompareOptions(CompareOperator op) : op(op) {}
- CompareOptions() : CompareOptions(CompareOperator::EQUAL) {}
+ CompareOptions() : CompareOptions(CompareOperator::EQUAL) {}
enum CompareOperator op;
};
-class ARROW_EXPORT MakeStructOptions : public FunctionOptions {
- public:
- MakeStructOptions(std::vector<std::string> n, std::vector<bool> r,
- std::vector<std::shared_ptr<const KeyValueMetadata>> m);
- explicit MakeStructOptions(std::vector<std::string> n);
- MakeStructOptions();
- constexpr static char const kTypeName[] = "MakeStructOptions";
-
- /// Names for wrapped columns
- std::vector<std::string> field_names;
-
- /// Nullability bits for wrapped columns
- std::vector<bool> field_nullability;
-
- /// Metadata attached to wrapped columns
- std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
-};
-
-struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
- public:
- explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1);
- constexpr static char const kTypeName[] = "DayOfWeekOptions";
- static DayOfWeekOptions Defaults() { return DayOfWeekOptions{}; }
-
- /// Number days from 1 if true and from 0 if false
- bool one_based_numbering;
- /// What day does the week start with (Monday=1, Sunday=7)
- uint32_t week_start;
-};
-
+class ARROW_EXPORT MakeStructOptions : public FunctionOptions {
+ public:
+ MakeStructOptions(std::vector<std::string> n, std::vector<bool> r,
+ std::vector<std::shared_ptr<const KeyValueMetadata>> m);
+ explicit MakeStructOptions(std::vector<std::string> n);
+ MakeStructOptions();
+ constexpr static char const kTypeName[] = "MakeStructOptions";
+
+ /// Names for wrapped columns
+ std::vector<std::string> field_names;
+
+ /// Nullability bits for wrapped columns
+ std::vector<bool> field_nullability;
+
+ /// Metadata attached to wrapped columns
+ std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
+};
+
+struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
+ public:
+ explicit DayOfWeekOptions(bool one_based_numbering = false, uint32_t week_start = 1);
+ constexpr static char const kTypeName[] = "DayOfWeekOptions";
+ static DayOfWeekOptions Defaults() { return DayOfWeekOptions{}; }
+
+ /// Number days from 1 if true and from 0 if false
+ bool one_based_numbering;
+ /// What day does the week start with (Monday=1, Sunday=7)
+ uint32_t week_start;
+};
+
/// @}
-/// \brief Get the absolute value of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg the value transformed
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise absolute value
-ARROW_EXPORT
-Result<Datum> AbsoluteValue(const Datum& arg,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
+/// \brief Get the absolute value of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value transformed
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise absolute value
+ARROW_EXPORT
+Result<Datum> AbsoluteValue(const Datum& arg,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
/// \brief Add two values together. Array values must be the same length. If
/// either addend is null the result will be null.
///
@@ -322,233 +322,233 @@ Result<Datum> Divide(const Datum& left, const Datum& right,
ArithmeticOptions options = ArithmeticOptions(),
ExecContext* ctx = NULLPTR);
-/// \brief Negate values.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg the value negated
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise negation
-ARROW_EXPORT
-Result<Datum> Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Raise the values of base array to the power of the exponent array values.
-/// Array values must be the same length. If either base or exponent is null the result
-/// will be null.
-///
-/// \param[in] left the base
-/// \param[in] right the exponent
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise base value raised to the power of exponent
-ARROW_EXPORT
-Result<Datum> Power(const Datum& left, const Datum& right,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Left shift the left array by the right array. Array values must be the
-/// same length. If either operand is null, the result will be null.
-///
-/// \param[in] left the value to shift
-/// \param[in] right the value to shift by
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise left value shifted left by the right value
-ARROW_EXPORT
-Result<Datum> ShiftLeft(const Datum& left, const Datum& right,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Right shift the left array by the right array. Array values must be the
-/// same length. If either operand is null, the result will be null. Performs a
-/// logical shift for unsigned values, and an arithmetic shift for signed values.
-///
-/// \param[in] left the value to shift
-/// \param[in] right the value to shift by
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise left value shifted right by the right value
-ARROW_EXPORT
-Result<Datum> ShiftRight(const Datum& left, const Datum& right,
- ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the sine of the array values.
-/// \param[in] arg The values to compute the sine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise sine of the values
-ARROW_EXPORT
-Result<Datum> Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the cosine of the array values.
-/// \param[in] arg The values to compute the cosine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise cosine of the values
-ARROW_EXPORT
-Result<Datum> Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse sine (arcsine) of the array values.
-/// \param[in] arg The values to compute the inverse sine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse sine of the values
-ARROW_EXPORT
-Result<Datum> Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse cosine (arccosine) of the array values.
-/// \param[in] arg The values to compute the inverse cosine for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse cosine of the values
-ARROW_EXPORT
-Result<Datum> Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the tangent of the array values.
-/// \param[in] arg The values to compute the tangent for.
-/// \param[in] options arithmetic options (enable/disable overflow checking), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise tangent of the values
-ARROW_EXPORT
-Result<Datum> Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse tangent (arctangent) of the array values.
-/// \param[in] arg The values to compute the inverse tangent for.
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse tangent of the values
-ARROW_EXPORT
-Result<Datum> Atan(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the inverse tangent (arctangent) of y/x, using the
-/// argument signs to determine the correct quadrant.
-/// \param[in] y The y-values to compute the inverse tangent for.
-/// \param[in] x The x-values to compute the inverse tangent for.
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise inverse tangent of the values
-ARROW_EXPORT
-Result<Datum> Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR);
-
-/// \brief Get the natural log of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise natural log
-ARROW_EXPORT
-Result<Datum> Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the log base 10 of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise log base 10
-ARROW_EXPORT
-Result<Datum> Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the log base 2 of a value.
-///
-/// If argument is null the result will be null.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise log base 2
-ARROW_EXPORT
-Result<Datum> Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the natural log of (1 + value).
-///
-/// If argument is null the result will be null.
-/// This function may be more accurate than Log(1 + value) for values close to zero.
-///
-/// \param[in] arg The values to compute the logarithm for.
-/// \param[in] options arithmetic options (overflow handling), optional
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise natural log
-ARROW_EXPORT
-Result<Datum> Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Round to the nearest integer less than or equal in magnitude to the
-/// argument. Array values can be of arbitrary length. If argument is null the
-/// result will be null.
-///
-/// \param[in] arg the value to round
-/// \param[in] ctx the function execution context, optional
-/// \return the rounded value
-ARROW_EXPORT
-Result<Datum> Floor(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Round to the nearest integer greater than or equal in magnitude to the
-/// argument. Array values can be of arbitrary length. If argument is null the
-/// result will be null.
-///
-/// \param[in] arg the value to round
-/// \param[in] ctx the function execution context, optional
-/// \return the rounded value
-ARROW_EXPORT
-Result<Datum> Ceil(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Get the integral part without fractional digits. Array values can be
-/// of arbitrary length. If argument is null the result will be null.
-///
-/// \param[in] arg the value to truncate
-/// \param[in] ctx the function execution context, optional
-/// \return the truncated value
-ARROW_EXPORT
-Result<Datum> Trunc(const Datum& arg, ExecContext* ctx = NULLPTR);
-
-/// \brief Find the element-wise maximum of any number of arrays or scalars.
-/// Array values must be the same length.
-///
-/// \param[in] args arrays or scalars to operate on.
-/// \param[in] options options for handling nulls, optional
-/// \param[in] ctx the function execution context, optional
-/// \return the element-wise maximum
-ARROW_EXPORT
-Result<Datum> MaxElementWise(
- const std::vector<Datum>& args,
- ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Find the element-wise minimum of any number of arrays or scalars.
-/// Array values must be the same length.
-///
-/// \param[in] args arrays or scalars to operate on.
-/// \param[in] options options for handling nulls, optional
-/// \param[in] ctx the function execution context, optional
-/// \return the element-wise minimum
-ARROW_EXPORT
-Result<Datum> MinElementWise(
- const std::vector<Datum>& args,
- ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument
-/// is null the result will be null.
-///
-/// \param[in] arg the value to extract sign from
-/// \param[in] ctx the function execution context, optional
-/// \return the elementwise sign function
-ARROW_EXPORT
-Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
-
+/// \brief Negate values.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value negated
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise negation
+ARROW_EXPORT
+Result<Datum> Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Raise the values of base array to the power of the exponent array values.
+/// Array values must be the same length. If either base or exponent is null the result
+/// will be null.
+///
+/// \param[in] left the base
+/// \param[in] right the exponent
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise base value raised to the power of exponent
+ARROW_EXPORT
+Result<Datum> Power(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Left shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted left by the right value
+ARROW_EXPORT
+Result<Datum> ShiftLeft(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Right shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null. Performs a
+/// logical shift for unsigned values, and an arithmetic shift for signed values.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted right by the right value
+ARROW_EXPORT
+Result<Datum> ShiftRight(const Datum& left, const Datum& right,
+ ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the sine of the array values.
+/// \param[in] arg The values to compute the sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sine of the values
+ARROW_EXPORT
+Result<Datum> Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cosine of the array values.
+/// \param[in] arg The values to compute the cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise cosine of the values
+ARROW_EXPORT
+Result<Datum> Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse sine (arcsine) of the array values.
+/// \param[in] arg The values to compute the inverse sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse sine of the values
+ARROW_EXPORT
+Result<Datum> Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse cosine (arccosine) of the array values.
+/// \param[in] arg The values to compute the inverse cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse cosine of the values
+ARROW_EXPORT
+Result<Datum> Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the tangent of the array values.
+/// \param[in] arg The values to compute the tangent for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise tangent of the values
+ARROW_EXPORT
+Result<Datum> Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of the array values.
+/// \param[in] arg The values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of y/x, using the
+/// argument signs to determine the correct quadrant.
+/// \param[in] y The y-values to compute the inverse tangent for.
+/// \param[in] x The x-values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 10 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 10
+ARROW_EXPORT
+Result<Datum> Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 2 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 2
+ARROW_EXPORT
+Result<Datum> Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of (1 + value).
+///
+/// If argument is null the result will be null.
+/// This function may be more accurate than Log(1 + value) for values close to zero.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer less than or equal in magnitude to the
+/// argument. Array values can be of arbitrary length. If argument is null the
+/// result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Floor(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer greater than or equal in magnitude to the
+/// argument. Array values can be of arbitrary length. If argument is null the
+/// result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Ceil(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the integral part without fractional digits. Array values can be
+/// of arbitrary length. If argument is null the result will be null.
+///
+/// \param[in] arg the value to truncate
+/// \param[in] ctx the function execution context, optional
+/// \return the truncated value
+ARROW_EXPORT
+Result<Datum> Trunc(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise maximum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise maximum
+ARROW_EXPORT
+Result<Datum> MaxElementWise(
+ const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise minimum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise minimum
+ARROW_EXPORT
+Result<Datum> MinElementWise(
+ const std::vector<Datum>& args,
+ ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument
+/// is null the result will be null.
+///
+/// \param[in] arg the value to extract sign from
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sign function
+ARROW_EXPORT
+Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
+
/// \brief Compare a numeric array with a scalar.
///
/// \param[in] left datum to compare, must be an Array
@@ -562,10 +562,10 @@ Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
///
/// \since 1.0.0
/// \note API not yet finalized
-ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly")
+ARROW_DEPRECATED("Deprecated in 5.0.0. Use each compare function directly")
ARROW_EXPORT
-Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
- ExecContext* ctx = NULLPTR);
+Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
+ ExecContext* ctx = NULLPTR);
/// \brief Invert the values of a boolean datum
/// \param[in] value datum to invert
@@ -580,8 +580,8 @@ Result<Datum> Invert(const Datum& value, ExecContext* ctx = NULLPTR);
/// \brief Element-wise AND of two boolean datums which always propagates nulls
/// (null and false is null).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -593,8 +593,8 @@ Result<Datum> And(const Datum& left, const Datum& right, ExecContext* ctx = NULL
/// \brief Element-wise AND of two boolean datums with a Kleene truth table
/// (null and false is false).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -607,8 +607,8 @@ Result<Datum> KleeneAnd(const Datum& left, const Datum& right,
/// \brief Element-wise OR of two boolean datums which always propagates nulls
/// (null and true is null).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -620,8 +620,8 @@ Result<Datum> Or(const Datum& left, const Datum& right, ExecContext* ctx = NULLP
/// \brief Element-wise OR of two boolean datums with a Kleene truth table
/// (null or true is true).
///
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -631,8 +631,8 @@ ARROW_EXPORT
Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
/// \brief Element-wise XOR of two boolean datums
-/// \param[in] left left operand
-/// \param[in] right right operand
+/// \param[in] left left operand
+/// \param[in] right right operand
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
@@ -641,49 +641,49 @@ Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx =
ARROW_EXPORT
Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
-/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls
-/// (null and not true is null).
-///
-/// \param[in] left left operand
-/// \param[in] right right operand
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
-
-/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table
-/// (false and not null is false, null and not true is false).
-///
-/// \param[in] left left operand
-/// \param[in] right right operand
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
- ExecContext* ctx = NULLPTR);
-
+/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls
+/// (null and not true is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table
+/// (false and not null is false, null and not true is false).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
/// \brief IsIn returns true for each element of `values` that is contained in
/// `value_set`
///
-/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
///
/// \param[in] values array-like input to look up in value_set
-/// \param[in] options SetLookupOptions
+/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx = NULLPTR);
-ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
Result<Datum> IsIn(const Datum& values, const Datum& value_set,
ExecContext* ctx = NULLPTR);
@@ -695,19 +695,19 @@ Result<Datum> IsIn(const Datum& values, const Datum& value_set,
/// For example given values = [99, 42, 3, null] and
/// value_set = [3, 3, 99], the output will be = [1, null, 0, null]
///
-/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
///
/// \param[in] values array-like input
-/// \param[in] options SetLookupOptions
+/// \param[in] options SetLookupOptions
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
- ExecContext* ctx = NULLPTR);
-ARROW_EXPORT
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+ ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
Result<Datum> IndexIn(const Datum& values, const Datum& value_set,
ExecContext* ctx = NULLPTR);
@@ -735,18 +735,18 @@ Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
ARROW_EXPORT
Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
-/// \brief IsNan returns true for each element of `values` that is NaN,
-/// false otherwise
-///
-/// \param[in] values input to look for NaN
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 3.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> IsNan(const Datum& values, ExecContext* ctx = NULLPTR);
-
+/// \brief IsNan returns true for each element of `values` that is NaN,
+/// false otherwise
+///
+/// \param[in] values input to look for NaN
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsNan(const Datum& values, ExecContext* ctx = NULLPTR);
+
/// \brief FillNull replaces each null element in `values`
/// with `fill_value`
///
@@ -762,228 +762,228 @@ ARROW_EXPORT
Result<Datum> FillNull(const Datum& values, const Datum& fill_value,
ExecContext* ctx = NULLPTR);
-/// \brief IfElse returns elements chosen from `left` or `right`
-/// depending on `cond`. `null` values in `cond` will be promoted to the result
-///
-/// \param[in] cond `Boolean` condition Scalar/ Array
-/// \param[in] left Scalar/ Array
-/// \param[in] right Scalar/ Array
-/// \param[in] ctx the function execution context, optional
-///
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right,
- ExecContext* ctx = NULLPTR);
-
-/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for
-/// each row, select the first value for which the corresponding condition is
-/// true, or (if given) select the 'else' value, else emit null. Note that a
-/// null condition is the same as false.
-///
-/// \param[in] cond Conditions (Boolean)
-/// \param[in] cases Values (any type), along with an optional 'else' value.
-/// \param[in] ctx the function execution context, optional
-///
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
- ExecContext* ctx = NULLPTR);
-
-/// \brief Year returns year for each element of `values`
-///
-/// \param[in] values input to extract year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Month returns month for each element of `values`.
-/// Month is encoded as January=1, December=12
-///
-/// \param[in] values input to extract month from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Day returns day number for each element of `values`
-///
-/// \param[in] values input to extract day from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief DayOfWeek returns number of the day of the week value for each element of
-/// `values`.
-///
-/// By default week starts on Monday denoted by 0 and ends on Sunday denoted
-/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be
-/// set using DayOfWeekOptions
-///
-/// \param[in] values input to extract number of the day of the week from
-/// \param[in] options for setting start of the week and day numbering
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values,
- DayOfWeekOptions options = DayOfWeekOptions(),
- ExecContext* ctx = NULLPTR);
-
-/// \brief DayOfYear returns number of day of the year for each element of `values`.
-/// January 1st maps to day number 1, February 1st to 32, etc.
-///
-/// \param[in] values input to extract number of day of the year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief ISOYear returns ISO year number for each element of `values`.
-/// First week of an ISO year has the majority (4 or more) of its days in January.
-///
-/// \param[in] values input to extract ISO year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief ISOWeek returns ISO week of year number for each element of `values`.
-/// First ISO week has the majority (4 or more) of its days in January.
-/// Week of the year starts with 1 and can run up to 53.
-///
-/// \param[in] values input to extract ISO week of year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
-/// each element of `values`.
-/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
-///
-/// \param[in] values input to ISO calendar struct from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Quarter returns the quarter of year number for each element of `values`
-/// First quarter maps to 1 and fourth quarter maps to 4.
-///
-/// \param[in] values input to extract quarter of year from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Hour returns hour value for each element of `values`
-///
-/// \param[in] values input to extract hour from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Minute returns minutes value for each element of `values`
-///
-/// \param[in] values input to extract minutes from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Second returns seconds value for each element of `values`
-///
-/// \param[in] values input to extract seconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Millisecond returns number of milliseconds since the last full second
-/// for each element of `values`
-///
-/// \param[in] values input to extract milliseconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Microsecond returns number of microseconds since the last full millisecond
-/// for each element of `values`
-///
-/// \param[in] values input to extract microseconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Nanosecond returns number of nanoseconds since the last full millisecond
-/// for each element of `values`
-///
-/// \param[in] values input to extract nanoseconds from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
-/// \brief Subsecond returns the fraction of second elapsed since last full second
-/// as a float for each element of `values`
-///
-/// \param[in] values input to extract subsecond from
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NULLPTR);
-
+/// \brief IfElse returns elements chosen from `left` or `right`
+/// depending on `cond`. `null` values in `cond` will be promoted to the result
+///
+/// \param[in] cond `Boolean` condition Scalar/ Array
+/// \param[in] left Scalar/ Array
+/// \param[in] right Scalar/ Array
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for
+/// each row, select the first value for which the corresponding condition is
+/// true, or (if given) select the 'else' value, else emit null. Note that a
+/// null condition is the same as false.
+///
+/// \param[in] cond Conditions (Boolean)
+/// \param[in] cases Values (any type), along with an optional 'else' value.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Year returns year for each element of `values`
+///
+/// \param[in] values input to extract year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Month returns month for each element of `values`.
+/// Month is encoded as January=1, December=12
+///
+/// \param[in] values input to extract month from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Day returns day number for each element of `values`
+///
+/// \param[in] values input to extract day from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfWeek returns number of the day of the week value for each element of
+/// `values`.
+///
+/// By default week starts on Monday denoted by 0 and ends on Sunday denoted
+/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be
+/// set using DayOfWeekOptions
+///
+/// \param[in] values input to extract number of the day of the week from
+/// \param[in] options for setting start of the week and day numbering
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values,
+ DayOfWeekOptions options = DayOfWeekOptions(),
+ ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfYear returns number of day of the year for each element of `values`.
+/// January 1st maps to day number 1, February 1st to 32, etc.
+///
+/// \param[in] values input to extract number of day of the year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOYear returns ISO year number for each element of `values`.
+/// First week of an ISO year has the majority (4 or more) of its days in January.
+///
+/// \param[in] values input to extract ISO year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOWeek returns ISO week of year number for each element of `values`.
+/// First ISO week has the majority (4 or more) of its days in January.
+/// Week of the year starts with 1 and can run up to 53.
+///
+/// \param[in] values input to extract ISO week of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
+/// each element of `values`.
+/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
+///
+/// \param[in] values input to ISO calendar struct from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Quarter returns the quarter of year number for each element of `values`
+/// First quarter maps to 1 and fourth quarter maps to 4.
+///
+/// \param[in] values input to extract quarter of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Hour returns hour value for each element of `values`
+///
+/// \param[in] values input to extract hour from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Minute returns minutes value for each element of `values`
+///
+/// \param[in] values input to extract minutes from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Second returns seconds value for each element of `values`
+///
+/// \param[in] values input to extract seconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Millisecond returns number of milliseconds since the last full second
+/// for each element of `values`
+///
+/// \param[in] values input to extract milliseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Microsecond returns number of microseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract microseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Nanosecond returns number of nanoseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract nanoseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Subsecond returns the fraction of second elapsed since last full second
+/// as a float for each element of `values`
+///
+/// \param[in] values input to extract subsecond from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
index 4b875ddaf04..a68969b2ee5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.cc
@@ -18,140 +18,140 @@
#include "arrow/compute/api_vector.h"
#include <memory>
-#include <sstream>
+#include <sstream>
#include <utility>
#include <vector>
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/registry.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/registry.h"
#include "arrow/datum.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/logging.h"
namespace arrow {
-using internal::checked_cast;
+using internal::checked_cast;
using internal::checked_pointer_cast;
-namespace internal {
-using compute::DictionaryEncodeOptions;
-using compute::FilterOptions;
-template <>
-struct EnumTraits<FilterOptions::NullSelectionBehavior>
- : BasicEnumTraits<FilterOptions::NullSelectionBehavior, FilterOptions::DROP,
- FilterOptions::EMIT_NULL> {
- static std::string name() { return "FilterOptions::NullSelectionBehavior"; }
- static std::string value_name(FilterOptions::NullSelectionBehavior value) {
- switch (value) {
- case FilterOptions::DROP:
- return "DROP";
- case FilterOptions::EMIT_NULL:
- return "EMIT_NULL";
- }
- return "<INVALID>";
- }
-};
-template <>
-struct EnumTraits<DictionaryEncodeOptions::NullEncodingBehavior>
- : BasicEnumTraits<DictionaryEncodeOptions::NullEncodingBehavior,
- DictionaryEncodeOptions::ENCODE, DictionaryEncodeOptions::MASK> {
- static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; }
- static std::string value_name(DictionaryEncodeOptions::NullEncodingBehavior value) {
- switch (value) {
- case DictionaryEncodeOptions::ENCODE:
- return "ENCODE";
- case DictionaryEncodeOptions::MASK:
- return "MASK";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
+namespace internal {
+using compute::DictionaryEncodeOptions;
+using compute::FilterOptions;
+template <>
+struct EnumTraits<FilterOptions::NullSelectionBehavior>
+ : BasicEnumTraits<FilterOptions::NullSelectionBehavior, FilterOptions::DROP,
+ FilterOptions::EMIT_NULL> {
+ static std::string name() { return "FilterOptions::NullSelectionBehavior"; }
+ static std::string value_name(FilterOptions::NullSelectionBehavior value) {
+ switch (value) {
+ case FilterOptions::DROP:
+ return "DROP";
+ case FilterOptions::EMIT_NULL:
+ return "EMIT_NULL";
+ }
+ return "<INVALID>";
+ }
+};
+template <>
+struct EnumTraits<DictionaryEncodeOptions::NullEncodingBehavior>
+ : BasicEnumTraits<DictionaryEncodeOptions::NullEncodingBehavior,
+ DictionaryEncodeOptions::ENCODE, DictionaryEncodeOptions::MASK> {
+ static std::string name() { return "DictionaryEncodeOptions::NullEncodingBehavior"; }
+ static std::string value_name(DictionaryEncodeOptions::NullEncodingBehavior value) {
+ switch (value) {
+ case DictionaryEncodeOptions::ENCODE:
+ return "ENCODE";
+ case DictionaryEncodeOptions::MASK:
+ return "MASK";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
namespace compute {
// ----------------------------------------------------------------------
-// Function options
-
-bool SortKey::Equals(const SortKey& other) const {
- return name == other.name && order == other.order;
-}
-std::string SortKey::ToString() const {
- std::stringstream ss;
- ss << name << ' ';
- switch (order) {
- case SortOrder::Ascending:
- ss << "ASC";
- break;
- case SortOrder::Descending:
- ss << "DESC";
- break;
- }
- return ss.str();
-}
-
-namespace internal {
-namespace {
-using ::arrow::internal::DataMember;
-static auto kFilterOptionsType = GetFunctionOptionsType<FilterOptions>(
- DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior));
-static auto kTakeOptionsType = GetFunctionOptionsType<TakeOptions>(
- DataMember("boundscheck", &TakeOptions::boundscheck));
-static auto kDictionaryEncodeOptionsType =
- GetFunctionOptionsType<DictionaryEncodeOptions>(DataMember(
- "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior));
-static auto kArraySortOptionsType = GetFunctionOptionsType<ArraySortOptions>(
- DataMember("order", &ArraySortOptions::order));
-static auto kSortOptionsType =
- GetFunctionOptionsType<SortOptions>(DataMember("sort_keys", &SortOptions::sort_keys));
-static auto kPartitionNthOptionsType = GetFunctionOptionsType<PartitionNthOptions>(
- DataMember("pivot", &PartitionNthOptions::pivot));
-} // namespace
-} // namespace internal
-
-FilterOptions::FilterOptions(NullSelectionBehavior null_selection)
- : FunctionOptions(internal::kFilterOptionsType),
- null_selection_behavior(null_selection) {}
-constexpr char FilterOptions::kTypeName[];
-
-TakeOptions::TakeOptions(bool boundscheck)
- : FunctionOptions(internal::kTakeOptionsType), boundscheck(boundscheck) {}
-constexpr char TakeOptions::kTypeName[];
-
-DictionaryEncodeOptions::DictionaryEncodeOptions(NullEncodingBehavior null_encoding)
- : FunctionOptions(internal::kDictionaryEncodeOptionsType),
- null_encoding_behavior(null_encoding) {}
-constexpr char DictionaryEncodeOptions::kTypeName[];
-
-ArraySortOptions::ArraySortOptions(SortOrder order)
- : FunctionOptions(internal::kArraySortOptionsType), order(order) {}
-constexpr char ArraySortOptions::kTypeName[];
-
-SortOptions::SortOptions(std::vector<SortKey> sort_keys)
- : FunctionOptions(internal::kSortOptionsType), sort_keys(std::move(sort_keys)) {}
-constexpr char SortOptions::kTypeName[];
-
-PartitionNthOptions::PartitionNthOptions(int64_t pivot)
- : FunctionOptions(internal::kPartitionNthOptionsType), pivot(pivot) {}
-constexpr char PartitionNthOptions::kTypeName[];
-
-namespace internal {
-void RegisterVectorOptions(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kTakeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kDictionaryEncodeOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kArraySortOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType));
- DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType));
-}
-} // namespace internal
-
-// ----------------------------------------------------------------------
+// Function options
+
+bool SortKey::Equals(const SortKey& other) const {
+ return name == other.name && order == other.order;
+}
+std::string SortKey::ToString() const {
+ std::stringstream ss;
+ ss << name << ' ';
+ switch (order) {
+ case SortOrder::Ascending:
+ ss << "ASC";
+ break;
+ case SortOrder::Descending:
+ ss << "DESC";
+ break;
+ }
+ return ss.str();
+}
+
+namespace internal {
+namespace {
+using ::arrow::internal::DataMember;
+static auto kFilterOptionsType = GetFunctionOptionsType<FilterOptions>(
+ DataMember("null_selection_behavior", &FilterOptions::null_selection_behavior));
+static auto kTakeOptionsType = GetFunctionOptionsType<TakeOptions>(
+ DataMember("boundscheck", &TakeOptions::boundscheck));
+static auto kDictionaryEncodeOptionsType =
+ GetFunctionOptionsType<DictionaryEncodeOptions>(DataMember(
+ "null_encoding_behavior", &DictionaryEncodeOptions::null_encoding_behavior));
+static auto kArraySortOptionsType = GetFunctionOptionsType<ArraySortOptions>(
+ DataMember("order", &ArraySortOptions::order));
+static auto kSortOptionsType =
+ GetFunctionOptionsType<SortOptions>(DataMember("sort_keys", &SortOptions::sort_keys));
+static auto kPartitionNthOptionsType = GetFunctionOptionsType<PartitionNthOptions>(
+ DataMember("pivot", &PartitionNthOptions::pivot));
+} // namespace
+} // namespace internal
+
+FilterOptions::FilterOptions(NullSelectionBehavior null_selection)
+ : FunctionOptions(internal::kFilterOptionsType),
+ null_selection_behavior(null_selection) {}
+constexpr char FilterOptions::kTypeName[];
+
+TakeOptions::TakeOptions(bool boundscheck)
+ : FunctionOptions(internal::kTakeOptionsType), boundscheck(boundscheck) {}
+constexpr char TakeOptions::kTypeName[];
+
+DictionaryEncodeOptions::DictionaryEncodeOptions(NullEncodingBehavior null_encoding)
+ : FunctionOptions(internal::kDictionaryEncodeOptionsType),
+ null_encoding_behavior(null_encoding) {}
+constexpr char DictionaryEncodeOptions::kTypeName[];
+
+ArraySortOptions::ArraySortOptions(SortOrder order)
+ : FunctionOptions(internal::kArraySortOptionsType), order(order) {}
+constexpr char ArraySortOptions::kTypeName[];
+
+SortOptions::SortOptions(std::vector<SortKey> sort_keys)
+ : FunctionOptions(internal::kSortOptionsType), sort_keys(std::move(sort_keys)) {}
+constexpr char SortOptions::kTypeName[];
+
+PartitionNthOptions::PartitionNthOptions(int64_t pivot)
+ : FunctionOptions(internal::kPartitionNthOptionsType), pivot(pivot) {}
+constexpr char PartitionNthOptions::kTypeName[];
+
+namespace internal {
+void RegisterVectorOptions(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kTakeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kDictionaryEncodeOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kArraySortOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType));
+}
+} // namespace internal
+
+// ----------------------------------------------------------------------
// Direct exec interface to kernels
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
@@ -162,42 +162,42 @@ Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
return result.make_array();
}
-Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
- const Datum& replacements, ExecContext* ctx) {
- return CallFunction("replace_with_mask", {values, mask, replacements}, ctx);
-}
-
-Result<std::shared_ptr<Array>> SortIndices(const Array& values, SortOrder order,
- ExecContext* ctx) {
- ArraySortOptions options(order);
- ARROW_ASSIGN_OR_RAISE(
- Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx));
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+ const Datum& replacements, ExecContext* ctx) {
+ return CallFunction("replace_with_mask", {values, mask, replacements}, ctx);
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const Array& values, SortOrder order,
+ ExecContext* ctx) {
+ ArraySortOptions options(order);
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, CallFunction("array_sort_indices", {Datum(values)}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+ SortOrder order, ExecContext* ctx) {
+ SortOptions options({SortKey("not-used", order)});
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+ ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum result,
+ CallFunction("sort_indices", {datum}, &options, ctx));
return result.make_array();
}
-Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
- SortOrder order, ExecContext* ctx) {
- SortOptions options({SortKey("not-used", order)});
- ARROW_ASSIGN_OR_RAISE(
- Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
- return result.make_array();
-}
-
-Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
- ExecContext* ctx) {
- ARROW_ASSIGN_OR_RAISE(Datum result,
- CallFunction("sort_indices", {datum}, &options, ctx));
- return result.make_array();
-}
-
Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("unique", {value}, ctx));
return result.make_array();
}
-Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options,
- ExecContext* ctx) {
- return CallFunction("dictionary_encode", {value}, &options, ctx);
+Result<Datum> DictionaryEncode(const Datum& value, const DictionaryEncodeOptions& options,
+ ExecContext* ctx) {
+ return CallFunction("dictionary_encode", {value}, &options, ctx);
}
const char kValuesFieldName[] = "values";
@@ -275,9 +275,9 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
return result.table();
}
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
- return SortIndices(values, SortOrder::Ascending, ctx);
-}
-
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
+ return SortIndices(values, SortOrder::Ascending, ctx);
+}
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
index c3a81542b76..9d8d4271db8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/api_vector.h
@@ -32,8 +32,8 @@ class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
-class ARROW_EXPORT FilterOptions : public FunctionOptions {
- public:
+class ARROW_EXPORT FilterOptions : public FunctionOptions {
+ public:
/// Configure the action taken when a slot of the selection mask is null
enum NullSelectionBehavior {
/// the corresponding filtered value will be removed in the output
@@ -42,89 +42,89 @@ class ARROW_EXPORT FilterOptions : public FunctionOptions {
EMIT_NULL,
};
- explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
- constexpr static char const kTypeName[] = "FilterOptions";
+ explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
+ constexpr static char const kTypeName[] = "FilterOptions";
static FilterOptions Defaults() { return FilterOptions(); }
NullSelectionBehavior null_selection_behavior = DROP;
};
-class ARROW_EXPORT TakeOptions : public FunctionOptions {
- public:
- explicit TakeOptions(bool boundscheck = true);
- constexpr static char const kTypeName[] = "TakeOptions";
+class ARROW_EXPORT TakeOptions : public FunctionOptions {
+ public:
+ explicit TakeOptions(bool boundscheck = true);
+ constexpr static char const kTypeName[] = "TakeOptions";
static TakeOptions BoundsCheck() { return TakeOptions(true); }
static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
static TakeOptions Defaults() { return BoundsCheck(); }
-
- bool boundscheck = true;
+
+ bool boundscheck = true;
+};
+
+/// \brief Options for the dictionary encode function
+class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
+ public:
+ /// Configure how null values will be encoded
+ enum NullEncodingBehavior {
+ /// the null value will be added to the dictionary with a proper index
+ ENCODE,
+ /// the null value will be masked in the indices array
+ MASK
+ };
+
+ explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
+ constexpr static char const kTypeName[] = "DictionaryEncodeOptions";
+ static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
+
+ NullEncodingBehavior null_encoding_behavior = MASK;
+};
+
+enum class SortOrder {
+ Ascending,
+ Descending,
+};
+
+/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
+class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
+ public:
+ explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
+ : name(name), order(order) {}
+
+ using util::EqualityComparable<SortKey>::Equals;
+ using util::EqualityComparable<SortKey>::operator==;
+ using util::EqualityComparable<SortKey>::operator!=;
+ bool Equals(const SortKey& other) const;
+ std::string ToString() const;
+
+ /// The name of the sort column.
+ std::string name;
+ /// How to order by this sort key.
+ SortOrder order;
+};
+
+class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
+ public:
+ explicit ArraySortOptions(SortOrder order = SortOrder::Ascending);
+ constexpr static char const kTypeName[] = "ArraySortOptions";
+ static ArraySortOptions Defaults() { return ArraySortOptions{}; }
+
+ SortOrder order;
+};
+
+class ARROW_EXPORT SortOptions : public FunctionOptions {
+ public:
+ explicit SortOptions(std::vector<SortKey> sort_keys = {});
+ constexpr static char const kTypeName[] = "SortOptions";
+ static SortOptions Defaults() { return SortOptions{}; }
+
+ std::vector<SortKey> sort_keys;
};
-/// \brief Options for the dictionary encode function
-class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
- public:
- /// Configure how null values will be encoded
- enum NullEncodingBehavior {
- /// the null value will be added to the dictionary with a proper index
- ENCODE,
- /// the null value will be masked in the indices array
- MASK
- };
-
- explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
- constexpr static char const kTypeName[] = "DictionaryEncodeOptions";
- static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
-
- NullEncodingBehavior null_encoding_behavior = MASK;
-};
-
-enum class SortOrder {
- Ascending,
- Descending,
-};
-
-/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
-class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
- public:
- explicit SortKey(std::string name, SortOrder order = SortOrder::Ascending)
- : name(name), order(order) {}
-
- using util::EqualityComparable<SortKey>::Equals;
- using util::EqualityComparable<SortKey>::operator==;
- using util::EqualityComparable<SortKey>::operator!=;
- bool Equals(const SortKey& other) const;
- std::string ToString() const;
-
- /// The name of the sort column.
- std::string name;
- /// How to order by this sort key.
- SortOrder order;
-};
-
-class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
- public:
- explicit ArraySortOptions(SortOrder order = SortOrder::Ascending);
- constexpr static char const kTypeName[] = "ArraySortOptions";
- static ArraySortOptions Defaults() { return ArraySortOptions{}; }
-
- SortOrder order;
-};
-
-class ARROW_EXPORT SortOptions : public FunctionOptions {
- public:
- explicit SortOptions(std::vector<SortKey> sort_keys = {});
- constexpr static char const kTypeName[] = "SortOptions";
- static SortOptions Defaults() { return SortOptions{}; }
-
- std::vector<SortKey> sort_keys;
-};
-
/// \brief Partitioning options for NthToIndices
-class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
- public:
- explicit PartitionNthOptions(int64_t pivot);
- PartitionNthOptions() : PartitionNthOptions(0) {}
- constexpr static char const kTypeName[] = "PartitionNthOptions";
+class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
+ public:
+ explicit PartitionNthOptions(int64_t pivot);
+ PartitionNthOptions() : PartitionNthOptions(0) {}
+ constexpr static char const kTypeName[] = "PartitionNthOptions";
/// The index into the equivalent sorted array of the partition pivot element.
int64_t pivot;
@@ -171,23 +171,23 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndices(
} // namespace internal
-/// \brief ReplaceWithMask replaces each value in the array corresponding
-/// to a true value in the mask with the next element from `replacements`.
-///
-/// \param[in] values Array input to replace
-/// \param[in] mask Array or Scalar of Boolean mask values
-/// \param[in] replacements The replacement values to draw from. There must
-/// be as many replacement values as true values in the mask.
-/// \param[in] ctx the function execution context, optional
-///
-/// \return the resulting datum
-///
-/// \since 5.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
- const Datum& replacements, ExecContext* ctx = NULLPTR);
-
+/// \brief ReplaceWithMask replaces each value in the array corresponding
+/// to a true value in the mask with the next element from `replacements`.
+///
+/// \param[in] values Array input to replace
+/// \param[in] mask Array or Scalar of Boolean mask values
+/// \param[in] replacements The replacement values to draw from. There must
+/// be as many replacement values as true values in the mask.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+ const Datum& replacements, ExecContext* ctx = NULLPTR);
+
/// \brief Take from an array of values at indices in another array
///
/// The output array will be of the same type as the input values
@@ -233,73 +233,73 @@ ARROW_EXPORT
Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
ExecContext* ctx = NULLPTR);
-/// \brief Returns the indices that would sort an array in the
-/// specified order.
+/// \brief Returns the indices that would sort an array in the
+/// specified order.
///
/// Perform an indirect sort of array. The output array will contain
/// indices that would sort an array, which would be the same length
-/// as input. Nulls will be stably partitioned to the end of the output
-/// regardless of order.
+/// as input. Nulls will be stably partitioned to the end of the output
+/// regardless of order.
///
-/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
-/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
-/// 3].
+/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
+/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
+/// 3].
///
-/// \param[in] array array to sort
-/// \param[in] order ascending or descending
+/// \param[in] array array to sort
+/// \param[in] order ascending or descending
/// \param[in] ctx the function execution context, optional
/// \return offsets indices that would sort an array
ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortIndices(const Array& array,
- SortOrder order = SortOrder::Ascending,
- ExecContext* ctx = NULLPTR);
-
-/// \brief Returns the indices that would sort a chunked array in the
-/// specified order.
-///
-/// Perform an indirect sort of chunked array. The output array will
-/// contain indices that would sort a chunked array, which would be
-/// the same length as input. Nulls will be stably partitioned to the
-/// end of the output regardless of order.
-///
-/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
-/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
-/// 4, 1, 0, 3].
-///
-/// \param[in] chunked_array chunked array to sort
-/// \param[in] order ascending or descending
-/// \param[in] ctx the function execution context, optional
-/// \return offsets indices that would sort an array
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
- SortOrder order = SortOrder::Ascending,
- ExecContext* ctx = NULLPTR);
-
-/// \brief Returns the indices that would sort an input in the
-/// specified order. Input is one of array, chunked array record batch
-/// or table.
-///
-/// Perform an indirect sort of input. The output array will contain
-/// indices that would sort an input, which would be the same length
-/// as input. Nulls will be stably partitioned to the end of the
-/// output regardless of order.
-///
-/// For example given input (table) = {
-/// "column1": [[null, 1], [ 3, null, 2, 1]],
-/// "column2": [[ 5], [3, null, null, 5, 5]],
-/// } and options = {
-/// {"column1", SortOrder::Ascending},
-/// {"column2", SortOrder::Descending},
-/// }, the output will be [5, 1, 4, 2, 0, 3].
-///
-/// \param[in] datum array, chunked array, record batch or table to sort
-/// \param[in] options options
-/// \param[in] ctx the function execution context, optional
-/// \return offsets indices that would sort a table
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
- ExecContext* ctx = NULLPTR);
-
+Result<std::shared_ptr<Array>> SortIndices(const Array& array,
+ SortOrder order = SortOrder::Ascending,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort a chunked array in the
+/// specified order.
+///
+/// Perform an indirect sort of chunked array. The output array will
+/// contain indices that would sort a chunked array, which would be
+/// the same length as input. Nulls will be stably partitioned to the
+/// end of the output regardless of order.
+///
+/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
+/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
+/// 4, 1, 0, 3].
+///
+/// \param[in] chunked_array chunked array to sort
+/// \param[in] order ascending or descending
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+ SortOrder order = SortOrder::Ascending,
+ ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort an input in the
+/// specified order. Input is one of array, chunked array record batch
+/// or table.
+///
+/// Perform an indirect sort of input. The output array will contain
+/// indices that would sort an input, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the
+/// output regardless of order.
+///
+/// For example given input (table) = {
+/// "column1": [[null, 1], [ 3, null, 2, 1]],
+/// "column2": [[ 5], [3, null, null, 5, 5]],
+/// } and options = {
+/// {"column1", SortOrder::Ascending},
+/// {"column2", SortOrder::Descending},
+/// }, the output will be [5, 1, 4, 2, 0, 3].
+///
+/// \param[in] datum array, chunked array, record batch or table to sort
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort a table
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+ ExecContext* ctx = NULLPTR);
+
/// \brief Compute unique elements from an array-like object
///
/// Note if a null occurs in the input it will NOT be included in the output.
@@ -338,29 +338,29 @@ Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
ExecContext* ctx = NULLPTR);
/// \brief Dictionary-encode values in an array-like object
-///
-/// Any nulls encountered in the dictionary will be handled according to the
-/// specified null encoding behavior.
-///
-/// For example, given values ["a", "b", null, "a", null] the output will be
-/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
-/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
-///
-/// If the input is already dictionary encoded this function is a no-op unless
-/// it needs to modify the null_encoding (TODO)
-///
+///
+/// Any nulls encountered in the dictionary will be handled according to the
+/// specified null encoding behavior.
+///
+/// For example, given values ["a", "b", null, "a", null] the output will be
+/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
+/// (null_encoding == MASK) Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
+///
+/// If the input is already dictionary encoded this function is a no-op unless
+/// it needs to modify the null_encoding (TODO)
+///
/// \param[in] data array-like input
/// \param[in] ctx the function execution context, optional
-/// \param[in] options configures null encoding behavior
+/// \param[in] options configures null encoding behavior
/// \return result with same shape and type as input
///
/// \since 1.0.0
/// \note API not yet finalized
ARROW_EXPORT
-Result<Datum> DictionaryEncode(
- const Datum& data,
- const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
- ExecContext* ctx = NULLPTR);
+Result<Datum> DictionaryEncode(
+ const Datum& data,
+ const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
+ ExecContext* ctx = NULLPTR);
// ----------------------------------------------------------------------
// Deprecated functions
@@ -401,10 +401,10 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
const TakeOptions& options = TakeOptions::Defaults(),
ExecContext* context = NULLPTR);
-ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
- ExecContext* ctx = NULLPTR);
-
+ARROW_DEPRECATED("Deprecated in 3.0.0. Use SortIndices()")
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
+ ExecContext* ctx = NULLPTR);
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
index db3b2e05da4..4de68ba8d90 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.cc
@@ -18,7 +18,7 @@
#include "arrow/compute/cast.h"
#include <mutex>
-#include <sstream>
+#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
@@ -27,12 +27,12 @@
#include "arrow/compute/cast_internal.h"
#include "arrow/compute/exec.h"
-#include "arrow/compute/function_internal.h"
+#include "arrow/compute/function_internal.h"
#include "arrow/compute/kernel.h"
#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/registry.h"
#include "arrow/util/logging.h"
-#include "arrow/util/reflection_internal.h"
+#include "arrow/util/reflection_internal.h"
namespace arrow {
@@ -41,13 +41,13 @@ using internal::ToTypeName;
namespace compute {
namespace internal {
-// ----------------------------------------------------------------------
-// Function options
-
-namespace {
-
+// ----------------------------------------------------------------------
+// Function options
+
+namespace {
+
std::unordered_map<int, std::shared_ptr<CastFunction>> g_cast_table;
-std::once_flag cast_table_initialized;
+std::once_flag cast_table_initialized;
void AddCastFunctions(const std::vector<std::shared_ptr<CastFunction>>& funcs) {
for (const auto& func : funcs) {
@@ -61,7 +61,7 @@ void InitCastTable() {
AddCastFunctions(GetNestedCasts());
AddCastFunctions(GetNumericCasts());
AddCastFunctions(GetTemporalCasts());
- AddCastFunctions(GetDictionaryCasts());
+ AddCastFunctions(GetDictionaryCasts());
}
void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); }
@@ -85,17 +85,17 @@ Result<std::shared_ptr<CastFunction>> GetCastFunctionInternal(
return it->second;
}
-const FunctionDoc cast_doc{"Cast values to another data type",
- ("Behavior when values wouldn't fit in the target type\n"
- "can be controlled through CastOptions."),
- {"input"},
- "CastOptions"};
+const FunctionDoc cast_doc{"Cast values to another data type",
+ ("Behavior when values wouldn't fit in the target type\n"
+ "can be controlled through CastOptions."),
+ {"input"},
+ "CastOptions"};
-// Metafunction for dispatching to appropriate CastFunction. This corresponds
+// Metafunction for dispatching to appropriate CastFunction. This corresponds
// to the standard SQL CAST(expr AS target_type)
class CastMetaFunction : public MetaFunction {
public:
- CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {}
+ CastMetaFunction() : MetaFunction("cast", Arity::Unary(), &cast_doc) {}
Result<const CastOptions*> ValidateOptions(const FunctionOptions* options) const {
auto cast_options = static_cast<const CastOptions*>(options);
@@ -123,44 +123,44 @@ class CastMetaFunction : public MetaFunction {
}
};
-static auto kCastOptionsType = GetFunctionOptionsType<CastOptions>(
- arrow::internal::DataMember("to_type", &CastOptions::to_type),
- arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow),
- arrow::internal::DataMember("allow_time_truncate", &CastOptions::allow_time_truncate),
- arrow::internal::DataMember("allow_time_overflow", &CastOptions::allow_time_overflow),
- arrow::internal::DataMember("allow_decimal_truncate",
- &CastOptions::allow_decimal_truncate),
- arrow::internal::DataMember("allow_float_truncate",
- &CastOptions::allow_float_truncate),
- arrow::internal::DataMember("allow_invalid_utf8", &CastOptions::allow_invalid_utf8));
-} // namespace
-
+static auto kCastOptionsType = GetFunctionOptionsType<CastOptions>(
+ arrow::internal::DataMember("to_type", &CastOptions::to_type),
+ arrow::internal::DataMember("allow_int_overflow", &CastOptions::allow_int_overflow),
+ arrow::internal::DataMember("allow_time_truncate", &CastOptions::allow_time_truncate),
+ arrow::internal::DataMember("allow_time_overflow", &CastOptions::allow_time_overflow),
+ arrow::internal::DataMember("allow_decimal_truncate",
+ &CastOptions::allow_decimal_truncate),
+ arrow::internal::DataMember("allow_float_truncate",
+ &CastOptions::allow_float_truncate),
+ arrow::internal::DataMember("allow_invalid_utf8", &CastOptions::allow_invalid_utf8));
+} // namespace
+
void RegisterScalarCast(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::make_shared<CastMetaFunction>()));
- DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType));
+ DCHECK_OK(registry->AddFunctionOptionsType(kCastOptionsType));
}
} // namespace internal
-CastOptions::CastOptions(bool safe)
- : FunctionOptions(internal::kCastOptionsType),
- allow_int_overflow(!safe),
- allow_time_truncate(!safe),
- allow_time_overflow(!safe),
- allow_decimal_truncate(!safe),
- allow_float_truncate(!safe),
- allow_invalid_utf8(!safe) {}
+CastOptions::CastOptions(bool safe)
+ : FunctionOptions(internal::kCastOptionsType),
+ allow_int_overflow(!safe),
+ allow_time_truncate(!safe),
+ allow_time_overflow(!safe),
+ allow_decimal_truncate(!safe),
+ allow_float_truncate(!safe),
+ allow_invalid_utf8(!safe) {}
-constexpr char CastOptions::kTypeName[];
+constexpr char CastOptions::kTypeName[];
-CastFunction::CastFunction(std::string name, Type::type out_type_id)
- : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr),
- out_type_id_(out_type_id) {}
+CastFunction::CastFunction(std::string name, Type::type out_type_id)
+ : ScalarFunction(std::move(name), Arity::Unary(), /*doc=*/nullptr),
+ out_type_id_(out_type_id) {}
Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
// We use the same KernelInit for every cast
kernel.init = internal::CastState::Init;
RETURN_NOT_OK(ScalarFunction::AddKernel(kernel));
- in_type_ids_.push_back(in_type_id);
+ in_type_ids_.push_back(in_type_id);
return Status::OK();
}
@@ -176,9 +176,9 @@ Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_
return AddKernel(in_type_id, std::move(kernel));
}
-Result<const Kernel*> CastFunction::DispatchExact(
+Result<const Kernel*> CastFunction::DispatchExact(
const std::vector<ValueDescr>& values) const {
- RETURN_NOT_OK(CheckArity(values));
+ RETURN_NOT_OK(CheckArity(values));
std::vector<const ScalarKernel*> candidate_kernels;
for (const auto& kernel : kernels_) {
@@ -189,28 +189,28 @@ Result<const Kernel*> CastFunction::DispatchExact(
if (candidate_kernels.size() == 0) {
return Status::NotImplemented("Unsupported cast from ", values[0].type->ToString(),
- " to ", ToTypeName(out_type_id_), " using function ",
+ " to ", ToTypeName(out_type_id_), " using function ",
this->name());
- }
-
- if (candidate_kernels.size() == 1) {
+ }
+
+ if (candidate_kernels.size() == 1) {
// One match, return it
return candidate_kernels[0];
- }
-
- // Now we are in a casting scenario where we may have both a EXACT_TYPE and
- // a SAME_TYPE_ID. So we will see if there is an exact match among the
- // candidate kernels and if not we will just return the first one
- for (auto kernel : candidate_kernels) {
- const InputType& arg0 = kernel->signature->in_types()[0];
- if (arg0.kind() == InputType::EXACT_TYPE) {
- // Bingo. Return it
- return kernel;
+ }
+
+ // Now we are in a casting scenario where we may have both a EXACT_TYPE and
+ // a SAME_TYPE_ID. So we will see if there is an exact match among the
+ // candidate kernels and if not we will just return the first one
+ for (auto kernel : candidate_kernels) {
+ const InputType& arg0 = kernel->signature->in_types()[0];
+ if (arg0.kind() == InputType::EXACT_TYPE) {
+ // Bingo. Return it
+ return kernel;
}
}
-
- // We didn't find an exact match. So just return some kernel that matches
- return candidate_kernels[0];
+
+ // We didn't find an exact match. So just return some kernel that matches
+ return candidate_kernels[0];
}
Result<Datum> Cast(const Datum& value, const CastOptions& options, ExecContext* ctx) {
@@ -237,37 +237,37 @@ Result<std::shared_ptr<CastFunction>> GetCastFunction(
bool CanCast(const DataType& from_type, const DataType& to_type) {
internal::EnsureInitCastTable();
- auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));
+ auto it = internal::g_cast_table.find(static_cast<int>(to_type.id()));
if (it == internal::g_cast_table.end()) {
return false;
}
-
- const CastFunction* function = it->second.get();
- DCHECK_EQ(function->out_type_id(), to_type.id());
-
- for (auto from_id : function->in_type_ids()) {
- // XXX should probably check the output type as well
- if (from_type.id() == from_id) return true;
- }
-
- return false;
+
+ const CastFunction* function = it->second.get();
+ DCHECK_EQ(function->out_type_id(), to_type.id());
+
+ for (auto from_id : function->in_type_ids()) {
+ // XXX should probably check the output type as well
+ if (from_type.id() == from_id) return true;
+ }
+
+ return false;
+}
+
+Result<std::vector<Datum>> Cast(std::vector<Datum> datums, std::vector<ValueDescr> descrs,
+ ExecContext* ctx) {
+ for (size_t i = 0; i != datums.size(); ++i) {
+ if (descrs[i] != datums[i].descr()) {
+ if (descrs[i].shape != datums[i].shape()) {
+ return Status::NotImplemented("casting between Datum shapes");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(datums[i],
+ Cast(datums[i], CastOptions::Safe(descrs[i].type), ctx));
+ }
+ }
+
+ return datums;
}
-Result<std::vector<Datum>> Cast(std::vector<Datum> datums, std::vector<ValueDescr> descrs,
- ExecContext* ctx) {
- for (size_t i = 0; i != datums.size(); ++i) {
- if (descrs[i] != datums[i].descr()) {
- if (descrs[i].shape != datums[i].shape()) {
- return Status::NotImplemented("casting between Datum shapes");
- }
-
- ARROW_ASSIGN_OR_RAISE(datums[i],
- Cast(datums[i], CastOptions::Safe(descrs[i].type), ctx));
- }
- }
-
- return datums;
-}
-
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
index 5a2afd86845..131f57f892f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast.h
@@ -41,22 +41,22 @@ class ExecContext;
/// \addtogroup compute-concrete-options
/// @{
-class ARROW_EXPORT CastOptions : public FunctionOptions {
- public:
- explicit CastOptions(bool safe = true);
-
- constexpr static char const kTypeName[] = "CastOptions";
- static CastOptions Safe(std::shared_ptr<DataType> to_type = NULLPTR) {
- CastOptions safe(true);
- safe.to_type = std::move(to_type);
- return safe;
- }
-
- static CastOptions Unsafe(std::shared_ptr<DataType> to_type = NULLPTR) {
- CastOptions unsafe(false);
- unsafe.to_type = std::move(to_type);
- return unsafe;
- }
+class ARROW_EXPORT CastOptions : public FunctionOptions {
+ public:
+ explicit CastOptions(bool safe = true);
+
+ constexpr static char const kTypeName[] = "CastOptions";
+ static CastOptions Safe(std::shared_ptr<DataType> to_type = NULLPTR) {
+ CastOptions safe(true);
+ safe.to_type = std::move(to_type);
+ return safe;
+ }
+
+ static CastOptions Unsafe(std::shared_ptr<DataType> to_type = NULLPTR) {
+ CastOptions unsafe(false);
+ unsafe.to_type = std::move(to_type);
+ return unsafe;
+ }
// Type being casted to. May be passed separate to eager function
// compute::Cast
@@ -78,10 +78,10 @@ class ARROW_EXPORT CastOptions : public FunctionOptions {
// the same execution machinery
class CastFunction : public ScalarFunction {
public:
- CastFunction(std::string name, Type::type out_type_id);
+ CastFunction(std::string name, Type::type out_type_id);
- Type::type out_type_id() const { return out_type_id_; }
- const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
+ Type::type out_type_id() const { return out_type_id_; }
+ const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
OutputType out_type, ArrayKernelExec exec,
@@ -92,12 +92,12 @@ class CastFunction : public ScalarFunction {
// function to CastInit
Status AddKernel(Type::type in_type_id, ScalarKernel kernel);
- Result<const Kernel*> DispatchExact(
+ Result<const Kernel*> DispatchExact(
const std::vector<ValueDescr>& values) const override;
private:
- std::vector<Type::type> in_type_ids_;
- const Type::type out_type_id_;
+ std::vector<Type::type> in_type_ids_;
+ const Type::type out_type_id_;
};
ARROW_EXPORT
@@ -151,17 +151,17 @@ Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
const CastOptions& options = CastOptions::Safe(),
ExecContext* ctx = NULLPTR);
-/// \brief Cast several values simultaneously. Safe cast options are used.
-/// \param[in] values datums to cast
-/// \param[in] descrs ValueDescrs to cast to
-/// \param[in] ctx the function execution context, optional
-/// \return the resulting datums
-///
-/// \since 4.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<std::vector<Datum>> Cast(std::vector<Datum> values, std::vector<ValueDescr> descrs,
- ExecContext* ctx = NULLPTR);
-
+/// \brief Cast several values simultaneously. Safe cast options are used.
+/// \param[in] values datums to cast
+/// \param[in] descrs ValueDescrs to cast to
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datums
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::vector<Datum>> Cast(std::vector<Datum> values, std::vector<ValueDescr> descrs,
+ ExecContext* ctx = NULLPTR);
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
index 7e784a0b61a..0105d08a573 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/cast_internal.h
@@ -36,7 +36,7 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts();
std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts();
std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts();
std::vector<std::shared_ptr<CastFunction>> GetNestedCasts();
-std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts();
+std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts();
} // namespace internal
} // namespace compute
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
index 8998df465e5..63f8d39f551 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.cc
@@ -36,8 +36,8 @@
#include "arrow/compute/registry.h"
#include "arrow/compute/util_internal.h"
#include "arrow/datum.h"
-#include "arrow/pretty_print.h"
-#include "arrow/record_batch.h"
+#include "arrow/pretty_print.h"
+#include "arrow/record_batch.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/type.h"
@@ -47,8 +47,8 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/cpu_info.h"
#include "arrow/util/logging.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/util/vector.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/vector.h"
namespace arrow {
@@ -59,104 +59,104 @@ using internal::CpuInfo;
namespace compute {
-ExecContext* default_exec_context() {
- static ExecContext default_ctx;
- return &default_ctx;
-}
-
-ExecBatch::ExecBatch(const RecordBatch& batch)
- : values(batch.num_columns()), length(batch.num_rows()) {
- auto columns = batch.column_data();
- std::move(columns.begin(), columns.end(), values.begin());
-}
-
-bool ExecBatch::Equals(const ExecBatch& other) const {
- return guarantee == other.guarantee && values == other.values;
-}
-
-void PrintTo(const ExecBatch& batch, std::ostream* os) {
- *os << "ExecBatch\n";
-
- static const std::string indent = " ";
-
- *os << indent << "# Rows: " << batch.length << "\n";
- if (batch.guarantee != literal(true)) {
- *os << indent << "Guarantee: " << batch.guarantee.ToString() << "\n";
- }
-
- int i = 0;
- for (const Datum& value : batch.values) {
- *os << indent << "" << i++ << ": ";
-
- if (value.is_scalar()) {
- *os << "Scalar[" << value.scalar()->ToString() << "]\n";
- continue;
- }
-
- auto array = value.make_array();
- PrettyPrintOptions options;
- options.skip_new_lines = true;
- *os << "Array";
- ARROW_CHECK_OK(PrettyPrint(*array, options, os));
- *os << "\n";
- }
-}
-
-ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const {
- ExecBatch out = *this;
- for (auto& value : out.values) {
- if (value.is_scalar()) continue;
- value = value.array()->Slice(offset, length);
- }
- out.length = length;
- return out;
-}
-
-Result<ExecBatch> ExecBatch::Make(std::vector<Datum> values) {
- if (values.empty()) {
- return Status::Invalid("Cannot infer ExecBatch length without at least one value");
- }
-
- int64_t length = -1;
- for (const auto& value : values) {
- if (value.is_scalar()) {
- continue;
- }
-
- if (length == -1) {
- length = value.length();
- continue;
- }
-
- if (length != value.length()) {
- return Status::Invalid(
- "Arrays used to construct an ExecBatch must have equal length");
- }
- }
-
- if (length == -1) {
- length = 1;
- }
-
- return ExecBatch(std::move(values), length);
-}
-
-Result<std::shared_ptr<RecordBatch>> ExecBatch::ToRecordBatch(
- std::shared_ptr<Schema> schema, MemoryPool* pool) const {
- ArrayVector columns(schema->num_fields());
-
- for (size_t i = 0; i < columns.size(); ++i) {
- const Datum& value = values[i];
- if (value.is_array()) {
- columns[i] = value.make_array();
- continue;
- }
- ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool));
- }
-
- return RecordBatch::Make(std::move(schema), length, std::move(columns));
-}
-
+ExecContext* default_exec_context() {
+ static ExecContext default_ctx;
+ return &default_ctx;
+}
+
+ExecBatch::ExecBatch(const RecordBatch& batch)
+ : values(batch.num_columns()), length(batch.num_rows()) {
+ auto columns = batch.column_data();
+ std::move(columns.begin(), columns.end(), values.begin());
+}
+
+bool ExecBatch::Equals(const ExecBatch& other) const {
+ return guarantee == other.guarantee && values == other.values;
+}
+
+void PrintTo(const ExecBatch& batch, std::ostream* os) {
+ *os << "ExecBatch\n";
+
+ static const std::string indent = " ";
+
+ *os << indent << "# Rows: " << batch.length << "\n";
+ if (batch.guarantee != literal(true)) {
+ *os << indent << "Guarantee: " << batch.guarantee.ToString() << "\n";
+ }
+
+ int i = 0;
+ for (const Datum& value : batch.values) {
+ *os << indent << "" << i++ << ": ";
+
+ if (value.is_scalar()) {
+ *os << "Scalar[" << value.scalar()->ToString() << "]\n";
+ continue;
+ }
+
+ auto array = value.make_array();
+ PrettyPrintOptions options;
+ options.skip_new_lines = true;
+ *os << "Array";
+ ARROW_CHECK_OK(PrettyPrint(*array, options, os));
+ *os << "\n";
+ }
+}
+
+ExecBatch ExecBatch::Slice(int64_t offset, int64_t length) const {
+ ExecBatch out = *this;
+ for (auto& value : out.values) {
+ if (value.is_scalar()) continue;
+ value = value.array()->Slice(offset, length);
+ }
+ out.length = length;
+ return out;
+}
+
+Result<ExecBatch> ExecBatch::Make(std::vector<Datum> values) {
+ if (values.empty()) {
+ return Status::Invalid("Cannot infer ExecBatch length without at least one value");
+ }
+
+ int64_t length = -1;
+ for (const auto& value : values) {
+ if (value.is_scalar()) {
+ continue;
+ }
+
+ if (length == -1) {
+ length = value.length();
+ continue;
+ }
+
+ if (length != value.length()) {
+ return Status::Invalid(
+ "Arrays used to construct an ExecBatch must have equal length");
+ }
+ }
+
+ if (length == -1) {
+ length = 1;
+ }
+
+ return ExecBatch(std::move(values), length);
+}
+
+Result<std::shared_ptr<RecordBatch>> ExecBatch::ToRecordBatch(
+ std::shared_ptr<Schema> schema, MemoryPool* pool) const {
+ ArrayVector columns(schema->num_fields());
+
+ for (size_t i = 0; i < columns.size(); ++i) {
+ const Datum& value = values[i];
+ if (value.is_array()) {
+ columns[i] = value.make_array();
+ continue;
+ }
+ ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool));
+ }
+
+ return RecordBatch::Make(std::move(schema), length, std::move(columns));
+}
+
namespace {
Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t length,
@@ -164,57 +164,57 @@ Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t l
if (bit_width == 1) {
return ctx->AllocateBitmap(length);
} else {
- int64_t buffer_size = BitUtil::BytesForBits(length * bit_width);
+ int64_t buffer_size = BitUtil::BytesForBits(length * bit_width);
return ctx->Allocate(buffer_size);
}
}
-struct BufferPreallocation {
- explicit BufferPreallocation(int bit_width = -1, int added_length = 0)
- : bit_width(bit_width), added_length(added_length) {}
-
- int bit_width;
- int added_length;
-};
-
-void ComputeDataPreallocate(const DataType& type,
- std::vector<BufferPreallocation>* widths) {
- if (is_fixed_width(type.id()) && type.id() != Type::NA) {
- widths->emplace_back(checked_cast<const FixedWidthType&>(type).bit_width());
- return;
+struct BufferPreallocation {
+ explicit BufferPreallocation(int bit_width = -1, int added_length = 0)
+ : bit_width(bit_width), added_length(added_length) {}
+
+ int bit_width;
+ int added_length;
+};
+
+void ComputeDataPreallocate(const DataType& type,
+ std::vector<BufferPreallocation>* widths) {
+ if (is_fixed_width(type.id()) && type.id() != Type::NA) {
+ widths->emplace_back(checked_cast<const FixedWidthType&>(type).bit_width());
+ return;
+ }
+ // Preallocate binary and list offsets
+ switch (type.id()) {
+ case Type::BINARY:
+ case Type::STRING:
+ case Type::LIST:
+ case Type::MAP:
+ widths->emplace_back(32, /*added_length=*/1);
+ return;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ case Type::LARGE_LIST:
+ widths->emplace_back(64, /*added_length=*/1);
+ return;
+ default:
+ break;
}
- // Preallocate binary and list offsets
- switch (type.id()) {
- case Type::BINARY:
- case Type::STRING:
- case Type::LIST:
- case Type::MAP:
- widths->emplace_back(32, /*added_length=*/1);
- return;
- case Type::LARGE_BINARY:
- case Type::LARGE_STRING:
- case Type::LARGE_LIST:
- widths->emplace_back(64, /*added_length=*/1);
- return;
- default:
- break;
- }
}
} // namespace
namespace detail {
-Status CheckAllValues(const std::vector<Datum>& values) {
- for (const auto& value : values) {
- if (!value.is_value()) {
- return Status::Invalid("Tried executing function with non-value type: ",
- value.ToString());
- }
- }
- return Status::OK();
-}
-
+Status CheckAllValues(const std::vector<Datum>& values) {
+ for (const auto& value : values) {
+ if (!value.is_value()) {
+ return Status::Invalid("Tried executing function with non-value type: ",
+ value.ToString());
+ }
+ }
+ return Status::OK();
+}
+
ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
int64_t max_chunksize)
: args_(std::move(args)),
@@ -311,35 +311,35 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
return true;
}
-namespace {
-
-struct NullGeneralization {
- enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
-
- static type Get(const Datum& datum) {
- if (datum.type()->id() == Type::NA) {
- return ALL_NULL;
- }
-
- if (datum.is_scalar()) {
- return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL;
- }
-
- const auto& arr = *datum.array();
-
+namespace {
+
+struct NullGeneralization {
+ enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
+
+ static type Get(const Datum& datum) {
+ if (datum.type()->id() == Type::NA) {
+ return ALL_NULL;
+ }
+
+ if (datum.is_scalar()) {
+ return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL;
+ }
+
+ const auto& arr = *datum.array();
+
// Do not count the bits if they haven't been counted already
- const int64_t known_null_count = arr.null_count.load();
- if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) {
- return ALL_VALID;
- }
-
- if (known_null_count == arr.length) {
- return ALL_NULL;
- }
-
- return PERHAPS_NULL;
+ const int64_t known_null_count = arr.null_count.load();
+ if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) {
+ return ALL_VALID;
+ }
+
+ if (known_null_count == arr.length) {
+ return ALL_NULL;
+ }
+
+ return PERHAPS_NULL;
}
-};
+};
// Null propagation implementation that deals both with preallocated bitmaps
// and maybe-to-be allocated bitmaps
@@ -356,17 +356,17 @@ class NullPropagator {
public:
NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output)
: ctx_(ctx), batch_(batch), output_(output) {
- for (const Datum& datum : batch_.values) {
- auto null_generalization = NullGeneralization::Get(datum);
-
- if (null_generalization == NullGeneralization::ALL_NULL) {
- is_all_null_ = true;
+ for (const Datum& datum : batch_.values) {
+ auto null_generalization = NullGeneralization::Get(datum);
+
+ if (null_generalization == NullGeneralization::ALL_NULL) {
+ is_all_null_ = true;
+ }
+
+ if (null_generalization != NullGeneralization::ALL_VALID &&
+ datum.kind() == Datum::ARRAY) {
+ arrays_with_nulls_.push_back(datum.array().get());
}
-
- if (null_generalization != NullGeneralization::ALL_VALID &&
- datum.kind() == Datum::ARRAY) {
- arrays_with_nulls_.push_back(datum.array().get());
- }
}
if (output->buffers[0] != nullptr) {
@@ -386,33 +386,33 @@ class NullPropagator {
return Status::OK();
}
- Status AllNullShortCircuit() {
- // OK, the output should be all null
- output_->null_count = output_->length;
+ Status AllNullShortCircuit() {
+ // OK, the output should be all null
+ output_->null_count = output_->length;
+
+ if (bitmap_preallocated_) {
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+ return Status::OK();
+ }
- if (bitmap_preallocated_) {
- BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
- return Status::OK();
- }
-
// Walk all the values with nulls instead of breaking on the first in case
// we find a bitmap that can be reused in the non-preallocated case
- for (const ArrayData* arr : arrays_with_nulls_) {
- if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) {
- // Reuse this all null bitmap
- output_->buffers[0] = arr->buffers[0];
- return Status::OK();
+ for (const ArrayData* arr : arrays_with_nulls_) {
+ if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) {
+ // Reuse this all null bitmap
+ output_->buffers[0] = arr->buffers[0];
+ return Status::OK();
}
}
- RETURN_NOT_OK(EnsureAllocated());
- BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
- return Status::OK();
+ RETURN_NOT_OK(EnsureAllocated());
+ BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+ return Status::OK();
}
Status PropagateSingle() {
// One array
- const ArrayData& arr = *arrays_with_nulls_[0];
+ const ArrayData& arr = *arrays_with_nulls_[0];
const std::shared_ptr<Buffer>& arr_bitmap = arr.buffers[0];
// Reuse the null count if it's known
@@ -420,27 +420,27 @@ class NullPropagator {
if (bitmap_preallocated_) {
CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset);
- return Status::OK();
- }
-
- // Two cases when memory was not pre-allocated:
- //
- // * Offset is zero: we reuse the bitmap as is
- // * Offset is nonzero but a multiple of 8: we can slice the bitmap
- // * Offset is not a multiple of 8: we must allocate and use CopyBitmap
- //
- // Keep in mind that output_->offset is not permitted to be nonzero when
- // the bitmap is not preallocated, and that precondition is asserted
- // higher in the call stack.
- if (arr.offset == 0) {
- output_->buffers[0] = arr_bitmap;
- } else if (arr.offset % 8 == 0) {
- output_->buffers[0] =
- SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
+ return Status::OK();
+ }
+
+ // Two cases when memory was not pre-allocated:
+ //
+ // * Offset is zero: we reuse the bitmap as is
+ // * Offset is nonzero but a multiple of 8: we can slice the bitmap
+ // * Offset is not a multiple of 8: we must allocate and use CopyBitmap
+ //
+ // Keep in mind that output_->offset is not permitted to be nonzero when
+ // the bitmap is not preallocated, and that precondition is asserted
+ // higher in the call stack.
+ if (arr.offset == 0) {
+ output_->buffers[0] = arr_bitmap;
+ } else if (arr.offset % 8 == 0) {
+ output_->buffers[0] =
+ SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
} else {
- RETURN_NOT_OK(EnsureAllocated());
- CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
- /*dst_offset=*/0);
+ RETURN_NOT_OK(EnsureAllocated());
+ CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
+ /*dst_offset=*/0);
}
return Status::OK();
}
@@ -459,27 +459,27 @@ class NullPropagator {
output_->buffers[0]->mutable_data());
};
- DCHECK_GT(arrays_with_nulls_.size(), 1);
+ DCHECK_GT(arrays_with_nulls_.size(), 1);
// Seed the output bitmap with the & of the first two bitmaps
- Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]);
+ Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]);
// Accumulate the rest
- for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) {
- Accumulate(*output_, *arrays_with_nulls_[i]);
+ for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) {
+ Accumulate(*output_, *arrays_with_nulls_[i]);
}
return Status::OK();
}
Status Execute() {
- if (is_all_null_) {
- // An all-null value (scalar null or all-null array) gives us a short
- // circuit opportunity
- return AllNullShortCircuit();
+ if (is_all_null_) {
+ // An all-null value (scalar null or all-null array) gives us a short
+ // circuit opportunity
+ return AllNullShortCircuit();
}
// At this point, by construction we know that all of the values in
- // arrays_with_nulls_ are arrays that are not all null. So there are a
+ // arrays_with_nulls_ are arrays that are not all null. So there are a
// few cases:
//
// * No arrays. This is a no-op w/o preallocation but when the bitmap is
@@ -494,27 +494,27 @@ class NullPropagator {
output_->null_count = kUnknownNullCount;
- if (arrays_with_nulls_.empty()) {
+ if (arrays_with_nulls_.empty()) {
// No arrays with nulls case
output_->null_count = 0;
if (bitmap_preallocated_) {
BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, true);
}
return Status::OK();
- }
-
- if (arrays_with_nulls_.size() == 1) {
+ }
+
+ if (arrays_with_nulls_.size() == 1) {
return PropagateSingle();
}
-
- return PropagateMultiple();
+
+ return PropagateMultiple();
}
private:
KernelContext* ctx_;
const ExecBatch& batch_;
- std::vector<const ArrayData*> arrays_with_nulls_;
- bool is_all_null_ = false;
+ std::vector<const ArrayData*> arrays_with_nulls_;
+ bool is_all_null_ = false;
ArrayData* output_;
uint8_t* bitmap_;
bool bitmap_preallocated_ = false;
@@ -523,15 +523,15 @@ class NullPropagator {
std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
const std::shared_ptr<DataType>& type) {
std::vector<std::shared_ptr<Array>> arrays;
- arrays.reserve(values.size());
- for (const Datum& val : values) {
- if (val.length() == 0) {
+ arrays.reserve(values.size());
+ for (const Datum& val : values) {
+ if (val.length() == 0) {
// Skip empty chunks
continue;
}
- arrays.emplace_back(val.make_array());
+ arrays.emplace_back(val.make_array());
}
- return std::make_shared<ChunkedArray>(std::move(arrays), type);
+ return std::make_shared<ChunkedArray>(std::move(arrays), type);
}
bool HaveChunkedArray(const std::vector<Datum>& values) {
@@ -543,25 +543,25 @@ bool HaveChunkedArray(const std::vector<Datum>& values) {
return false;
}
-template <typename KernelType>
-class KernelExecutorImpl : public KernelExecutor {
+template <typename KernelType>
+class KernelExecutorImpl : public KernelExecutor {
public:
- Status Init(KernelContext* kernel_ctx, KernelInitArgs args) override {
- kernel_ctx_ = kernel_ctx;
- kernel_ = static_cast<const KernelType*>(args.kernel);
+ Status Init(KernelContext* kernel_ctx, KernelInitArgs args) override {
+ kernel_ctx_ = kernel_ctx;
+ kernel_ = static_cast<const KernelType*>(args.kernel);
- // Resolve the output descriptor for this kernel
- ARROW_ASSIGN_OR_RAISE(
- output_descr_, kernel_->signature->out_type().Resolve(kernel_ctx_, args.inputs));
+ // Resolve the output descriptor for this kernel
+ ARROW_ASSIGN_OR_RAISE(
+ output_descr_, kernel_->signature->out_type().Resolve(kernel_ctx_, args.inputs));
return Status::OK();
}
- protected:
+ protected:
// This is overridden by the VectorExecutor
virtual Status SetupArgIteration(const std::vector<Datum>& args) {
- ARROW_ASSIGN_OR_RAISE(
- batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
+ ARROW_ASSIGN_OR_RAISE(
+ batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
return Status::OK();
}
@@ -570,29 +570,29 @@ class KernelExecutorImpl : public KernelExecutor {
out->buffers.resize(output_num_buffers_);
if (validity_preallocated_) {
- ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_->AllocateBitmap(length));
+ ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_->AllocateBitmap(length));
}
- if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
- out->null_count = 0;
+ if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+ out->null_count = 0;
+ }
+ for (size_t i = 0; i < data_preallocated_.size(); ++i) {
+ const auto& prealloc = data_preallocated_[i];
+ if (prealloc.bit_width >= 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ out->buffers[i + 1],
+ AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length,
+ prealloc.bit_width));
+ }
}
- for (size_t i = 0; i < data_preallocated_.size(); ++i) {
- const auto& prealloc = data_preallocated_[i];
- if (prealloc.bit_width >= 0) {
- ARROW_ASSIGN_OR_RAISE(
- out->buffers[i + 1],
- AllocateDataBuffer(kernel_ctx_, length + prealloc.added_length,
- prealloc.bit_width));
- }
- }
return out;
}
- ExecContext* exec_context() { return kernel_ctx_->exec_context(); }
- KernelState* state() { return kernel_ctx_->state(); }
+ ExecContext* exec_context() { return kernel_ctx_->exec_context(); }
+ KernelState* state() { return kernel_ctx_->state(); }
// Not all of these members are used for every executor type
- KernelContext* kernel_ctx_;
+ KernelContext* kernel_ctx_;
const KernelType* kernel_;
std::unique_ptr<ExecBatchIterator> batch_iterator_;
ValueDescr output_descr_;
@@ -602,13 +602,13 @@ class KernelExecutorImpl : public KernelExecutor {
// If true, then memory is preallocated for the validity bitmap with the same
// strategy as the data buffer(s).
bool validity_preallocated_ = false;
-
- // The kernel writes into data buffers preallocated for these bit widths
- // (0 indicates no preallocation);
- std::vector<BufferPreallocation> data_preallocated_;
+
+ // The kernel writes into data buffers preallocated for these bit widths
+ // (0 indicates no preallocation);
+ std::vector<BufferPreallocation> data_preallocated_;
};
-class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
+class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
public:
Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
RETURN_NOT_OK(PrepareExecute(args));
@@ -646,9 +646,9 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
} else {
// XXX: In the case where no outputs are omitted, is returning a 0-length
// array always the correct move?
- return MakeArrayOfNull(output_descr_.type, /*length=*/0,
- exec_context()->memory_pool())
- .ValueOrDie();
+ return MakeArrayOfNull(output_descr_.type, /*length=*/0,
+ exec_context()->memory_pool())
+ .ValueOrDie();
}
}
}
@@ -661,7 +661,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
if (output_descr_.shape == ValueDescr::ARRAY) {
ArrayData* out_arr = out.mutable_array();
if (kernel_->null_handling == NullHandling::INTERSECTION) {
- RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
+ RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
} else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
out_arr->null_count = 0;
}
@@ -676,7 +676,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
}
}
- RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+ RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
if (!preallocate_contiguous_) {
// If we are producing chunked output rather than one big array, then
// emit each chunk as soon as it's available
@@ -686,7 +686,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
}
Status PrepareExecute(const std::vector<Datum>& args) {
- RETURN_NOT_OK(this->SetupArgIteration(args));
+ RETURN_NOT_OK(this->SetupArgIteration(args));
if (output_descr_.shape == ValueDescr::ARRAY) {
// If the executor is configured to produce a single large Array output for
@@ -749,26 +749,26 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
// Decide if we need to preallocate memory for this kernel
validity_preallocated_ =
(kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
- kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL &&
- output_descr_.type->id() != Type::NA);
- if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
- ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
- }
-
- // Contiguous preallocation only possible on non-nested types if all
- // buffers are preallocated. Otherwise, we must go chunk-by-chunk.
+ kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL &&
+ output_descr_.type->id() != Type::NA);
+ if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+ ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+ }
+
+ // Contiguous preallocation only possible on non-nested types if all
+ // buffers are preallocated. Otherwise, we must go chunk-by-chunk.
//
- // Some kernels are also unable to write into sliced outputs, so we respect the
- // kernel's attributes.
+ // Some kernels are also unable to write into sliced outputs, so we respect the
+ // kernel's attributes.
preallocate_contiguous_ =
- (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices &&
- validity_preallocated_ && !is_nested(output_descr_.type->id()) &&
- !is_dictionary(output_descr_.type->id()) &&
- data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
- std::all_of(data_preallocated_.begin(), data_preallocated_.end(),
- [](const BufferPreallocation& prealloc) {
- return prealloc.bit_width >= 0;
- }));
+ (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices &&
+ validity_preallocated_ && !is_nested(output_descr_.type->id()) &&
+ !is_dictionary(output_descr_.type->id()) &&
+ data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
+ std::all_of(data_preallocated_.begin(), data_preallocated_.end(),
+ [](const BufferPreallocation& prealloc) {
+ return prealloc.bit_width >= 0;
+ }));
if (preallocate_contiguous_) {
ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length));
}
@@ -790,7 +790,7 @@ Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
switch (arg.kind()) {
case Datum::SCALAR:
case Datum::ARRAY:
- case Datum::CHUNKED_ARRAY:
+ case Datum::CHUNKED_ARRAY:
length = std::max(arg.length(), length);
break;
default:
@@ -803,7 +803,7 @@ Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
return Status::OK();
}
-class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
+class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
public:
Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
RETURN_NOT_OK(PrepareExecute(args));
@@ -823,15 +823,15 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
const std::vector<Datum>& outputs) override {
// If execution yielded multiple chunks (because large arrays were split
// based on the ExecContext parameters, then the result is a ChunkedArray
- if (kernel_->output_chunked && (HaveChunkedArray(inputs) || outputs.size() > 1)) {
- return ToChunkedArray(outputs, output_descr_.type);
- } else if (outputs.size() == 1) {
- // Outputs have just one element
- return outputs[0];
+ if (kernel_->output_chunked && (HaveChunkedArray(inputs) || outputs.size() > 1)) {
+ return ToChunkedArray(outputs, output_descr_.type);
+ } else if (outputs.size() == 1) {
+ // Outputs have just one element
+ return outputs[0];
} else {
- // XXX: In the case where no outputs are omitted, is returning a 0-length
- // array always the correct move?
- return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
+ // XXX: In the case where no outputs are omitted, is returning a 0-length
+ // array always the correct move?
+ return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
}
}
@@ -851,9 +851,9 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
if (kernel_->null_handling == NullHandling::INTERSECTION &&
output_descr_.shape == ValueDescr::ARRAY) {
- RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array()));
+ RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array()));
}
- RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+ RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
if (!kernel_->finalize) {
// If there is no result finalizer (e.g. for hash-based functions, we can
// emit the processed batch right away rather than waiting
@@ -868,7 +868,7 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
if (kernel_->finalize) {
// Intermediate results require post-processing after the execution is
// completed (possibly involving some accumulated state)
- RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
+ RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
for (const auto& result : results_) {
RETURN_NOT_OK(listener->OnResult(result));
}
@@ -878,39 +878,39 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
Status SetupArgIteration(const std::vector<Datum>& args) override {
if (kernel_->can_execute_chunkwise) {
- ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
- args, exec_context()->exec_chunksize()));
+ ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
+ args, exec_context()->exec_chunksize()));
}
return Status::OK();
}
Status PrepareExecute(const std::vector<Datum>& args) {
- RETURN_NOT_OK(this->SetupArgIteration(args));
+ RETURN_NOT_OK(this->SetupArgIteration(args));
output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
// Decide if we need to preallocate memory for this kernel
validity_preallocated_ =
(kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
- if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
- ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
- }
+ if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+ ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+ }
return Status::OK();
}
std::vector<Datum> results_;
};
-class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
+class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
public:
- Status Init(KernelContext* ctx, KernelInitArgs args) override {
- input_descrs_ = &args.inputs;
- options_ = args.options;
- return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
- }
+ Status Init(KernelContext* ctx, KernelInitArgs args) override {
+ input_descrs_ = &args.inputs;
+ options_ = args.options;
+ return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
+ }
Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
- RETURN_NOT_OK(this->SetupArgIteration(args));
+ RETURN_NOT_OK(this->SetupArgIteration(args));
ExecBatch batch;
while (batch_iterator_->Next(&batch)) {
@@ -921,7 +921,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
}
Datum out;
- RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &out));
+ RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &out));
RETURN_NOT_OK(listener->OnResult(std::move(out)));
return Status::OK();
}
@@ -934,78 +934,78 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
private:
Status Consume(const ExecBatch& batch) {
- // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
- ARROW_ASSIGN_OR_RAISE(
- auto batch_state,
- kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
+ // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
+ ARROW_ASSIGN_OR_RAISE(
+ auto batch_state,
+ kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
if (batch_state == nullptr) {
- return Status::Invalid("ScalarAggregation requires non-null kernel state");
+ return Status::Invalid("ScalarAggregation requires non-null kernel state");
}
- KernelContext batch_ctx(exec_context());
+ KernelContext batch_ctx(exec_context());
batch_ctx.SetState(batch_state.get());
- RETURN_NOT_OK(kernel_->consume(&batch_ctx, batch));
- RETURN_NOT_OK(kernel_->merge(kernel_ctx_, std::move(*batch_state), state()));
+ RETURN_NOT_OK(kernel_->consume(&batch_ctx, batch));
+ RETURN_NOT_OK(kernel_->merge(kernel_ctx_, std::move(*batch_state), state()));
return Status::OK();
}
-
- const std::vector<ValueDescr>* input_descrs_;
- const FunctionOptions* options_;
+
+ const std::vector<ValueDescr>* input_descrs_;
+ const FunctionOptions* options_;
};
template <typename ExecutorType,
typename FunctionType = typename ExecutorType::FunctionType>
-Result<std::unique_ptr<KernelExecutor>> MakeExecutor(ExecContext* ctx,
- const Function* func,
- const FunctionOptions* options) {
+Result<std::unique_ptr<KernelExecutor>> MakeExecutor(ExecContext* ctx,
+ const Function* func,
+ const FunctionOptions* options) {
DCHECK_EQ(ExecutorType::function_kind, func->kind());
auto typed_func = checked_cast<const FunctionType*>(func);
- return std::unique_ptr<KernelExecutor>(new ExecutorType(ctx, typed_func, options));
+ return std::unique_ptr<KernelExecutor>(new ExecutorType(ctx, typed_func, options));
}
-} // namespace
-
-Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
- DCHECK_NE(nullptr, output);
- DCHECK_GT(output->buffers.size(), 0);
-
- if (output->type->id() == Type::NA) {
- // Null output type is a no-op (rare when this would happen but we at least
- // will test for it)
- return Status::OK();
+} // namespace
+
+Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
+ DCHECK_NE(nullptr, output);
+ DCHECK_GT(output->buffers.size(), 0);
+
+ if (output->type->id() == Type::NA) {
+ // Null output type is a no-op (rare when this would happen but we at least
+ // will test for it)
+ return Status::OK();
+ }
+
+ // This function is ONLY able to write into output with non-zero offset
+ // when the bitmap is preallocated. This could be a DCHECK but returning
+ // error Status for now for emphasis
+ if (output->offset != 0 && output->buffers[0] == nullptr) {
+ return Status::Invalid(
+ "Can only propagate nulls into pre-allocated memory "
+ "when the output offset is non-zero");
}
-
- // This function is ONLY able to write into output with non-zero offset
- // when the bitmap is preallocated. This could be a DCHECK but returning
- // error Status for now for emphasis
- if (output->offset != 0 && output->buffers[0] == nullptr) {
- return Status::Invalid(
- "Can only propagate nulls into pre-allocated memory "
- "when the output offset is non-zero");
- }
- NullPropagator propagator(ctx, batch, output);
- return propagator.Execute();
+ NullPropagator propagator(ctx, batch, output);
+ return propagator.Execute();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalar() {
+ return ::arrow::internal::make_unique<detail::ScalarExecutor>();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeVector() {
+ return ::arrow::internal::make_unique<detail::VectorExecutor>();
+}
+
+std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
+ return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
}
-std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalar() {
- return ::arrow::internal::make_unique<detail::ScalarExecutor>();
-}
-
-std::unique_ptr<KernelExecutor> KernelExecutor::MakeVector() {
- return ::arrow::internal::make_unique<detail::VectorExecutor>();
-}
-
-std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
- return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
-}
-
} // namespace detail
-ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
- FunctionRegistry* func_registry)
- : pool_(pool), executor_(executor) {
+ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+ FunctionRegistry* func_registry)
+ : pool_(pool), executor_(executor) {
this->func_registry_ = func_registry == nullptr ? GetFunctionRegistry() : func_registry;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
index 90fb291dbb8..de1b695de48 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec.h
@@ -28,13 +28,13 @@
#include <vector>
#include "arrow/array/data.h"
-#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/exec/expression.h"
#include "arrow/datum.h"
#include "arrow/memory_pool.h"
#include "arrow/result.h"
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -46,7 +46,7 @@ class CpuInfo;
namespace compute {
-class FunctionOptions;
+class FunctionOptions;
class FunctionRegistry;
// It seems like 64K might be a good default chunksize to use for execution
@@ -61,7 +61,7 @@ class ARROW_EXPORT ExecContext {
public:
// If no function registry passed, the default is used.
explicit ExecContext(MemoryPool* pool = default_memory_pool(),
- ::arrow::internal::Executor* executor = NULLPTR,
+ ::arrow::internal::Executor* executor = NULLPTR,
FunctionRegistry* func_registry = NULLPTR);
/// \brief The MemoryPool used for allocations, default is
@@ -70,9 +70,9 @@ class ARROW_EXPORT ExecContext {
::arrow::internal::CpuInfo* cpu_info() const;
- /// \brief An Executor which may be used to parallelize execution.
- ::arrow::internal::Executor* executor() const { return executor_; }
-
+ /// \brief An Executor which may be used to parallelize execution.
+ ::arrow::internal::Executor* executor() const { return executor_; }
+
/// \brief The FunctionRegistry for looking up functions by name and
/// selecting kernels for execution. Defaults to the library-global function
/// registry provided by GetFunctionRegistry.
@@ -119,15 +119,15 @@ class ARROW_EXPORT ExecContext {
private:
MemoryPool* pool_;
- ::arrow::internal::Executor* executor_;
+ ::arrow::internal::Executor* executor_;
FunctionRegistry* func_registry_;
int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
bool preallocate_contiguous_ = true;
bool use_threads_ = true;
};
-ARROW_EXPORT ExecContext* default_exec_context();
-
+ARROW_EXPORT ExecContext* default_exec_context();
+
// TODO: Consider standardizing on uint16 selection vectors and only use them
// when we can ensure that each value is 64K length or smaller
@@ -173,18 +173,18 @@ class ARROW_EXPORT SelectionVector {
/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
/// than is desirable for this class. Microbenchmarks would help determine for
/// sure. See ARROW-8928.
-struct ARROW_EXPORT ExecBatch {
- ExecBatch() = default;
+struct ARROW_EXPORT ExecBatch {
+ ExecBatch() = default;
ExecBatch(std::vector<Datum> values, int64_t length)
: values(std::move(values)), length(length) {}
- explicit ExecBatch(const RecordBatch& batch);
-
- static Result<ExecBatch> Make(std::vector<Datum> values);
-
- Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
- std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
-
+ explicit ExecBatch(const RecordBatch& batch);
+
+ static Result<ExecBatch> Make(std::vector<Datum> values);
+
+ Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
+ std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
+
/// The values representing positional arguments to be passed to a kernel's
/// exec function for processing.
std::vector<Datum> values;
@@ -196,9 +196,9 @@ struct ARROW_EXPORT ExecBatch {
/// ExecBatch::length is equal to the length of this array.
std::shared_ptr<SelectionVector> selection_vector;
- /// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
- Expression guarantee = literal(true);
-
+ /// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
+ Expression guarantee = literal(true);
+
/// The semantic length of the ExecBatch. When the values are all scalars,
/// the length should be set to 1, otherwise the length is taken from the
/// array values, except when there is a selection vector. When there is a
@@ -216,13 +216,13 @@ struct ARROW_EXPORT ExecBatch {
return values[i];
}
- bool Equals(const ExecBatch& other) const;
-
+ bool Equals(const ExecBatch& other) const;
+
/// \brief A convenience for the number of values / arguments.
int num_values() const { return static_cast<int>(values.size()); }
- ExecBatch Slice(int64_t offset, int64_t length) const;
-
+ ExecBatch Slice(int64_t offset, int64_t length) const;
+
/// \brief A convenience for returning the ValueDescr objects (types and
/// shapes) from the batch.
std::vector<ValueDescr> GetDescriptors() const {
@@ -232,13 +232,13 @@ struct ARROW_EXPORT ExecBatch {
}
return result;
}
-
- ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*);
+
+ ARROW_EXPORT friend void PrintTo(const ExecBatch&, std::ostream*);
};
-inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
-inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
-
+inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
+inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
+
/// \defgroup compute-call-function One-shot calls to compute functions
///
/// @{
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
index aec7805ceea..433e895c243 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.cc
@@ -1,823 +1,823 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/exec_plan.h"
-
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "arrow/array/util.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/exec/expression.h"
-#include "arrow/compute/registry.h"
-#include "arrow/datum.h"
-#include "arrow/record_batch.h"
-#include "arrow/result.h"
-#include "arrow/util/async_generator.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-namespace compute {
-
-namespace {
-
-struct ExecPlanImpl : public ExecPlan {
- explicit ExecPlanImpl(ExecContext* exec_context) : ExecPlan(exec_context) {}
-
- ~ExecPlanImpl() override {
- if (started_ && !finished_.is_finished()) {
- ARROW_LOG(WARNING) << "Plan was destroyed before finishing";
- StopProducing();
- finished().Wait();
- }
- }
-
- ExecNode* AddNode(std::unique_ptr<ExecNode> node) {
- if (node->num_inputs() == 0) {
- sources_.push_back(node.get());
- }
- if (node->num_outputs() == 0) {
- sinks_.push_back(node.get());
- }
- nodes_.push_back(std::move(node));
- return nodes_.back().get();
- }
-
- Status Validate() const {
- if (nodes_.empty()) {
- return Status::Invalid("ExecPlan has no node");
- }
- for (const auto& node : nodes_) {
- RETURN_NOT_OK(node->Validate());
- }
- return Status::OK();
- }
-
- Status StartProducing() {
- if (started_) {
- return Status::Invalid("restarted ExecPlan");
- }
- started_ = true;
-
- // producers precede consumers
- sorted_nodes_ = TopoSort();
-
- std::vector<Future<>> futures;
-
- Status st = Status::OK();
-
- using rev_it = std::reverse_iterator<NodeVector::iterator>;
- for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) {
- auto node = *it;
-
- st = node->StartProducing();
- if (!st.ok()) {
- // Stop nodes that successfully started, in reverse order
- stopped_ = true;
- StopProducingImpl(it.base(), sorted_nodes_.end());
- break;
- }
-
- futures.push_back(node->finished());
- }
-
- finished_ = AllComplete(std::move(futures));
- return st;
- }
-
- void StopProducing() {
- DCHECK(started_) << "stopped an ExecPlan which never started";
- stopped_ = true;
-
- StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end());
- }
-
- template <typename It>
- void StopProducingImpl(It begin, It end) {
- for (auto it = begin; it != end; ++it) {
- auto node = *it;
- node->StopProducing();
- }
- }
-
- NodeVector TopoSort() {
- struct Impl {
- const std::vector<std::unique_ptr<ExecNode>>& nodes;
- std::unordered_set<ExecNode*> visited;
- NodeVector sorted;
-
- explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
- visited.reserve(nodes.size());
- sorted.resize(nodes.size());
-
- for (const auto& node : nodes) {
- Visit(node.get());
- }
-
- DCHECK_EQ(visited.size(), nodes.size());
- }
-
- void Visit(ExecNode* node) {
- if (visited.count(node) != 0) return;
-
- for (auto input : node->inputs()) {
- // Ensure that producers are inserted before this consumer
- Visit(input);
- }
-
- sorted[visited.size()] = node;
- visited.insert(node);
- }
- };
-
- return std::move(Impl{nodes_}.sorted);
- }
-
- Future<> finished_ = Future<>::MakeFinished();
- bool started_ = false, stopped_ = false;
- std::vector<std::unique_ptr<ExecNode>> nodes_;
- NodeVector sources_, sinks_;
- NodeVector sorted_nodes_;
-};
-
-ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast<ExecPlanImpl*>(ptr); }
-
-const ExecPlanImpl* ToDerived(const ExecPlan* ptr) {
- return checked_cast<const ExecPlanImpl*>(ptr);
-}
-
-util::optional<int> GetNodeIndex(const std::vector<ExecNode*>& nodes,
- const ExecNode* node) {
- for (int i = 0; i < static_cast<int>(nodes.size()); ++i) {
- if (nodes[i] == node) return i;
- }
- return util::nullopt;
-}
-
-} // namespace
-
-Result<std::shared_ptr<ExecPlan>> ExecPlan::Make(ExecContext* ctx) {
- return std::shared_ptr<ExecPlan>(new ExecPlanImpl{ctx});
-}
-
-ExecNode* ExecPlan::AddNode(std::unique_ptr<ExecNode> node) {
- return ToDerived(this)->AddNode(std::move(node));
-}
-
-const ExecPlan::NodeVector& ExecPlan::sources() const {
- return ToDerived(this)->sources_;
-}
-
-const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; }
-
-Status ExecPlan::Validate() { return ToDerived(this)->Validate(); }
-
-Status ExecPlan::StartProducing() { return ToDerived(this)->StartProducing(); }
-
-void ExecPlan::StopProducing() { ToDerived(this)->StopProducing(); }
-
-Future<> ExecPlan::finished() { return ToDerived(this)->finished_; }
-
-ExecNode::ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
- std::vector<std::string> input_labels,
- std::shared_ptr<Schema> output_schema, int num_outputs)
- : plan_(plan),
- label_(std::move(label)),
- inputs_(std::move(inputs)),
- input_labels_(std::move(input_labels)),
- output_schema_(std::move(output_schema)),
- num_outputs_(num_outputs) {
- for (auto input : inputs_) {
- input->outputs_.push_back(this);
- }
-}
-
-Status ExecNode::Validate() const {
- if (inputs_.size() != input_labels_.size()) {
- return Status::Invalid("Invalid number of inputs for '", label(), "' (expected ",
- num_inputs(), ", actual ", input_labels_.size(), ")");
- }
-
- if (static_cast<int>(outputs_.size()) != num_outputs_) {
- return Status::Invalid("Invalid number of outputs for '", label(), "' (expected ",
- num_outputs(), ", actual ", outputs_.size(), ")");
- }
-
- for (auto out : outputs_) {
- auto input_index = GetNodeIndex(out->inputs(), this);
- if (!input_index) {
- return Status::Invalid("Node '", label(), "' outputs to node '", out->label(),
- "' but is not listed as an input.");
- }
- }
-
- return Status::OK();
-}
-
-struct SourceNode : ExecNode {
- SourceNode(ExecPlan* plan, std::string label, std::shared_ptr<Schema> output_schema,
- AsyncGenerator<util::optional<ExecBatch>> generator)
- : ExecNode(plan, std::move(label), {}, {}, std::move(output_schema),
- /*num_outputs=*/1),
- generator_(std::move(generator)) {}
-
- const char* kind_name() override { return "SourceNode"; }
-
- [[noreturn]] static void NoInputs() {
- DCHECK(false) << "no inputs; this should never be called";
- std::abort();
- }
- [[noreturn]] void InputReceived(ExecNode*, int, ExecBatch) override { NoInputs(); }
- [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); }
- [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); }
-
- Status StartProducing() override {
- DCHECK(!stop_requested_) << "Restarted SourceNode";
-
- CallbackOptions options;
- if (auto executor = plan()->exec_context()->executor()) {
- // These options will transfer execution to the desired Executor if necessary.
- // This can happen for in-memory scans where batches didn't require
- // any CPU work to decode. Otherwise, parsing etc should have already
- // been placed us on the desired Executor and no queues will be pushed to.
- options.executor = executor;
- options.should_schedule = ShouldSchedule::IfDifferentExecutor;
- }
-
- finished_ = Loop([this, options] {
- std::unique_lock<std::mutex> lock(mutex_);
- int seq = batch_count_++;
- if (stop_requested_) {
- return Future<ControlFlow<int>>::MakeFinished(Break(seq));
- }
- lock.unlock();
-
- return generator_().Then(
- [=](const util::optional<ExecBatch>& batch) -> ControlFlow<int> {
- std::unique_lock<std::mutex> lock(mutex_);
- if (IsIterationEnd(batch) || stop_requested_) {
- stop_requested_ = true;
- return Break(seq);
- }
- lock.unlock();
-
- outputs_[0]->InputReceived(this, seq, *batch);
- return Continue();
- },
- [=](const Status& error) -> ControlFlow<int> {
- // NB: ErrorReceived is independent of InputFinished, but
- // ErrorReceived will usually prompt StopProducing which will
- // prompt InputFinished. ErrorReceived may still be called from a
- // node which was requested to stop (indeed, the request to stop
- // may prompt an error).
- std::unique_lock<std::mutex> lock(mutex_);
- stop_requested_ = true;
- lock.unlock();
- outputs_[0]->ErrorReceived(this, error);
- return Break(seq);
- },
- options);
- }).Then([&](int seq) { outputs_[0]->InputFinished(this, seq); });
-
- return Status::OK();
- }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override {
- std::unique_lock<std::mutex> lock(mutex_);
- stop_requested_ = true;
- }
-
- Future<> finished() override { return finished_; }
-
- private:
- std::mutex mutex_;
- bool stop_requested_{false};
- int batch_count_{0};
- Future<> finished_ = Future<>::MakeFinished();
- AsyncGenerator<util::optional<ExecBatch>> generator_;
-};
-
-ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
- std::shared_ptr<Schema> output_schema,
- AsyncGenerator<util::optional<ExecBatch>> generator) {
- return plan->EmplaceNode<SourceNode>(plan, std::move(label), std::move(output_schema),
- std::move(generator));
-}
-
-struct FilterNode : ExecNode {
- FilterNode(ExecNode* input, std::string label, Expression filter)
- : ExecNode(input->plan(), std::move(label), {input}, {"target"},
- /*output_schema=*/input->output_schema(),
- /*num_outputs=*/1),
- filter_(std::move(filter)) {}
-
- const char* kind_name() override { return "FilterNode"; }
-
- Result<ExecBatch> DoFilter(const ExecBatch& target) {
- ARROW_ASSIGN_OR_RAISE(Expression simplified_filter,
- SimplifyWithGuarantee(filter_, target.guarantee));
-
- ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target,
- plan()->exec_context()));
-
- if (mask.is_scalar()) {
- const auto& mask_scalar = mask.scalar_as<BooleanScalar>();
- if (mask_scalar.is_valid && mask_scalar.value) {
- return target;
- }
-
- return target.Slice(0, 0);
- }
-
- // if the values are all scalar then the mask must also be
- DCHECK(!std::all_of(target.values.begin(), target.values.end(),
- [](const Datum& value) { return value.is_scalar(); }));
-
- auto values = target.values;
- for (auto& value : values) {
- if (value.is_scalar()) continue;
- ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults()));
- }
- return ExecBatch::Make(std::move(values));
- }
-
- void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- auto maybe_filtered = DoFilter(std::move(batch));
- if (!maybe_filtered.ok()) {
- outputs_[0]->ErrorReceived(this, maybe_filtered.status());
- return;
- }
-
- maybe_filtered->guarantee = batch.guarantee;
- outputs_[0]->InputReceived(this, seq, maybe_filtered.MoveValueUnsafe());
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->ErrorReceived(this, std::move(error));
- }
-
- void InputFinished(ExecNode* input, int seq) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->InputFinished(this, seq);
- }
-
- Status StartProducing() override { return Status::OK(); }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override { inputs_[0]->StopProducing(this); }
-
- Future<> finished() override { return inputs_[0]->finished(); }
-
- private:
- Expression filter_;
-};
-
-Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter) {
- if (!filter.IsBound()) {
- ARROW_ASSIGN_OR_RAISE(filter, filter.Bind(*input->output_schema()));
- }
-
- if (filter.type()->id() != Type::BOOL) {
- return Status::TypeError("Filter expression must evaluate to bool, but ",
- filter.ToString(), " evaluates to ",
- filter.type()->ToString());
- }
-
- return input->plan()->EmplaceNode<FilterNode>(input, std::move(label),
- std::move(filter));
-}
-
-struct ProjectNode : ExecNode {
- ProjectNode(ExecNode* input, std::string label, std::shared_ptr<Schema> output_schema,
- std::vector<Expression> exprs)
- : ExecNode(input->plan(), std::move(label), {input}, {"target"},
- /*output_schema=*/std::move(output_schema),
- /*num_outputs=*/1),
- exprs_(std::move(exprs)) {}
-
- const char* kind_name() override { return "ProjectNode"; }
-
- Result<ExecBatch> DoProject(const ExecBatch& target) {
- std::vector<Datum> values{exprs_.size()};
- for (size_t i = 0; i < exprs_.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(Expression simplified_expr,
- SimplifyWithGuarantee(exprs_[i], target.guarantee));
-
- ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target,
- plan()->exec_context()));
- }
- return ExecBatch{std::move(values), target.length};
- }
-
- void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- auto maybe_projected = DoProject(std::move(batch));
- if (!maybe_projected.ok()) {
- outputs_[0]->ErrorReceived(this, maybe_projected.status());
- return;
- }
-
- maybe_projected->guarantee = batch.guarantee;
- outputs_[0]->InputReceived(this, seq, maybe_projected.MoveValueUnsafe());
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->ErrorReceived(this, std::move(error));
- }
-
- void InputFinished(ExecNode* input, int seq) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->InputFinished(this, seq);
- }
-
- Status StartProducing() override { return Status::OK(); }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override { inputs_[0]->StopProducing(this); }
-
- Future<> finished() override { return inputs_[0]->finished(); }
-
- private:
- std::vector<Expression> exprs_;
-};
-
-Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
- std::vector<Expression> exprs,
- std::vector<std::string> names) {
- FieldVector fields(exprs.size());
-
- if (names.size() == 0) {
- names.resize(exprs.size());
- for (size_t i = 0; i < exprs.size(); ++i) {
- names[i] = exprs[i].ToString();
- }
- }
-
- int i = 0;
- for (auto& expr : exprs) {
- if (!expr.IsBound()) {
- ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*input->output_schema()));
- }
- fields[i] = field(std::move(names[i]), expr.type());
- ++i;
- }
-
- return input->plan()->EmplaceNode<ProjectNode>(
- input, std::move(label), schema(std::move(fields)), std::move(exprs));
-}
-
-struct SinkNode : ExecNode {
- SinkNode(ExecNode* input, std::string label,
- AsyncGenerator<util::optional<ExecBatch>>* generator)
- : ExecNode(input->plan(), std::move(label), {input}, {"collected"}, {},
- /*num_outputs=*/0),
- producer_(MakeProducer(generator)) {}
-
- static PushGenerator<util::optional<ExecBatch>>::Producer MakeProducer(
- AsyncGenerator<util::optional<ExecBatch>>* out_gen) {
- PushGenerator<util::optional<ExecBatch>> gen;
- auto out = gen.producer();
- *out_gen = std::move(gen);
- return out;
- }
-
- const char* kind_name() override { return "SinkNode"; }
-
- Status StartProducing() override {
- finished_ = Future<>::Make();
- return Status::OK();
- }
-
- // sink nodes have no outputs from which to feel backpressure
- [[noreturn]] static void NoOutputs() {
- DCHECK(false) << "no outputs; this should never be called";
- std::abort();
- }
- [[noreturn]] void ResumeProducing(ExecNode* output) override { NoOutputs(); }
- [[noreturn]] void PauseProducing(ExecNode* output) override { NoOutputs(); }
- [[noreturn]] void StopProducing(ExecNode* output) override { NoOutputs(); }
-
- void StopProducing() override {
- Finish();
- inputs_[0]->StopProducing(this);
- }
-
- Future<> finished() override { return finished_; }
-
- void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- std::unique_lock<std::mutex> lock(mutex_);
- if (finished_.is_finished()) return;
-
- ++num_received_;
- if (num_received_ == emit_stop_) {
- lock.unlock();
- producer_.Push(std::move(batch));
- Finish();
- return;
- }
-
- if (emit_stop_ != -1) {
- DCHECK_LE(seq_num, emit_stop_);
- }
-
- lock.unlock();
- producer_.Push(std::move(batch));
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- producer_.Push(std::move(error));
- Finish();
- inputs_[0]->StopProducing(this);
- }
-
- void InputFinished(ExecNode* input, int seq_stop) override {
- std::unique_lock<std::mutex> lock(mutex_);
- emit_stop_ = seq_stop;
- if (num_received_ == emit_stop_) {
- lock.unlock();
- Finish();
- }
- }
-
- private:
- void Finish() {
- if (producer_.Close()) {
- finished_.MarkFinished();
- }
- }
-
- std::mutex mutex_;
-
- int num_received_ = 0;
- int emit_stop_ = -1;
- Future<> finished_ = Future<>::MakeFinished();
-
- PushGenerator<util::optional<ExecBatch>>::Producer producer_;
-};
-
-AsyncGenerator<util::optional<ExecBatch>> MakeSinkNode(ExecNode* input,
- std::string label) {
- AsyncGenerator<util::optional<ExecBatch>> out;
- (void)input->plan()->EmplaceNode<SinkNode>(input, std::move(label), &out);
- return out;
-}
-
-std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
- std::shared_ptr<Schema> schema,
- std::function<Future<util::optional<ExecBatch>>()> gen, MemoryPool* pool) {
- struct Impl : RecordBatchReader {
- std::shared_ptr<Schema> schema() const override { return schema_; }
-
- Status ReadNext(std::shared_ptr<RecordBatch>* record_batch) override {
- ARROW_ASSIGN_OR_RAISE(auto batch, iterator_.Next());
- if (batch) {
- ARROW_ASSIGN_OR_RAISE(*record_batch, batch->ToRecordBatch(schema_, pool_));
- } else {
- *record_batch = IterationEnd<std::shared_ptr<RecordBatch>>();
- }
- return Status::OK();
- }
-
- MemoryPool* pool_;
- std::shared_ptr<Schema> schema_;
- Iterator<util::optional<ExecBatch>> iterator_;
- };
-
- auto out = std::make_shared<Impl>();
- out->pool_ = pool;
- out->schema_ = std::move(schema);
- out->iterator_ = MakeGeneratorIterator(std::move(gen));
- return out;
-}
-
-struct ScalarAggregateNode : ExecNode {
- ScalarAggregateNode(ExecNode* input, std::string label,
- std::shared_ptr<Schema> output_schema,
- std::vector<const ScalarAggregateKernel*> kernels,
- std::vector<std::vector<std::unique_ptr<KernelState>>> states)
- : ExecNode(input->plan(), std::move(label), {input}, {"target"},
- /*output_schema=*/std::move(output_schema),
- /*num_outputs=*/1),
- kernels_(std::move(kernels)),
- states_(std::move(states)) {}
-
- const char* kind_name() override { return "ScalarAggregateNode"; }
-
- Status DoConsume(const ExecBatch& batch, size_t thread_index) {
- for (size_t i = 0; i < kernels_.size(); ++i) {
- KernelContext batch_ctx{plan()->exec_context()};
- batch_ctx.SetState(states_[i][thread_index].get());
- ExecBatch single_column_batch{{batch.values[i]}, batch.length};
- RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch));
- }
- return Status::OK();
- }
-
- void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
- DCHECK_EQ(input, inputs_[0]);
-
- std::unique_lock<std::mutex> lock(mutex_);
- auto it =
- thread_indices_.emplace(std::this_thread::get_id(), thread_indices_.size()).first;
- auto thread_index = it->second;
-
- lock.unlock();
-
- Status st = DoConsume(std::move(batch), thread_index);
- if (!st.ok()) {
- outputs_[0]->ErrorReceived(this, std::move(st));
- return;
- }
-
- lock.lock();
- ++num_received_;
- st = MaybeFinish(&lock);
- if (!st.ok()) {
- outputs_[0]->ErrorReceived(this, std::move(st));
- }
- }
-
- void ErrorReceived(ExecNode* input, Status error) override {
- DCHECK_EQ(input, inputs_[0]);
- outputs_[0]->ErrorReceived(this, std::move(error));
- }
-
- void InputFinished(ExecNode* input, int seq) override {
- DCHECK_EQ(input, inputs_[0]);
- std::unique_lock<std::mutex> lock(mutex_);
- num_total_ = seq;
- Status st = MaybeFinish(&lock);
-
- if (!st.ok()) {
- outputs_[0]->ErrorReceived(this, std::move(st));
- }
- }
-
- Status StartProducing() override {
- finished_ = Future<>::Make();
- // Scalar aggregates will only output a single batch
- outputs_[0]->InputFinished(this, 1);
- return Status::OK();
- }
-
- void PauseProducing(ExecNode* output) override {}
-
- void ResumeProducing(ExecNode* output) override {}
-
- void StopProducing(ExecNode* output) override {
- DCHECK_EQ(output, outputs_[0]);
- StopProducing();
- }
-
- void StopProducing() override {
- inputs_[0]->StopProducing(this);
- finished_.MarkFinished();
- }
-
- Future<> finished() override { return finished_; }
-
- private:
- Status MaybeFinish(std::unique_lock<std::mutex>* lock) {
- if (num_received_ != num_total_) return Status::OK();
-
- if (states_.empty()) return Status::OK();
-
- ExecBatch batch{{}, 1};
- batch.values.resize(kernels_.size());
-
- for (size_t i = 0; i < kernels_.size(); ++i) {
- KernelContext ctx{plan()->exec_context()};
- ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll(
- kernels_[i], &ctx, std::move(states_[i])));
- RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i]));
- }
- states_.clear();
- lock->unlock();
-
- outputs_[0]->InputReceived(this, 0, batch);
-
- finished_.MarkFinished();
- return Status::OK();
- }
-
- Future<> finished_ = Future<>::MakeFinished();
- std::vector<const ScalarAggregateKernel*> kernels_;
- std::vector<std::vector<std::unique_ptr<KernelState>>> states_;
- std::unordered_map<std::thread::id, size_t> thread_indices_;
- std::mutex mutex_;
- int num_received_ = 0, num_total_ = -1;
-};
-
-Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
- std::vector<internal::Aggregate> aggregates) {
- if (input->output_schema()->num_fields() != static_cast<int>(aggregates.size())) {
- return Status::Invalid("Provided ", aggregates.size(),
- " aggregates, expected one for each field of ",
- input->output_schema()->ToString());
- }
-
- auto exec_ctx = input->plan()->exec_context();
-
- std::vector<const ScalarAggregateKernel*> kernels(aggregates.size());
- std::vector<std::vector<std::unique_ptr<KernelState>>> states(kernels.size());
- FieldVector fields(kernels.size());
-
- for (size_t i = 0; i < kernels.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(auto function,
- exec_ctx->func_registry()->GetFunction(aggregates[i].function));
-
- if (function->kind() != Function::SCALAR_AGGREGATE) {
- return Status::Invalid("Provided non ScalarAggregateFunction ",
- aggregates[i].function);
- }
-
- auto in_type = ValueDescr::Array(input->output_schema()->fields()[i]->type());
-
- ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, function->DispatchExact({in_type}));
- kernels[i] = static_cast<const ScalarAggregateKernel*>(kernel);
-
- if (aggregates[i].options == nullptr) {
- aggregates[i].options = function->default_options();
- }
-
- KernelContext kernel_ctx{exec_ctx};
- states[i].resize(exec_ctx->executor() ? exec_ctx->executor()->GetCapacity() : 1);
- RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx,
- KernelInitArgs{kernels[i],
- {
- in_type,
- },
- aggregates[i].options},
- &states[i]));
-
- // pick one to resolve the kernel signature
- kernel_ctx.SetState(states[i][0].get());
- ARROW_ASSIGN_OR_RAISE(
- auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type}));
-
- fields[i] = field(aggregates[i].function, std::move(descr.type));
- }
-
- return input->plan()->EmplaceNode<ScalarAggregateNode>(
- input, std::move(label), schema(std::move(fields)), std::move(kernels),
- std::move(states));
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/exec_plan.h"
+
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arrow/array/util.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec/expression.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+
+namespace {
+
+struct ExecPlanImpl : public ExecPlan {
+ explicit ExecPlanImpl(ExecContext* exec_context) : ExecPlan(exec_context) {}
+
+ ~ExecPlanImpl() override {
+ if (started_ && !finished_.is_finished()) {
+ ARROW_LOG(WARNING) << "Plan was destroyed before finishing";
+ StopProducing();
+ finished().Wait();
+ }
+ }
+
+ ExecNode* AddNode(std::unique_ptr<ExecNode> node) {
+ if (node->num_inputs() == 0) {
+ sources_.push_back(node.get());
+ }
+ if (node->num_outputs() == 0) {
+ sinks_.push_back(node.get());
+ }
+ nodes_.push_back(std::move(node));
+ return nodes_.back().get();
+ }
+
+ Status Validate() const {
+ if (nodes_.empty()) {
+ return Status::Invalid("ExecPlan has no node");
+ }
+ for (const auto& node : nodes_) {
+ RETURN_NOT_OK(node->Validate());
+ }
+ return Status::OK();
+ }
+
+ Status StartProducing() {
+ if (started_) {
+ return Status::Invalid("restarted ExecPlan");
+ }
+ started_ = true;
+
+ // producers precede consumers
+ sorted_nodes_ = TopoSort();
+
+ std::vector<Future<>> futures;
+
+ Status st = Status::OK();
+
+ using rev_it = std::reverse_iterator<NodeVector::iterator>;
+ for (rev_it it(sorted_nodes_.end()), end(sorted_nodes_.begin()); it != end; ++it) {
+ auto node = *it;
+
+ st = node->StartProducing();
+ if (!st.ok()) {
+ // Stop nodes that successfully started, in reverse order
+ stopped_ = true;
+ StopProducingImpl(it.base(), sorted_nodes_.end());
+ break;
+ }
+
+ futures.push_back(node->finished());
+ }
+
+ finished_ = AllComplete(std::move(futures));
+ return st;
+ }
+
+ void StopProducing() {
+ DCHECK(started_) << "stopped an ExecPlan which never started";
+ stopped_ = true;
+
+ StopProducingImpl(sorted_nodes_.begin(), sorted_nodes_.end());
+ }
+
+ template <typename It>
+ void StopProducingImpl(It begin, It end) {
+ for (auto it = begin; it != end; ++it) {
+ auto node = *it;
+ node->StopProducing();
+ }
+ }
+
+ NodeVector TopoSort() {
+ struct Impl {
+ const std::vector<std::unique_ptr<ExecNode>>& nodes;
+ std::unordered_set<ExecNode*> visited;
+ NodeVector sorted;
+
+ explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+ visited.reserve(nodes.size());
+ sorted.resize(nodes.size());
+
+ for (const auto& node : nodes) {
+ Visit(node.get());
+ }
+
+ DCHECK_EQ(visited.size(), nodes.size());
+ }
+
+ void Visit(ExecNode* node) {
+ if (visited.count(node) != 0) return;
+
+ for (auto input : node->inputs()) {
+ // Ensure that producers are inserted before this consumer
+ Visit(input);
+ }
+
+ sorted[visited.size()] = node;
+ visited.insert(node);
+ }
+ };
+
+ return std::move(Impl{nodes_}.sorted);
+ }
+
+ Future<> finished_ = Future<>::MakeFinished();
+ bool started_ = false, stopped_ = false;
+ std::vector<std::unique_ptr<ExecNode>> nodes_;
+ NodeVector sources_, sinks_;
+ NodeVector sorted_nodes_;
+};
+
+ExecPlanImpl* ToDerived(ExecPlan* ptr) { return checked_cast<ExecPlanImpl*>(ptr); }
+
+const ExecPlanImpl* ToDerived(const ExecPlan* ptr) {
+ return checked_cast<const ExecPlanImpl*>(ptr);
+}
+
+util::optional<int> GetNodeIndex(const std::vector<ExecNode*>& nodes,
+ const ExecNode* node) {
+ for (int i = 0; i < static_cast<int>(nodes.size()); ++i) {
+ if (nodes[i] == node) return i;
+ }
+ return util::nullopt;
+}
+
+} // namespace
+
+Result<std::shared_ptr<ExecPlan>> ExecPlan::Make(ExecContext* ctx) {
+ return std::shared_ptr<ExecPlan>(new ExecPlanImpl{ctx});
+}
+
+ExecNode* ExecPlan::AddNode(std::unique_ptr<ExecNode> node) {
+ return ToDerived(this)->AddNode(std::move(node));
+}
+
+const ExecPlan::NodeVector& ExecPlan::sources() const {
+ return ToDerived(this)->sources_;
+}
+
+const ExecPlan::NodeVector& ExecPlan::sinks() const { return ToDerived(this)->sinks_; }
+
+Status ExecPlan::Validate() { return ToDerived(this)->Validate(); }
+
+Status ExecPlan::StartProducing() { return ToDerived(this)->StartProducing(); }
+
+void ExecPlan::StopProducing() { ToDerived(this)->StopProducing(); }
+
+Future<> ExecPlan::finished() { return ToDerived(this)->finished_; }
+
+ExecNode::ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
+ std::vector<std::string> input_labels,
+ std::shared_ptr<Schema> output_schema, int num_outputs)
+ : plan_(plan),
+ label_(std::move(label)),
+ inputs_(std::move(inputs)),
+ input_labels_(std::move(input_labels)),
+ output_schema_(std::move(output_schema)),
+ num_outputs_(num_outputs) {
+ for (auto input : inputs_) {
+ input->outputs_.push_back(this);
+ }
+}
+
+Status ExecNode::Validate() const {
+ if (inputs_.size() != input_labels_.size()) {
+ return Status::Invalid("Invalid number of inputs for '", label(), "' (expected ",
+ num_inputs(), ", actual ", input_labels_.size(), ")");
+ }
+
+ if (static_cast<int>(outputs_.size()) != num_outputs_) {
+ return Status::Invalid("Invalid number of outputs for '", label(), "' (expected ",
+ num_outputs(), ", actual ", outputs_.size(), ")");
+ }
+
+ for (auto out : outputs_) {
+ auto input_index = GetNodeIndex(out->inputs(), this);
+ if (!input_index) {
+ return Status::Invalid("Node '", label(), "' outputs to node '", out->label(),
+ "' but is not listed as an input.");
+ }
+ }
+
+ return Status::OK();
+}
+
+struct SourceNode : ExecNode {
+ SourceNode(ExecPlan* plan, std::string label, std::shared_ptr<Schema> output_schema,
+ AsyncGenerator<util::optional<ExecBatch>> generator)
+ : ExecNode(plan, std::move(label), {}, {}, std::move(output_schema),
+ /*num_outputs=*/1),
+ generator_(std::move(generator)) {}
+
+ const char* kind_name() override { return "SourceNode"; }
+
+ [[noreturn]] static void NoInputs() {
+ DCHECK(false) << "no inputs; this should never be called";
+ std::abort();
+ }
+ [[noreturn]] void InputReceived(ExecNode*, int, ExecBatch) override { NoInputs(); }
+ [[noreturn]] void ErrorReceived(ExecNode*, Status) override { NoInputs(); }
+ [[noreturn]] void InputFinished(ExecNode*, int) override { NoInputs(); }
+
+ Status StartProducing() override {
+ DCHECK(!stop_requested_) << "Restarted SourceNode";
+
+ CallbackOptions options;
+ if (auto executor = plan()->exec_context()->executor()) {
+ // These options will transfer execution to the desired Executor if necessary.
+ // This can happen for in-memory scans where batches didn't require
+ // any CPU work to decode. Otherwise, parsing etc should have already
+ // been placed us on the desired Executor and no queues will be pushed to.
+ options.executor = executor;
+ options.should_schedule = ShouldSchedule::IfDifferentExecutor;
+ }
+
+ finished_ = Loop([this, options] {
+ std::unique_lock<std::mutex> lock(mutex_);
+ int seq = batch_count_++;
+ if (stop_requested_) {
+ return Future<ControlFlow<int>>::MakeFinished(Break(seq));
+ }
+ lock.unlock();
+
+ return generator_().Then(
+ [=](const util::optional<ExecBatch>& batch) -> ControlFlow<int> {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (IsIterationEnd(batch) || stop_requested_) {
+ stop_requested_ = true;
+ return Break(seq);
+ }
+ lock.unlock();
+
+ outputs_[0]->InputReceived(this, seq, *batch);
+ return Continue();
+ },
+ [=](const Status& error) -> ControlFlow<int> {
+ // NB: ErrorReceived is independent of InputFinished, but
+ // ErrorReceived will usually prompt StopProducing which will
+ // prompt InputFinished. ErrorReceived may still be called from a
+ // node which was requested to stop (indeed, the request to stop
+ // may prompt an error).
+ std::unique_lock<std::mutex> lock(mutex_);
+ stop_requested_ = true;
+ lock.unlock();
+ outputs_[0]->ErrorReceived(this, error);
+ return Break(seq);
+ },
+ options);
+ }).Then([&](int seq) { outputs_[0]->InputFinished(this, seq); });
+
+ return Status::OK();
+ }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ stop_requested_ = true;
+ }
+
+ Future<> finished() override { return finished_; }
+
+ private:
+ std::mutex mutex_;
+ bool stop_requested_{false};
+ int batch_count_{0};
+ Future<> finished_ = Future<>::MakeFinished();
+ AsyncGenerator<util::optional<ExecBatch>> generator_;
+};
+
+ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ AsyncGenerator<util::optional<ExecBatch>> generator) {
+ return plan->EmplaceNode<SourceNode>(plan, std::move(label), std::move(output_schema),
+ std::move(generator));
+}
+
+struct FilterNode : ExecNode {
+ FilterNode(ExecNode* input, std::string label, Expression filter)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/input->output_schema(),
+ /*num_outputs=*/1),
+ filter_(std::move(filter)) {}
+
+ const char* kind_name() override { return "FilterNode"; }
+
+ Result<ExecBatch> DoFilter(const ExecBatch& target) {
+ ARROW_ASSIGN_OR_RAISE(Expression simplified_filter,
+ SimplifyWithGuarantee(filter_, target.guarantee));
+
+ ARROW_ASSIGN_OR_RAISE(Datum mask, ExecuteScalarExpression(simplified_filter, target,
+ plan()->exec_context()));
+
+ if (mask.is_scalar()) {
+ const auto& mask_scalar = mask.scalar_as<BooleanScalar>();
+ if (mask_scalar.is_valid && mask_scalar.value) {
+ return target;
+ }
+
+ return target.Slice(0, 0);
+ }
+
+ // if the values are all scalar then the mask must also be
+ DCHECK(!std::all_of(target.values.begin(), target.values.end(),
+ [](const Datum& value) { return value.is_scalar(); }));
+
+ auto values = target.values;
+ for (auto& value : values) {
+ if (value.is_scalar()) continue;
+ ARROW_ASSIGN_OR_RAISE(value, Filter(value, mask, FilterOptions::Defaults()));
+ }
+ return ExecBatch::Make(std::move(values));
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ auto maybe_filtered = DoFilter(std::move(batch));
+ if (!maybe_filtered.ok()) {
+ outputs_[0]->ErrorReceived(this, maybe_filtered.status());
+ return;
+ }
+
+ maybe_filtered->guarantee = batch.guarantee;
+ outputs_[0]->InputReceived(this, seq, maybe_filtered.MoveValueUnsafe());
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->InputFinished(this, seq);
+ }
+
+ Status StartProducing() override { return Status::OK(); }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override { inputs_[0]->StopProducing(this); }
+
+ Future<> finished() override { return inputs_[0]->finished(); }
+
+ private:
+ Expression filter_;
+};
+
+Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter) {
+ if (!filter.IsBound()) {
+ ARROW_ASSIGN_OR_RAISE(filter, filter.Bind(*input->output_schema()));
+ }
+
+ if (filter.type()->id() != Type::BOOL) {
+ return Status::TypeError("Filter expression must evaluate to bool, but ",
+ filter.ToString(), " evaluates to ",
+ filter.type()->ToString());
+ }
+
+ return input->plan()->EmplaceNode<FilterNode>(input, std::move(label),
+ std::move(filter));
+}
+
+struct ProjectNode : ExecNode {
+ ProjectNode(ExecNode* input, std::string label, std::shared_ptr<Schema> output_schema,
+ std::vector<Expression> exprs)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/std::move(output_schema),
+ /*num_outputs=*/1),
+ exprs_(std::move(exprs)) {}
+
+ const char* kind_name() override { return "ProjectNode"; }
+
+ Result<ExecBatch> DoProject(const ExecBatch& target) {
+ std::vector<Datum> values{exprs_.size()};
+ for (size_t i = 0; i < exprs_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(Expression simplified_expr,
+ SimplifyWithGuarantee(exprs_[i], target.guarantee));
+
+ ARROW_ASSIGN_OR_RAISE(values[i], ExecuteScalarExpression(simplified_expr, target,
+ plan()->exec_context()));
+ }
+ return ExecBatch{std::move(values), target.length};
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ auto maybe_projected = DoProject(std::move(batch));
+ if (!maybe_projected.ok()) {
+ outputs_[0]->ErrorReceived(this, maybe_projected.status());
+ return;
+ }
+
+ maybe_projected->guarantee = batch.guarantee;
+ outputs_[0]->InputReceived(this, seq, maybe_projected.MoveValueUnsafe());
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->InputFinished(this, seq);
+ }
+
+ Status StartProducing() override { return Status::OK(); }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override { inputs_[0]->StopProducing(this); }
+
+ Future<> finished() override { return inputs_[0]->finished(); }
+
+ private:
+ std::vector<Expression> exprs_;
+};
+
+Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
+ std::vector<Expression> exprs,
+ std::vector<std::string> names) {
+ FieldVector fields(exprs.size());
+
+ if (names.size() == 0) {
+ names.resize(exprs.size());
+ for (size_t i = 0; i < exprs.size(); ++i) {
+ names[i] = exprs[i].ToString();
+ }
+ }
+
+ int i = 0;
+ for (auto& expr : exprs) {
+ if (!expr.IsBound()) {
+ ARROW_ASSIGN_OR_RAISE(expr, expr.Bind(*input->output_schema()));
+ }
+ fields[i] = field(std::move(names[i]), expr.type());
+ ++i;
+ }
+
+ return input->plan()->EmplaceNode<ProjectNode>(
+ input, std::move(label), schema(std::move(fields)), std::move(exprs));
+}
+
+struct SinkNode : ExecNode {
+ SinkNode(ExecNode* input, std::string label,
+ AsyncGenerator<util::optional<ExecBatch>>* generator)
+ : ExecNode(input->plan(), std::move(label), {input}, {"collected"}, {},
+ /*num_outputs=*/0),
+ producer_(MakeProducer(generator)) {}
+
+ static PushGenerator<util::optional<ExecBatch>>::Producer MakeProducer(
+ AsyncGenerator<util::optional<ExecBatch>>* out_gen) {
+ PushGenerator<util::optional<ExecBatch>> gen;
+ auto out = gen.producer();
+ *out_gen = std::move(gen);
+ return out;
+ }
+
+ const char* kind_name() override { return "SinkNode"; }
+
+ Status StartProducing() override {
+ finished_ = Future<>::Make();
+ return Status::OK();
+ }
+
+ // sink nodes have no outputs from which to feel backpressure
+ [[noreturn]] static void NoOutputs() {
+ DCHECK(false) << "no outputs; this should never be called";
+ std::abort();
+ }
+ [[noreturn]] void ResumeProducing(ExecNode* output) override { NoOutputs(); }
+ [[noreturn]] void PauseProducing(ExecNode* output) override { NoOutputs(); }
+ [[noreturn]] void StopProducing(ExecNode* output) override { NoOutputs(); }
+
+ void StopProducing() override {
+ Finish();
+ inputs_[0]->StopProducing(this);
+ }
+
+ Future<> finished() override { return finished_; }
+
+ void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (finished_.is_finished()) return;
+
+ ++num_received_;
+ if (num_received_ == emit_stop_) {
+ lock.unlock();
+ producer_.Push(std::move(batch));
+ Finish();
+ return;
+ }
+
+ if (emit_stop_ != -1) {
+ DCHECK_LE(seq_num, emit_stop_);
+ }
+
+ lock.unlock();
+ producer_.Push(std::move(batch));
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ producer_.Push(std::move(error));
+ Finish();
+ inputs_[0]->StopProducing(this);
+ }
+
+ void InputFinished(ExecNode* input, int seq_stop) override {
+ std::unique_lock<std::mutex> lock(mutex_);
+ emit_stop_ = seq_stop;
+ if (num_received_ == emit_stop_) {
+ lock.unlock();
+ Finish();
+ }
+ }
+
+ private:
+ void Finish() {
+ if (producer_.Close()) {
+ finished_.MarkFinished();
+ }
+ }
+
+ std::mutex mutex_;
+
+ int num_received_ = 0;
+ int emit_stop_ = -1;
+ Future<> finished_ = Future<>::MakeFinished();
+
+ PushGenerator<util::optional<ExecBatch>>::Producer producer_;
+};
+
+AsyncGenerator<util::optional<ExecBatch>> MakeSinkNode(ExecNode* input,
+ std::string label) {
+ AsyncGenerator<util::optional<ExecBatch>> out;
+ (void)input->plan()->EmplaceNode<SinkNode>(input, std::move(label), &out);
+ return out;
+}
+
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+ std::shared_ptr<Schema> schema,
+ std::function<Future<util::optional<ExecBatch>>()> gen, MemoryPool* pool) {
+ struct Impl : RecordBatchReader {
+ std::shared_ptr<Schema> schema() const override { return schema_; }
+
+ Status ReadNext(std::shared_ptr<RecordBatch>* record_batch) override {
+ ARROW_ASSIGN_OR_RAISE(auto batch, iterator_.Next());
+ if (batch) {
+ ARROW_ASSIGN_OR_RAISE(*record_batch, batch->ToRecordBatch(schema_, pool_));
+ } else {
+ *record_batch = IterationEnd<std::shared_ptr<RecordBatch>>();
+ }
+ return Status::OK();
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<Schema> schema_;
+ Iterator<util::optional<ExecBatch>> iterator_;
+ };
+
+ auto out = std::make_shared<Impl>();
+ out->pool_ = pool;
+ out->schema_ = std::move(schema);
+ out->iterator_ = MakeGeneratorIterator(std::move(gen));
+ return out;
+}
+
+struct ScalarAggregateNode : ExecNode {
+ ScalarAggregateNode(ExecNode* input, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ std::vector<const ScalarAggregateKernel*> kernels,
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states)
+ : ExecNode(input->plan(), std::move(label), {input}, {"target"},
+ /*output_schema=*/std::move(output_schema),
+ /*num_outputs=*/1),
+ kernels_(std::move(kernels)),
+ states_(std::move(states)) {}
+
+ const char* kind_name() override { return "ScalarAggregateNode"; }
+
+ Status DoConsume(const ExecBatch& batch, size_t thread_index) {
+ for (size_t i = 0; i < kernels_.size(); ++i) {
+ KernelContext batch_ctx{plan()->exec_context()};
+ batch_ctx.SetState(states_[i][thread_index].get());
+ ExecBatch single_column_batch{{batch.values[i]}, batch.length};
+ RETURN_NOT_OK(kernels_[i]->consume(&batch_ctx, single_column_batch));
+ }
+ return Status::OK();
+ }
+
+ void InputReceived(ExecNode* input, int seq, ExecBatch batch) override {
+ DCHECK_EQ(input, inputs_[0]);
+
+ std::unique_lock<std::mutex> lock(mutex_);
+ auto it =
+ thread_indices_.emplace(std::this_thread::get_id(), thread_indices_.size()).first;
+ auto thread_index = it->second;
+
+ lock.unlock();
+
+ Status st = DoConsume(std::move(batch), thread_index);
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ return;
+ }
+
+ lock.lock();
+ ++num_received_;
+ st = MaybeFinish(&lock);
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ }
+ }
+
+ void ErrorReceived(ExecNode* input, Status error) override {
+ DCHECK_EQ(input, inputs_[0]);
+ outputs_[0]->ErrorReceived(this, std::move(error));
+ }
+
+ void InputFinished(ExecNode* input, int seq) override {
+ DCHECK_EQ(input, inputs_[0]);
+ std::unique_lock<std::mutex> lock(mutex_);
+ num_total_ = seq;
+ Status st = MaybeFinish(&lock);
+
+ if (!st.ok()) {
+ outputs_[0]->ErrorReceived(this, std::move(st));
+ }
+ }
+
+ Status StartProducing() override {
+ finished_ = Future<>::Make();
+ // Scalar aggregates will only output a single batch
+ outputs_[0]->InputFinished(this, 1);
+ return Status::OK();
+ }
+
+ void PauseProducing(ExecNode* output) override {}
+
+ void ResumeProducing(ExecNode* output) override {}
+
+ void StopProducing(ExecNode* output) override {
+ DCHECK_EQ(output, outputs_[0]);
+ StopProducing();
+ }
+
+ void StopProducing() override {
+ inputs_[0]->StopProducing(this);
+ finished_.MarkFinished();
+ }
+
+ Future<> finished() override { return finished_; }
+
+ private:
+ Status MaybeFinish(std::unique_lock<std::mutex>* lock) {
+ if (num_received_ != num_total_) return Status::OK();
+
+ if (states_.empty()) return Status::OK();
+
+ ExecBatch batch{{}, 1};
+ batch.values.resize(kernels_.size());
+
+ for (size_t i = 0; i < kernels_.size(); ++i) {
+ KernelContext ctx{plan()->exec_context()};
+ ARROW_ASSIGN_OR_RAISE(auto merged, ScalarAggregateKernel::MergeAll(
+ kernels_[i], &ctx, std::move(states_[i])));
+ RETURN_NOT_OK(kernels_[i]->finalize(&ctx, &batch.values[i]));
+ }
+ states_.clear();
+ lock->unlock();
+
+ outputs_[0]->InputReceived(this, 0, batch);
+
+ finished_.MarkFinished();
+ return Status::OK();
+ }
+
+ Future<> finished_ = Future<>::MakeFinished();
+ std::vector<const ScalarAggregateKernel*> kernels_;
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states_;
+ std::unordered_map<std::thread::id, size_t> thread_indices_;
+ std::mutex mutex_;
+ int num_received_ = 0, num_total_ = -1;
+};
+
+Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
+ std::vector<internal::Aggregate> aggregates) {
+ if (input->output_schema()->num_fields() != static_cast<int>(aggregates.size())) {
+ return Status::Invalid("Provided ", aggregates.size(),
+ " aggregates, expected one for each field of ",
+ input->output_schema()->ToString());
+ }
+
+ auto exec_ctx = input->plan()->exec_context();
+
+ std::vector<const ScalarAggregateKernel*> kernels(aggregates.size());
+ std::vector<std::vector<std::unique_ptr<KernelState>>> states(kernels.size());
+ FieldVector fields(kernels.size());
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto function,
+ exec_ctx->func_registry()->GetFunction(aggregates[i].function));
+
+ if (function->kind() != Function::SCALAR_AGGREGATE) {
+ return Status::Invalid("Provided non ScalarAggregateFunction ",
+ aggregates[i].function);
+ }
+
+ auto in_type = ValueDescr::Array(input->output_schema()->fields()[i]->type());
+
+ ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, function->DispatchExact({in_type}));
+ kernels[i] = static_cast<const ScalarAggregateKernel*>(kernel);
+
+ if (aggregates[i].options == nullptr) {
+ aggregates[i].options = function->default_options();
+ }
+
+ KernelContext kernel_ctx{exec_ctx};
+ states[i].resize(exec_ctx->executor() ? exec_ctx->executor()->GetCapacity() : 1);
+ RETURN_NOT_OK(Kernel::InitAll(&kernel_ctx,
+ KernelInitArgs{kernels[i],
+ {
+ in_type,
+ },
+ aggregates[i].options},
+ &states[i]));
+
+ // pick one to resolve the kernel signature
+ kernel_ctx.SetState(states[i][0].get());
+ ARROW_ASSIGN_OR_RAISE(
+ auto descr, kernels[i]->signature->out_type().Resolve(&kernel_ctx, {in_type}));
+
+ fields[i] = field(aggregates[i].function, std::move(descr.type));
+ }
+
+ return input->plan()->EmplaceNode<ScalarAggregateNode>(
+ input, std::move(label), schema(std::move(fields)), std::move(kernels),
+ std::move(states));
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
index 0df78fecd7c..c36c174af05 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/exec_plan.h
@@ -1,287 +1,287 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/type_fwd.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace compute {
-
-class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
- public:
- using NodeVector = std::vector<ExecNode*>;
-
- virtual ~ExecPlan() = default;
-
- ExecContext* exec_context() const { return exec_context_; }
-
- /// Make an empty exec plan
- static Result<std::shared_ptr<ExecPlan>> Make(ExecContext* = default_exec_context());
-
- ExecNode* AddNode(std::unique_ptr<ExecNode> node);
-
- template <typename Node, typename... Args>
- Node* EmplaceNode(Args&&... args) {
- std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
- auto out = node.get();
- AddNode(std::move(node));
- return out;
- }
-
- /// The initial inputs
- const NodeVector& sources() const;
-
- /// The final outputs
- const NodeVector& sinks() const;
-
- Status Validate();
-
- /// \brief Start producing on all nodes
- ///
- /// Nodes are started in reverse topological order, such that any node
- /// is started before all of its inputs.
- Status StartProducing();
-
- /// \brief Stop producing on all nodes
- ///
- /// Nodes are stopped in topological order, such that any node
- /// is stopped before all of its outputs.
- void StopProducing();
-
- /// \brief A future which will be marked finished when all nodes have stopped producing.
- Future<> finished();
-
- protected:
- ExecContext* exec_context_;
- explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {}
-};
-
-class ARROW_EXPORT ExecNode {
- public:
- using NodeVector = std::vector<ExecNode*>;
-
- virtual ~ExecNode() = default;
-
- virtual const char* kind_name() = 0;
-
- // The number of inputs/outputs expected by this node
- int num_inputs() const { return static_cast<int>(inputs_.size()); }
- int num_outputs() const { return num_outputs_; }
-
- /// This node's predecessors in the exec plan
- const NodeVector& inputs() const { return inputs_; }
-
- /// \brief Labels identifying the function of each input.
- const std::vector<std::string>& input_labels() const { return input_labels_; }
-
- /// This node's successors in the exec plan
- const NodeVector& outputs() const { return outputs_; }
-
- /// The datatypes for batches produced by this node
- const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
-
- /// This node's exec plan
- ExecPlan* plan() { return plan_; }
-
- /// \brief An optional label, for display and debugging
- ///
- /// There is no guarantee that this value is non-empty or unique.
- const std::string& label() const { return label_; }
-
- Status Validate() const;
-
- /// Upstream API:
- /// These functions are called by input nodes that want to inform this node
- /// about an updated condition (a new input batch, an error, an impeding
- /// end of stream).
- ///
- /// Implementation rules:
- /// - these may be called anytime after StartProducing() has succeeded
- /// (and even during or after StopProducing())
- /// - these may be called concurrently
- /// - these are allowed to call back into PauseProducing(), ResumeProducing()
- /// and StopProducing()
-
- /// Transfer input batch to ExecNode
- virtual void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) = 0;
-
- /// Signal error to ExecNode
- virtual void ErrorReceived(ExecNode* input, Status error) = 0;
-
- /// Mark the inputs finished after the given number of batches.
- ///
- /// This may be called before all inputs are received. This simply fixes
- /// the total number of incoming batches for an input, so that the ExecNode
- /// knows when it has received all input, regardless of order.
- virtual void InputFinished(ExecNode* input, int seq_stop) = 0;
-
- /// Lifecycle API:
- /// - start / stop to initiate and terminate production
- /// - pause / resume to apply backpressure
- ///
- /// Implementation rules:
- /// - StartProducing() should not recurse into the inputs, as it is
- /// handled by ExecPlan::StartProducing()
- /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
- /// concurrently (but only after StartProducing() has returned successfully)
- /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
- /// by the downstream nodes' InputReceived(), ErrorReceived(), InputFinished()
- /// methods
- /// - StopProducing() should recurse into the inputs
- /// - StopProducing() must be idempotent
-
- // XXX What happens if StartProducing() calls an output's InputReceived()
- // synchronously, and InputReceived() decides to call back into StopProducing()
- // (or PauseProducing()) because it received enough data?
- //
- // Right now, since synchronous calls happen in both directions (input to
- // output and then output to input), a node must be careful to be reentrant
- // against synchronous calls from its output, *and* also concurrent calls from
- // other threads. The most reliable solution is to update the internal state
- // first, and notify outputs only at the end.
- //
- // Alternate rules:
- // - StartProducing(), ResumeProducing() can call synchronously into
- // its ouputs' consuming methods (InputReceived() etc.)
- // - InputReceived(), ErrorReceived(), InputFinished() can call asynchronously
- // into its inputs' PauseProducing(), StopProducing()
- //
- // Alternate API:
- // - InputReceived(), ErrorReceived(), InputFinished() return a ProductionHint
- // enum: either None (default), PauseProducing, ResumeProducing, StopProducing
- // - A method allows passing a ProductionHint asynchronously from an output node
- // (replacing PauseProducing(), ResumeProducing(), StopProducing())
-
- /// \brief Start producing
- ///
- /// This must only be called once. If this fails, then other lifecycle
- /// methods must not be called.
- ///
- /// This is typically called automatically by ExecPlan::StartProducing().
- virtual Status StartProducing() = 0;
-
- /// \brief Pause producing temporarily
- ///
- /// This call is a hint that an output node is currently not willing
- /// to receive data.
- ///
- /// This may be called any number of times after StartProducing() succeeds.
- /// However, the node is still free to produce data (which may be difficult
- /// to prevent anyway if data is produced using multiple threads).
- virtual void PauseProducing(ExecNode* output) = 0;
-
- /// \brief Resume producing after a temporary pause
- ///
- /// This call is a hint that an output node is willing to receive data again.
- ///
- /// This may be called any number of times after StartProducing() succeeds.
- /// This may also be called concurrently with PauseProducing(), which suggests
- /// the implementation may use an atomic counter.
- virtual void ResumeProducing(ExecNode* output) = 0;
-
- /// \brief Stop producing definitively to a single output
- ///
- /// This call is a hint that an output node has completed and is not willing
- /// to receive any further data.
- virtual void StopProducing(ExecNode* output) = 0;
-
- /// \brief Stop producing definitively to all outputs
- virtual void StopProducing() = 0;
-
- /// \brief A future which will be marked finished when this node has stopped producing.
- virtual Future<> finished() = 0;
-
- protected:
- ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
- std::vector<std::string> input_labels, std::shared_ptr<Schema> output_schema,
- int num_outputs);
-
- ExecPlan* plan_;
- std::string label_;
-
- NodeVector inputs_;
- std::vector<std::string> input_labels_;
-
- std::shared_ptr<Schema> output_schema_;
- int num_outputs_;
- NodeVector outputs_;
-};
-
-/// \brief Adapt an AsyncGenerator<ExecBatch> as a source node
-///
-/// plan->exec_context()->executor() is used to parallelize pushing to
-/// outputs, if provided.
-ARROW_EXPORT
-ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
- std::shared_ptr<Schema> output_schema,
- std::function<Future<util::optional<ExecBatch>>()>);
-
-/// \brief Add a sink node which forwards to an AsyncGenerator<ExecBatch>
-///
-/// Emitted batches will not be ordered.
-ARROW_EXPORT
-std::function<Future<util::optional<ExecBatch>>()> MakeSinkNode(ExecNode* input,
- std::string label);
-
-/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
-///
-/// The RecordBatchReader does not impose any ordering on emitted batches.
-ARROW_EXPORT
-std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
- std::shared_ptr<Schema>, std::function<Future<util::optional<ExecBatch>>()>,
- MemoryPool*);
-
-/// \brief Make a node which excludes some rows from batches passed through it
-///
-/// The filter Expression will be evaluated against each batch which is pushed to
-/// this node. Any rows for which the filter does not evaluate to `true` will be excluded
-/// in the batch emitted by this node.
-///
-/// If the filter is not already bound, it will be bound against the input's schema.
-ARROW_EXPORT
-Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter);
-
-/// \brief Make a node which executes expressions on input batches, producing new batches.
-///
-/// Each expression will be evaluated against each batch which is pushed to
-/// this node to produce a corresponding output column.
-///
-/// If exprs are not already bound, they will be bound against the input's schema.
-/// If names are not provided, the string representations of exprs will be used.
-ARROW_EXPORT
-Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
- std::vector<Expression> exprs,
- std::vector<std::string> names = {});
-
-ARROW_EXPORT
-Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
- std::vector<internal::Aggregate> aggregates);
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class ARROW_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
+ public:
+ using NodeVector = std::vector<ExecNode*>;
+
+ virtual ~ExecPlan() = default;
+
+ ExecContext* exec_context() const { return exec_context_; }
+
+ /// Make an empty exec plan
+ static Result<std::shared_ptr<ExecPlan>> Make(ExecContext* = default_exec_context());
+
+ ExecNode* AddNode(std::unique_ptr<ExecNode> node);
+
+ template <typename Node, typename... Args>
+ Node* EmplaceNode(Args&&... args) {
+ std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
+ auto out = node.get();
+ AddNode(std::move(node));
+ return out;
+ }
+
+ /// The initial inputs
+ const NodeVector& sources() const;
+
+ /// The final outputs
+ const NodeVector& sinks() const;
+
+ Status Validate();
+
+ /// \brief Start producing on all nodes
+ ///
+ /// Nodes are started in reverse topological order, such that any node
+ /// is started before all of its inputs.
+ Status StartProducing();
+
+ /// \brief Stop producing on all nodes
+ ///
+ /// Nodes are stopped in topological order, such that any node
+ /// is stopped before all of its outputs.
+ void StopProducing();
+
+ /// \brief A future which will be marked finished when all nodes have stopped producing.
+ Future<> finished();
+
+ protected:
+ ExecContext* exec_context_;
+ explicit ExecPlan(ExecContext* exec_context) : exec_context_(exec_context) {}
+};
+
+class ARROW_EXPORT ExecNode {
+ public:
+ using NodeVector = std::vector<ExecNode*>;
+
+ virtual ~ExecNode() = default;
+
+ virtual const char* kind_name() = 0;
+
+ // The number of inputs/outputs expected by this node
+ int num_inputs() const { return static_cast<int>(inputs_.size()); }
+ int num_outputs() const { return num_outputs_; }
+
+ /// This node's predecessors in the exec plan
+ const NodeVector& inputs() const { return inputs_; }
+
+ /// \brief Labels identifying the function of each input.
+ const std::vector<std::string>& input_labels() const { return input_labels_; }
+
+ /// This node's successors in the exec plan
+ const NodeVector& outputs() const { return outputs_; }
+
+ /// The datatypes for batches produced by this node
+ const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
+
+ /// This node's exec plan
+ ExecPlan* plan() { return plan_; }
+
+ /// \brief An optional label, for display and debugging
+ ///
+ /// There is no guarantee that this value is non-empty or unique.
+ const std::string& label() const { return label_; }
+
+ Status Validate() const;
+
+ /// Upstream API:
+ /// These functions are called by input nodes that want to inform this node
+ /// about an updated condition (a new input batch, an error, an impeding
+ /// end of stream).
+ ///
+ /// Implementation rules:
+ /// - these may be called anytime after StartProducing() has succeeded
+ /// (and even during or after StopProducing())
+ /// - these may be called concurrently
+ /// - these are allowed to call back into PauseProducing(), ResumeProducing()
+ /// and StopProducing()
+
+ /// Transfer input batch to ExecNode
+ virtual void InputReceived(ExecNode* input, int seq_num, ExecBatch batch) = 0;
+
+ /// Signal error to ExecNode
+ virtual void ErrorReceived(ExecNode* input, Status error) = 0;
+
+ /// Mark the inputs finished after the given number of batches.
+ ///
+ /// This may be called before all inputs are received. This simply fixes
+ /// the total number of incoming batches for an input, so that the ExecNode
+ /// knows when it has received all input, regardless of order.
+ virtual void InputFinished(ExecNode* input, int seq_stop) = 0;
+
+ /// Lifecycle API:
+ /// - start / stop to initiate and terminate production
+ /// - pause / resume to apply backpressure
+ ///
+ /// Implementation rules:
+ /// - StartProducing() should not recurse into the inputs, as it is
+ /// handled by ExecPlan::StartProducing()
+ /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+ /// concurrently (but only after StartProducing() has returned successfully)
+ /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+ /// by the downstream nodes' InputReceived(), ErrorReceived(), InputFinished()
+ /// methods
+ /// - StopProducing() should recurse into the inputs
+ /// - StopProducing() must be idempotent
+
+ // XXX What happens if StartProducing() calls an output's InputReceived()
+ // synchronously, and InputReceived() decides to call back into StopProducing()
+ // (or PauseProducing()) because it received enough data?
+ //
+ // Right now, since synchronous calls happen in both directions (input to
+ // output and then output to input), a node must be careful to be reentrant
+ // against synchronous calls from its output, *and* also concurrent calls from
+ // other threads. The most reliable solution is to update the internal state
+ // first, and notify outputs only at the end.
+ //
+ // Alternate rules:
+ // - StartProducing(), ResumeProducing() can call synchronously into
+ // its ouputs' consuming methods (InputReceived() etc.)
+ // - InputReceived(), ErrorReceived(), InputFinished() can call asynchronously
+ // into its inputs' PauseProducing(), StopProducing()
+ //
+ // Alternate API:
+ // - InputReceived(), ErrorReceived(), InputFinished() return a ProductionHint
+ // enum: either None (default), PauseProducing, ResumeProducing, StopProducing
+ // - A method allows passing a ProductionHint asynchronously from an output node
+ // (replacing PauseProducing(), ResumeProducing(), StopProducing())
+
+ /// \brief Start producing
+ ///
+ /// This must only be called once. If this fails, then other lifecycle
+ /// methods must not be called.
+ ///
+ /// This is typically called automatically by ExecPlan::StartProducing().
+ virtual Status StartProducing() = 0;
+
+ /// \brief Pause producing temporarily
+ ///
+ /// This call is a hint that an output node is currently not willing
+ /// to receive data.
+ ///
+ /// This may be called any number of times after StartProducing() succeeds.
+ /// However, the node is still free to produce data (which may be difficult
+ /// to prevent anyway if data is produced using multiple threads).
+ virtual void PauseProducing(ExecNode* output) = 0;
+
+ /// \brief Resume producing after a temporary pause
+ ///
+ /// This call is a hint that an output node is willing to receive data again.
+ ///
+ /// This may be called any number of times after StartProducing() succeeds.
+ /// This may also be called concurrently with PauseProducing(), which suggests
+ /// the implementation may use an atomic counter.
+ virtual void ResumeProducing(ExecNode* output) = 0;
+
+ /// \brief Stop producing definitively to a single output
+ ///
+ /// This call is a hint that an output node has completed and is not willing
+ /// to receive any further data.
+ virtual void StopProducing(ExecNode* output) = 0;
+
+ /// \brief Stop producing definitively to all outputs
+ virtual void StopProducing() = 0;
+
+ /// \brief A future which will be marked finished when this node has stopped producing.
+ virtual Future<> finished() = 0;
+
+ protected:
+ ExecNode(ExecPlan* plan, std::string label, NodeVector inputs,
+ std::vector<std::string> input_labels, std::shared_ptr<Schema> output_schema,
+ int num_outputs);
+
+ ExecPlan* plan_;
+ std::string label_;
+
+ NodeVector inputs_;
+ std::vector<std::string> input_labels_;
+
+ std::shared_ptr<Schema> output_schema_;
+ int num_outputs_;
+ NodeVector outputs_;
+};
+
+/// \brief Adapt an AsyncGenerator<ExecBatch> as a source node
+///
+/// plan->exec_context()->executor() is used to parallelize pushing to
+/// outputs, if provided.
+ARROW_EXPORT
+ExecNode* MakeSourceNode(ExecPlan* plan, std::string label,
+ std::shared_ptr<Schema> output_schema,
+ std::function<Future<util::optional<ExecBatch>>()>);
+
+/// \brief Add a sink node which forwards to an AsyncGenerator<ExecBatch>
+///
+/// Emitted batches will not be ordered.
+ARROW_EXPORT
+std::function<Future<util::optional<ExecBatch>>()> MakeSinkNode(ExecNode* input,
+ std::string label);
+
+/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
+///
+/// The RecordBatchReader does not impose any ordering on emitted batches.
+ARROW_EXPORT
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+ std::shared_ptr<Schema>, std::function<Future<util::optional<ExecBatch>>()>,
+ MemoryPool*);
+
+/// \brief Make a node which excludes some rows from batches passed through it
+///
+/// The filter Expression will be evaluated against each batch which is pushed to
+/// this node. Any rows for which the filter does not evaluate to `true` will be excluded
+/// in the batch emitted by this node.
+///
+/// If the filter is not already bound, it will be bound against the input's schema.
+ARROW_EXPORT
+Result<ExecNode*> MakeFilterNode(ExecNode* input, std::string label, Expression filter);
+
+/// \brief Make a node which executes expressions on input batches, producing new batches.
+///
+/// Each expression will be evaluated against each batch which is pushed to
+/// this node to produce a corresponding output column.
+///
+/// If exprs are not already bound, they will be bound against the input's schema.
+/// If names are not provided, the string representations of exprs will be used.
+ARROW_EXPORT
+Result<ExecNode*> MakeProjectNode(ExecNode* input, std::string label,
+ std::vector<Expression> exprs,
+ std::vector<std::string> names = {});
+
+ARROW_EXPORT
+Result<ExecNode*> MakeScalarAggregateNode(ExecNode* input, std::string label,
+ std::vector<internal::Aggregate> aggregates);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
index 44fb7cf1104..4aab64a46a4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.cc
@@ -1,1186 +1,1186 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/expression.h"
-
-#include <unordered_map>
-#include <unordered_set>
-
-#include "arrow/chunked_array.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec/expression_internal.h"
-#include "arrow/compute/exec_internal.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/util/hash_util.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/string.h"
-#include "arrow/util/value_parsing.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-namespace compute {
-
-void Expression::Call::ComputeHash() {
- hash = std::hash<std::string>{}(function_name);
- for (const auto& arg : arguments) {
- arrow::internal::hash_combine(hash, arg.hash());
- }
-}
-
-Expression::Expression(Call call) {
- call.ComputeHash();
- impl_ = std::make_shared<Impl>(std::move(call));
-}
-
-Expression::Expression(Datum literal)
- : impl_(std::make_shared<Impl>(std::move(literal))) {}
-
-Expression::Expression(Parameter parameter)
- : impl_(std::make_shared<Impl>(std::move(parameter))) {}
-
-Expression literal(Datum lit) { return Expression(std::move(lit)); }
-
-Expression field_ref(FieldRef ref) {
- return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1});
-}
-
-Expression call(std::string function, std::vector<Expression> arguments,
- std::shared_ptr<compute::FunctionOptions> options) {
- Expression::Call call;
- call.function_name = std::move(function);
- call.arguments = std::move(arguments);
- call.options = std::move(options);
- return Expression(std::move(call));
-}
-
-const Datum* Expression::literal() const { return util::get_if<Datum>(impl_.get()); }
-
-const Expression::Parameter* Expression::parameter() const {
- return util::get_if<Parameter>(impl_.get());
-}
-
-const FieldRef* Expression::field_ref() const {
- if (auto parameter = this->parameter()) {
- return &parameter->ref;
- }
- return nullptr;
-}
-
-const Expression::Call* Expression::call() const {
- return util::get_if<Call>(impl_.get());
-}
-
-ValueDescr Expression::descr() const {
- if (impl_ == nullptr) return {};
-
- if (auto lit = literal()) {
- return lit->descr();
- }
-
- if (auto parameter = this->parameter()) {
- return parameter->descr;
- }
-
- return CallNotNull(*this)->descr;
-}
-
-namespace {
-
-std::string PrintDatum(const Datum& datum) {
- if (datum.is_scalar()) {
- if (!datum.scalar()->is_valid) return "null";
-
- switch (datum.type()->id()) {
- case Type::STRING:
- case Type::LARGE_STRING:
- return '"' +
- Escape(util::string_view(*datum.scalar_as<BaseBinaryScalar>().value)) +
- '"';
-
- case Type::BINARY:
- case Type::FIXED_SIZE_BINARY:
- case Type::LARGE_BINARY:
- return '"' + datum.scalar_as<BaseBinaryScalar>().value->ToHexString() + '"';
-
- default:
- break;
- }
-
- return datum.scalar()->ToString();
- }
- return datum.ToString();
-}
-
-} // namespace
-
-std::string Expression::ToString() const {
- if (auto lit = literal()) {
- return PrintDatum(*lit);
- }
-
- if (auto ref = field_ref()) {
- if (auto name = ref->name()) {
- return *name;
- }
- if (auto path = ref->field_path()) {
- return path->ToString();
- }
- return ref->ToString();
- }
-
- auto call = CallNotNull(*this);
- auto binary = [&](std::string op) {
- return "(" + call->arguments[0].ToString() + " " + op + " " +
- call->arguments[1].ToString() + ")";
- };
-
- if (auto cmp = Comparison::Get(call->function_name)) {
- return binary(Comparison::GetOp(*cmp));
- }
-
- constexpr util::string_view kleene = "_kleene";
- if (util::string_view{call->function_name}.ends_with(kleene)) {
- auto op = call->function_name.substr(0, call->function_name.size() - kleene.size());
- return binary(std::move(op));
- }
-
- if (auto options = GetMakeStructOptions(*call)) {
- std::string out = "{";
- auto argument = call->arguments.begin();
- for (const auto& field_name : options->field_names) {
- out += field_name + "=" + argument++->ToString() + ", ";
- }
- out.resize(out.size() - 1);
- out.back() = '}';
- return out;
- }
-
- std::string out = call->function_name + "(";
- for (const auto& arg : call->arguments) {
- out += arg.ToString() + ", ";
- }
-
- if (call->options) {
- out += call->options->ToString();
- out.resize(out.size() + 1);
- } else {
- out.resize(out.size() - 1);
- }
- out.back() = ')';
- return out;
-}
-
-void PrintTo(const Expression& expr, std::ostream* os) {
- *os << expr.ToString();
- if (expr.IsBound()) {
- *os << "[bound]";
- }
-}
-
-bool Expression::Equals(const Expression& other) const {
- if (Identical(*this, other)) return true;
-
- if (impl_->index() != other.impl_->index()) {
- return false;
- }
-
- if (auto lit = literal()) {
- return lit->Equals(*other.literal());
- }
-
- if (auto ref = field_ref()) {
- return ref->Equals(*other.field_ref());
- }
-
- auto call = CallNotNull(*this);
- auto other_call = CallNotNull(other);
-
- if (call->function_name != other_call->function_name ||
- call->kernel != other_call->kernel) {
- return false;
- }
-
- for (size_t i = 0; i < call->arguments.size(); ++i) {
- if (!call->arguments[i].Equals(other_call->arguments[i])) {
- return false;
- }
- }
-
- if (call->options == other_call->options) return true;
- if (call->options && other_call->options) {
- return call->options->Equals(other_call->options);
- }
- return false;
-}
-
-bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.impl_; }
-
-size_t Expression::hash() const {
- if (auto lit = literal()) {
- if (lit->is_scalar()) {
- return lit->scalar()->hash();
- }
- return 0;
- }
-
- if (auto ref = field_ref()) {
- return ref->hash();
- }
-
- return CallNotNull(*this)->hash;
-}
-
-bool Expression::IsBound() const {
- if (type() == nullptr) return false;
-
- if (auto call = this->call()) {
- if (call->kernel == nullptr) return false;
-
- for (const Expression& arg : call->arguments) {
- if (!arg.IsBound()) return false;
- }
- }
-
- return true;
-}
-
-bool Expression::IsScalarExpression() const {
- if (auto lit = literal()) {
- return lit->is_scalar();
- }
-
- if (field_ref()) return true;
-
- auto call = CallNotNull(*this);
-
- for (const Expression& arg : call->arguments) {
- if (!arg.IsScalarExpression()) return false;
- }
-
- if (call->function) {
- return call->function->kind() == compute::Function::SCALAR;
- }
-
- // this expression is not bound; make a best guess based on
- // the default function registry
- if (auto function = compute::GetFunctionRegistry()
- ->GetFunction(call->function_name)
- .ValueOr(nullptr)) {
- return function->kind() == compute::Function::SCALAR;
- }
-
- // unknown function or other error; conservatively return false
- return false;
-}
-
-bool Expression::IsNullLiteral() const {
- if (auto lit = literal()) {
- if (lit->null_count() == lit->length()) {
- return true;
- }
- }
-
- return false;
-}
-
-bool Expression::IsSatisfiable() const {
- if (type() && type()->id() == Type::NA) {
- return false;
- }
-
- if (auto lit = literal()) {
- if (lit->null_count() == lit->length()) {
- return false;
- }
-
- if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
- return lit->scalar_as<BooleanScalar>().value;
- }
- }
-
- return true;
-}
-
-namespace {
-
-// Produce a bound Expression from unbound Call and bound arguments.
-Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_casts,
- compute::ExecContext* exec_context) {
- DCHECK(std::all_of(call.arguments.begin(), call.arguments.end(),
- [](const Expression& argument) { return argument.IsBound(); }));
-
- auto descrs = GetDescriptors(call.arguments);
- ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context));
-
- if (!insert_implicit_casts) {
- ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchExact(descrs));
- } else {
- ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&descrs));
-
- for (size_t i = 0; i < descrs.size(); ++i) {
- if (descrs[i] == call.arguments[i].descr()) continue;
-
- if (descrs[i].shape != call.arguments[i].descr().shape) {
- return Status::NotImplemented(
- "Automatic broadcasting of scalars arguments to arrays in ",
- Expression(std::move(call)).ToString());
- }
-
- if (auto lit = call.arguments[i].literal()) {
- ARROW_ASSIGN_OR_RAISE(Datum new_lit, compute::Cast(*lit, descrs[i].type));
- call.arguments[i] = literal(std::move(new_lit));
- continue;
- }
-
- // construct an implicit cast Expression with which to replace this argument
- Expression::Call implicit_cast;
- implicit_cast.function_name = "cast";
- implicit_cast.arguments = {std::move(call.arguments[i])};
- implicit_cast.options = std::make_shared<compute::CastOptions>(
- compute::CastOptions::Safe(descrs[i].type));
-
- ARROW_ASSIGN_OR_RAISE(
- call.arguments[i],
- BindNonRecursive(std::move(implicit_cast),
- /*insert_implicit_casts=*/false, exec_context));
- }
- }
-
- compute::KernelContext kernel_context(exec_context);
- if (call.kernel->init) {
- ARROW_ASSIGN_OR_RAISE(
- call.kernel_state,
- call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()}));
-
- kernel_context.SetState(call.kernel_state.get());
- }
-
- ARROW_ASSIGN_OR_RAISE(
- call.descr, call.kernel->signature->out_type().Resolve(&kernel_context, descrs));
-
- return Expression(std::move(call));
-}
-
-template <typename TypeOrSchema>
-Result<Expression> BindImpl(Expression expr, const TypeOrSchema& in,
- ValueDescr::Shape shape, compute::ExecContext* exec_context) {
- if (exec_context == nullptr) {
- compute::ExecContext exec_context;
- return BindImpl(std::move(expr), in, shape, &exec_context);
- }
-
- if (expr.literal()) return expr;
-
- if (auto ref = expr.field_ref()) {
- if (ref->IsNested()) {
- return Status::NotImplemented("nested field references");
- }
-
- ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in));
-
- auto bound = *expr.parameter();
- bound.index = path[0];
- ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in));
- bound.descr.type = field->type();
- bound.descr.shape = shape;
- return Expression{std::move(bound)};
- }
-
- auto call = *CallNotNull(expr);
- for (auto& argument : call.arguments) {
- ARROW_ASSIGN_OR_RAISE(argument,
- BindImpl(std::move(argument), in, shape, exec_context));
- }
- return BindNonRecursive(std::move(call),
- /*insert_implicit_casts=*/true, exec_context);
-}
-
-} // namespace
-
-Result<Expression> Expression::Bind(const ValueDescr& in,
- compute::ExecContext* exec_context) const {
- return BindImpl(*this, *in.type, in.shape, exec_context);
-}
-
-Result<Expression> Expression::Bind(const Schema& in_schema,
- compute::ExecContext* exec_context) const {
- return BindImpl(*this, in_schema, ValueDescr::ARRAY, exec_context);
-}
-
-Result<ExecBatch> MakeExecBatch(const Schema& full_schema, const Datum& partial) {
- ExecBatch out;
-
- if (partial.kind() == Datum::RECORD_BATCH) {
- const auto& partial_batch = *partial.record_batch();
- out.length = partial_batch.num_rows();
-
- for (const auto& field : full_schema.fields()) {
- ARROW_ASSIGN_OR_RAISE(auto column,
- FieldRef(field->name()).GetOneOrNone(partial_batch));
-
- if (column) {
- if (!column->type()->Equals(field->type())) {
- // Referenced field was present but didn't have the expected type.
- // This *should* be handled by readers, and will just be an error in the future.
- ARROW_ASSIGN_OR_RAISE(
- auto converted,
- compute::Cast(column, field->type(), compute::CastOptions::Safe()));
- column = converted.make_array();
- }
- out.values.emplace_back(std::move(column));
- } else {
- out.values.emplace_back(MakeNullScalar(field->type()));
- }
- }
- return out;
- }
-
- // wasteful but useful for testing:
- if (partial.type()->id() == Type::STRUCT) {
- if (partial.is_array()) {
- ARROW_ASSIGN_OR_RAISE(auto partial_batch,
- RecordBatch::FromStructArray(partial.make_array()));
-
- return MakeExecBatch(full_schema, partial_batch);
- }
-
- if (partial.is_scalar()) {
- ARROW_ASSIGN_OR_RAISE(auto partial_array,
- MakeArrayFromScalar(*partial.scalar(), 1));
- ARROW_ASSIGN_OR_RAISE(auto out, MakeExecBatch(full_schema, partial_array));
-
- for (Datum& value : out.values) {
- if (value.is_scalar()) continue;
- ARROW_ASSIGN_OR_RAISE(value, value.make_array()->GetScalar(0));
- }
- return out;
- }
- }
-
- return Status::NotImplemented("MakeExecBatch from ", PrintDatum(partial));
-}
-
-Result<Datum> ExecuteScalarExpression(const Expression& expr, const Schema& full_schema,
- const Datum& partial_input,
- compute::ExecContext* exec_context) {
- ARROW_ASSIGN_OR_RAISE(auto input, MakeExecBatch(full_schema, partial_input));
- return ExecuteScalarExpression(expr, input, exec_context);
-}
-
-Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& input,
- compute::ExecContext* exec_context) {
- if (exec_context == nullptr) {
- compute::ExecContext exec_context;
- return ExecuteScalarExpression(expr, input, &exec_context);
- }
-
- if (!expr.IsBound()) {
- return Status::Invalid("Cannot Execute unbound expression.");
- }
-
- if (!expr.IsScalarExpression()) {
- return Status::Invalid(
- "ExecuteScalarExpression cannot Execute non-scalar expression ", expr.ToString());
- }
-
- if (auto lit = expr.literal()) return *lit;
-
- if (auto param = expr.parameter()) {
- if (param->descr.type->id() == Type::NA) {
- return MakeNullScalar(null());
- }
-
- const Datum& field = input[param->index];
- if (!field.type()->Equals(param->descr.type)) {
- return Status::Invalid("Referenced field ", expr.ToString(), " was ",
- field.type()->ToString(), " but should have been ",
- param->descr.type->ToString());
- }
-
- return field;
- }
-
- auto call = CallNotNull(expr);
-
- std::vector<Datum> arguments(call->arguments.size());
- for (size_t i = 0; i < arguments.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context));
- }
-
- auto executor = compute::detail::KernelExecutor::MakeScalar();
-
- compute::KernelContext kernel_context(exec_context);
- kernel_context.SetState(call->kernel_state.get());
-
- auto kernel = call->kernel;
- auto descrs = GetDescriptors(arguments);
- auto options = call->options.get();
- RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
-
- auto listener = std::make_shared<compute::detail::DatumAccumulator>();
- RETURN_NOT_OK(executor->Execute(arguments, listener.get()));
- return executor->WrapResults(arguments, listener->values());
-}
-
-namespace {
-
-std::array<std::pair<const Expression&, const Expression&>, 2>
-ArgumentsAndFlippedArguments(const Expression::Call& call) {
- DCHECK_EQ(call.arguments.size(), 2);
- return {std::pair<const Expression&, const Expression&>{call.arguments[0],
- call.arguments[1]},
- std::pair<const Expression&, const Expression&>{call.arguments[1],
- call.arguments[0]}};
-}
-
-template <typename BinOp, typename It,
- typename Out = typename std::iterator_traits<It>::value_type>
-util::optional<Out> FoldLeft(It begin, It end, const BinOp& bin_op) {
- if (begin == end) return util::nullopt;
-
- Out folded = std::move(*begin++);
- while (begin != end) {
- folded = bin_op(std::move(folded), std::move(*begin++));
- }
- return folded;
-}
-
-util::optional<compute::NullHandling::type> GetNullHandling(
- const Expression::Call& call) {
- if (call.function && call.function->kind() == compute::Function::SCALAR) {
- return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
- }
- return util::nullopt;
-}
-
-} // namespace
-
-std::vector<FieldRef> FieldsInExpression(const Expression& expr) {
- if (expr.literal()) return {};
-
- if (auto ref = expr.field_ref()) {
- return {*ref};
- }
-
- std::vector<FieldRef> fields;
- for (const Expression& arg : CallNotNull(expr)->arguments) {
- auto argument_fields = FieldsInExpression(arg);
- std::move(argument_fields.begin(), argument_fields.end(), std::back_inserter(fields));
- }
- return fields;
-}
-
-bool ExpressionHasFieldRefs(const Expression& expr) {
- if (expr.literal()) return false;
-
- if (expr.field_ref()) return true;
-
- for (const Expression& arg : CallNotNull(expr)->arguments) {
- if (ExpressionHasFieldRefs(arg)) return true;
- }
- return false;
-}
-
-Result<Expression> FoldConstants(Expression expr) {
- return Modify(
- std::move(expr), [](Expression expr) { return expr; },
- [](Expression expr, ...) -> Result<Expression> {
- auto call = CallNotNull(expr);
- if (std::all_of(call->arguments.begin(), call->arguments.end(),
- [](const Expression& argument) { return argument.literal(); })) {
- // all arguments are literal; we can evaluate this subexpression *now*
- static const ExecBatch ignored_input = ExecBatch{};
- ARROW_ASSIGN_OR_RAISE(Datum constant,
- ExecuteScalarExpression(expr, ignored_input));
-
- return literal(std::move(constant));
- }
-
- // XXX the following should probably be in a registry of passes instead
- // of inline
-
- if (GetNullHandling(*call) == compute::NullHandling::INTERSECTION) {
- // kernels which always produce intersected validity can be resolved
- // to null *now* if any of their inputs is a null literal
- for (const auto& argument : call->arguments) {
- if (argument.IsNullLiteral()) {
- return argument;
- }
- }
- }
-
- if (call->function_name == "and_kleene") {
- for (auto args : ArgumentsAndFlippedArguments(*call)) {
- // true and x == x
- if (args.first == literal(true)) return args.second;
-
- // false and x == false
- if (args.first == literal(false)) return args.first;
-
- // x and x == x
- if (args.first == args.second) return args.first;
- }
- return expr;
- }
-
- if (call->function_name == "or_kleene") {
- for (auto args : ArgumentsAndFlippedArguments(*call)) {
- // false or x == x
- if (args.first == literal(false)) return args.second;
-
- // true or x == true
- if (args.first == literal(true)) return args.first;
-
- // x or x == x
- if (args.first == args.second) return args.first;
- }
- return expr;
- }
-
- return expr;
- });
-}
-
-namespace {
-
-std::vector<Expression> GuaranteeConjunctionMembers(
- const Expression& guaranteed_true_predicate) {
- auto guarantee = guaranteed_true_predicate.call();
- if (!guarantee || guarantee->function_name != "and_kleene") {
- return {guaranteed_true_predicate};
- }
- return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
-}
-
-// Conjunction members which are represented in known_values are erased from
-// conjunction_members
-Status ExtractKnownFieldValuesImpl(
- std::vector<Expression>* conjunction_members,
- std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
- auto unconsumed_end =
- std::partition(conjunction_members->begin(), conjunction_members->end(),
- [](const Expression& expr) {
- // search for an equality conditions between a field and a literal
- auto call = expr.call();
- if (!call) return true;
-
- if (call->function_name == "equal") {
- auto ref = call->arguments[0].field_ref();
- auto lit = call->arguments[1].literal();
- return !(ref && lit);
- }
-
- if (call->function_name == "is_null") {
- auto ref = call->arguments[0].field_ref();
- return !ref;
- }
-
- return true;
- });
-
- for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
- auto call = CallNotNull(*it);
-
- if (call->function_name == "equal") {
- auto ref = call->arguments[0].field_ref();
- auto lit = call->arguments[1].literal();
- known_values->emplace(*ref, *lit);
- } else if (call->function_name == "is_null") {
- auto ref = call->arguments[0].field_ref();
- known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
- }
- }
-
- conjunction_members->erase(unconsumed_end, conjunction_members->end());
-
- return Status::OK();
-}
-
-} // namespace
-
-Result<KnownFieldValues> ExtractKnownFieldValues(
- const Expression& guaranteed_true_predicate) {
- auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
- KnownFieldValues known_values;
- RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
- return known_values;
-}
-
-Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
- Expression expr) {
- if (!expr.IsBound()) {
- return Status::Invalid(
- "ReplaceFieldsWithKnownValues called on an unbound Expression");
- }
-
- return Modify(
- std::move(expr),
- [&known_values](Expression expr) -> Result<Expression> {
- if (auto ref = expr.field_ref()) {
- auto it = known_values.map.find(*ref);
- if (it != known_values.map.end()) {
- Datum lit = it->second;
- if (lit.descr() == expr.descr()) return literal(std::move(lit));
- // type mismatch, try casting the known value to the correct type
-
- if (expr.type()->id() == Type::DICTIONARY &&
- lit.type()->id() != Type::DICTIONARY) {
- // the known value must be dictionary encoded
-
- const auto& dict_type = checked_cast<const DictionaryType&>(*expr.type());
- if (!lit.type()->Equals(dict_type.value_type())) {
- ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, dict_type.value_type()));
- }
-
- if (lit.is_scalar()) {
- ARROW_ASSIGN_OR_RAISE(auto dictionary,
- MakeArrayFromScalar(*lit.scalar(), 1));
-
- lit = Datum{DictionaryScalar::Make(MakeScalar<int32_t>(0),
- std::move(dictionary))};
- }
- }
-
- ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
- return literal(std::move(lit));
- }
- }
- return expr;
- },
- [](Expression expr, ...) { return expr; });
-}
-
-namespace {
-
-bool IsBinaryAssociativeCommutative(const Expression::Call& call) {
- static std::unordered_set<std::string> binary_associative_commutative{
- "and", "or", "and_kleene", "or_kleene", "xor",
- "multiply", "add", "multiply_checked", "add_checked"};
-
- auto it = binary_associative_commutative.find(call.function_name);
- return it != binary_associative_commutative.end();
-}
-
-} // namespace
-
-Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_context) {
- if (exec_context == nullptr) {
- compute::ExecContext exec_context;
- return Canonicalize(std::move(expr), &exec_context);
- }
-
- // If potentially reconstructing more deeply than a call's immediate arguments
- // (for example, when reorganizing an associative chain), add expressions to this set to
- // avoid unnecessary work
- struct {
- std::unordered_set<Expression, Expression::Hash> set_;
-
- bool operator()(const Expression& expr) const {
- return set_.find(expr) != set_.end();
- }
-
- void Add(std::vector<Expression> exprs) {
- std::move(exprs.begin(), exprs.end(), std::inserter(set_, set_.end()));
- }
- } AlreadyCanonicalized;
-
- return Modify(
- std::move(expr),
- [&AlreadyCanonicalized, exec_context](Expression expr) -> Result<Expression> {
- auto call = expr.call();
- if (!call) return expr;
-
- if (AlreadyCanonicalized(expr)) return expr;
-
- if (IsBinaryAssociativeCommutative(*call)) {
- struct {
- int Priority(const Expression& operand) const {
- // order literals first, starting with nulls
- if (operand.IsNullLiteral()) return 0;
- if (operand.literal()) return 1;
- return 2;
- }
- bool operator()(const Expression& l, const Expression& r) const {
- return Priority(l) < Priority(r);
- }
- } CanonicalOrdering;
-
- FlattenedAssociativeChain chain(expr);
- if (chain.was_left_folded &&
- std::is_sorted(chain.fringe.begin(), chain.fringe.end(),
- CanonicalOrdering)) {
- AlreadyCanonicalized.Add(std::move(chain.exprs));
- return expr;
- }
-
- std::stable_sort(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering);
-
- // fold the chain back up
- auto folded =
- FoldLeft(chain.fringe.begin(), chain.fringe.end(),
- [call, &AlreadyCanonicalized](Expression l, Expression r) {
- auto canonicalized_call = *call;
- canonicalized_call.arguments = {std::move(l), std::move(r)};
- Expression expr(std::move(canonicalized_call));
- AlreadyCanonicalized.Add({expr});
- return expr;
- });
- return std::move(*folded);
- }
-
- if (auto cmp = Comparison::Get(call->function_name)) {
- if (call->arguments[0].literal() && !call->arguments[1].literal()) {
- // ensure that literals are on comparisons' RHS
- auto flipped_call = *call;
-
- std::swap(flipped_call.arguments[0], flipped_call.arguments[1]);
- flipped_call.function_name =
- Comparison::GetName(Comparison::GetFlipped(*cmp));
-
- return BindNonRecursive(flipped_call,
- /*insert_implicit_casts=*/false, exec_context);
- }
- }
-
- return expr;
- },
- [](Expression expr, ...) { return expr; });
-}
-
-namespace {
-
-Result<Expression> DirectComparisonSimplification(Expression expr,
- const Expression::Call& guarantee) {
- return Modify(
- std::move(expr), [](Expression expr) { return expr; },
- [&guarantee](Expression expr, ...) -> Result<Expression> {
- auto call = expr.call();
- if (!call) return expr;
-
- // Ensure both calls are comparisons with equal LHS and scalar RHS
- auto cmp = Comparison::Get(expr);
- auto cmp_guarantee = Comparison::Get(guarantee.function_name);
-
- if (!cmp) return expr;
- if (!cmp_guarantee) return expr;
-
- const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
- const auto& guarantee_lhs = guarantee.arguments[0];
- if (lhs != guarantee_lhs) return expr;
-
- auto rhs = call->arguments[1].literal();
- auto guarantee_rhs = guarantee.arguments[1].literal();
-
- if (!rhs) return expr;
- if (!rhs->is_scalar()) return expr;
-
- if (!guarantee_rhs) return expr;
- if (!guarantee_rhs->is_scalar()) return expr;
-
- ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
- Comparison::Execute(*rhs, *guarantee_rhs));
- DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
-
- if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
- // RHS of filter is equal to RHS of guarantee
-
- if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
- // guarantee is a subset of filter, so all data will be included
- // x > 1, x >= 1, x != 1 guaranteed by x > 1
- return literal(true);
- }
-
- if ((*cmp & *cmp_guarantee) == 0) {
- // guarantee disjoint with filter, so all data will be excluded
- // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
- return literal(false);
- }
-
- return expr;
- }
-
- if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
- // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
- return expr;
- }
-
- if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
- // x > 1, x >= 1, x != 1 guaranteed by x >= 3
- return literal(true);
- } else {
- // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
- return literal(false);
- }
- });
-}
-
-} // namespace
-
-Result<Expression> SimplifyWithGuarantee(Expression expr,
- const Expression& guaranteed_true_predicate) {
- auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
-
- KnownFieldValues known_values;
- RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
-
- ARROW_ASSIGN_OR_RAISE(expr,
- ReplaceFieldsWithKnownValues(known_values, std::move(expr)));
-
- auto CanonicalizeAndFoldConstants = [&expr] {
- ARROW_ASSIGN_OR_RAISE(expr, Canonicalize(std::move(expr)));
- ARROW_ASSIGN_OR_RAISE(expr, FoldConstants(std::move(expr)));
- return Status::OK();
- };
- RETURN_NOT_OK(CanonicalizeAndFoldConstants());
-
- for (const auto& guarantee : conjunction_members) {
- if (Comparison::Get(guarantee) && guarantee.call()->arguments[1].literal()) {
- ARROW_ASSIGN_OR_RAISE(
- auto simplified, DirectComparisonSimplification(expr, *CallNotNull(guarantee)));
-
- if (Identical(simplified, expr)) continue;
-
- expr = std::move(simplified);
- RETURN_NOT_OK(CanonicalizeAndFoldConstants());
- }
- }
-
- return expr;
-}
-
-// Serialization is accomplished by converting expressions to KeyValueMetadata and storing
-// this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its
-// columns. Finally, the RecordBatch is written to an IPC file.
-Result<std::shared_ptr<Buffer>> Serialize(const Expression& expr) {
- struct {
- std::shared_ptr<KeyValueMetadata> metadata_ = std::make_shared<KeyValueMetadata>();
- ArrayVector columns_;
-
- Result<std::string> AddScalar(const Scalar& scalar) {
- auto ret = columns_.size();
- ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(scalar, 1));
- columns_.push_back(std::move(array));
- return std::to_string(ret);
- }
-
- Status Visit(const Expression& expr) {
- if (auto lit = expr.literal()) {
- if (!lit->is_scalar()) {
- return Status::NotImplemented("Serialization of non-scalar literals");
- }
- ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*lit->scalar()));
- metadata_->Append("literal", std::move(value));
- return Status::OK();
- }
-
- if (auto ref = expr.field_ref()) {
- if (!ref->name()) {
- return Status::NotImplemented("Serialization of non-name field_refs");
- }
- metadata_->Append("field_ref", *ref->name());
- return Status::OK();
- }
-
- auto call = CallNotNull(expr);
- metadata_->Append("call", call->function_name);
-
- for (const auto& argument : call->arguments) {
- RETURN_NOT_OK(Visit(argument));
- }
-
- if (call->options) {
- ARROW_ASSIGN_OR_RAISE(auto options_scalar,
- internal::FunctionOptionsToStructScalar(*call->options));
- ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*options_scalar));
- metadata_->Append("options", std::move(value));
- }
-
- metadata_->Append("end", call->function_name);
- return Status::OK();
- }
-
- Result<std::shared_ptr<RecordBatch>> operator()(const Expression& expr) {
- RETURN_NOT_OK(Visit(expr));
- FieldVector fields(columns_.size());
- for (size_t i = 0; i < fields.size(); ++i) {
- fields[i] = field("", columns_[i]->type());
- }
- return RecordBatch::Make(schema(std::move(fields), std::move(metadata_)), 1,
- std::move(columns_));
- }
- } ToRecordBatch;
-
- ARROW_ASSIGN_OR_RAISE(auto batch, ToRecordBatch(expr));
- ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
- ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
- RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
- RETURN_NOT_OK(writer->Close());
- return stream->Finish();
-}
-
-Result<Expression> Deserialize(std::shared_ptr<Buffer> buffer) {
- io::BufferReader stream(std::move(buffer));
- ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
- ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
- if (batch->schema()->metadata() == nullptr) {
- return Status::Invalid("serialized Expression's batch repr had null metadata");
- }
- if (batch->num_rows() != 1) {
- return Status::Invalid(
- "serialized Expression's batch repr was not a single row - had ",
- batch->num_rows());
- }
-
- struct FromRecordBatch {
- const RecordBatch& batch_;
- int index_;
-
- const KeyValueMetadata& metadata() { return *batch_.schema()->metadata(); }
-
- Result<std::shared_ptr<Scalar>> GetScalar(const std::string& i) {
- int32_t column_index;
- if (!::arrow::internal::ParseValue<Int32Type>(i.data(), i.length(),
- &column_index)) {
- return Status::Invalid("Couldn't parse column_index");
- }
- if (column_index >= batch_.num_columns()) {
- return Status::Invalid("column_index out of bounds");
- }
- return batch_.column(column_index)->GetScalar(0);
- }
-
- Result<Expression> GetOne() {
- if (index_ >= metadata().size()) {
- return Status::Invalid("unterminated serialized Expression");
- }
-
- const std::string& key = metadata().key(index_);
- const std::string& value = metadata().value(index_);
- ++index_;
-
- if (key == "literal") {
- ARROW_ASSIGN_OR_RAISE(auto scalar, GetScalar(value));
- return literal(std::move(scalar));
- }
-
- if (key == "field_ref") {
- return field_ref(value);
- }
-
- if (key != "call") {
- return Status::Invalid("Unrecognized serialized Expression key ", key);
- }
-
- std::vector<Expression> arguments;
- while (metadata().key(index_) != "end") {
- if (metadata().key(index_) == "options") {
- ARROW_ASSIGN_OR_RAISE(auto options_scalar, GetScalar(metadata().value(index_)));
- std::shared_ptr<compute::FunctionOptions> options;
- if (options_scalar) {
- ARROW_ASSIGN_OR_RAISE(
- options, internal::FunctionOptionsFromStructScalar(
- checked_cast<const StructScalar&>(*options_scalar)));
- }
- auto expr = call(value, std::move(arguments), std::move(options));
- index_ += 2;
- return expr;
- }
-
- ARROW_ASSIGN_OR_RAISE(auto argument, GetOne());
- arguments.push_back(std::move(argument));
- }
-
- ++index_;
- return call(value, std::move(arguments));
- }
- };
-
- return FromRecordBatch{*batch, 0}.GetOne();
-}
-
-Expression project(std::vector<Expression> values, std::vector<std::string> names) {
- return call("make_struct", std::move(values),
- compute::MakeStructOptions{std::move(names)});
-}
-
-Expression equal(Expression lhs, Expression rhs) {
- return call("equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression not_equal(Expression lhs, Expression rhs) {
- return call("not_equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression less(Expression lhs, Expression rhs) {
- return call("less", {std::move(lhs), std::move(rhs)});
-}
-
-Expression less_equal(Expression lhs, Expression rhs) {
- return call("less_equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression greater(Expression lhs, Expression rhs) {
- return call("greater", {std::move(lhs), std::move(rhs)});
-}
-
-Expression greater_equal(Expression lhs, Expression rhs) {
- return call("greater_equal", {std::move(lhs), std::move(rhs)});
-}
-
-Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
-
-Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }
-
-Expression and_(Expression lhs, Expression rhs) {
- return call("and_kleene", {std::move(lhs), std::move(rhs)});
-}
-
-Expression and_(const std::vector<Expression>& operands) {
- auto folded = FoldLeft<Expression(Expression, Expression)>(operands.begin(),
- operands.end(), and_);
- if (folded) {
- return std::move(*folded);
- }
- return literal(true);
-}
-
-Expression or_(Expression lhs, Expression rhs) {
- return call("or_kleene", {std::move(lhs), std::move(rhs)});
-}
-
-Expression or_(const std::vector<Expression>& operands) {
- auto folded =
- FoldLeft<Expression(Expression, Expression)>(operands.begin(), operands.end(), or_);
- if (folded) {
- return std::move(*folded);
- }
- return literal(false);
-}
-
-Expression not_(Expression operand) { return call("invert", {std::move(operand)}); }
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/expression.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec/expression_internal.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+
+void Expression::Call::ComputeHash() {
+ hash = std::hash<std::string>{}(function_name);
+ for (const auto& arg : arguments) {
+ arrow::internal::hash_combine(hash, arg.hash());
+ }
+}
+
+Expression::Expression(Call call) {
+ call.ComputeHash();
+ impl_ = std::make_shared<Impl>(std::move(call));
+}
+
+Expression::Expression(Datum literal)
+ : impl_(std::make_shared<Impl>(std::move(literal))) {}
+
+Expression::Expression(Parameter parameter)
+ : impl_(std::make_shared<Impl>(std::move(parameter))) {}
+
+Expression literal(Datum lit) { return Expression(std::move(lit)); }
+
+Expression field_ref(FieldRef ref) {
+ return Expression(Expression::Parameter{std::move(ref), ValueDescr{}, -1});
+}
+
+Expression call(std::string function, std::vector<Expression> arguments,
+ std::shared_ptr<compute::FunctionOptions> options) {
+ Expression::Call call;
+ call.function_name = std::move(function);
+ call.arguments = std::move(arguments);
+ call.options = std::move(options);
+ return Expression(std::move(call));
+}
+
+const Datum* Expression::literal() const { return util::get_if<Datum>(impl_.get()); }
+
+const Expression::Parameter* Expression::parameter() const {
+ return util::get_if<Parameter>(impl_.get());
+}
+
+const FieldRef* Expression::field_ref() const {
+ if (auto parameter = this->parameter()) {
+ return &parameter->ref;
+ }
+ return nullptr;
+}
+
+const Expression::Call* Expression::call() const {
+ return util::get_if<Call>(impl_.get());
+}
+
+ValueDescr Expression::descr() const {
+ if (impl_ == nullptr) return {};
+
+ if (auto lit = literal()) {
+ return lit->descr();
+ }
+
+ if (auto parameter = this->parameter()) {
+ return parameter->descr;
+ }
+
+ return CallNotNull(*this)->descr;
+}
+
+namespace {
+
+std::string PrintDatum(const Datum& datum) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) return "null";
+
+ switch (datum.type()->id()) {
+ case Type::STRING:
+ case Type::LARGE_STRING:
+ return '"' +
+ Escape(util::string_view(*datum.scalar_as<BaseBinaryScalar>().value)) +
+ '"';
+
+ case Type::BINARY:
+ case Type::FIXED_SIZE_BINARY:
+ case Type::LARGE_BINARY:
+ return '"' + datum.scalar_as<BaseBinaryScalar>().value->ToHexString() + '"';
+
+ default:
+ break;
+ }
+
+ return datum.scalar()->ToString();
+ }
+ return datum.ToString();
+}
+
+} // namespace
+
+std::string Expression::ToString() const {
+ if (auto lit = literal()) {
+ return PrintDatum(*lit);
+ }
+
+ if (auto ref = field_ref()) {
+ if (auto name = ref->name()) {
+ return *name;
+ }
+ if (auto path = ref->field_path()) {
+ return path->ToString();
+ }
+ return ref->ToString();
+ }
+
+ auto call = CallNotNull(*this);
+ auto binary = [&](std::string op) {
+ return "(" + call->arguments[0].ToString() + " " + op + " " +
+ call->arguments[1].ToString() + ")";
+ };
+
+ if (auto cmp = Comparison::Get(call->function_name)) {
+ return binary(Comparison::GetOp(*cmp));
+ }
+
+ constexpr util::string_view kleene = "_kleene";
+ if (util::string_view{call->function_name}.ends_with(kleene)) {
+ auto op = call->function_name.substr(0, call->function_name.size() - kleene.size());
+ return binary(std::move(op));
+ }
+
+ if (auto options = GetMakeStructOptions(*call)) {
+ std::string out = "{";
+ auto argument = call->arguments.begin();
+ for (const auto& field_name : options->field_names) {
+ out += field_name + "=" + argument++->ToString() + ", ";
+ }
+ out.resize(out.size() - 1);
+ out.back() = '}';
+ return out;
+ }
+
+ std::string out = call->function_name + "(";
+ for (const auto& arg : call->arguments) {
+ out += arg.ToString() + ", ";
+ }
+
+ if (call->options) {
+ out += call->options->ToString();
+ out.resize(out.size() + 1);
+ } else {
+ out.resize(out.size() - 1);
+ }
+ out.back() = ')';
+ return out;
+}
+
+void PrintTo(const Expression& expr, std::ostream* os) {
+ *os << expr.ToString();
+ if (expr.IsBound()) {
+ *os << "[bound]";
+ }
+}
+
+bool Expression::Equals(const Expression& other) const {
+ if (Identical(*this, other)) return true;
+
+ if (impl_->index() != other.impl_->index()) {
+ return false;
+ }
+
+ if (auto lit = literal()) {
+ return lit->Equals(*other.literal());
+ }
+
+ if (auto ref = field_ref()) {
+ return ref->Equals(*other.field_ref());
+ }
+
+ auto call = CallNotNull(*this);
+ auto other_call = CallNotNull(other);
+
+ if (call->function_name != other_call->function_name ||
+ call->kernel != other_call->kernel) {
+ return false;
+ }
+
+ for (size_t i = 0; i < call->arguments.size(); ++i) {
+ if (!call->arguments[i].Equals(other_call->arguments[i])) {
+ return false;
+ }
+ }
+
+ if (call->options == other_call->options) return true;
+ if (call->options && other_call->options) {
+ return call->options->Equals(other_call->options);
+ }
+ return false;
+}
+
+bool Identical(const Expression& l, const Expression& r) { return l.impl_ == r.impl_; }
+
+size_t Expression::hash() const {
+ if (auto lit = literal()) {
+ if (lit->is_scalar()) {
+ return lit->scalar()->hash();
+ }
+ return 0;
+ }
+
+ if (auto ref = field_ref()) {
+ return ref->hash();
+ }
+
+ return CallNotNull(*this)->hash;
+}
+
+bool Expression::IsBound() const {
+ if (type() == nullptr) return false;
+
+ if (auto call = this->call()) {
+ if (call->kernel == nullptr) return false;
+
+ for (const Expression& arg : call->arguments) {
+ if (!arg.IsBound()) return false;
+ }
+ }
+
+ return true;
+}
+
+bool Expression::IsScalarExpression() const {
+ if (auto lit = literal()) {
+ return lit->is_scalar();
+ }
+
+ if (field_ref()) return true;
+
+ auto call = CallNotNull(*this);
+
+ for (const Expression& arg : call->arguments) {
+ if (!arg.IsScalarExpression()) return false;
+ }
+
+ if (call->function) {
+ return call->function->kind() == compute::Function::SCALAR;
+ }
+
+ // this expression is not bound; make a best guess based on
+ // the default function registry
+ if (auto function = compute::GetFunctionRegistry()
+ ->GetFunction(call->function_name)
+ .ValueOr(nullptr)) {
+ return function->kind() == compute::Function::SCALAR;
+ }
+
+ // unknown function or other error; conservatively return false
+ return false;
+}
+
+bool Expression::IsNullLiteral() const {
+ if (auto lit = literal()) {
+ if (lit->null_count() == lit->length()) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool Expression::IsSatisfiable() const {
+ if (type() && type()->id() == Type::NA) {
+ return false;
+ }
+
+ if (auto lit = literal()) {
+ if (lit->null_count() == lit->length()) {
+ return false;
+ }
+
+ if (lit->is_scalar() && lit->type()->id() == Type::BOOL) {
+ return lit->scalar_as<BooleanScalar>().value;
+ }
+ }
+
+ return true;
+}
+
+namespace {
+
+// Produce a bound Expression from unbound Call and bound arguments.
+Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_casts,
+ compute::ExecContext* exec_context) {
+ DCHECK(std::all_of(call.arguments.begin(), call.arguments.end(),
+ [](const Expression& argument) { return argument.IsBound(); }));
+
+ auto descrs = GetDescriptors(call.arguments);
+ ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context));
+
+ if (!insert_implicit_casts) {
+ ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchExact(descrs));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&descrs));
+
+ for (size_t i = 0; i < descrs.size(); ++i) {
+ if (descrs[i] == call.arguments[i].descr()) continue;
+
+ if (descrs[i].shape != call.arguments[i].descr().shape) {
+ return Status::NotImplemented(
+ "Automatic broadcasting of scalars arguments to arrays in ",
+ Expression(std::move(call)).ToString());
+ }
+
+ if (auto lit = call.arguments[i].literal()) {
+ ARROW_ASSIGN_OR_RAISE(Datum new_lit, compute::Cast(*lit, descrs[i].type));
+ call.arguments[i] = literal(std::move(new_lit));
+ continue;
+ }
+
+ // construct an implicit cast Expression with which to replace this argument
+ Expression::Call implicit_cast;
+ implicit_cast.function_name = "cast";
+ implicit_cast.arguments = {std::move(call.arguments[i])};
+ implicit_cast.options = std::make_shared<compute::CastOptions>(
+ compute::CastOptions::Safe(descrs[i].type));
+
+ ARROW_ASSIGN_OR_RAISE(
+ call.arguments[i],
+ BindNonRecursive(std::move(implicit_cast),
+ /*insert_implicit_casts=*/false, exec_context));
+ }
+ }
+
+ compute::KernelContext kernel_context(exec_context);
+ if (call.kernel->init) {
+ ARROW_ASSIGN_OR_RAISE(
+ call.kernel_state,
+ call.kernel->init(&kernel_context, {call.kernel, descrs, call.options.get()}));
+
+ kernel_context.SetState(call.kernel_state.get());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ call.descr, call.kernel->signature->out_type().Resolve(&kernel_context, descrs));
+
+ return Expression(std::move(call));
+}
+
+template <typename TypeOrSchema>
+Result<Expression> BindImpl(Expression expr, const TypeOrSchema& in,
+ ValueDescr::Shape shape, compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return BindImpl(std::move(expr), in, shape, &exec_context);
+ }
+
+ if (expr.literal()) return expr;
+
+ if (auto ref = expr.field_ref()) {
+ if (ref->IsNested()) {
+ return Status::NotImplemented("nested field references");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto path, ref->FindOne(in));
+
+ auto bound = *expr.parameter();
+ bound.index = path[0];
+ ARROW_ASSIGN_OR_RAISE(auto field, path.Get(in));
+ bound.descr.type = field->type();
+ bound.descr.shape = shape;
+ return Expression{std::move(bound)};
+ }
+
+ auto call = *CallNotNull(expr);
+ for (auto& argument : call.arguments) {
+ ARROW_ASSIGN_OR_RAISE(argument,
+ BindImpl(std::move(argument), in, shape, exec_context));
+ }
+ return BindNonRecursive(std::move(call),
+ /*insert_implicit_casts=*/true, exec_context);
+}
+
+} // namespace
+
+Result<Expression> Expression::Bind(const ValueDescr& in,
+ compute::ExecContext* exec_context) const {
+ return BindImpl(*this, *in.type, in.shape, exec_context);
+}
+
+Result<Expression> Expression::Bind(const Schema& in_schema,
+ compute::ExecContext* exec_context) const {
+ return BindImpl(*this, in_schema, ValueDescr::ARRAY, exec_context);
+}
+
+Result<ExecBatch> MakeExecBatch(const Schema& full_schema, const Datum& partial) {
+ ExecBatch out;
+
+ if (partial.kind() == Datum::RECORD_BATCH) {
+ const auto& partial_batch = *partial.record_batch();
+ out.length = partial_batch.num_rows();
+
+ for (const auto& field : full_schema.fields()) {
+ ARROW_ASSIGN_OR_RAISE(auto column,
+ FieldRef(field->name()).GetOneOrNone(partial_batch));
+
+ if (column) {
+ if (!column->type()->Equals(field->type())) {
+ // Referenced field was present but didn't have the expected type.
+ // This *should* be handled by readers, and will just be an error in the future.
+ ARROW_ASSIGN_OR_RAISE(
+ auto converted,
+ compute::Cast(column, field->type(), compute::CastOptions::Safe()));
+ column = converted.make_array();
+ }
+ out.values.emplace_back(std::move(column));
+ } else {
+ out.values.emplace_back(MakeNullScalar(field->type()));
+ }
+ }
+ return out;
+ }
+
+ // wasteful but useful for testing:
+ if (partial.type()->id() == Type::STRUCT) {
+ if (partial.is_array()) {
+ ARROW_ASSIGN_OR_RAISE(auto partial_batch,
+ RecordBatch::FromStructArray(partial.make_array()));
+
+ return MakeExecBatch(full_schema, partial_batch);
+ }
+
+ if (partial.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto partial_array,
+ MakeArrayFromScalar(*partial.scalar(), 1));
+ ARROW_ASSIGN_OR_RAISE(auto out, MakeExecBatch(full_schema, partial_array));
+
+ for (Datum& value : out.values) {
+ if (value.is_scalar()) continue;
+ ARROW_ASSIGN_OR_RAISE(value, value.make_array()->GetScalar(0));
+ }
+ return out;
+ }
+ }
+
+ return Status::NotImplemented("MakeExecBatch from ", PrintDatum(partial));
+}
+
+Result<Datum> ExecuteScalarExpression(const Expression& expr, const Schema& full_schema,
+ const Datum& partial_input,
+ compute::ExecContext* exec_context) {
+ ARROW_ASSIGN_OR_RAISE(auto input, MakeExecBatch(full_schema, partial_input));
+ return ExecuteScalarExpression(expr, input, exec_context);
+}
+
+Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& input,
+ compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return ExecuteScalarExpression(expr, input, &exec_context);
+ }
+
+ if (!expr.IsBound()) {
+ return Status::Invalid("Cannot Execute unbound expression.");
+ }
+
+ if (!expr.IsScalarExpression()) {
+ return Status::Invalid(
+ "ExecuteScalarExpression cannot Execute non-scalar expression ", expr.ToString());
+ }
+
+ if (auto lit = expr.literal()) return *lit;
+
+ if (auto param = expr.parameter()) {
+ if (param->descr.type->id() == Type::NA) {
+ return MakeNullScalar(null());
+ }
+
+ const Datum& field = input[param->index];
+ if (!field.type()->Equals(param->descr.type)) {
+ return Status::Invalid("Referenced field ", expr.ToString(), " was ",
+ field.type()->ToString(), " but should have been ",
+ param->descr.type->ToString());
+ }
+
+ return field;
+ }
+
+ auto call = CallNotNull(expr);
+
+ std::vector<Datum> arguments(call->arguments.size());
+ for (size_t i = 0; i < arguments.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context));
+ }
+
+ auto executor = compute::detail::KernelExecutor::MakeScalar();
+
+ compute::KernelContext kernel_context(exec_context);
+ kernel_context.SetState(call->kernel_state.get());
+
+ auto kernel = call->kernel;
+ auto descrs = GetDescriptors(arguments);
+ auto options = call->options.get();
+ RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
+
+ auto listener = std::make_shared<compute::detail::DatumAccumulator>();
+ RETURN_NOT_OK(executor->Execute(arguments, listener.get()));
+ return executor->WrapResults(arguments, listener->values());
+}
+
+namespace {
+
+std::array<std::pair<const Expression&, const Expression&>, 2>
+ArgumentsAndFlippedArguments(const Expression::Call& call) {
+ DCHECK_EQ(call.arguments.size(), 2);
+ return {std::pair<const Expression&, const Expression&>{call.arguments[0],
+ call.arguments[1]},
+ std::pair<const Expression&, const Expression&>{call.arguments[1],
+ call.arguments[0]}};
+}
+
+template <typename BinOp, typename It,
+ typename Out = typename std::iterator_traits<It>::value_type>
+util::optional<Out> FoldLeft(It begin, It end, const BinOp& bin_op) {
+ if (begin == end) return util::nullopt;
+
+ Out folded = std::move(*begin++);
+ while (begin != end) {
+ folded = bin_op(std::move(folded), std::move(*begin++));
+ }
+ return folded;
+}
+
+util::optional<compute::NullHandling::type> GetNullHandling(
+ const Expression::Call& call) {
+ if (call.function && call.function->kind() == compute::Function::SCALAR) {
+ return static_cast<const compute::ScalarKernel*>(call.kernel)->null_handling;
+ }
+ return util::nullopt;
+}
+
+} // namespace
+
+std::vector<FieldRef> FieldsInExpression(const Expression& expr) {
+ if (expr.literal()) return {};
+
+ if (auto ref = expr.field_ref()) {
+ return {*ref};
+ }
+
+ std::vector<FieldRef> fields;
+ for (const Expression& arg : CallNotNull(expr)->arguments) {
+ auto argument_fields = FieldsInExpression(arg);
+ std::move(argument_fields.begin(), argument_fields.end(), std::back_inserter(fields));
+ }
+ return fields;
+}
+
+bool ExpressionHasFieldRefs(const Expression& expr) {
+ if (expr.literal()) return false;
+
+ if (expr.field_ref()) return true;
+
+ for (const Expression& arg : CallNotNull(expr)->arguments) {
+ if (ExpressionHasFieldRefs(arg)) return true;
+ }
+ return false;
+}
+
+Result<Expression> FoldConstants(Expression expr) {
+ return Modify(
+ std::move(expr), [](Expression expr) { return expr; },
+ [](Expression expr, ...) -> Result<Expression> {
+ auto call = CallNotNull(expr);
+ if (std::all_of(call->arguments.begin(), call->arguments.end(),
+ [](const Expression& argument) { return argument.literal(); })) {
+ // all arguments are literal; we can evaluate this subexpression *now*
+ static const ExecBatch ignored_input = ExecBatch{};
+ ARROW_ASSIGN_OR_RAISE(Datum constant,
+ ExecuteScalarExpression(expr, ignored_input));
+
+ return literal(std::move(constant));
+ }
+
+ // XXX the following should probably be in a registry of passes instead
+ // of inline
+
+ if (GetNullHandling(*call) == compute::NullHandling::INTERSECTION) {
+ // kernels which always produce intersected validity can be resolved
+ // to null *now* if any of their inputs is a null literal
+ for (const auto& argument : call->arguments) {
+ if (argument.IsNullLiteral()) {
+ return argument;
+ }
+ }
+ }
+
+ if (call->function_name == "and_kleene") {
+ for (auto args : ArgumentsAndFlippedArguments(*call)) {
+ // true and x == x
+ if (args.first == literal(true)) return args.second;
+
+ // false and x == false
+ if (args.first == literal(false)) return args.first;
+
+ // x and x == x
+ if (args.first == args.second) return args.first;
+ }
+ return expr;
+ }
+
+ if (call->function_name == "or_kleene") {
+ for (auto args : ArgumentsAndFlippedArguments(*call)) {
+ // false or x == x
+ if (args.first == literal(false)) return args.second;
+
+ // true or x == true
+ if (args.first == literal(true)) return args.first;
+
+ // x or x == x
+ if (args.first == args.second) return args.first;
+ }
+ return expr;
+ }
+
+ return expr;
+ });
+}
+
+namespace {
+
+std::vector<Expression> GuaranteeConjunctionMembers(
+ const Expression& guaranteed_true_predicate) {
+ auto guarantee = guaranteed_true_predicate.call();
+ if (!guarantee || guarantee->function_name != "and_kleene") {
+ return {guaranteed_true_predicate};
+ }
+ return FlattenedAssociativeChain(guaranteed_true_predicate).fringe;
+}
+
+// Conjunction members which are represented in known_values are erased from
+// conjunction_members
+Status ExtractKnownFieldValuesImpl(
+ std::vector<Expression>* conjunction_members,
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash>* known_values) {
+ auto unconsumed_end =
+ std::partition(conjunction_members->begin(), conjunction_members->end(),
+ [](const Expression& expr) {
+ // search for an equality conditions between a field and a literal
+ auto call = expr.call();
+ if (!call) return true;
+
+ if (call->function_name == "equal") {
+ auto ref = call->arguments[0].field_ref();
+ auto lit = call->arguments[1].literal();
+ return !(ref && lit);
+ }
+
+ if (call->function_name == "is_null") {
+ auto ref = call->arguments[0].field_ref();
+ return !ref;
+ }
+
+ return true;
+ });
+
+ for (auto it = unconsumed_end; it != conjunction_members->end(); ++it) {
+ auto call = CallNotNull(*it);
+
+ if (call->function_name == "equal") {
+ auto ref = call->arguments[0].field_ref();
+ auto lit = call->arguments[1].literal();
+ known_values->emplace(*ref, *lit);
+ } else if (call->function_name == "is_null") {
+ auto ref = call->arguments[0].field_ref();
+ known_values->emplace(*ref, Datum(std::make_shared<NullScalar>()));
+ }
+ }
+
+ conjunction_members->erase(unconsumed_end, conjunction_members->end());
+
+ return Status::OK();
+}
+
+} // namespace
+
+Result<KnownFieldValues> ExtractKnownFieldValues(
+ const Expression& guaranteed_true_predicate) {
+ auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
+ KnownFieldValues known_values;
+ RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
+ return known_values;
+}
+
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+ Expression expr) {
+ if (!expr.IsBound()) {
+ return Status::Invalid(
+ "ReplaceFieldsWithKnownValues called on an unbound Expression");
+ }
+
+ return Modify(
+ std::move(expr),
+ [&known_values](Expression expr) -> Result<Expression> {
+ if (auto ref = expr.field_ref()) {
+ auto it = known_values.map.find(*ref);
+ if (it != known_values.map.end()) {
+ Datum lit = it->second;
+ if (lit.descr() == expr.descr()) return literal(std::move(lit));
+ // type mismatch, try casting the known value to the correct type
+
+ if (expr.type()->id() == Type::DICTIONARY &&
+ lit.type()->id() != Type::DICTIONARY) {
+ // the known value must be dictionary encoded
+
+ const auto& dict_type = checked_cast<const DictionaryType&>(*expr.type());
+ if (!lit.type()->Equals(dict_type.value_type())) {
+ ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, dict_type.value_type()));
+ }
+
+ if (lit.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto dictionary,
+ MakeArrayFromScalar(*lit.scalar(), 1));
+
+ lit = Datum{DictionaryScalar::Make(MakeScalar<int32_t>(0),
+ std::move(dictionary))};
+ }
+ }
+
+ ARROW_ASSIGN_OR_RAISE(lit, compute::Cast(lit, expr.type()));
+ return literal(std::move(lit));
+ }
+ }
+ return expr;
+ },
+ [](Expression expr, ...) { return expr; });
+}
+
+namespace {
+
+bool IsBinaryAssociativeCommutative(const Expression::Call& call) {
+ static std::unordered_set<std::string> binary_associative_commutative{
+ "and", "or", "and_kleene", "or_kleene", "xor",
+ "multiply", "add", "multiply_checked", "add_checked"};
+
+ auto it = binary_associative_commutative.find(call.function_name);
+ return it != binary_associative_commutative.end();
+}
+
+} // namespace
+
+Result<Expression> Canonicalize(Expression expr, compute::ExecContext* exec_context) {
+ if (exec_context == nullptr) {
+ compute::ExecContext exec_context;
+ return Canonicalize(std::move(expr), &exec_context);
+ }
+
+ // If potentially reconstructing more deeply than a call's immediate arguments
+ // (for example, when reorganizing an associative chain), add expressions to this set to
+ // avoid unnecessary work
+ struct {
+ std::unordered_set<Expression, Expression::Hash> set_;
+
+ bool operator()(const Expression& expr) const {
+ return set_.find(expr) != set_.end();
+ }
+
+ void Add(std::vector<Expression> exprs) {
+ std::move(exprs.begin(), exprs.end(), std::inserter(set_, set_.end()));
+ }
+ } AlreadyCanonicalized;
+
+ return Modify(
+ std::move(expr),
+ [&AlreadyCanonicalized, exec_context](Expression expr) -> Result<Expression> {
+ auto call = expr.call();
+ if (!call) return expr;
+
+ if (AlreadyCanonicalized(expr)) return expr;
+
+ if (IsBinaryAssociativeCommutative(*call)) {
+ struct {
+ int Priority(const Expression& operand) const {
+ // order literals first, starting with nulls
+ if (operand.IsNullLiteral()) return 0;
+ if (operand.literal()) return 1;
+ return 2;
+ }
+ bool operator()(const Expression& l, const Expression& r) const {
+ return Priority(l) < Priority(r);
+ }
+ } CanonicalOrdering;
+
+ FlattenedAssociativeChain chain(expr);
+ if (chain.was_left_folded &&
+ std::is_sorted(chain.fringe.begin(), chain.fringe.end(),
+ CanonicalOrdering)) {
+ AlreadyCanonicalized.Add(std::move(chain.exprs));
+ return expr;
+ }
+
+ std::stable_sort(chain.fringe.begin(), chain.fringe.end(), CanonicalOrdering);
+
+ // fold the chain back up
+ auto folded =
+ FoldLeft(chain.fringe.begin(), chain.fringe.end(),
+ [call, &AlreadyCanonicalized](Expression l, Expression r) {
+ auto canonicalized_call = *call;
+ canonicalized_call.arguments = {std::move(l), std::move(r)};
+ Expression expr(std::move(canonicalized_call));
+ AlreadyCanonicalized.Add({expr});
+ return expr;
+ });
+ return std::move(*folded);
+ }
+
+ if (auto cmp = Comparison::Get(call->function_name)) {
+ if (call->arguments[0].literal() && !call->arguments[1].literal()) {
+ // ensure that literals are on comparisons' RHS
+ auto flipped_call = *call;
+
+ std::swap(flipped_call.arguments[0], flipped_call.arguments[1]);
+ flipped_call.function_name =
+ Comparison::GetName(Comparison::GetFlipped(*cmp));
+
+ return BindNonRecursive(flipped_call,
+ /*insert_implicit_casts=*/false, exec_context);
+ }
+ }
+
+ return expr;
+ },
+ [](Expression expr, ...) { return expr; });
+}
+
+namespace {
+
+Result<Expression> DirectComparisonSimplification(Expression expr,
+ const Expression::Call& guarantee) {
+ return Modify(
+ std::move(expr), [](Expression expr) { return expr; },
+ [&guarantee](Expression expr, ...) -> Result<Expression> {
+ auto call = expr.call();
+ if (!call) return expr;
+
+ // Ensure both calls are comparisons with equal LHS and scalar RHS
+ auto cmp = Comparison::Get(expr);
+ auto cmp_guarantee = Comparison::Get(guarantee.function_name);
+
+ if (!cmp) return expr;
+ if (!cmp_guarantee) return expr;
+
+ const auto& lhs = Comparison::StripOrderPreservingCasts(call->arguments[0]);
+ const auto& guarantee_lhs = guarantee.arguments[0];
+ if (lhs != guarantee_lhs) return expr;
+
+ auto rhs = call->arguments[1].literal();
+ auto guarantee_rhs = guarantee.arguments[1].literal();
+
+ if (!rhs) return expr;
+ if (!rhs->is_scalar()) return expr;
+
+ if (!guarantee_rhs) return expr;
+ if (!guarantee_rhs->is_scalar()) return expr;
+
+ ARROW_ASSIGN_OR_RAISE(auto cmp_rhs_guarantee_rhs,
+ Comparison::Execute(*rhs, *guarantee_rhs));
+ DCHECK_NE(cmp_rhs_guarantee_rhs, Comparison::NA);
+
+ if (cmp_rhs_guarantee_rhs == Comparison::EQUAL) {
+ // RHS of filter is equal to RHS of guarantee
+
+ if ((*cmp & *cmp_guarantee) == *cmp_guarantee) {
+ // guarantee is a subset of filter, so all data will be included
+ // x > 1, x >= 1, x != 1 guaranteed by x > 1
+ return literal(true);
+ }
+
+ if ((*cmp & *cmp_guarantee) == 0) {
+ // guarantee disjoint with filter, so all data will be excluded
+ // x > 1, x >= 1, x != 1 unsatisfiable if x == 1
+ return literal(false);
+ }
+
+ return expr;
+ }
+
+ if (*cmp_guarantee & cmp_rhs_guarantee_rhs) {
+ // x > 1, x >= 1, x != 1 cannot use guarantee x >= 3
+ return expr;
+ }
+
+ if (*cmp & Comparison::GetFlipped(cmp_rhs_guarantee_rhs)) {
+ // x > 1, x >= 1, x != 1 guaranteed by x >= 3
+ return literal(true);
+ } else {
+ // x < 1, x <= 1, x == 1 unsatisfiable if x >= 3
+ return literal(false);
+ }
+ });
+}
+
+} // namespace
+
+Result<Expression> SimplifyWithGuarantee(Expression expr,
+ const Expression& guaranteed_true_predicate) {
+ auto conjunction_members = GuaranteeConjunctionMembers(guaranteed_true_predicate);
+
+ KnownFieldValues known_values;
+ RETURN_NOT_OK(ExtractKnownFieldValuesImpl(&conjunction_members, &known_values.map));
+
+ ARROW_ASSIGN_OR_RAISE(expr,
+ ReplaceFieldsWithKnownValues(known_values, std::move(expr)));
+
+ auto CanonicalizeAndFoldConstants = [&expr] {
+ ARROW_ASSIGN_OR_RAISE(expr, Canonicalize(std::move(expr)));
+ ARROW_ASSIGN_OR_RAISE(expr, FoldConstants(std::move(expr)));
+ return Status::OK();
+ };
+ RETURN_NOT_OK(CanonicalizeAndFoldConstants());
+
+ for (const auto& guarantee : conjunction_members) {
+ if (Comparison::Get(guarantee) && guarantee.call()->arguments[1].literal()) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto simplified, DirectComparisonSimplification(expr, *CallNotNull(guarantee)));
+
+ if (Identical(simplified, expr)) continue;
+
+ expr = std::move(simplified);
+ RETURN_NOT_OK(CanonicalizeAndFoldConstants());
+ }
+ }
+
+ return expr;
+}
+
+// Serialization is accomplished by converting expressions to KeyValueMetadata and storing
+// this in the schema of a RecordBatch. Embedded arrays and scalars are stored in its
+// columns. Finally, the RecordBatch is written to an IPC file.
+Result<std::shared_ptr<Buffer>> Serialize(const Expression& expr) {
+ struct {
+ std::shared_ptr<KeyValueMetadata> metadata_ = std::make_shared<KeyValueMetadata>();
+ ArrayVector columns_;
+
+ Result<std::string> AddScalar(const Scalar& scalar) {
+ auto ret = columns_.size();
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(scalar, 1));
+ columns_.push_back(std::move(array));
+ return std::to_string(ret);
+ }
+
+ Status Visit(const Expression& expr) {
+ if (auto lit = expr.literal()) {
+ if (!lit->is_scalar()) {
+ return Status::NotImplemented("Serialization of non-scalar literals");
+ }
+ ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*lit->scalar()));
+ metadata_->Append("literal", std::move(value));
+ return Status::OK();
+ }
+
+ if (auto ref = expr.field_ref()) {
+ if (!ref->name()) {
+ return Status::NotImplemented("Serialization of non-name field_refs");
+ }
+ metadata_->Append("field_ref", *ref->name());
+ return Status::OK();
+ }
+
+ auto call = CallNotNull(expr);
+ metadata_->Append("call", call->function_name);
+
+ for (const auto& argument : call->arguments) {
+ RETURN_NOT_OK(Visit(argument));
+ }
+
+ if (call->options) {
+ ARROW_ASSIGN_OR_RAISE(auto options_scalar,
+ internal::FunctionOptionsToStructScalar(*call->options));
+ ARROW_ASSIGN_OR_RAISE(auto value, AddScalar(*options_scalar));
+ metadata_->Append("options", std::move(value));
+ }
+
+ metadata_->Append("end", call->function_name);
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<RecordBatch>> operator()(const Expression& expr) {
+ RETURN_NOT_OK(Visit(expr));
+ FieldVector fields(columns_.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ fields[i] = field("", columns_[i]->type());
+ }
+ return RecordBatch::Make(schema(std::move(fields), std::move(metadata_)), 1,
+ std::move(columns_));
+ }
+ } ToRecordBatch;
+
+ ARROW_ASSIGN_OR_RAISE(auto batch, ToRecordBatch(expr));
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+ ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ RETURN_NOT_OK(writer->Close());
+ return stream->Finish();
+}
+
+Result<Expression> Deserialize(std::shared_ptr<Buffer> buffer) {
+ io::BufferReader stream(std::move(buffer));
+ ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
+ ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
+ if (batch->schema()->metadata() == nullptr) {
+ return Status::Invalid("serialized Expression's batch repr had null metadata");
+ }
+ if (batch->num_rows() != 1) {
+ return Status::Invalid(
+ "serialized Expression's batch repr was not a single row - had ",
+ batch->num_rows());
+ }
+
+ struct FromRecordBatch {
+ const RecordBatch& batch_;
+ int index_;
+
+ const KeyValueMetadata& metadata() { return *batch_.schema()->metadata(); }
+
+ Result<std::shared_ptr<Scalar>> GetScalar(const std::string& i) {
+ int32_t column_index;
+ if (!::arrow::internal::ParseValue<Int32Type>(i.data(), i.length(),
+ &column_index)) {
+ return Status::Invalid("Couldn't parse column_index");
+ }
+ if (column_index >= batch_.num_columns()) {
+ return Status::Invalid("column_index out of bounds");
+ }
+ return batch_.column(column_index)->GetScalar(0);
+ }
+
+ Result<Expression> GetOne() {
+ if (index_ >= metadata().size()) {
+ return Status::Invalid("unterminated serialized Expression");
+ }
+
+ const std::string& key = metadata().key(index_);
+ const std::string& value = metadata().value(index_);
+ ++index_;
+
+ if (key == "literal") {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, GetScalar(value));
+ return literal(std::move(scalar));
+ }
+
+ if (key == "field_ref") {
+ return field_ref(value);
+ }
+
+ if (key != "call") {
+ return Status::Invalid("Unrecognized serialized Expression key ", key);
+ }
+
+ std::vector<Expression> arguments;
+ while (metadata().key(index_) != "end") {
+ if (metadata().key(index_) == "options") {
+ ARROW_ASSIGN_OR_RAISE(auto options_scalar, GetScalar(metadata().value(index_)));
+ std::shared_ptr<compute::FunctionOptions> options;
+ if (options_scalar) {
+ ARROW_ASSIGN_OR_RAISE(
+ options, internal::FunctionOptionsFromStructScalar(
+ checked_cast<const StructScalar&>(*options_scalar)));
+ }
+ auto expr = call(value, std::move(arguments), std::move(options));
+ index_ += 2;
+ return expr;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto argument, GetOne());
+ arguments.push_back(std::move(argument));
+ }
+
+ ++index_;
+ return call(value, std::move(arguments));
+ }
+ };
+
+ return FromRecordBatch{*batch, 0}.GetOne();
+}
+
+Expression project(std::vector<Expression> values, std::vector<std::string> names) {
+ return call("make_struct", std::move(values),
+ compute::MakeStructOptions{std::move(names)});
+}
+
+Expression equal(Expression lhs, Expression rhs) {
+ return call("equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression not_equal(Expression lhs, Expression rhs) {
+ return call("not_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression less(Expression lhs, Expression rhs) {
+ return call("less", {std::move(lhs), std::move(rhs)});
+}
+
+Expression less_equal(Expression lhs, Expression rhs) {
+ return call("less_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression greater(Expression lhs, Expression rhs) {
+ return call("greater", {std::move(lhs), std::move(rhs)});
+}
+
+Expression greater_equal(Expression lhs, Expression rhs) {
+ return call("greater_equal", {std::move(lhs), std::move(rhs)});
+}
+
+Expression is_null(Expression lhs) { return call("is_null", {std::move(lhs)}); }
+
+Expression is_valid(Expression lhs) { return call("is_valid", {std::move(lhs)}); }
+
+Expression and_(Expression lhs, Expression rhs) {
+ return call("and_kleene", {std::move(lhs), std::move(rhs)});
+}
+
+Expression and_(const std::vector<Expression>& operands) {
+ auto folded = FoldLeft<Expression(Expression, Expression)>(operands.begin(),
+ operands.end(), and_);
+ if (folded) {
+ return std::move(*folded);
+ }
+ return literal(true);
+}
+
+Expression or_(Expression lhs, Expression rhs) {
+ return call("or_kleene", {std::move(lhs), std::move(rhs)});
+}
+
+Expression or_(const std::vector<Expression>& operands) {
+ auto folded =
+ FoldLeft<Expression(Expression, Expression)>(operands.begin(), operands.end(), or_);
+ if (folded) {
+ return std::move(*folded);
+ }
+ return literal(false);
+}
+
+Expression not_(Expression operand) { return call("invert", {std::move(operand)}); }
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
index 5ae95532c2b..3810accf70a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression.h
@@ -1,269 +1,269 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This API is EXPERIMENTAL.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/compute/type_fwd.h"
-#include "arrow/datum.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/variant.h"
-
-namespace arrow {
-namespace compute {
-
-/// An unbound expression which maps a single Datum to another Datum.
-/// An expression is one of
-/// - A literal Datum.
-/// - A reference to a single (potentially nested) field of the input Datum.
-/// - A call to a compute function, with arguments specified by other Expressions.
-class ARROW_EXPORT Expression {
- public:
- struct Call {
- std::string function_name;
- std::vector<Expression> arguments;
- std::shared_ptr<FunctionOptions> options;
- // Cached hash value
- size_t hash;
-
- // post-Bind properties:
- std::shared_ptr<Function> function;
- const Kernel* kernel = NULLPTR;
- std::shared_ptr<KernelState> kernel_state;
- ValueDescr descr;
-
- void ComputeHash();
- };
-
- std::string ToString() const;
- bool Equals(const Expression& other) const;
- size_t hash() const;
- struct Hash {
- size_t operator()(const Expression& expr) const { return expr.hash(); }
- };
-
- /// Bind this expression to the given input type, looking up Kernels and field types.
- /// Some expression simplification may be performed and implicit casts will be inserted.
- /// Any state necessary for execution will be initialized and returned.
- Result<Expression> Bind(const ValueDescr& in, ExecContext* = NULLPTR) const;
- Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
-
- // XXX someday
- // Clone all KernelState in this bound expression. If any function referenced by this
- // expression has mutable KernelState, it is not safe to execute or apply simplification
- // passes to it (or copies of it!) from multiple threads. Cloning state produces new
- // KernelStates where necessary to ensure that Expressions may be manipulated safely
- // on multiple threads.
- // Result<ExpressionState> CloneState() const;
- // Status SetState(ExpressionState);
-
- /// Return true if all an expression's field references have explicit ValueDescr and all
- /// of its functions' kernels are looked up.
- bool IsBound() const;
-
- /// Return true if this expression is composed only of Scalar literals, field
- /// references, and calls to ScalarFunctions.
- bool IsScalarExpression() const;
-
- /// Return true if this expression is literal and entirely null.
- bool IsNullLiteral() const;
-
- /// Return true if this expression could evaluate to true.
- bool IsSatisfiable() const;
-
- // XXX someday
- // Result<PipelineGraph> GetPipelines();
-
- /// Access a Call or return nullptr if this expression is not a call
- const Call* call() const;
- /// Access a Datum or return nullptr if this expression is not a literal
- const Datum* literal() const;
- /// Access a FieldRef or return nullptr if this expression is not a field_ref
- const FieldRef* field_ref() const;
-
- /// The type and shape to which this expression will evaluate
- ValueDescr descr() const;
- std::shared_ptr<DataType> type() const { return descr().type; }
- // XXX someday
- // NullGeneralization::type nullable() const;
-
- struct Parameter {
- FieldRef ref;
-
- // post-bind properties
- ValueDescr descr;
- int index;
- };
- const Parameter* parameter() const;
-
- Expression() = default;
- explicit Expression(Call call);
- explicit Expression(Datum literal);
- explicit Expression(Parameter parameter);
-
- private:
- using Impl = util::Variant<Datum, Parameter, Call>;
- std::shared_ptr<Impl> impl_;
-
- ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r);
-
- ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*);
-};
-
-inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
-inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
-
-// Factories
-
-ARROW_EXPORT
-Expression literal(Datum lit);
-
-template <typename Arg>
-Expression literal(Arg&& arg) {
- return literal(Datum(std::forward<Arg>(arg)));
-}
-
-ARROW_EXPORT
-Expression field_ref(FieldRef ref);
-
-ARROW_EXPORT
-Expression call(std::string function, std::vector<Expression> arguments,
- std::shared_ptr<FunctionOptions> options = NULLPTR);
-
-template <typename Options, typename = typename std::enable_if<
- std::is_base_of<FunctionOptions, Options>::value>::type>
-Expression call(std::string function, std::vector<Expression> arguments,
- Options options) {
- return call(std::move(function), std::move(arguments),
- std::make_shared<Options>(std::move(options)));
-}
-
-/// Assemble a list of all fields referenced by an Expression at any depth.
-ARROW_EXPORT
-std::vector<FieldRef> FieldsInExpression(const Expression&);
-
-/// Check if the expression references any fields.
-ARROW_EXPORT
-bool ExpressionHasFieldRefs(const Expression&);
-
-/// Assemble a mapping from field references to known values.
-struct ARROW_EXPORT KnownFieldValues;
-ARROW_EXPORT
-Result<KnownFieldValues> ExtractKnownFieldValues(
- const Expression& guaranteed_true_predicate);
-
-/// \defgroup expression-passes Functions for modification of Expressions
-///
-/// @{
-///
-/// These transform bound expressions. Some transforms utilize a guarantee, which is
-/// provided as an Expression which is guaranteed to evaluate to true. The
-/// guaranteed_true_predicate need not be bound, but canonicalization is currently
-/// deferred to producers of guarantees. For example in order to be recognized as a
-/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
-/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
-/// other semantically identical Expressions will not be recognized.
-
-/// Weak canonicalization which establishes guarantees for subsequent passes. Even
-/// equivalent Expressions may result in different canonicalized expressions.
-/// TODO this could be a strong canonicalization
-ARROW_EXPORT
-Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
-
-/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
-/// be null so replace the call with a null literal). Includes early evaluation of all
-/// calls whose arguments are entirely literal.
-ARROW_EXPORT
-Result<Expression> FoldConstants(Expression);
-
-/// Simplify Expressions by replacing with known values of the fields which it references.
-ARROW_EXPORT
-Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
- Expression);
-
-/// Simplify an expression by replacing subexpressions based on a guarantee:
-/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
-/// used to remove redundant function calls from a filter expression or to replace a
-/// reference to a constant-value field with a literal.
-ARROW_EXPORT
-Result<Expression> SimplifyWithGuarantee(Expression,
- const Expression& guaranteed_true_predicate);
-
-/// @}
-
-// Execution
-
-/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
-/// RecordBatch which may have missing or incorrectly ordered columns.
-/// Missing fields will be replaced with null scalars.
-ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
- const Datum& partial);
-
-/// Execute a scalar expression against the provided state and input ExecBatch. This
-/// expression must be bound.
-ARROW_EXPORT
-Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
- ExecContext* = NULLPTR);
-
-/// Convenience function for invoking against a RecordBatch
-ARROW_EXPORT
-Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
- const Datum& partial_input, ExecContext* = NULLPTR);
-
-// Serialization
-
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
-
-ARROW_EXPORT
-Result<Expression> Deserialize(std::shared_ptr<Buffer>);
-
-// Convenience aliases for factories
-
-ARROW_EXPORT Expression project(std::vector<Expression> values,
- std::vector<std::string> names);
-
-ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
-
-ARROW_EXPORT Expression is_null(Expression lhs);
-
-ARROW_EXPORT Expression is_valid(Expression lhs);
-
-ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
-ARROW_EXPORT Expression and_(const std::vector<Expression>&);
-ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
-ARROW_EXPORT Expression or_(const std::vector<Expression>&);
-ARROW_EXPORT Expression not_(Expression operand);
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/variant.h"
+
+namespace arrow {
+namespace compute {
+
+/// An unbound expression which maps a single Datum to another Datum.
+/// An expression is one of
+/// - A literal Datum.
+/// - A reference to a single (potentially nested) field of the input Datum.
+/// - A call to a compute function, with arguments specified by other Expressions.
+class ARROW_EXPORT Expression {
+ public:
+ struct Call {
+ std::string function_name;
+ std::vector<Expression> arguments;
+ std::shared_ptr<FunctionOptions> options;
+ // Cached hash value
+ size_t hash;
+
+ // post-Bind properties:
+ std::shared_ptr<Function> function;
+ const Kernel* kernel = NULLPTR;
+ std::shared_ptr<KernelState> kernel_state;
+ ValueDescr descr;
+
+ void ComputeHash();
+ };
+
+ std::string ToString() const;
+ bool Equals(const Expression& other) const;
+ size_t hash() const;
+ struct Hash {
+ size_t operator()(const Expression& expr) const { return expr.hash(); }
+ };
+
+ /// Bind this expression to the given input type, looking up Kernels and field types.
+ /// Some expression simplification may be performed and implicit casts will be inserted.
+ /// Any state necessary for execution will be initialized and returned.
+ Result<Expression> Bind(const ValueDescr& in, ExecContext* = NULLPTR) const;
+ Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
+
+ // XXX someday
+ // Clone all KernelState in this bound expression. If any function referenced by this
+ // expression has mutable KernelState, it is not safe to execute or apply simplification
+ // passes to it (or copies of it!) from multiple threads. Cloning state produces new
+ // KernelStates where necessary to ensure that Expressions may be manipulated safely
+ // on multiple threads.
+ // Result<ExpressionState> CloneState() const;
+ // Status SetState(ExpressionState);
+
+ /// Return true if all an expression's field references have explicit ValueDescr and all
+ /// of its functions' kernels are looked up.
+ bool IsBound() const;
+
+ /// Return true if this expression is composed only of Scalar literals, field
+ /// references, and calls to ScalarFunctions.
+ bool IsScalarExpression() const;
+
+ /// Return true if this expression is literal and entirely null.
+ bool IsNullLiteral() const;
+
+ /// Return true if this expression could evaluate to true.
+ bool IsSatisfiable() const;
+
+ // XXX someday
+ // Result<PipelineGraph> GetPipelines();
+
+ /// Access a Call or return nullptr if this expression is not a call
+ const Call* call() const;
+ /// Access a Datum or return nullptr if this expression is not a literal
+ const Datum* literal() const;
+ /// Access a FieldRef or return nullptr if this expression is not a field_ref
+ const FieldRef* field_ref() const;
+
+ /// The type and shape to which this expression will evaluate
+ ValueDescr descr() const;
+ std::shared_ptr<DataType> type() const { return descr().type; }
+ // XXX someday
+ // NullGeneralization::type nullable() const;
+
+ struct Parameter {
+ FieldRef ref;
+
+ // post-bind properties
+ ValueDescr descr;
+ int index;
+ };
+ const Parameter* parameter() const;
+
+ Expression() = default;
+ explicit Expression(Call call);
+ explicit Expression(Datum literal);
+ explicit Expression(Parameter parameter);
+
+ private:
+ using Impl = util::Variant<Datum, Parameter, Call>;
+ std::shared_ptr<Impl> impl_;
+
+ ARROW_EXPORT friend bool Identical(const Expression& l, const Expression& r);
+
+ ARROW_EXPORT friend void PrintTo(const Expression&, std::ostream*);
+};
+
+inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
+inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
+
+// Factories
+
+ARROW_EXPORT
+Expression literal(Datum lit);
+
+template <typename Arg>
+Expression literal(Arg&& arg) {
+ return literal(Datum(std::forward<Arg>(arg)));
+}
+
+ARROW_EXPORT
+Expression field_ref(FieldRef ref);
+
+ARROW_EXPORT
+Expression call(std::string function, std::vector<Expression> arguments,
+ std::shared_ptr<FunctionOptions> options = NULLPTR);
+
+template <typename Options, typename = typename std::enable_if<
+ std::is_base_of<FunctionOptions, Options>::value>::type>
+Expression call(std::string function, std::vector<Expression> arguments,
+ Options options) {
+ return call(std::move(function), std::move(arguments),
+ std::make_shared<Options>(std::move(options)));
+}
+
+/// Assemble a list of all fields referenced by an Expression at any depth.
+ARROW_EXPORT
+std::vector<FieldRef> FieldsInExpression(const Expression&);
+
+/// Check if the expression references any fields.
+ARROW_EXPORT
+bool ExpressionHasFieldRefs(const Expression&);
+
+/// Assemble a mapping from field references to known values.
+struct ARROW_EXPORT KnownFieldValues;
+ARROW_EXPORT
+Result<KnownFieldValues> ExtractKnownFieldValues(
+ const Expression& guaranteed_true_predicate);
+
+/// \defgroup expression-passes Functions for modification of Expressions
+///
+/// @{
+///
+/// These transform bound expressions. Some transforms utilize a guarantee, which is
+/// provided as an Expression which is guaranteed to evaluate to true. The
+/// guaranteed_true_predicate need not be bound, but canonicalization is currently
+/// deferred to producers of guarantees. For example in order to be recognized as a
+/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
+/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
+/// other semantically identical Expressions will not be recognized.
+
+/// Weak canonicalization which establishes guarantees for subsequent passes. Even
+/// equivalent Expressions may result in different canonicalized expressions.
+/// TODO this could be a strong canonicalization
+ARROW_EXPORT
+Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
+
+/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
+/// be null so replace the call with a null literal). Includes early evaluation of all
+/// calls whose arguments are entirely literal.
+ARROW_EXPORT
+Result<Expression> FoldConstants(Expression);
+
+/// Simplify Expressions by replacing with known values of the fields which it references.
+ARROW_EXPORT
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+ Expression);
+
+/// Simplify an expression by replacing subexpressions based on a guarantee:
+/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
+/// used to remove redundant function calls from a filter expression or to replace a
+/// reference to a constant-value field with a literal.
+ARROW_EXPORT
+Result<Expression> SimplifyWithGuarantee(Expression,
+ const Expression& guaranteed_true_predicate);
+
+/// @}
+
+// Execution
+
+/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
+/// RecordBatch which may have missing or incorrectly ordered columns.
+/// Missing fields will be replaced with null scalars.
+ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
+ const Datum& partial);
+
+/// Execute a scalar expression against the provided state and input ExecBatch. This
+/// expression must be bound.
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
+ ExecContext* = NULLPTR);
+
+/// Convenience function for invoking against a RecordBatch
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
+ const Datum& partial_input, ExecContext* = NULLPTR);
+
+// Serialization
+
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
+
+ARROW_EXPORT
+Result<Expression> Deserialize(std::shared_ptr<Buffer>);
+
+// Convenience aliases for factories
+
+ARROW_EXPORT Expression project(std::vector<Expression> values,
+ std::vector<std::string> names);
+
+ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression is_null(Expression lhs);
+
+ARROW_EXPORT Expression is_valid(Expression lhs);
+
+ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression and_(const std::vector<Expression>&);
+ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression or_(const std::vector<Expression>&);
+ARROW_EXPORT Expression not_(Expression operand);
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
index abcb99bc576..dc38924d932 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/expression_internal.h
@@ -1,336 +1,336 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/expression.h"
-
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/cast.h"
-#include "arrow/compute/registry.h"
-#include "arrow/record_batch.h"
-#include "arrow/table.h"
-#include "arrow/util/logging.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-
-namespace compute {
-
-struct KnownFieldValues {
- std::unordered_map<FieldRef, Datum, FieldRef::Hash> map;
-};
-
-inline const Expression::Call* CallNotNull(const Expression& expr) {
- auto call = expr.call();
- DCHECK_NE(call, nullptr);
- return call;
-}
-
-inline std::vector<ValueDescr> GetDescriptors(const std::vector<Expression>& exprs) {
- std::vector<ValueDescr> descrs(exprs.size());
- for (size_t i = 0; i < exprs.size(); ++i) {
- DCHECK(exprs[i].IsBound());
- descrs[i] = exprs[i].descr();
- }
- return descrs;
-}
-
-inline std::vector<ValueDescr> GetDescriptors(const std::vector<Datum>& values) {
- std::vector<ValueDescr> descrs(values.size());
- for (size_t i = 0; i < values.size(); ++i) {
- descrs[i] = values[i].descr();
- }
- return descrs;
-}
-
-struct Comparison {
- enum type {
- NA = 0,
- EQUAL = 1,
- LESS = 2,
- GREATER = 4,
- NOT_EQUAL = LESS | GREATER,
- LESS_EQUAL = LESS | EQUAL,
- GREATER_EQUAL = GREATER | EQUAL,
- };
-
- static const type* Get(const std::string& function) {
- static std::unordered_map<std::string, type> map{
- {"equal", EQUAL}, {"not_equal", NOT_EQUAL},
- {"less", LESS}, {"less_equal", LESS_EQUAL},
- {"greater", GREATER}, {"greater_equal", GREATER_EQUAL},
- };
-
- auto it = map.find(function);
- return it != map.end() ? &it->second : nullptr;
- }
-
- static const type* Get(const Expression& expr) {
- if (auto call = expr.call()) {
- return Comparison::Get(call->function_name);
- }
- return nullptr;
- }
-
- // Execute a simple Comparison between scalars
- static Result<type> Execute(Datum l, Datum r) {
- if (!l.is_scalar() || !r.is_scalar()) {
- return Status::Invalid("Cannot Execute Comparison on non-scalars");
- }
-
- std::vector<Datum> arguments{std::move(l), std::move(r)};
-
- ARROW_ASSIGN_OR_RAISE(auto equal, compute::CallFunction("equal", arguments));
-
- if (!equal.scalar()->is_valid) return NA;
- if (equal.scalar_as<BooleanScalar>().value) return EQUAL;
-
- ARROW_ASSIGN_OR_RAISE(auto less, compute::CallFunction("less", arguments));
-
- if (!less.scalar()->is_valid) return NA;
- return less.scalar_as<BooleanScalar>().value ? LESS : GREATER;
- }
-
- // Given an Expression wrapped in casts which preserve ordering
- // (for example, cast(field_ref("i16"), to_type=int32())), unwrap the inner Expression.
- // This is used to destructure implicitly cast field_refs during Expression
- // simplification.
- static const Expression& StripOrderPreservingCasts(const Expression& expr) {
- auto call = expr.call();
- if (!call) return expr;
- if (call->function_name != "cast") return expr;
-
- const Expression& from = call->arguments[0];
-
- auto from_id = from.type()->id();
- auto to_id = expr.type()->id();
-
- if (is_floating(to_id)) {
- if (is_integer(from_id) || is_floating(from_id)) {
- return StripOrderPreservingCasts(from);
- }
- return expr;
- }
-
- if (is_unsigned_integer(to_id)) {
- if (is_unsigned_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
- return StripOrderPreservingCasts(from);
- }
- return expr;
- }
-
- if (is_signed_integer(to_id)) {
- if (is_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
- return StripOrderPreservingCasts(from);
- }
- return expr;
- }
-
- return expr;
- }
-
- static type GetFlipped(type op) {
- switch (op) {
- case NA:
- return NA;
- case EQUAL:
- return EQUAL;
- case LESS:
- return GREATER;
- case GREATER:
- return LESS;
- case NOT_EQUAL:
- return NOT_EQUAL;
- case LESS_EQUAL:
- return GREATER_EQUAL;
- case GREATER_EQUAL:
- return LESS_EQUAL;
- }
- DCHECK(false);
- return NA;
- }
-
- static std::string GetName(type op) {
- switch (op) {
- case NA:
- break;
- case EQUAL:
- return "equal";
- case LESS:
- return "less";
- case GREATER:
- return "greater";
- case NOT_EQUAL:
- return "not_equal";
- case LESS_EQUAL:
- return "less_equal";
- case GREATER_EQUAL:
- return "greater_equal";
- }
- return "na";
- }
-
- static std::string GetOp(type op) {
- switch (op) {
- case NA:
- DCHECK(false) << "unreachable";
- break;
- case EQUAL:
- return "==";
- case LESS:
- return "<";
- case GREATER:
- return ">";
- case NOT_EQUAL:
- return "!=";
- case LESS_EQUAL:
- return "<=";
- case GREATER_EQUAL:
- return ">=";
- }
- DCHECK(false);
- return "";
- }
-};
-
-inline const compute::CastOptions* GetCastOptions(const Expression::Call& call) {
- if (call.function_name != "cast") return nullptr;
- return checked_cast<const compute::CastOptions*>(call.options.get());
-}
-
-inline bool IsSetLookup(const std::string& function) {
- return function == "is_in" || function == "index_in";
-}
-
-inline const compute::MakeStructOptions* GetMakeStructOptions(
- const Expression::Call& call) {
- if (call.function_name != "make_struct") return nullptr;
- return checked_cast<const compute::MakeStructOptions*>(call.options.get());
-}
-
-/// A helper for unboxing an Expression composed of associative function calls.
-/// Such expressions can frequently be rearranged to a semantically equivalent
-/// expression for more optimal execution or more straightforward manipulation.
-/// For example, (a + ((b + 3) + 4)) is equivalent to (((4 + 3) + a) + b) and the latter
-/// can be trivially constant-folded to ((7 + a) + b).
-struct FlattenedAssociativeChain {
- /// True if a chain was already a left fold.
- bool was_left_folded = true;
-
- /// All "branch" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
- /// exprs would be [(a + ((b + 3) + 4)), ((b + 3) + 4), (b + 3)]
- std::vector<Expression> exprs;
-
- /// All "leaf" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
- /// the fringe would be [a, b, 3, 4]
- std::vector<Expression> fringe;
-
- explicit FlattenedAssociativeChain(Expression expr) : exprs{std::move(expr)} {
- auto call = CallNotNull(exprs.back());
- fringe = call->arguments;
-
- auto it = fringe.begin();
-
- while (it != fringe.end()) {
- auto sub_call = it->call();
- if (!sub_call || sub_call->function_name != call->function_name) {
- ++it;
- continue;
- }
-
- if (it != fringe.begin()) {
- was_left_folded = false;
- }
-
- exprs.push_back(std::move(*it));
- it = fringe.erase(it);
-
- auto index = it - fringe.begin();
- fringe.insert(it, sub_call->arguments.begin(), sub_call->arguments.end());
- it = fringe.begin() + index;
- // NB: no increment so we hit sub_call's first argument next iteration
- }
-
- DCHECK(std::all_of(exprs.begin(), exprs.end(), [](const Expression& expr) {
- return CallNotNull(expr)->options == nullptr;
- }));
- }
-};
-
-inline Result<std::shared_ptr<compute::Function>> GetFunction(
- const Expression::Call& call, compute::ExecContext* exec_context) {
- if (call.function_name != "cast") {
- return exec_context->func_registry()->GetFunction(call.function_name);
- }
- // XXX this special case is strange; why not make "cast" a ScalarFunction?
- const auto& to_type = checked_cast<const compute::CastOptions&>(*call.options).to_type;
- return compute::GetCastFunction(to_type);
-}
-
-/// Modify an Expression with pre-order and post-order visitation.
-/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
-/// arguments, `post_call` will visit Calls (and no other Expressions) after their
-/// arguments. Visitors should return the Identical expression to indicate no change; this
-/// will prevent unnecessary construction in the common case where a modification is not
-/// possible/necessary/...
-///
-/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
-/// arguments but also receives a pointer to the unmodified Expression as a second
-/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
-template <typename PreVisit, typename PostVisitCall>
-Result<Expression> Modify(Expression expr, const PreVisit& pre,
- const PostVisitCall& post_call) {
- ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
-
- auto call = expr.call();
- if (!call) return expr;
-
- bool at_least_one_modified = false;
- std::vector<Expression> modified_arguments;
-
- for (size_t i = 0; i < call->arguments.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(auto modified_argument,
- Modify(call->arguments[i], pre, post_call));
-
- if (Identical(modified_argument, call->arguments[i])) {
- continue;
- }
-
- if (!at_least_one_modified) {
- modified_arguments = call->arguments;
- at_least_one_modified = true;
- }
-
- modified_arguments[i] = std::move(modified_argument);
- }
-
- if (at_least_one_modified) {
- // reconstruct the call expression with the modified arguments
- auto modified_call = *call;
- modified_call.arguments = std::move(modified_arguments);
- return post_call(Expression(std::move(modified_call)), &expr);
- }
-
- return post_call(std::move(expr), nullptr);
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/expression.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/registry.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+
+struct KnownFieldValues {
+ std::unordered_map<FieldRef, Datum, FieldRef::Hash> map;
+};
+
+inline const Expression::Call* CallNotNull(const Expression& expr) {
+ auto call = expr.call();
+ DCHECK_NE(call, nullptr);
+ return call;
+}
+
+inline std::vector<ValueDescr> GetDescriptors(const std::vector<Expression>& exprs) {
+ std::vector<ValueDescr> descrs(exprs.size());
+ for (size_t i = 0; i < exprs.size(); ++i) {
+ DCHECK(exprs[i].IsBound());
+ descrs[i] = exprs[i].descr();
+ }
+ return descrs;
+}
+
+inline std::vector<ValueDescr> GetDescriptors(const std::vector<Datum>& values) {
+ std::vector<ValueDescr> descrs(values.size());
+ for (size_t i = 0; i < values.size(); ++i) {
+ descrs[i] = values[i].descr();
+ }
+ return descrs;
+}
+
+struct Comparison {
+ enum type {
+ NA = 0,
+ EQUAL = 1,
+ LESS = 2,
+ GREATER = 4,
+ NOT_EQUAL = LESS | GREATER,
+ LESS_EQUAL = LESS | EQUAL,
+ GREATER_EQUAL = GREATER | EQUAL,
+ };
+
+ static const type* Get(const std::string& function) {
+ static std::unordered_map<std::string, type> map{
+ {"equal", EQUAL}, {"not_equal", NOT_EQUAL},
+ {"less", LESS}, {"less_equal", LESS_EQUAL},
+ {"greater", GREATER}, {"greater_equal", GREATER_EQUAL},
+ };
+
+ auto it = map.find(function);
+ return it != map.end() ? &it->second : nullptr;
+ }
+
+ static const type* Get(const Expression& expr) {
+ if (auto call = expr.call()) {
+ return Comparison::Get(call->function_name);
+ }
+ return nullptr;
+ }
+
+ // Execute a simple Comparison between scalars
+ static Result<type> Execute(Datum l, Datum r) {
+ if (!l.is_scalar() || !r.is_scalar()) {
+ return Status::Invalid("Cannot Execute Comparison on non-scalars");
+ }
+
+ std::vector<Datum> arguments{std::move(l), std::move(r)};
+
+ ARROW_ASSIGN_OR_RAISE(auto equal, compute::CallFunction("equal", arguments));
+
+ if (!equal.scalar()->is_valid) return NA;
+ if (equal.scalar_as<BooleanScalar>().value) return EQUAL;
+
+ ARROW_ASSIGN_OR_RAISE(auto less, compute::CallFunction("less", arguments));
+
+ if (!less.scalar()->is_valid) return NA;
+ return less.scalar_as<BooleanScalar>().value ? LESS : GREATER;
+ }
+
+ // Given an Expression wrapped in casts which preserve ordering
+ // (for example, cast(field_ref("i16"), to_type=int32())), unwrap the inner Expression.
+ // This is used to destructure implicitly cast field_refs during Expression
+ // simplification.
+ static const Expression& StripOrderPreservingCasts(const Expression& expr) {
+ auto call = expr.call();
+ if (!call) return expr;
+ if (call->function_name != "cast") return expr;
+
+ const Expression& from = call->arguments[0];
+
+ auto from_id = from.type()->id();
+ auto to_id = expr.type()->id();
+
+ if (is_floating(to_id)) {
+ if (is_integer(from_id) || is_floating(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ if (is_unsigned_integer(to_id)) {
+ if (is_unsigned_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ if (is_signed_integer(to_id)) {
+ if (is_integer(from_id) && bit_width(to_id) >= bit_width(from_id)) {
+ return StripOrderPreservingCasts(from);
+ }
+ return expr;
+ }
+
+ return expr;
+ }
+
+ static type GetFlipped(type op) {
+ switch (op) {
+ case NA:
+ return NA;
+ case EQUAL:
+ return EQUAL;
+ case LESS:
+ return GREATER;
+ case GREATER:
+ return LESS;
+ case NOT_EQUAL:
+ return NOT_EQUAL;
+ case LESS_EQUAL:
+ return GREATER_EQUAL;
+ case GREATER_EQUAL:
+ return LESS_EQUAL;
+ }
+ DCHECK(false);
+ return NA;
+ }
+
+ static std::string GetName(type op) {
+ switch (op) {
+ case NA:
+ break;
+ case EQUAL:
+ return "equal";
+ case LESS:
+ return "less";
+ case GREATER:
+ return "greater";
+ case NOT_EQUAL:
+ return "not_equal";
+ case LESS_EQUAL:
+ return "less_equal";
+ case GREATER_EQUAL:
+ return "greater_equal";
+ }
+ return "na";
+ }
+
+ static std::string GetOp(type op) {
+ switch (op) {
+ case NA:
+ DCHECK(false) << "unreachable";
+ break;
+ case EQUAL:
+ return "==";
+ case LESS:
+ return "<";
+ case GREATER:
+ return ">";
+ case NOT_EQUAL:
+ return "!=";
+ case LESS_EQUAL:
+ return "<=";
+ case GREATER_EQUAL:
+ return ">=";
+ }
+ DCHECK(false);
+ return "";
+ }
+};
+
+inline const compute::CastOptions* GetCastOptions(const Expression::Call& call) {
+ if (call.function_name != "cast") return nullptr;
+ return checked_cast<const compute::CastOptions*>(call.options.get());
+}
+
+inline bool IsSetLookup(const std::string& function) {
+ return function == "is_in" || function == "index_in";
+}
+
+inline const compute::MakeStructOptions* GetMakeStructOptions(
+ const Expression::Call& call) {
+ if (call.function_name != "make_struct") return nullptr;
+ return checked_cast<const compute::MakeStructOptions*>(call.options.get());
+}
+
+/// A helper for unboxing an Expression composed of associative function calls.
+/// Such expressions can frequently be rearranged to a semantically equivalent
+/// expression for more optimal execution or more straightforward manipulation.
+/// For example, (a + ((b + 3) + 4)) is equivalent to (((4 + 3) + a) + b) and the latter
+/// can be trivially constant-folded to ((7 + a) + b).
+struct FlattenedAssociativeChain {
+ /// True if a chain was already a left fold.
+ bool was_left_folded = true;
+
+ /// All "branch" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
+ /// exprs would be [(a + ((b + 3) + 4)), ((b + 3) + 4), (b + 3)]
+ std::vector<Expression> exprs;
+
+ /// All "leaf" expressions in a flattened chain. For example given (a + ((b + 3) + 4))
+ /// the fringe would be [a, b, 3, 4]
+ std::vector<Expression> fringe;
+
+ explicit FlattenedAssociativeChain(Expression expr) : exprs{std::move(expr)} {
+ auto call = CallNotNull(exprs.back());
+ fringe = call->arguments;
+
+ auto it = fringe.begin();
+
+ while (it != fringe.end()) {
+ auto sub_call = it->call();
+ if (!sub_call || sub_call->function_name != call->function_name) {
+ ++it;
+ continue;
+ }
+
+ if (it != fringe.begin()) {
+ was_left_folded = false;
+ }
+
+ exprs.push_back(std::move(*it));
+ it = fringe.erase(it);
+
+ auto index = it - fringe.begin();
+ fringe.insert(it, sub_call->arguments.begin(), sub_call->arguments.end());
+ it = fringe.begin() + index;
+ // NB: no increment so we hit sub_call's first argument next iteration
+ }
+
+ DCHECK(std::all_of(exprs.begin(), exprs.end(), [](const Expression& expr) {
+ return CallNotNull(expr)->options == nullptr;
+ }));
+ }
+};
+
+inline Result<std::shared_ptr<compute::Function>> GetFunction(
+ const Expression::Call& call, compute::ExecContext* exec_context) {
+ if (call.function_name != "cast") {
+ return exec_context->func_registry()->GetFunction(call.function_name);
+ }
+ // XXX this special case is strange; why not make "cast" a ScalarFunction?
+ const auto& to_type = checked_cast<const compute::CastOptions&>(*call.options).to_type;
+ return compute::GetCastFunction(to_type);
+}
+
+/// Modify an Expression with pre-order and post-order visitation.
+/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
+/// arguments, `post_call` will visit Calls (and no other Expressions) after their
+/// arguments. Visitors should return the Identical expression to indicate no change; this
+/// will prevent unnecessary construction in the common case where a modification is not
+/// possible/necessary/...
+///
+/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
+/// arguments but also receives a pointer to the unmodified Expression as a second
+/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
+template <typename PreVisit, typename PostVisitCall>
+Result<Expression> Modify(Expression expr, const PreVisit& pre,
+ const PostVisitCall& post_call) {
+ ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
+
+ auto call = expr.call();
+ if (!call) return expr;
+
+ bool at_least_one_modified = false;
+ std::vector<Expression> modified_arguments;
+
+ for (size_t i = 0; i < call->arguments.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto modified_argument,
+ Modify(call->arguments[i], pre, post_call));
+
+ if (Identical(modified_argument, call->arguments[i])) {
+ continue;
+ }
+
+ if (!at_least_one_modified) {
+ modified_arguments = call->arguments;
+ at_least_one_modified = true;
+ }
+
+ modified_arguments[i] = std::move(modified_argument);
+ }
+
+ if (at_least_one_modified) {
+ // reconstruct the call expression with the modified arguments
+ auto modified_call = *call;
+ modified_call.arguments = std::move(modified_arguments);
+ return post_call(Expression(std::move(modified_call)), &expr);
+ }
+
+ return post_call(std::move(expr), nullptr);
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
index 01de727978f..7a5b0be9990 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.cc
@@ -1,268 +1,268 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_compare.h"
-
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace compute {
-
-void KeyCompare::CompareRows(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
- uint16_t* out_sel_left_maybe_same,
- const KeyEncoder::KeyRowArray& rows_left,
- const KeyEncoder::KeyRowArray& rows_right) {
- ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata()));
-
- if (num_rows_to_compare == 0) {
- *out_num_rows = 0;
- return;
- }
-
- // Allocate temporary byte and bit vectors
- auto bytevector_holder =
- util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
- auto bitvector_holder =
- util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
-
- uint8_t* match_bytevector = bytevector_holder.mutable_data();
- uint8_t* match_bitvector = bitvector_holder.mutable_data();
-
- // All comparison functions called here will update match byte vector
- // (AND it with comparison result) instead of overwriting it.
- memset(match_bytevector, 0xff, num_rows_to_compare);
-
- if (rows_left.metadata().is_fixed_length) {
- CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
- match_bytevector, ctx, rows_left.metadata().fixed_length,
- rows_left.data(1), rows_right.data(1));
- } else {
- CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
- match_bytevector, ctx, rows_left.data(2), rows_right.data(2),
- rows_left.offsets(), rows_right.offsets());
- }
-
- // CompareFixedLength can be used to compare nulls as well
- bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx);
- if (nulls_present) {
- CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
- match_bytevector, ctx,
- rows_left.metadata().null_masks_bytes_per_row,
- rows_left.null_masks(), rows_right.null_masks());
- }
-
- util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector,
- match_bitvector);
- if (sel_left_maybe_null) {
- int out_num_rows_int;
- util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare,
- match_bitvector, sel_left_maybe_null,
- &out_num_rows_int, out_sel_left_maybe_same);
- *out_num_rows = out_num_rows_int;
- } else {
- int out_num_rows_int;
- util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare,
- match_bitvector, &out_num_rows_int,
- out_sel_left_maybe_same);
- *out_num_rows = out_num_rows_int;
- }
-}
-
-void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- uint32_t fixed_length, const uint8_t* rows_left,
- const uint8_t* rows_right) {
- bool use_selection = (sel_left_maybe_null != nullptr);
-
- uint32_t num_rows_already_processed = 0;
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && !use_selection) {
- // Choose between up-to-8B length, up-to-16B length and any size versions
- if (fixed_length <= 8) {
- num_rows_already_processed = CompareFixedLength_UpTo8B_avx2(
- num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
- rows_left, rows_right);
- } else if (fixed_length <= 16) {
- num_rows_already_processed = CompareFixedLength_UpTo16B_avx2(
- num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
- rows_left, rows_right);
- } else {
- num_rows_already_processed =
- CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map,
- match_bytevector, fixed_length, rows_left, rows_right);
- }
- }
-#endif
-
- typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*,
- const uint32_t*, uint8_t*, uint32_t,
- const uint8_t*, const uint8_t*);
- static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = {
- CompareFixedLengthImp<false, 1>, CompareFixedLengthImp<false, 2>,
- CompareFixedLengthImp<false, 0>, CompareFixedLengthImp<true, 1>,
- CompareFixedLengthImp<true, 2>, CompareFixedLengthImp<true, 0>};
- int dispatch_const = (use_selection ? 3 : 0) +
- ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2));
- CompareFixedLengthImp_fn[dispatch_const](
- num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null,
- left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right);
-}
-
-template <bool use_selection, int num_64bit_words>
-void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed,
- uint32_t num_rows,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, uint32_t length,
- const uint8_t* rows_left,
- const uint8_t* rows_right) {
- // Key length (for encoded key) has to be non-zero
- ARROW_DCHECK(length > 0);
-
- // Non-zero length guarantees no underflow
- int32_t num_loops_less_one = (static_cast<int32_t>(length) + 7) / 8 - 1;
-
- // Length remaining in last loop can only be zero for input length equal to zero
- uint32_t length_remaining_last_loop = length - num_loops_less_one * 8;
- uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop));
-
- for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) {
- uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input;
- uint32_t irow_right = left_to_right_map[irow_left];
- uint32_t begin_left = length * irow_left;
- uint32_t begin_right = length * irow_right;
- const uint64_t* key_left_ptr =
- reinterpret_cast<const uint64_t*>(rows_left + begin_left);
- const uint64_t* key_right_ptr =
- reinterpret_cast<const uint64_t*>(rows_right + begin_right);
- uint64_t result_or = 0ULL;
- int32_t istripe = 0;
-
- // Specializations for keys up to 8 bytes and between 9 and 16 bytes to
- // avoid internal loop over words in the value for short ones.
- //
- // Template argument 0 means arbitrarily many 64-bit words,
- // 1 means up to 1 and 2 means up to 2.
- //
- if (num_64bit_words == 0) {
- for (; istripe < num_loops_less_one; ++istripe) {
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (key_left ^ key_right);
- }
- } else if (num_64bit_words == 2) {
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (key_left ^ key_right);
- ++istripe;
- }
-
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (tail_mask & (key_left ^ key_right));
-
- int result = (result_or == 0 ? 0xff : 0);
- match_bytevector[id_input] &= result;
- }
-}
-
-void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- const uint8_t* rows_left, const uint8_t* rows_right,
- const uint32_t* offsets_left,
- const uint32_t* offsets_right) {
- bool use_selection = (sel_left_maybe_null != nullptr);
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && !use_selection) {
- CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector,
- rows_left, rows_right, offsets_left, offsets_right);
- } else {
-#endif
- if (use_selection) {
- CompareVaryingLengthImp<true>(num_rows_to_compare, sel_left_maybe_null,
- left_to_right_map, match_bytevector, rows_left,
- rows_right, offsets_left, offsets_right);
- } else {
- CompareVaryingLengthImp<false>(num_rows_to_compare, sel_left_maybe_null,
- left_to_right_map, match_bytevector, rows_left,
- rows_right, offsets_left, offsets_right);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-template <bool use_selection>
-void KeyCompare::CompareVaryingLengthImp(
- uint32_t num_rows, const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
- const uint32_t* offsets_right) {
- static const uint64_t tail_masks[] = {
- 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL,
- 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL,
- 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL};
- for (uint32_t i = 0; i < num_rows; ++i) {
- uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
- uint32_t irow_right = left_to_right_map[irow_left];
- uint32_t begin_left = offsets_left[irow_left];
- uint32_t begin_right = offsets_right[irow_right];
- uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
- uint32_t length_right = offsets_right[irow_right + 1] - begin_right;
- uint32_t length = std::min(length_left, length_right);
- const uint64_t* key_left_ptr =
- reinterpret_cast<const uint64_t*>(rows_left + begin_left);
- const uint64_t* key_right_ptr =
- reinterpret_cast<const uint64_t*>(rows_right + begin_right);
- uint64_t result_or = 0;
- int32_t istripe;
- // length can be zero
- for (istripe = 0; istripe < (static_cast<int32_t>(length) + 7) / 8 - 1; ++istripe) {
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (key_left ^ key_right);
- }
-
- uint32_t length_remaining = length - static_cast<uint32_t>(istripe) * 8;
- uint64_t tail_mask = tail_masks[length_remaining];
-
- uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
- uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
- result_or |= (tail_mask & (key_left ^ key_right));
-
- int result = (result_or == 0 ? 0xff : 0);
- match_bytevector[i] &= result;
- }
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_compare.h"
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace compute {
+
+void KeyCompare::CompareRows(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
+ uint16_t* out_sel_left_maybe_same,
+ const KeyEncoder::KeyRowArray& rows_left,
+ const KeyEncoder::KeyRowArray& rows_right) {
+ ARROW_DCHECK(rows_left.metadata().is_compatible(rows_right.metadata()));
+
+ if (num_rows_to_compare == 0) {
+ *out_num_rows = 0;
+ return;
+ }
+
+ // Allocate temporary byte and bit vectors
+ auto bytevector_holder =
+ util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
+ auto bitvector_holder =
+ util::TempVectorHolder<uint8_t>(ctx->stack, num_rows_to_compare);
+
+ uint8_t* match_bytevector = bytevector_holder.mutable_data();
+ uint8_t* match_bitvector = bitvector_holder.mutable_data();
+
+ // All comparison functions called here will update match byte vector
+ // (AND it with comparison result) instead of overwriting it.
+ memset(match_bytevector, 0xff, num_rows_to_compare);
+
+ if (rows_left.metadata().is_fixed_length) {
+ CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx, rows_left.metadata().fixed_length,
+ rows_left.data(1), rows_right.data(1));
+ } else {
+ CompareVaryingLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx, rows_left.data(2), rows_right.data(2),
+ rows_left.offsets(), rows_right.offsets());
+ }
+
+ // CompareFixedLength can be used to compare nulls as well
+ bool nulls_present = rows_left.has_any_nulls(ctx) || rows_right.has_any_nulls(ctx);
+ if (nulls_present) {
+ CompareFixedLength(num_rows_to_compare, sel_left_maybe_null, left_to_right_map,
+ match_bytevector, ctx,
+ rows_left.metadata().null_masks_bytes_per_row,
+ rows_left.null_masks(), rows_right.null_masks());
+ }
+
+ util::BitUtil::bytes_to_bits(ctx->hardware_flags, num_rows_to_compare, match_bytevector,
+ match_bitvector);
+ if (sel_left_maybe_null) {
+ int out_num_rows_int;
+ util::BitUtil::bits_filter_indexes(0, ctx->hardware_flags, num_rows_to_compare,
+ match_bitvector, sel_left_maybe_null,
+ &out_num_rows_int, out_sel_left_maybe_same);
+ *out_num_rows = out_num_rows_int;
+ } else {
+ int out_num_rows_int;
+ util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, num_rows_to_compare,
+ match_bitvector, &out_num_rows_int,
+ out_sel_left_maybe_same);
+ *out_num_rows = out_num_rows_int;
+ }
+}
+
+void KeyCompare::CompareFixedLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ uint32_t fixed_length, const uint8_t* rows_left,
+ const uint8_t* rows_right) {
+ bool use_selection = (sel_left_maybe_null != nullptr);
+
+ uint32_t num_rows_already_processed = 0;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && !use_selection) {
+ // Choose between up-to-8B length, up-to-16B length and any size versions
+ if (fixed_length <= 8) {
+ num_rows_already_processed = CompareFixedLength_UpTo8B_avx2(
+ num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
+ rows_left, rows_right);
+ } else if (fixed_length <= 16) {
+ num_rows_already_processed = CompareFixedLength_UpTo16B_avx2(
+ num_rows_to_compare, left_to_right_map, match_bytevector, fixed_length,
+ rows_left, rows_right);
+ } else {
+ num_rows_already_processed =
+ CompareFixedLength_avx2(num_rows_to_compare, left_to_right_map,
+ match_bytevector, fixed_length, rows_left, rows_right);
+ }
+ }
+#endif
+
+ typedef void (*CompareFixedLengthImp_t)(uint32_t, uint32_t, const uint16_t*,
+ const uint32_t*, uint8_t*, uint32_t,
+ const uint8_t*, const uint8_t*);
+ static const CompareFixedLengthImp_t CompareFixedLengthImp_fn[] = {
+ CompareFixedLengthImp<false, 1>, CompareFixedLengthImp<false, 2>,
+ CompareFixedLengthImp<false, 0>, CompareFixedLengthImp<true, 1>,
+ CompareFixedLengthImp<true, 2>, CompareFixedLengthImp<true, 0>};
+ int dispatch_const = (use_selection ? 3 : 0) +
+ ((fixed_length <= 8) ? 0 : ((fixed_length <= 16) ? 1 : 2));
+ CompareFixedLengthImp_fn[dispatch_const](
+ num_rows_already_processed, num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, fixed_length, rows_left, rows_right);
+}
+
+template <bool use_selection, int num_64bit_words>
+void KeyCompare::CompareFixedLengthImp(uint32_t num_rows_already_processed,
+ uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left,
+ const uint8_t* rows_right) {
+ // Key length (for encoded key) has to be non-zero
+ ARROW_DCHECK(length > 0);
+
+ // Non-zero length guarantees no underflow
+ int32_t num_loops_less_one = (static_cast<int32_t>(length) + 7) / 8 - 1;
+
+ // Length remaining in last loop can only be zero for input length equal to zero
+ uint32_t length_remaining_last_loop = length - num_loops_less_one * 8;
+ uint64_t tail_mask = (~0ULL) >> (8 * (8 - length_remaining_last_loop));
+
+ for (uint32_t id_input = num_rows_already_processed; id_input < num_rows; ++id_input) {
+ uint32_t irow_left = use_selection ? sel_left_maybe_null[id_input] : id_input;
+ uint32_t irow_right = left_to_right_map[irow_left];
+ uint32_t begin_left = length * irow_left;
+ uint32_t begin_right = length * irow_right;
+ const uint64_t* key_left_ptr =
+ reinterpret_cast<const uint64_t*>(rows_left + begin_left);
+ const uint64_t* key_right_ptr =
+ reinterpret_cast<const uint64_t*>(rows_right + begin_right);
+ uint64_t result_or = 0ULL;
+ int32_t istripe = 0;
+
+ // Specializations for keys up to 8 bytes and between 9 and 16 bytes to
+ // avoid internal loop over words in the value for short ones.
+ //
+ // Template argument 0 means arbitrarily many 64-bit words,
+ // 1 means up to 1 and 2 means up to 2.
+ //
+ if (num_64bit_words == 0) {
+ for (; istripe < num_loops_less_one; ++istripe) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ }
+ } else if (num_64bit_words == 2) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ ++istripe;
+ }
+
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (tail_mask & (key_left ^ key_right));
+
+ int result = (result_or == 0 ? 0xff : 0);
+ match_bytevector[id_input] &= result;
+ }
+}
+
+void KeyCompare::CompareVaryingLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ const uint8_t* rows_left, const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right) {
+ bool use_selection = (sel_left_maybe_null != nullptr);
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && !use_selection) {
+ CompareVaryingLength_avx2(num_rows_to_compare, left_to_right_map, match_bytevector,
+ rows_left, rows_right, offsets_left, offsets_right);
+ } else {
+#endif
+ if (use_selection) {
+ CompareVaryingLengthImp<true>(num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, rows_left,
+ rows_right, offsets_left, offsets_right);
+ } else {
+ CompareVaryingLengthImp<false>(num_rows_to_compare, sel_left_maybe_null,
+ left_to_right_map, match_bytevector, rows_left,
+ rows_right, offsets_left, offsets_right);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+template <bool use_selection>
+void KeyCompare::CompareVaryingLengthImp(
+ uint32_t num_rows, const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
+ const uint32_t* offsets_right) {
+ static const uint64_t tail_masks[] = {
+ 0x0000000000000000ULL, 0x00000000000000ffULL, 0x000000000000ffffULL,
+ 0x0000000000ffffffULL, 0x00000000ffffffffULL, 0x000000ffffffffffULL,
+ 0x0000ffffffffffffULL, 0x00ffffffffffffffULL, 0xffffffffffffffffULL};
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t irow_left = use_selection ? sel_left_maybe_null[i] : i;
+ uint32_t irow_right = left_to_right_map[irow_left];
+ uint32_t begin_left = offsets_left[irow_left];
+ uint32_t begin_right = offsets_right[irow_right];
+ uint32_t length_left = offsets_left[irow_left + 1] - begin_left;
+ uint32_t length_right = offsets_right[irow_right + 1] - begin_right;
+ uint32_t length = std::min(length_left, length_right);
+ const uint64_t* key_left_ptr =
+ reinterpret_cast<const uint64_t*>(rows_left + begin_left);
+ const uint64_t* key_right_ptr =
+ reinterpret_cast<const uint64_t*>(rows_right + begin_right);
+ uint64_t result_or = 0;
+ int32_t istripe;
+ // length can be zero
+ for (istripe = 0; istripe < (static_cast<int32_t>(length) + 7) / 8 - 1; ++istripe) {
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (key_left ^ key_right);
+ }
+
+ uint32_t length_remaining = length - static_cast<uint32_t>(istripe) * 8;
+ uint64_t tail_mask = tail_masks[length_remaining];
+
+ uint64_t key_left = util::SafeLoad(&key_left_ptr[istripe]);
+ uint64_t key_right = util::SafeLoad(&key_right_ptr[istripe]);
+ result_or |= (tail_mask & (key_left ^ key_right));
+
+ int result = (result_or == 0 ? 0xff : 0);
+ match_bytevector[i] &= result;
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
index 397a729dac6..1dffabb884b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_compare.h
@@ -1,101 +1,101 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/compute/exec/key_encode.h"
-#include "arrow/compute/exec/util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-
-namespace arrow {
-namespace compute {
-
-class KeyCompare {
- public:
- // Returns a single 16-bit selection vector of rows that failed comparison.
- // If there is input selection on the left, the resulting selection is a filtered image
- // of input selection.
- static void CompareRows(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
- uint16_t* out_sel_left_maybe_same,
- const KeyEncoder::KeyRowArray& rows_left,
- const KeyEncoder::KeyRowArray& rows_right);
-
- private:
- static void CompareFixedLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- uint32_t fixed_length, const uint8_t* rows_left,
- const uint8_t* rows_right);
- static void CompareVaryingLength(uint32_t num_rows_to_compare,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector,
- KeyEncoder::KeyEncoderContext* ctx,
- const uint8_t* rows_left, const uint8_t* rows_right,
- const uint32_t* offsets_left,
- const uint32_t* offsets_right);
-
- // Second template argument is 0, 1 or 2.
- // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2.
- template <bool use_selection, int num_64bit_words>
- static void CompareFixedLengthImp(uint32_t num_rows_already_processed,
- uint32_t num_rows,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, uint32_t length,
- const uint8_t* rows_left, const uint8_t* rows_right);
- template <bool use_selection>
- static void CompareVaryingLengthImp(uint32_t num_rows,
- const uint16_t* sel_left_maybe_null,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, const uint8_t* rows_left,
- const uint8_t* rows_right,
- const uint32_t* offsets_left,
- const uint32_t* offsets_right);
-
-#if defined(ARROW_HAVE_AVX2)
-
- static uint32_t CompareFixedLength_UpTo8B_avx2(
- uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
- static uint32_t CompareFixedLength_UpTo16B_avx2(
- uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
- static uint32_t CompareFixedLength_avx2(uint32_t num_rows,
- const uint32_t* left_to_right_map,
- uint8_t* match_bytevector, uint32_t length,
- const uint8_t* rows_left,
- const uint8_t* rows_right);
- static void CompareVaryingLength_avx2(
- uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
- const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
- const uint32_t* offsets_right);
-
-#endif
-};
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/compute/exec/key_encode.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace compute {
+
+class KeyCompare {
+ public:
+ // Returns a single 16-bit selection vector of rows that failed comparison.
+ // If there is input selection on the left, the resulting selection is a filtered image
+ // of input selection.
+ static void CompareRows(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ KeyEncoder::KeyEncoderContext* ctx, uint32_t* out_num_rows,
+ uint16_t* out_sel_left_maybe_same,
+ const KeyEncoder::KeyRowArray& rows_left,
+ const KeyEncoder::KeyRowArray& rows_right);
+
+ private:
+ static void CompareFixedLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ uint32_t fixed_length, const uint8_t* rows_left,
+ const uint8_t* rows_right);
+ static void CompareVaryingLength(uint32_t num_rows_to_compare,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector,
+ KeyEncoder::KeyEncoderContext* ctx,
+ const uint8_t* rows_left, const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+ // Second template argument is 0, 1 or 2.
+ // 0 means arbitrarily many 64-bit words, 1 means up to 1 and 2 means up to 2.
+ template <bool use_selection, int num_64bit_words>
+ static void CompareFixedLengthImp(uint32_t num_rows_already_processed,
+ uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left, const uint8_t* rows_right);
+ template <bool use_selection>
+ static void CompareVaryingLengthImp(uint32_t num_rows,
+ const uint16_t* sel_left_maybe_null,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, const uint8_t* rows_left,
+ const uint8_t* rows_right,
+ const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+#if defined(ARROW_HAVE_AVX2)
+
+ static uint32_t CompareFixedLength_UpTo8B_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
+ static uint32_t CompareFixedLength_UpTo16B_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ uint32_t length, const uint8_t* rows_left, const uint8_t* rows_right);
+ static uint32_t CompareFixedLength_avx2(uint32_t num_rows,
+ const uint32_t* left_to_right_map,
+ uint8_t* match_bytevector, uint32_t length,
+ const uint8_t* rows_left,
+ const uint8_t* rows_right);
+ static void CompareVaryingLength_avx2(
+ uint32_t num_rows, const uint32_t* left_to_right_map, uint8_t* match_bytevector,
+ const uint8_t* rows_left, const uint8_t* rows_right, const uint32_t* offsets_left,
+ const uint32_t* offsets_right);
+
+#endif
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
index f0498b509a1..de79558f2c2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.cc
@@ -1,1649 +1,1649 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_encode.h"
-
-#include <memory.h>
-
-#include <algorithm>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace compute {
-
-KeyEncoder::KeyRowArray::KeyRowArray()
- : pool_(nullptr), rows_capacity_(0), bytes_capacity_(0) {}
-
-Status KeyEncoder::KeyRowArray::Init(MemoryPool* pool, const KeyRowMetadata& metadata) {
- pool_ = pool;
- metadata_ = metadata;
-
- DCHECK(!null_masks_ && !offsets_ && !rows_);
-
- constexpr int64_t rows_capacity = 8;
- constexpr int64_t bytes_capacity = 1024;
-
- // Null masks
- ARROW_ASSIGN_OR_RAISE(auto null_masks,
- AllocateResizableBuffer(size_null_masks(rows_capacity), pool_));
- null_masks_ = std::move(null_masks);
- memset(null_masks_->mutable_data(), 0, size_null_masks(rows_capacity));
-
- // Offsets and rows
- if (!metadata.is_fixed_length) {
- ARROW_ASSIGN_OR_RAISE(auto offsets,
- AllocateResizableBuffer(size_offsets(rows_capacity), pool_));
- offsets_ = std::move(offsets);
- memset(offsets_->mutable_data(), 0, size_offsets(rows_capacity));
- reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
-
- ARROW_ASSIGN_OR_RAISE(
- auto rows,
- AllocateResizableBuffer(size_rows_varying_length(bytes_capacity), pool_));
- rows_ = std::move(rows);
- memset(rows_->mutable_data(), 0, size_rows_varying_length(bytes_capacity));
- bytes_capacity_ = size_rows_varying_length(bytes_capacity) - padding_for_vectors;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- auto rows, AllocateResizableBuffer(size_rows_fixed_length(rows_capacity), pool_));
- rows_ = std::move(rows);
- memset(rows_->mutable_data(), 0, size_rows_fixed_length(rows_capacity));
- bytes_capacity_ = size_rows_fixed_length(rows_capacity) - padding_for_vectors;
- }
-
- update_buffer_pointers();
-
- rows_capacity_ = rows_capacity;
-
- num_rows_ = 0;
- num_rows_for_has_any_nulls_ = 0;
- has_any_nulls_ = false;
-
- return Status::OK();
-}
-
-void KeyEncoder::KeyRowArray::Clean() {
- num_rows_ = 0;
- num_rows_for_has_any_nulls_ = 0;
- has_any_nulls_ = false;
-
- if (!metadata_.is_fixed_length) {
- reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
- }
-}
-
-int64_t KeyEncoder::KeyRowArray::size_null_masks(int64_t num_rows) {
- return num_rows * metadata_.null_masks_bytes_per_row + padding_for_vectors;
-}
-
-int64_t KeyEncoder::KeyRowArray::size_offsets(int64_t num_rows) {
- return (num_rows + 1) * sizeof(uint32_t) + padding_for_vectors;
-}
-
-int64_t KeyEncoder::KeyRowArray::size_rows_fixed_length(int64_t num_rows) {
- return num_rows * metadata_.fixed_length + padding_for_vectors;
-}
-
-int64_t KeyEncoder::KeyRowArray::size_rows_varying_length(int64_t num_bytes) {
- return num_bytes + padding_for_vectors;
-}
-
-void KeyEncoder::KeyRowArray::update_buffer_pointers() {
- buffers_[0] = mutable_buffers_[0] = null_masks_->mutable_data();
- if (metadata_.is_fixed_length) {
- buffers_[1] = mutable_buffers_[1] = rows_->mutable_data();
- buffers_[2] = mutable_buffers_[2] = nullptr;
- } else {
- buffers_[1] = mutable_buffers_[1] = offsets_->mutable_data();
- buffers_[2] = mutable_buffers_[2] = rows_->mutable_data();
- }
-}
-
-Status KeyEncoder::KeyRowArray::ResizeFixedLengthBuffers(int64_t num_extra_rows) {
- if (rows_capacity_ >= num_rows_ + num_extra_rows) {
- return Status::OK();
- }
-
- int64_t rows_capacity_new = std::max(static_cast<int64_t>(1), 2 * rows_capacity_);
- while (rows_capacity_new < num_rows_ + num_extra_rows) {
- rows_capacity_new *= 2;
- }
-
- // Null masks
- RETURN_NOT_OK(null_masks_->Resize(size_null_masks(rows_capacity_new), false));
- memset(null_masks_->mutable_data() + size_null_masks(rows_capacity_), 0,
- size_null_masks(rows_capacity_new) - size_null_masks(rows_capacity_));
-
- // Either offsets or rows
- if (!metadata_.is_fixed_length) {
- RETURN_NOT_OK(offsets_->Resize(size_offsets(rows_capacity_new), false));
- memset(offsets_->mutable_data() + size_offsets(rows_capacity_), 0,
- size_offsets(rows_capacity_new) - size_offsets(rows_capacity_));
- } else {
- RETURN_NOT_OK(rows_->Resize(size_rows_fixed_length(rows_capacity_new), false));
- memset(rows_->mutable_data() + size_rows_fixed_length(rows_capacity_), 0,
- size_rows_fixed_length(rows_capacity_new) -
- size_rows_fixed_length(rows_capacity_));
- bytes_capacity_ = size_rows_fixed_length(rows_capacity_new) - padding_for_vectors;
- }
-
- update_buffer_pointers();
-
- rows_capacity_ = rows_capacity_new;
-
- return Status::OK();
-}
-
-Status KeyEncoder::KeyRowArray::ResizeOptionalVaryingLengthBuffer(
- int64_t num_extra_bytes) {
- int64_t num_bytes = offsets()[num_rows_];
- if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) {
- return Status::OK();
- }
-
- int64_t bytes_capacity_new = std::max(static_cast<int64_t>(1), 2 * bytes_capacity_);
- while (bytes_capacity_new < num_bytes + num_extra_bytes) {
- bytes_capacity_new *= 2;
- }
-
- RETURN_NOT_OK(rows_->Resize(size_rows_varying_length(bytes_capacity_new), false));
- memset(rows_->mutable_data() + size_rows_varying_length(bytes_capacity_), 0,
- size_rows_varying_length(bytes_capacity_new) -
- size_rows_varying_length(bytes_capacity_));
-
- update_buffer_pointers();
-
- bytes_capacity_ = bytes_capacity_new;
-
- return Status::OK();
-}
-
-Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from,
- uint32_t num_rows_to_append,
- const uint16_t* source_row_ids) {
- DCHECK(metadata_.is_compatible(from.metadata()));
-
- RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
-
- if (!metadata_.is_fixed_length) {
- // Varying-length rows
- auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
- auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
- uint32_t total_length = to_offsets[num_rows_];
- uint32_t total_length_to_append = 0;
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint16_t row_id = source_row_ids[i];
- uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
- total_length_to_append += length;
- to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
- }
-
- RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
-
- const uint8_t* src = from.rows_->data();
- uint8_t* dst = rows_->mutable_data() + total_length;
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint16_t row_id = source_row_ids[i];
- uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
- auto src64 = reinterpret_cast<const uint64_t*>(src + from_offsets[row_id]);
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
- dst64[j] = src64[j];
- }
- dst += length;
- }
- } else {
- // Fixed-length rows
- const uint8_t* src = from.rows_->data();
- uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length;
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint16_t row_id = source_row_ids[i];
- uint32_t length = metadata_.fixed_length;
- auto src64 = reinterpret_cast<const uint64_t*>(src + length * row_id);
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
- dst64[j] = src64[j];
- }
- dst += length;
- }
- }
-
- // Null masks
- uint32_t byte_length = metadata_.null_masks_bytes_per_row;
- uint64_t dst_byte_offset = num_rows_ * byte_length;
- const uint8_t* src_base = from.null_masks_->data();
- uint8_t* dst_base = null_masks_->mutable_data();
- for (uint32_t i = 0; i < num_rows_to_append; ++i) {
- uint32_t row_id = source_row_ids[i];
- int64_t src_byte_offset = row_id * byte_length;
- const uint8_t* src = src_base + src_byte_offset;
- uint8_t* dst = dst_base + dst_byte_offset;
- for (uint32_t ibyte = 0; ibyte < byte_length; ++ibyte) {
- dst[ibyte] = src[ibyte];
- }
- dst_byte_offset += byte_length;
- }
-
- num_rows_ += num_rows_to_append;
-
- return Status::OK();
-}
-
-Status KeyEncoder::KeyRowArray::AppendEmpty(uint32_t num_rows_to_append,
- uint32_t num_extra_bytes_to_append) {
- RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
- RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append));
- num_rows_ += num_rows_to_append;
- if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) {
- memset(rows_->mutable_data(), 0, bytes_capacity_);
- }
- return Status::OK();
-}
-
-bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const {
- if (has_any_nulls_) {
- return true;
- }
- if (num_rows_for_has_any_nulls_ < num_rows_) {
- auto size_per_row = metadata().null_masks_bytes_per_row;
- has_any_nulls_ = !util::BitUtil::are_all_bytes_zero(
- ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_,
- static_cast<uint32_t>(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_)));
- num_rows_for_has_any_nulls_ = num_rows_;
- }
- return has_any_nulls_;
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
- const KeyColumnArray& left,
- const KeyColumnArray& right,
- int buffer_id_to_replace) {
- metadata_ = metadata;
- length_ = left.length();
- for (int i = 0; i < max_buffers_; ++i) {
- buffers_[i] = left.buffers_[i];
- mutable_buffers_[i] = left.mutable_buffers_[i];
- }
- buffers_[buffer_id_to_replace] = right.buffers_[buffer_id_to_replace];
- mutable_buffers_[buffer_id_to_replace] = right.mutable_buffers_[buffer_id_to_replace];
- bit_offset_[0] = left.bit_offset_[0];
- bit_offset_[1] = left.bit_offset_[1];
- if (buffer_id_to_replace < max_buffers_ - 1) {
- bit_offset_[buffer_id_to_replace] = right.bit_offset_[buffer_id_to_replace];
- }
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
- int64_t length, const uint8_t* buffer0,
- const uint8_t* buffer1, const uint8_t* buffer2,
- int bit_offset0, int bit_offset1) {
- metadata_ = metadata;
- length_ = length;
- buffers_[0] = buffer0;
- buffers_[1] = buffer1;
- buffers_[2] = buffer2;
- mutable_buffers_[0] = mutable_buffers_[1] = mutable_buffers_[2] = nullptr;
- bit_offset_[0] = bit_offset0;
- bit_offset_[1] = bit_offset1;
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
- int64_t length, uint8_t* buffer0,
- uint8_t* buffer1, uint8_t* buffer2,
- int bit_offset0, int bit_offset1) {
- metadata_ = metadata;
- length_ = length;
- buffers_[0] = mutable_buffers_[0] = buffer0;
- buffers_[1] = mutable_buffers_[1] = buffer1;
- buffers_[2] = mutable_buffers_[2] = buffer2;
- bit_offset_[0] = bit_offset0;
- bit_offset_[1] = bit_offset1;
-}
-
-KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnArray& from, int64_t start,
- int64_t length) {
- metadata_ = from.metadata_;
- length_ = length;
- uint32_t fixed_size =
- !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
-
- buffers_[0] =
- from.buffers_[0] ? from.buffers_[0] + (from.bit_offset_[0] + start) / 8 : nullptr;
- mutable_buffers_[0] = from.mutable_buffers_[0]
- ? from.mutable_buffers_[0] + (from.bit_offset_[0] + start) / 8
- : nullptr;
- bit_offset_[0] = (from.bit_offset_[0] + start) % 8;
-
- if (fixed_size == 0) {
- buffers_[1] =
- from.buffers_[1] ? from.buffers_[1] + (from.bit_offset_[1] + start) / 8 : nullptr;
- mutable_buffers_[1] = from.mutable_buffers_[1] ? from.mutable_buffers_[1] +
- (from.bit_offset_[1] + start) / 8
- : nullptr;
- bit_offset_[1] = (from.bit_offset_[1] + start) % 8;
- } else {
- buffers_[1] = from.buffers_[1] ? from.buffers_[1] + start * fixed_size : nullptr;
- mutable_buffers_[1] = from.mutable_buffers_[1]
- ? from.mutable_buffers_[1] + start * fixed_size
- : nullptr;
- bit_offset_[1] = 0;
- }
-
- buffers_[2] = from.buffers_[2];
- mutable_buffers_[2] = from.mutable_buffers_[2];
-}
-
-KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace(
- const KeyColumnArray& column, const KeyColumnArray& temp) {
- // Make sure that the temp buffer is large enough
- DCHECK(temp.length() >= column.length() && temp.metadata().is_fixed_length &&
- temp.metadata().fixed_length >= sizeof(uint8_t));
- KeyColumnMetadata metadata;
- metadata.is_fixed_length = true;
- metadata.fixed_length = sizeof(uint8_t);
- constexpr int buffer_index = 1;
- KeyColumnArray result = KeyColumnArray(metadata, column, temp, buffer_index);
- return result;
-}
-
-void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- // Make sure that metadata and lengths are compatible.
- DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
- DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0);
- DCHECK(output->length() == input.length());
- constexpr int buffer_index = 1;
- DCHECK(input.data(buffer_index) != nullptr);
- DCHECK(output->mutable_data(buffer_index) != nullptr);
- util::BitUtil::bits_to_bytes(
- ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
- output->mutable_data(buffer_index), input.bit_offset(buffer_index));
-}
-
-void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- // Make sure that metadata and lengths are compatible.
- DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
- DCHECK(output->metadata().fixed_length == 0 && input.metadata().fixed_length == 1);
- DCHECK(output->length() == input.length());
- constexpr int buffer_index = 1;
- DCHECK(input.data(buffer_index) != nullptr);
- DCHECK(output->mutable_data(buffer_index) != nullptr);
-
- util::BitUtil::bytes_to_bits(
- ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
- output->mutable_data(buffer_index), output->bit_offset(buffer_index));
-}
-
-bool KeyEncoder::EncoderInteger::IsBoolean(const KeyColumnMetadata& metadata) {
- return metadata.is_fixed_length && metadata.fixed_length == 0;
-}
-
-bool KeyEncoder::EncoderInteger::UsesTransform(const KeyColumnArray& column) {
- return IsBoolean(column.metadata());
-}
-
-KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace(
- const KeyColumnArray& column, const KeyColumnArray& temp) {
- if (IsBoolean(column.metadata())) {
- return TransformBoolean::ArrayReplace(column, temp);
- }
- return column;
-}
-
-void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- if (IsBoolean(input.metadata())) {
- TransformBoolean::PreEncode(input, output, ctx);
- }
-}
-
-void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input,
- KeyColumnArray* output,
- KeyEncoderContext* ctx) {
- if (IsBoolean(output->metadata())) {
- TransformBoolean::PostDecode(input, output, ctx);
- }
-}
-
-void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp) {
- KeyColumnArray col_prep;
- if (UsesTransform(col)) {
- col_prep = ArrayReplace(col, *temp);
- PreEncode(col, &col_prep, ctx);
- } else {
- col_prep = col;
- }
-
- const auto num_rows = static_cast<uint32_t>(col.length());
-
- // When we have a single fixed length column we can just do memcpy
- if (rows->metadata().is_fixed_length &&
- rows->metadata().fixed_length == col.metadata().fixed_length) {
- DCHECK_EQ(offset_within_row, 0);
- uint32_t row_size = col.metadata().fixed_length;
- memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size);
- } else if (rows->metadata().is_fixed_length) {
- uint32_t row_size = rows->metadata().fixed_length;
- uint8_t* row_base = rows->mutable_data(1) + offset_within_row;
- const uint8_t* col_base = col_prep.data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- row_base[i * row_size] = col_base[i];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint16_t*>(row_base + i * row_size) =
- reinterpret_cast<const uint16_t*>(col_base)[i];
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint32_t*>(row_base + i * row_size) =
- reinterpret_cast<const uint32_t*>(col_base)[i];
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint64_t*>(row_base + i * row_size) =
- reinterpret_cast<const uint64_t*>(col_base)[i];
- }
- break;
- default:
- DCHECK(false);
- }
- } else {
- const uint32_t* row_offsets = rows->offsets();
- uint8_t* row_base = rows->mutable_data(2) + offset_within_row;
- const uint8_t* col_base = col_prep.data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- row_base[row_offsets[i]] = col_base[i];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint16_t*>(row_base + row_offsets[i]) =
- reinterpret_cast<const uint16_t*>(col_base)[i];
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint32_t*>(row_base + row_offsets[i]) =
- reinterpret_cast<const uint32_t*>(col_base)[i];
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- *reinterpret_cast<uint64_t*>(row_base + row_offsets[i]) =
- reinterpret_cast<const uint64_t*>(col_base)[i];
- }
- break;
- default:
- DCHECK(false);
- }
- }
-}
-
-void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp) {
- KeyColumnArray col_prep;
- if (UsesTransform(*col)) {
- col_prep = ArrayReplace(*col, *temp);
- } else {
- col_prep = *col;
- }
-
- // When we have a single fixed length column we can just do memcpy
- if (rows.metadata().is_fixed_length &&
- col_prep.metadata().fixed_length == rows.metadata().fixed_length) {
- DCHECK_EQ(offset_within_row, 0);
- uint32_t row_size = rows.metadata().fixed_length;
- memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size,
- num_rows * row_size);
- } else if (rows.metadata().is_fixed_length) {
- uint32_t row_size = rows.metadata().fixed_length;
- const uint8_t* row_base = rows.data(1) + start_row * row_size;
- row_base += offset_within_row;
- uint8_t* col_base = col_prep.mutable_data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- col_base[i] = row_base[i * row_size];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint16_t*>(col_base)[i] =
- *reinterpret_cast<const uint16_t*>(row_base + i * row_size);
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint32_t*>(col_base)[i] =
- *reinterpret_cast<const uint32_t*>(row_base + i * row_size);
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint64_t*>(col_base)[i] =
- *reinterpret_cast<const uint64_t*>(row_base + i * row_size);
- }
- break;
- default:
- DCHECK(false);
- }
- } else {
- const uint32_t* row_offsets = rows.offsets() + start_row;
- const uint8_t* row_base = rows.data(2);
- row_base += offset_within_row;
- uint8_t* col_base = col_prep.mutable_data(1);
- switch (col_prep.metadata().fixed_length) {
- case 1:
- for (uint32_t i = 0; i < num_rows; ++i) {
- col_base[i] = row_base[row_offsets[i]];
- }
- break;
- case 2:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint16_t*>(col_base)[i] =
- *reinterpret_cast<const uint16_t*>(row_base + row_offsets[i]);
- }
- break;
- case 4:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint32_t*>(col_base)[i] =
- *reinterpret_cast<const uint32_t*>(row_base + row_offsets[i]);
- }
- break;
- case 8:
- for (uint32_t i = 0; i < num_rows; ++i) {
- reinterpret_cast<uint64_t*>(col_base)[i] =
- *reinterpret_cast<const uint64_t*>(row_base + row_offsets[i]);
- }
- break;
- default:
- DCHECK(false);
- }
- }
-
- if (UsesTransform(*col)) {
- PostDecode(col_prep, col, ctx);
- }
-}
-
-bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) {
- bool is_fixed_length = metadata.is_fixed_length;
- auto size = metadata.fixed_length;
- return is_fixed_length &&
- (size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
-}
-
-void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp) {
- if (IsInteger(col.metadata())) {
- EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp);
- } else {
- KeyColumnArray col_prep;
- if (EncoderInteger::UsesTransform(col)) {
- col_prep = EncoderInteger::ArrayReplace(col, *temp);
- EncoderInteger::PreEncode(col, &col_prep, ctx);
- } else {
- col_prep = col;
- }
-
- bool is_row_fixed_length = rows->metadata().is_fixed_length;
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col);
- } else {
-#endif
- if (is_row_fixed_length) {
- EncodeImp<true>(offset_within_row, rows, col);
- } else {
- EncodeImp<false>(offset_within_row, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
- }
-
- DCHECK(temp->metadata().is_fixed_length);
- DCHECK(temp->length() * temp->metadata().fixed_length >=
- col.length() * static_cast<int64_t>(sizeof(uint16_t)));
-
- KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(),
- nullptr, temp->mutable_data(1), nullptr);
- ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae);
-}
-
-void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp) {
- if (IsInteger(col->metadata())) {
- EncoderInteger::Decode(start_row, num_rows, offset_within_row, rows, col, ctx, temp);
- } else {
- KeyColumnArray col_prep;
- if (EncoderInteger::UsesTransform(*col)) {
- col_prep = EncoderInteger::ArrayReplace(*col, *temp);
- } else {
- col_prep = *col;
- }
-
- bool is_row_fixed_length = rows.metadata().is_fixed_length;
-
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows,
- col);
- } else {
-#endif
- if (is_row_fixed_length) {
- DecodeImp<true>(start_row, num_rows, offset_within_row, rows, col);
- } else {
- DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-
- if (EncoderInteger::UsesTransform(*col)) {
- EncoderInteger::PostDecode(col_prep, col, ctx);
- }
- }
-}
-
-template <bool is_row_fixed_length>
-void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col) {
- EncodeDecodeHelper<is_row_fixed_length, true>(
- 0, static_cast<uint32_t>(col.length()), offset_within_row, rows, rows, &col,
- nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- uint32_t istripe;
- for (istripe = 0; istripe < length / 8; ++istripe) {
- dst64[istripe] = util::SafeLoad(src64 + istripe);
- }
- if ((length % 8) > 0) {
- uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
- dst64[istripe] = (dst64[istripe] & ~mask_last) |
- (util::SafeLoad(src64 + istripe) & mask_last);
- }
- });
-}
-
-template <bool is_row_fixed_length>
-void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col) {
- EncodeDecodeHelper<is_row_fixed_length, false>(
- start_row, num_rows, offset_within_row, &rows, nullptr, col, col,
- [](uint8_t* dst, const uint8_t* src, int64_t length) {
- for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- util::SafeStore(dst64 + istripe, src64[istripe]);
- }
- });
-}
-
-void KeyEncoder::EncoderBinary::ColumnMemsetNulls(
- uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
- KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
- using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&,
- KeyEncoderContext*, KeyColumnArray*, uint8_t);
- static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = {
- ColumnMemsetNullsImp<false, 1>, ColumnMemsetNullsImp<false, 2>,
- ColumnMemsetNullsImp<false, 4>, ColumnMemsetNullsImp<false, 8>,
- ColumnMemsetNullsImp<false, 16>, ColumnMemsetNullsImp<true, 1>,
- ColumnMemsetNullsImp<true, 2>, ColumnMemsetNullsImp<true, 4>,
- ColumnMemsetNullsImp<true, 8>, ColumnMemsetNullsImp<true, 16>};
- uint32_t col_width = col.metadata().fixed_length;
- int dispatch_const =
- (rows->metadata().is_fixed_length ? 5 : 0) +
- (col_width == 1 ? 0
- : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4);
- ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx,
- temp_vector_16bit, byte_value);
-}
-
-template <bool is_row_fixed_length, uint32_t col_width>
-void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp(
- uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
- KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
- // Nothing to do when there are no nulls
- if (!col.data(0)) {
- return;
- }
-
- const auto num_rows = static_cast<uint32_t>(col.length());
-
- // Temp vector needs space for the required number of rows
- DCHECK(temp_vector_16bit->length() >= num_rows);
- DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
- temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
- auto temp_vector = reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1));
-
- // Bit vector to index vector of null positions
- int num_selected;
- util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast<int>(col.length()),
- col.data(0), &num_selected, temp_vector,
- col.bit_offset(0));
-
- for (int i = 0; i < num_selected; ++i) {
- uint32_t row_id = temp_vector[i];
-
- // Target binary field pointer
- uint8_t* dst;
- if (is_row_fixed_length) {
- dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id;
- } else {
- dst = rows->mutable_data(2) + rows->offsets()[row_id];
- }
- dst += offset_within_row;
-
- if (col_width == 1) {
- *dst = byte_value;
- } else if (col_width == 2) {
- *reinterpret_cast<uint16_t*>(dst) =
- (static_cast<uint16_t>(byte_value) * static_cast<uint16_t>(0x0101));
- } else if (col_width == 4) {
- *reinterpret_cast<uint32_t*>(dst) =
- (static_cast<uint32_t>(byte_value) * static_cast<uint32_t>(0x01010101));
- } else if (col_width == 8) {
- *reinterpret_cast<uint64_t*>(dst) =
- (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
- } else {
- uint64_t value = (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
- uint32_t col_width_actual = col.metadata().fixed_length;
- uint32_t j;
- for (j = 0; j < col_width_actual / 8; ++j) {
- reinterpret_cast<uint64_t*>(dst)[j] = value;
- }
- int tail = col_width_actual % 8;
- if (tail) {
- uint64_t mask = ~0ULL >> (8 * (8 - tail));
- reinterpret_cast<uint64_t*>(dst)[j] =
- (reinterpret_cast<const uint64_t*>(dst)[j] & ~mask) | (value & mask);
- }
- }
- }
-}
-
-void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2,
- KeyEncoderContext* ctx, KeyColumnArray* temp1,
- KeyColumnArray* temp2) {
- DCHECK(CanProcessPair(col1.metadata(), col2.metadata()));
-
- KeyColumnArray col_prep[2];
- if (EncoderInteger::UsesTransform(col1)) {
- col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1);
- EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx);
- } else {
- col_prep[0] = col1;
- }
- if (EncoderInteger::UsesTransform(col2)) {
- col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2);
- EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx);
- } else {
- col_prep[1] = col2;
- }
-
- uint32_t col_width1 = col_prep[0].metadata().fixed_length;
- uint32_t col_width2 = col_prep[1].metadata().fixed_length;
- int log_col_width1 =
- col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
- int log_col_width2 =
- col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
-
- bool is_row_fixed_length = rows->metadata().is_fixed_length;
-
- const auto num_rows = static_cast<uint32_t>(col1.length());
- uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && col_width1 == col_width2) {
- num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row,
- rows, col_prep[0], col_prep[1]);
- }
-#endif
- if (num_processed < num_rows) {
- using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&,
- const KeyColumnArray&);
- static const EncodeImp_t EncodeImp_fn[] = {
- EncodeImp<false, uint8_t, uint8_t>, EncodeImp<false, uint16_t, uint8_t>,
- EncodeImp<false, uint32_t, uint8_t>, EncodeImp<false, uint64_t, uint8_t>,
- EncodeImp<false, uint8_t, uint16_t>, EncodeImp<false, uint16_t, uint16_t>,
- EncodeImp<false, uint32_t, uint16_t>, EncodeImp<false, uint64_t, uint16_t>,
- EncodeImp<false, uint8_t, uint32_t>, EncodeImp<false, uint16_t, uint32_t>,
- EncodeImp<false, uint32_t, uint32_t>, EncodeImp<false, uint64_t, uint32_t>,
- EncodeImp<false, uint8_t, uint64_t>, EncodeImp<false, uint16_t, uint64_t>,
- EncodeImp<false, uint32_t, uint64_t>, EncodeImp<false, uint64_t, uint64_t>,
- EncodeImp<true, uint8_t, uint8_t>, EncodeImp<true, uint16_t, uint8_t>,
- EncodeImp<true, uint32_t, uint8_t>, EncodeImp<true, uint64_t, uint8_t>,
- EncodeImp<true, uint8_t, uint16_t>, EncodeImp<true, uint16_t, uint16_t>,
- EncodeImp<true, uint32_t, uint16_t>, EncodeImp<true, uint64_t, uint16_t>,
- EncodeImp<true, uint8_t, uint32_t>, EncodeImp<true, uint16_t, uint32_t>,
- EncodeImp<true, uint32_t, uint32_t>, EncodeImp<true, uint64_t, uint32_t>,
- EncodeImp<true, uint8_t, uint64_t>, EncodeImp<true, uint16_t, uint64_t>,
- EncodeImp<true, uint32_t, uint64_t>, EncodeImp<true, uint64_t, uint64_t>};
- int dispatch_const = (log_col_width2 << 2) | log_col_width1;
- dispatch_const += (is_row_fixed_length ? 16 : 0);
- EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0],
- col_prep[1]);
- }
-}
-
-template <bool is_row_fixed_length, typename col1_type, typename col2_type>
-void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip,
- uint32_t offset_within_row,
- KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2) {
- const uint8_t* src_A = col1.data(1);
- const uint8_t* src_B = col2.data(1);
-
- const auto num_rows = static_cast<uint32_t>(col1.length());
-
- uint32_t fixed_length = rows->metadata().fixed_length;
- const uint32_t* offsets;
- uint8_t* dst_base;
- if (is_row_fixed_length) {
- dst_base = rows->mutable_data(1) + offset_within_row;
- offsets = nullptr;
- } else {
- dst_base = rows->mutable_data(2) + offset_within_row;
- offsets = rows->offsets();
- }
-
- using col1_type_const = typename std::add_const<col1_type>::type;
- using col2_type_const = typename std::add_const<col2_type>::type;
-
- if (is_row_fixed_length) {
- uint8_t* dst = dst_base + num_rows_to_skip * fixed_length;
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
- *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
- reinterpret_cast<col2_type_const*>(src_B)[i];
- dst += fixed_length;
- }
- } else {
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- uint8_t* dst = dst_base + offsets[i];
- *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
- *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
- reinterpret_cast<col2_type_const*>(src_B)[i];
- }
- }
-}
-
-void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col1,
- KeyColumnArray* col2, KeyEncoderContext* ctx,
- KeyColumnArray* temp1, KeyColumnArray* temp2) {
- DCHECK(CanProcessPair(col1->metadata(), col2->metadata()));
-
- KeyColumnArray col_prep[2];
- if (EncoderInteger::UsesTransform(*col1)) {
- col_prep[0] = EncoderInteger::ArrayReplace(*col1, *temp1);
- } else {
- col_prep[0] = *col1;
- }
- if (EncoderInteger::UsesTransform(*col2)) {
- col_prep[1] = EncoderInteger::ArrayReplace(*col2, *temp2);
- } else {
- col_prep[1] = *col2;
- }
-
- uint32_t col_width1 = col_prep[0].metadata().fixed_length;
- uint32_t col_width2 = col_prep[1].metadata().fixed_length;
- int log_col_width1 =
- col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
- int log_col_width2 =
- col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
-
- bool is_row_fixed_length = rows.metadata().is_fixed_length;
-
- uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2() && col_width1 == col_width2) {
- num_processed =
- DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
- offset_within_row, rows, &col_prep[0], &col_prep[1]);
- }
-#endif
- if (num_processed < num_rows) {
- using DecodeImp_t = void (*)(uint32_t, uint32_t, uint32_t, uint32_t,
- const KeyRowArray&, KeyColumnArray*, KeyColumnArray*);
- static const DecodeImp_t DecodeImp_fn[] = {
- DecodeImp<false, uint8_t, uint8_t>, DecodeImp<false, uint16_t, uint8_t>,
- DecodeImp<false, uint32_t, uint8_t>, DecodeImp<false, uint64_t, uint8_t>,
- DecodeImp<false, uint8_t, uint16_t>, DecodeImp<false, uint16_t, uint16_t>,
- DecodeImp<false, uint32_t, uint16_t>, DecodeImp<false, uint64_t, uint16_t>,
- DecodeImp<false, uint8_t, uint32_t>, DecodeImp<false, uint16_t, uint32_t>,
- DecodeImp<false, uint32_t, uint32_t>, DecodeImp<false, uint64_t, uint32_t>,
- DecodeImp<false, uint8_t, uint64_t>, DecodeImp<false, uint16_t, uint64_t>,
- DecodeImp<false, uint32_t, uint64_t>, DecodeImp<false, uint64_t, uint64_t>,
- DecodeImp<true, uint8_t, uint8_t>, DecodeImp<true, uint16_t, uint8_t>,
- DecodeImp<true, uint32_t, uint8_t>, DecodeImp<true, uint64_t, uint8_t>,
- DecodeImp<true, uint8_t, uint16_t>, DecodeImp<true, uint16_t, uint16_t>,
- DecodeImp<true, uint32_t, uint16_t>, DecodeImp<true, uint64_t, uint16_t>,
- DecodeImp<true, uint8_t, uint32_t>, DecodeImp<true, uint16_t, uint32_t>,
- DecodeImp<true, uint32_t, uint32_t>, DecodeImp<true, uint64_t, uint32_t>,
- DecodeImp<true, uint8_t, uint64_t>, DecodeImp<true, uint16_t, uint64_t>,
- DecodeImp<true, uint32_t, uint64_t>, DecodeImp<true, uint64_t, uint64_t>};
- int dispatch_const =
- (log_col_width2 << 2) | log_col_width1 | (is_row_fixed_length ? 16 : 0);
- DecodeImp_fn[dispatch_const](num_processed, start_row, num_rows, offset_within_row,
- rows, &(col_prep[0]), &(col_prep[1]));
- }
-
- if (EncoderInteger::UsesTransform(*col1)) {
- EncoderInteger::PostDecode(col_prep[0], col1, ctx);
- }
- if (EncoderInteger::UsesTransform(*col2)) {
- EncoderInteger::PostDecode(col_prep[1], col2, ctx);
- }
-}
-
-template <bool is_row_fixed_length, typename col1_type, typename col2_type>
-void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip,
- uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray& rows,
- KeyColumnArray* col1,
- KeyColumnArray* col2) {
- DCHECK(rows.length() >= start_row + num_rows);
- DCHECK(col1->length() == num_rows && col2->length() == num_rows);
-
- uint8_t* dst_A = col1->mutable_data(1);
- uint8_t* dst_B = col2->mutable_data(1);
-
- uint32_t fixed_length = rows.metadata().fixed_length;
- const uint32_t* offsets;
- const uint8_t* src_base;
- if (is_row_fixed_length) {
- src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
- offsets = nullptr;
- } else {
- src_base = rows.data(2) + offset_within_row;
- offsets = rows.offsets() + start_row;
- }
-
- using col1_type_const = typename std::add_const<col1_type>::type;
- using col2_type_const = typename std::add_const<col2_type>::type;
-
- if (is_row_fixed_length) {
- const uint8_t* src = src_base + num_rows_to_skip * fixed_length;
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
- reinterpret_cast<col2_type*>(dst_B)[i] =
- *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
- src += fixed_length;
- }
- } else {
- for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
- const uint8_t* src = src_base + offsets[i];
- reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
- reinterpret_cast<col2_type*>(dst_B)[i] =
- *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
- }
- }
-}
-
-void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols,
- KeyEncoderContext* ctx) {
- DCHECK(!varbinary_cols.empty());
-
- // Rows and columns must all be varying-length
- DCHECK(!rows->metadata().is_fixed_length);
- for (const auto& col : varbinary_cols) {
- DCHECK(!col.metadata().is_fixed_length);
- }
-
- const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
-
- uint32_t num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector
- bool has_bit_offset = false;
-
- // The space in columns must be exactly equal to a space for offsets in rows
- DCHECK(rows->length() == num_rows);
- for (const auto& col : varbinary_cols) {
- DCHECK(col.length() == num_rows);
- if (col.bit_offset(0) != 0) {
- has_bit_offset = true;
- }
- }
-
- if (ctx->has_avx2() && !has_bit_offset) {
- // Create a temp vector sized based on the number of columns
- auto temp_buffer_holder = util::TempVectorHolder<uint32_t>(
- ctx->stack, static_cast<uint32_t>(varbinary_cols.size()) * 8);
- auto temp_buffer_32B_per_col = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder.mutable_data()), nullptr);
-
- num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col);
- }
-#endif
- if (num_processed < num_rows) {
- EncodeImp(num_processed, rows, varbinary_cols);
- }
-}
-
-void KeyEncoder::EncoderOffsets::EncodeImp(
- uint32_t num_rows_already_processed, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols) {
- DCHECK_GT(varbinary_cols.size(), 0);
-
- int row_alignment = rows->metadata().row_alignment;
- int string_alignment = rows->metadata().string_alignment;
-
- uint32_t* row_offsets = rows->mutable_offsets();
- uint8_t* row_values = rows->mutable_data(2);
- const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
-
- if (num_rows_already_processed == 0) {
- row_offsets[0] = 0;
- }
-
- uint32_t row_offset = row_offsets[num_rows_already_processed];
- for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) {
- uint32_t* varbinary_end =
- rows->metadata().varbinary_end_array(row_values + row_offset);
-
- // Zero out lengths for nulls.
- // Add lengths of all columns to get row size.
- // Store varbinary field ends while summing their lengths.
-
- uint32_t offset_within_row = rows->metadata().fixed_length;
-
- for (size_t col = 0; col < varbinary_cols.size(); ++col) {
- const uint32_t* col_offsets = varbinary_cols[col].offsets();
- uint32_t col_length = col_offsets[i + 1] - col_offsets[i];
-
- const int bit_offset = varbinary_cols[col].bit_offset(0);
-
- const uint8_t* non_nulls = varbinary_cols[col].data(0);
- if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) {
- col_length = 0;
- }
-
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
- offset_within_row += col_length;
-
- varbinary_end[col] = offset_within_row;
- }
-
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment);
- row_offset += offset_within_row;
- row_offsets[i + 1] = row_offset;
- }
-}
-
-void KeyEncoder::EncoderOffsets::Decode(
- uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* varbinary_cols,
- const std::vector<uint32_t>& varbinary_cols_base_offset, KeyEncoderContext* ctx) {
- DCHECK(!varbinary_cols->empty());
- DCHECK(varbinary_cols->size() == varbinary_cols_base_offset.size());
-
- DCHECK(!rows.metadata().is_fixed_length);
- DCHECK(rows.length() >= start_row + num_rows);
- for (const auto& col : *varbinary_cols) {
- // Rows and columns must all be varying-length
- DCHECK(!col.metadata().is_fixed_length);
- // The space in columns must be exactly equal to a subset of rows selected
- DCHECK(col.length() == num_rows);
- }
-
- // Offsets of varbinary columns data within each encoded row are stored
- // in the same encoded row as an array of 32-bit integers.
- // This array follows immediately the data of fixed-length columns.
- // There is one element for each varying-length column.
- // The Nth element is the sum of all the lengths of varbinary columns data in
- // that row, up to and including Nth varbinary column.
-
- const uint32_t* row_offsets = rows.offsets() + start_row;
-
- // Set the base offset for each column
- for (size_t col = 0; col < varbinary_cols->size(); ++col) {
- uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
- col_offsets[0] = varbinary_cols_base_offset[col];
- }
-
- int string_alignment = rows.metadata().string_alignment;
-
- for (uint32_t i = 0; i < num_rows; ++i) {
- // Find the beginning of cumulative lengths array for next row
- const uint8_t* row = rows.data(2) + row_offsets[i];
- const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row);
-
- // Update the offset of each column
- uint32_t offset_within_row = rows.metadata().fixed_length;
- for (size_t col = 0; col < varbinary_cols->size(); ++col) {
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
- uint32_t length = varbinary_ends[col] - offset_within_row;
- offset_within_row = varbinary_ends[col];
- uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
- col_offsets[i + 1] = col_offsets[i] + length;
- }
- }
-}
-
-void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col,
- KeyEncoderContext* ctx) {
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- EncodeHelper_avx2(varbinary_col_id, rows, col);
- } else {
-#endif
- if (varbinary_col_id == 0) {
- EncodeImp<true>(varbinary_col_id, rows, col);
- } else {
- EncodeImp<false>(varbinary_col_id, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx) {
- // Output column varbinary buffer needs an extra 32B
- // at the end in avx2 version and 8B otherwise.
-#if defined(ARROW_HAVE_AVX2)
- if (ctx->has_avx2()) {
- DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
- } else {
-#endif
- if (varbinary_col_id == 0) {
- DecodeImp<true>(start_row, num_rows, varbinary_col_id, rows, col);
- } else {
- DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-template <bool first_varbinary_col>
-void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col) {
- EncodeDecodeHelper<first_varbinary_col, true>(
- 0, static_cast<uint32_t>(col.length()), varbinary_col_id, rows, rows, &col, nullptr,
- [](uint8_t* dst, const uint8_t* src, int64_t length) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- uint32_t istripe;
- for (istripe = 0; istripe < length / 8; ++istripe) {
- dst64[istripe] = util::SafeLoad(src64 + istripe);
- }
- if ((length % 8) > 0) {
- uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
- dst64[istripe] = (dst64[istripe] & ~mask_last) |
- (util::SafeLoad(src64 + istripe) & mask_last);
- }
- });
-}
-
-template <bool first_varbinary_col>
-void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id,
- const KeyRowArray& rows,
- KeyColumnArray* col) {
- EncodeDecodeHelper<first_varbinary_col, false>(
- start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col,
- [](uint8_t* dst, const uint8_t* src, int64_t length) {
- for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
- auto dst64 = reinterpret_cast<uint64_t*>(dst);
- auto src64 = reinterpret_cast<const uint64_t*>(src);
- util::SafeStore(dst64 + istripe, src64[istripe]);
- }
- });
-}
-
-void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& cols,
- KeyEncoderContext* ctx,
- KeyColumnArray* temp_vector_16bit) {
- DCHECK_GT(cols.size(), 0);
- const auto num_rows = static_cast<uint32_t>(rows->length());
-
- // All input columns should have the same number of rows.
- // They may or may not have non-nulls bit-vectors allocated.
- for (const auto& col : cols) {
- DCHECK(col.length() == num_rows);
- }
-
- // Temp vector needs space for the required number of rows
- DCHECK(temp_vector_16bit->length() >= num_rows);
- DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
- temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
-
- uint8_t* null_masks = rows->null_masks();
- uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row;
- memset(null_masks, 0, null_masks_bytes_per_row * num_rows);
- for (size_t col = 0; col < cols.size(); ++col) {
- const uint8_t* non_nulls = cols[col].data(0);
- if (!non_nulls) {
- continue;
- }
- int bit_offset = cols[col].bit_offset(0);
- DCHECK_LT(bit_offset, 8);
- int num_selected;
- util::BitUtil::bits_to_indexes(
- 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected,
- reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1)), bit_offset);
- for (int i = 0; i < num_selected; ++i) {
- uint16_t row_id = reinterpret_cast<const uint16_t*>(temp_vector_16bit->data(1))[i];
- int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col;
- BitUtil::SetBit(null_masks, null_masks_bit_id);
- }
- }
-}
-
-void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows,
- const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols) {
- // Every output column needs to have a space for exactly the required number
- // of rows. It also needs to have non-nulls bit-vector allocated and mutable.
- DCHECK_GT(cols->size(), 0);
- for (auto& col : *cols) {
- DCHECK(col.length() == num_rows);
- DCHECK(col.mutable_data(0));
- }
-
- const uint8_t* null_masks = rows.null_masks();
- uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row;
- for (size_t col = 0; col < cols->size(); ++col) {
- uint8_t* non_nulls = (*cols)[col].mutable_data(0);
- const int bit_offset = (*cols)[col].bit_offset(0);
- DCHECK_LT(bit_offset, 8);
- non_nulls[0] |= 0xff << (bit_offset);
- if (bit_offset + num_rows > 8) {
- int bits_in_first_byte = 8 - bit_offset;
- memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte));
- }
- for (uint32_t row = 0; row < num_rows; ++row) {
- uint32_t null_masks_bit_id =
- (start_row + row) * null_masks_bytes_per_row * 8 + static_cast<uint32_t>(col);
- bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id);
- if (is_set) {
- BitUtil::ClearBit(non_nulls, bit_offset + row);
- }
- }
- }
-}
-
-uint32_t KeyEncoder::KeyRowMetadata::num_varbinary_cols() const {
- uint32_t result = 0;
- for (auto column_metadata : column_metadatas) {
- if (!column_metadata.is_fixed_length) {
- ++result;
- }
- }
- return result;
-}
-
-bool KeyEncoder::KeyRowMetadata::is_compatible(const KeyRowMetadata& other) const {
- if (other.num_cols() != num_cols()) {
- return false;
- }
- if (row_alignment != other.row_alignment ||
- string_alignment != other.string_alignment) {
- return false;
- }
- for (size_t i = 0; i < column_metadatas.size(); ++i) {
- if (column_metadatas[i].is_fixed_length !=
- other.column_metadatas[i].is_fixed_length) {
- return false;
- }
- if (column_metadatas[i].fixed_length != other.column_metadatas[i].fixed_length) {
- return false;
- }
- }
- return true;
-}
-
-void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector(
- const std::vector<KeyColumnMetadata>& cols, int in_row_alignment,
- int in_string_alignment) {
- column_metadatas.resize(cols.size());
- for (size_t i = 0; i < cols.size(); ++i) {
- column_metadatas[i] = cols[i];
- }
-
- const auto num_cols = static_cast<uint32_t>(cols.size());
-
- // Sort columns.
- // Columns are sorted based on the size in bytes of their fixed-length part.
- // For the varying-length column, the fixed-length part is the 32-bit field storing
- // cumulative length of varying-length fields.
- // The rules are:
- // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
- // part of 1 byte. b) Columns with fixed-length part being power of 2 or multiple of row
- // alignment precede other columns. They are sorted among themselves based on size of
- // fixed-length part. c) Fixed-length columns precede varying-length columns when both
- // have the same size fixed-length part.
- column_order.resize(num_cols);
- for (uint32_t i = 0; i < num_cols; ++i) {
- column_order[i] = i;
- }
- std::sort(
- column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
- bool is_left_pow2 =
- !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
- bool is_right_pow2 = !cols[right].is_fixed_length ||
- ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
- bool is_left_fixedlen = cols[left].is_fixed_length;
- bool is_right_fixedlen = cols[right].is_fixed_length;
- uint32_t width_left =
- cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
- uint32_t width_right =
- cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
- if (is_left_pow2 != is_right_pow2) {
- return is_left_pow2;
- }
- if (!is_left_pow2) {
- return left < right;
- }
- if (width_left != width_right) {
- return width_left > width_right;
- }
- if (is_left_fixedlen != is_right_fixedlen) {
- return is_left_fixedlen;
- }
- return left < right;
- });
-
- row_alignment = in_row_alignment;
- string_alignment = in_string_alignment;
- varbinary_end_array_offset = 0;
-
- column_offsets.resize(num_cols);
- uint32_t num_varbinary_cols = 0;
- uint32_t offset_within_row = 0;
- for (uint32_t i = 0; i < num_cols; ++i) {
- const KeyColumnMetadata& col = cols[column_order[i]];
- offset_within_row +=
- KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col);
- column_offsets[i] = offset_within_row;
- if (!col.is_fixed_length) {
- if (num_varbinary_cols == 0) {
- varbinary_end_array_offset = offset_within_row;
- }
- DCHECK(column_offsets[i] - varbinary_end_array_offset ==
- num_varbinary_cols * sizeof(uint32_t));
- ++num_varbinary_cols;
- offset_within_row += sizeof(uint32_t);
- } else {
- // Boolean column is a bit-vector, which is indicated by
- // setting fixed length in column metadata to zero.
- // It will be stored as a byte in output row.
- if (col.fixed_length == 0) {
- offset_within_row += 1;
- } else {
- offset_within_row += col.fixed_length;
- }
- }
- }
-
- is_fixed_length = (num_varbinary_cols == 0);
- fixed_length =
- offset_within_row +
- KeyRowMetadata::padding_for_alignment(
- offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment);
-
- // We set the number of bytes per row storing null masks of individual key columns
- // to be a power of two. This is not required. It could be also set to the minimal
- // number of bytes required for a given number of bits (one bit per column).
- null_masks_bytes_per_row = 1;
- while (static_cast<uint32_t>(null_masks_bytes_per_row * 8) < num_cols) {
- null_masks_bytes_per_row *= 2;
- }
-}
-
-void KeyEncoder::Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
- int row_alignment, int string_alignment) {
- ctx_ = ctx;
- row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment);
- uint32_t num_cols = row_metadata_.num_cols();
- uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols();
- batch_all_cols_.resize(num_cols);
- batch_varbinary_cols_.resize(num_varbinary_cols);
- batch_varbinary_cols_base_offsets_.resize(num_varbinary_cols);
-}
-
-void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
- const std::vector<KeyColumnArray>& cols_in) {
- const auto num_cols = static_cast<uint32_t>(cols_in.size());
- DCHECK(batch_all_cols_.size() == num_cols);
-
- uint32_t num_varbinary_visited = 0;
- for (uint32_t i = 0; i < num_cols; ++i) {
- const KeyColumnArray& col = cols_in[row_metadata_.column_order[i]];
- KeyColumnArray col_window(col, start_row, num_rows);
- batch_all_cols_[i] = col_window;
- if (!col.metadata().is_fixed_length) {
- DCHECK(num_varbinary_visited < batch_varbinary_cols_.size());
- // If start row is zero, then base offset of varbinary column is also zero.
- if (start_row == 0) {
- batch_varbinary_cols_base_offsets_[num_varbinary_visited] = 0;
- } else {
- batch_varbinary_cols_base_offsets_[num_varbinary_visited] =
- col.offsets()[start_row];
- }
- batch_varbinary_cols_[num_varbinary_visited++] = col_window;
- }
- }
-}
-
-Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows,
- KeyRowArray* rows,
- const std::vector<KeyColumnArray>& all_cols) {
- int64_t num_bytes_required = 0;
-
- int64_t fixed_part = row_metadata_.fixed_length * num_rows;
- int64_t var_part = 0;
- for (const auto& col : all_cols) {
- if (!col.metadata().is_fixed_length) {
- DCHECK(col.length() >= start_row + num_rows);
- const uint32_t* offsets = col.offsets();
- var_part += offsets[start_row + num_rows] - offsets[start_row];
- // Include maximum padding that can be added to align the start of varbinary fields.
- var_part += num_rows * row_metadata_.string_alignment;
- }
- }
- // Include maximum padding that can be added to align the start of the rows.
- if (!row_metadata_.is_fixed_length) {
- fixed_part += row_metadata_.row_alignment * num_rows;
- }
- num_bytes_required = fixed_part + var_part;
-
- rows->Clean();
- RETURN_NOT_OK(rows->AppendEmpty(static_cast<uint32_t>(num_rows),
- static_cast<uint32_t>(num_bytes_required)));
-
- return Status::OK();
-}
-
-void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& cols) {
- // Prepare column array vectors
- PrepareKeyColumnArrays(start_row, num_rows, cols);
-
- // Create two temp vectors with 16-bit elements
- auto temp_buffer_holder_A =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_A = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
- auto temp_buffer_holder_B =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_B = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
-
- bool is_row_fixed_length = row_metadata_.is_fixed_length;
- if (!is_row_fixed_length) {
- // This call will generate and fill in data for both:
- // - offsets to the entire encoded arrays
- // - offsets for individual varbinary fields within each row
- EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_);
-
- for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
- // Memcpy varbinary fields into precomputed in the previous step
- // positions in the output row buffer.
- EncoderVarBinary::Encode(static_cast<uint32_t>(i), rows, batch_varbinary_cols_[i],
- ctx_);
- }
- }
-
- // Process fixed length columns
- const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
- for (uint32_t i = 0; i < num_cols;) {
- if (!batch_all_cols_[i].metadata().is_fixed_length) {
- i += 1;
- continue;
- }
- bool can_process_pair =
- (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
- EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
- batch_all_cols_[i + 1].metadata());
- if (!can_process_pair) {
- EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
- ctx_, &temp_buffer_A);
- i += 1;
- } else {
- EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
- batch_all_cols_[i + 1], ctx_, &temp_buffer_A,
- &temp_buffer_B);
- i += 2;
- }
- }
-
- // Process nulls
- EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A);
-}
-
-void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input,
- int64_t start_row_output, int64_t num_rows,
- const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols) {
- // Prepare column array vectors
- PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
-
- // Create two temp vectors with 16-bit elements
- auto temp_buffer_holder_A =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_A = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
- auto temp_buffer_holder_B =
- util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
- auto temp_buffer_B = KeyColumnArray(
- KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
- reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
-
- bool is_row_fixed_length = row_metadata_.is_fixed_length;
- if (!is_row_fixed_length) {
- EncoderOffsets::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows), rows, &batch_varbinary_cols_,
- batch_varbinary_cols_base_offsets_, ctx_);
- }
-
- // Process fixed length columns
- const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
- for (uint32_t i = 0; i < num_cols;) {
- if (!batch_all_cols_[i].metadata().is_fixed_length) {
- i += 1;
- continue;
- }
- bool can_process_pair =
- (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
- EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
- batch_all_cols_[i + 1].metadata());
- if (!can_process_pair) {
- EncoderBinary::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows),
- row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
- ctx_, &temp_buffer_A);
- i += 1;
- } else {
- EncoderBinaryPair::Decode(
- static_cast<uint32_t>(start_row_input), static_cast<uint32_t>(num_rows),
- row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
- &batch_all_cols_[i + 1], ctx_, &temp_buffer_A, &temp_buffer_B);
- i += 2;
- }
- }
-
- // Process nulls
- EncoderNulls::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows), rows, &batch_all_cols_);
-}
-
-void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input,
- int64_t start_row_output, int64_t num_rows,
- const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols) {
- // Prepare column array vectors
- PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
-
- bool is_row_fixed_length = row_metadata_.is_fixed_length;
- if (!is_row_fixed_length) {
- for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
- // Memcpy varbinary fields into precomputed in the previous step
- // positions in the output row buffer.
- EncoderVarBinary::Decode(static_cast<uint32_t>(start_row_input),
- static_cast<uint32_t>(num_rows), static_cast<uint32_t>(i),
- rows, &batch_varbinary_cols_[i], ctx_);
- }
- }
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_encode.h"
+
+#include <memory.h>
+
+#include <algorithm>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace compute {
+
+KeyEncoder::KeyRowArray::KeyRowArray()
+ : pool_(nullptr), rows_capacity_(0), bytes_capacity_(0) {}
+
+Status KeyEncoder::KeyRowArray::Init(MemoryPool* pool, const KeyRowMetadata& metadata) {
+ pool_ = pool;
+ metadata_ = metadata;
+
+ DCHECK(!null_masks_ && !offsets_ && !rows_);
+
+ constexpr int64_t rows_capacity = 8;
+ constexpr int64_t bytes_capacity = 1024;
+
+ // Null masks
+ ARROW_ASSIGN_OR_RAISE(auto null_masks,
+ AllocateResizableBuffer(size_null_masks(rows_capacity), pool_));
+ null_masks_ = std::move(null_masks);
+ memset(null_masks_->mutable_data(), 0, size_null_masks(rows_capacity));
+
+ // Offsets and rows
+ if (!metadata.is_fixed_length) {
+ ARROW_ASSIGN_OR_RAISE(auto offsets,
+ AllocateResizableBuffer(size_offsets(rows_capacity), pool_));
+ offsets_ = std::move(offsets);
+ memset(offsets_->mutable_data(), 0, size_offsets(rows_capacity));
+ reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+
+ ARROW_ASSIGN_OR_RAISE(
+ auto rows,
+ AllocateResizableBuffer(size_rows_varying_length(bytes_capacity), pool_));
+ rows_ = std::move(rows);
+ memset(rows_->mutable_data(), 0, size_rows_varying_length(bytes_capacity));
+ bytes_capacity_ = size_rows_varying_length(bytes_capacity) - padding_for_vectors;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ auto rows, AllocateResizableBuffer(size_rows_fixed_length(rows_capacity), pool_));
+ rows_ = std::move(rows);
+ memset(rows_->mutable_data(), 0, size_rows_fixed_length(rows_capacity));
+ bytes_capacity_ = size_rows_fixed_length(rows_capacity) - padding_for_vectors;
+ }
+
+ update_buffer_pointers();
+
+ rows_capacity_ = rows_capacity;
+
+ num_rows_ = 0;
+ num_rows_for_has_any_nulls_ = 0;
+ has_any_nulls_ = false;
+
+ return Status::OK();
+}
+
+void KeyEncoder::KeyRowArray::Clean() {
+ num_rows_ = 0;
+ num_rows_for_has_any_nulls_ = 0;
+ has_any_nulls_ = false;
+
+ if (!metadata_.is_fixed_length) {
+ reinterpret_cast<uint32_t*>(offsets_->mutable_data())[0] = 0;
+ }
+}
+
+int64_t KeyEncoder::KeyRowArray::size_null_masks(int64_t num_rows) {
+ return num_rows * metadata_.null_masks_bytes_per_row + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_offsets(int64_t num_rows) {
+ return (num_rows + 1) * sizeof(uint32_t) + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_rows_fixed_length(int64_t num_rows) {
+ return num_rows * metadata_.fixed_length + padding_for_vectors;
+}
+
+int64_t KeyEncoder::KeyRowArray::size_rows_varying_length(int64_t num_bytes) {
+ return num_bytes + padding_for_vectors;
+}
+
+void KeyEncoder::KeyRowArray::update_buffer_pointers() {
+ buffers_[0] = mutable_buffers_[0] = null_masks_->mutable_data();
+ if (metadata_.is_fixed_length) {
+ buffers_[1] = mutable_buffers_[1] = rows_->mutable_data();
+ buffers_[2] = mutable_buffers_[2] = nullptr;
+ } else {
+ buffers_[1] = mutable_buffers_[1] = offsets_->mutable_data();
+ buffers_[2] = mutable_buffers_[2] = rows_->mutable_data();
+ }
+}
+
+Status KeyEncoder::KeyRowArray::ResizeFixedLengthBuffers(int64_t num_extra_rows) {
+ if (rows_capacity_ >= num_rows_ + num_extra_rows) {
+ return Status::OK();
+ }
+
+ int64_t rows_capacity_new = std::max(static_cast<int64_t>(1), 2 * rows_capacity_);
+ while (rows_capacity_new < num_rows_ + num_extra_rows) {
+ rows_capacity_new *= 2;
+ }
+
+ // Null masks
+ RETURN_NOT_OK(null_masks_->Resize(size_null_masks(rows_capacity_new), false));
+ memset(null_masks_->mutable_data() + size_null_masks(rows_capacity_), 0,
+ size_null_masks(rows_capacity_new) - size_null_masks(rows_capacity_));
+
+ // Either offsets or rows
+ if (!metadata_.is_fixed_length) {
+ RETURN_NOT_OK(offsets_->Resize(size_offsets(rows_capacity_new), false));
+ memset(offsets_->mutable_data() + size_offsets(rows_capacity_), 0,
+ size_offsets(rows_capacity_new) - size_offsets(rows_capacity_));
+ } else {
+ RETURN_NOT_OK(rows_->Resize(size_rows_fixed_length(rows_capacity_new), false));
+ memset(rows_->mutable_data() + size_rows_fixed_length(rows_capacity_), 0,
+ size_rows_fixed_length(rows_capacity_new) -
+ size_rows_fixed_length(rows_capacity_));
+ bytes_capacity_ = size_rows_fixed_length(rows_capacity_new) - padding_for_vectors;
+ }
+
+ update_buffer_pointers();
+
+ rows_capacity_ = rows_capacity_new;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::ResizeOptionalVaryingLengthBuffer(
+ int64_t num_extra_bytes) {
+ int64_t num_bytes = offsets()[num_rows_];
+ if (bytes_capacity_ >= num_bytes + num_extra_bytes || metadata_.is_fixed_length) {
+ return Status::OK();
+ }
+
+ int64_t bytes_capacity_new = std::max(static_cast<int64_t>(1), 2 * bytes_capacity_);
+ while (bytes_capacity_new < num_bytes + num_extra_bytes) {
+ bytes_capacity_new *= 2;
+ }
+
+ RETURN_NOT_OK(rows_->Resize(size_rows_varying_length(bytes_capacity_new), false));
+ memset(rows_->mutable_data() + size_rows_varying_length(bytes_capacity_), 0,
+ size_rows_varying_length(bytes_capacity_new) -
+ size_rows_varying_length(bytes_capacity_));
+
+ update_buffer_pointers();
+
+ bytes_capacity_ = bytes_capacity_new;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::AppendSelectionFrom(const KeyRowArray& from,
+ uint32_t num_rows_to_append,
+ const uint16_t* source_row_ids) {
+ DCHECK(metadata_.is_compatible(from.metadata()));
+
+ RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
+
+ if (!metadata_.is_fixed_length) {
+ // Varying-length rows
+ auto from_offsets = reinterpret_cast<const uint32_t*>(from.offsets_->data());
+ auto to_offsets = reinterpret_cast<uint32_t*>(offsets_->mutable_data());
+ uint32_t total_length = to_offsets[num_rows_];
+ uint32_t total_length_to_append = 0;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+ total_length_to_append += length;
+ to_offsets[num_rows_ + i + 1] = total_length + total_length_to_append;
+ }
+
+ RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(total_length_to_append));
+
+ const uint8_t* src = from.rows_->data();
+ uint8_t* dst = rows_->mutable_data() + total_length;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = from_offsets[row_id + 1] - from_offsets[row_id];
+ auto src64 = reinterpret_cast<const uint64_t*>(src + from_offsets[row_id]);
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
+ dst64[j] = src64[j];
+ }
+ dst += length;
+ }
+ } else {
+ // Fixed-length rows
+ const uint8_t* src = from.rows_->data();
+ uint8_t* dst = rows_->mutable_data() + num_rows_ * metadata_.fixed_length;
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint16_t row_id = source_row_ids[i];
+ uint32_t length = metadata_.fixed_length;
+ auto src64 = reinterpret_cast<const uint64_t*>(src + length * row_id);
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ for (uint32_t j = 0; j < (length + 7) / 8; ++j) {
+ dst64[j] = src64[j];
+ }
+ dst += length;
+ }
+ }
+
+ // Null masks
+ uint32_t byte_length = metadata_.null_masks_bytes_per_row;
+ uint64_t dst_byte_offset = num_rows_ * byte_length;
+ const uint8_t* src_base = from.null_masks_->data();
+ uint8_t* dst_base = null_masks_->mutable_data();
+ for (uint32_t i = 0; i < num_rows_to_append; ++i) {
+ uint32_t row_id = source_row_ids[i];
+ int64_t src_byte_offset = row_id * byte_length;
+ const uint8_t* src = src_base + src_byte_offset;
+ uint8_t* dst = dst_base + dst_byte_offset;
+ for (uint32_t ibyte = 0; ibyte < byte_length; ++ibyte) {
+ dst[ibyte] = src[ibyte];
+ }
+ dst_byte_offset += byte_length;
+ }
+
+ num_rows_ += num_rows_to_append;
+
+ return Status::OK();
+}
+
+Status KeyEncoder::KeyRowArray::AppendEmpty(uint32_t num_rows_to_append,
+ uint32_t num_extra_bytes_to_append) {
+ RETURN_NOT_OK(ResizeFixedLengthBuffers(num_rows_to_append));
+ RETURN_NOT_OK(ResizeOptionalVaryingLengthBuffer(num_extra_bytes_to_append));
+ num_rows_ += num_rows_to_append;
+ if (metadata_.row_alignment > 1 || metadata_.string_alignment > 1) {
+ memset(rows_->mutable_data(), 0, bytes_capacity_);
+ }
+ return Status::OK();
+}
+
+bool KeyEncoder::KeyRowArray::has_any_nulls(const KeyEncoderContext* ctx) const {
+ if (has_any_nulls_) {
+ return true;
+ }
+ if (num_rows_for_has_any_nulls_ < num_rows_) {
+ auto size_per_row = metadata().null_masks_bytes_per_row;
+ has_any_nulls_ = !util::BitUtil::are_all_bytes_zero(
+ ctx->hardware_flags, null_masks() + size_per_row * num_rows_for_has_any_nulls_,
+ static_cast<uint32_t>(size_per_row * (num_rows_ - num_rows_for_has_any_nulls_)));
+ num_rows_for_has_any_nulls_ = num_rows_;
+ }
+ return has_any_nulls_;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ const KeyColumnArray& left,
+ const KeyColumnArray& right,
+ int buffer_id_to_replace) {
+ metadata_ = metadata;
+ length_ = left.length();
+ for (int i = 0; i < max_buffers_; ++i) {
+ buffers_[i] = left.buffers_[i];
+ mutable_buffers_[i] = left.mutable_buffers_[i];
+ }
+ buffers_[buffer_id_to_replace] = right.buffers_[buffer_id_to_replace];
+ mutable_buffers_[buffer_id_to_replace] = right.mutable_buffers_[buffer_id_to_replace];
+ bit_offset_[0] = left.bit_offset_[0];
+ bit_offset_[1] = left.bit_offset_[1];
+ if (buffer_id_to_replace < max_buffers_ - 1) {
+ bit_offset_[buffer_id_to_replace] = right.bit_offset_[buffer_id_to_replace];
+ }
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ int64_t length, const uint8_t* buffer0,
+ const uint8_t* buffer1, const uint8_t* buffer2,
+ int bit_offset0, int bit_offset1) {
+ metadata_ = metadata;
+ length_ = length;
+ buffers_[0] = buffer0;
+ buffers_[1] = buffer1;
+ buffers_[2] = buffer2;
+ mutable_buffers_[0] = mutable_buffers_[1] = mutable_buffers_[2] = nullptr;
+ bit_offset_[0] = bit_offset0;
+ bit_offset_[1] = bit_offset1;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnMetadata& metadata,
+ int64_t length, uint8_t* buffer0,
+ uint8_t* buffer1, uint8_t* buffer2,
+ int bit_offset0, int bit_offset1) {
+ metadata_ = metadata;
+ length_ = length;
+ buffers_[0] = mutable_buffers_[0] = buffer0;
+ buffers_[1] = mutable_buffers_[1] = buffer1;
+ buffers_[2] = mutable_buffers_[2] = buffer2;
+ bit_offset_[0] = bit_offset0;
+ bit_offset_[1] = bit_offset1;
+}
+
+KeyEncoder::KeyColumnArray::KeyColumnArray(const KeyColumnArray& from, int64_t start,
+ int64_t length) {
+ metadata_ = from.metadata_;
+ length_ = length;
+ uint32_t fixed_size =
+ !metadata_.is_fixed_length ? sizeof(uint32_t) : metadata_.fixed_length;
+
+ buffers_[0] =
+ from.buffers_[0] ? from.buffers_[0] + (from.bit_offset_[0] + start) / 8 : nullptr;
+ mutable_buffers_[0] = from.mutable_buffers_[0]
+ ? from.mutable_buffers_[0] + (from.bit_offset_[0] + start) / 8
+ : nullptr;
+ bit_offset_[0] = (from.bit_offset_[0] + start) % 8;
+
+ if (fixed_size == 0) {
+ buffers_[1] =
+ from.buffers_[1] ? from.buffers_[1] + (from.bit_offset_[1] + start) / 8 : nullptr;
+ mutable_buffers_[1] = from.mutable_buffers_[1] ? from.mutable_buffers_[1] +
+ (from.bit_offset_[1] + start) / 8
+ : nullptr;
+ bit_offset_[1] = (from.bit_offset_[1] + start) % 8;
+ } else {
+ buffers_[1] = from.buffers_[1] ? from.buffers_[1] + start * fixed_size : nullptr;
+ mutable_buffers_[1] = from.mutable_buffers_[1]
+ ? from.mutable_buffers_[1] + start * fixed_size
+ : nullptr;
+ bit_offset_[1] = 0;
+ }
+
+ buffers_[2] = from.buffers_[2];
+ mutable_buffers_[2] = from.mutable_buffers_[2];
+}
+
+KeyEncoder::KeyColumnArray KeyEncoder::TransformBoolean::ArrayReplace(
+ const KeyColumnArray& column, const KeyColumnArray& temp) {
+ // Make sure that the temp buffer is large enough
+ DCHECK(temp.length() >= column.length() && temp.metadata().is_fixed_length &&
+ temp.metadata().fixed_length >= sizeof(uint8_t));
+ KeyColumnMetadata metadata;
+ metadata.is_fixed_length = true;
+ metadata.fixed_length = sizeof(uint8_t);
+ constexpr int buffer_index = 1;
+ KeyColumnArray result = KeyColumnArray(metadata, column, temp, buffer_index);
+ return result;
+}
+
+void KeyEncoder::TransformBoolean::PreEncode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ // Make sure that metadata and lengths are compatible.
+ DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
+ DCHECK(output->metadata().fixed_length == 1 && input.metadata().fixed_length == 0);
+ DCHECK(output->length() == input.length());
+ constexpr int buffer_index = 1;
+ DCHECK(input.data(buffer_index) != nullptr);
+ DCHECK(output->mutable_data(buffer_index) != nullptr);
+ util::BitUtil::bits_to_bytes(
+ ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
+ output->mutable_data(buffer_index), input.bit_offset(buffer_index));
+}
+
+void KeyEncoder::TransformBoolean::PostDecode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ // Make sure that metadata and lengths are compatible.
+ DCHECK(output->metadata().is_fixed_length == input.metadata().is_fixed_length);
+ DCHECK(output->metadata().fixed_length == 0 && input.metadata().fixed_length == 1);
+ DCHECK(output->length() == input.length());
+ constexpr int buffer_index = 1;
+ DCHECK(input.data(buffer_index) != nullptr);
+ DCHECK(output->mutable_data(buffer_index) != nullptr);
+
+ util::BitUtil::bytes_to_bits(
+ ctx->hardware_flags, static_cast<int>(input.length()), input.data(buffer_index),
+ output->mutable_data(buffer_index), output->bit_offset(buffer_index));
+}
+
+bool KeyEncoder::EncoderInteger::IsBoolean(const KeyColumnMetadata& metadata) {
+ return metadata.is_fixed_length && metadata.fixed_length == 0;
+}
+
+bool KeyEncoder::EncoderInteger::UsesTransform(const KeyColumnArray& column) {
+ return IsBoolean(column.metadata());
+}
+
+KeyEncoder::KeyColumnArray KeyEncoder::EncoderInteger::ArrayReplace(
+ const KeyColumnArray& column, const KeyColumnArray& temp) {
+ if (IsBoolean(column.metadata())) {
+ return TransformBoolean::ArrayReplace(column, temp);
+ }
+ return column;
+}
+
+void KeyEncoder::EncoderInteger::PreEncode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ if (IsBoolean(input.metadata())) {
+ TransformBoolean::PreEncode(input, output, ctx);
+ }
+}
+
+void KeyEncoder::EncoderInteger::PostDecode(const KeyColumnArray& input,
+ KeyColumnArray* output,
+ KeyEncoderContext* ctx) {
+ if (IsBoolean(output->metadata())) {
+ TransformBoolean::PostDecode(input, output, ctx);
+ }
+}
+
+void KeyEncoder::EncoderInteger::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp) {
+ KeyColumnArray col_prep;
+ if (UsesTransform(col)) {
+ col_prep = ArrayReplace(col, *temp);
+ PreEncode(col, &col_prep, ctx);
+ } else {
+ col_prep = col;
+ }
+
+ const auto num_rows = static_cast<uint32_t>(col.length());
+
+ // When we have a single fixed length column we can just do memcpy
+ if (rows->metadata().is_fixed_length &&
+ rows->metadata().fixed_length == col.metadata().fixed_length) {
+ DCHECK_EQ(offset_within_row, 0);
+ uint32_t row_size = col.metadata().fixed_length;
+ memcpy(rows->mutable_data(1), col.data(1), num_rows * row_size);
+ } else if (rows->metadata().is_fixed_length) {
+ uint32_t row_size = rows->metadata().fixed_length;
+ uint8_t* row_base = rows->mutable_data(1) + offset_within_row;
+ const uint8_t* col_base = col_prep.data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ row_base[i * row_size] = col_base[i];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint16_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint16_t*>(col_base)[i];
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint32_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint32_t*>(col_base)[i];
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint64_t*>(row_base + i * row_size) =
+ reinterpret_cast<const uint64_t*>(col_base)[i];
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ } else {
+ const uint32_t* row_offsets = rows->offsets();
+ uint8_t* row_base = rows->mutable_data(2) + offset_within_row;
+ const uint8_t* col_base = col_prep.data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ row_base[row_offsets[i]] = col_base[i];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint16_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint16_t*>(col_base)[i];
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint32_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint32_t*>(col_base)[i];
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ *reinterpret_cast<uint64_t*>(row_base + row_offsets[i]) =
+ reinterpret_cast<const uint64_t*>(col_base)[i];
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ }
+}
+
+void KeyEncoder::EncoderInteger::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp) {
+ KeyColumnArray col_prep;
+ if (UsesTransform(*col)) {
+ col_prep = ArrayReplace(*col, *temp);
+ } else {
+ col_prep = *col;
+ }
+
+ // When we have a single fixed length column we can just do memcpy
+ if (rows.metadata().is_fixed_length &&
+ col_prep.metadata().fixed_length == rows.metadata().fixed_length) {
+ DCHECK_EQ(offset_within_row, 0);
+ uint32_t row_size = rows.metadata().fixed_length;
+ memcpy(col_prep.mutable_data(1), rows.data(1) + start_row * row_size,
+ num_rows * row_size);
+ } else if (rows.metadata().is_fixed_length) {
+ uint32_t row_size = rows.metadata().fixed_length;
+ const uint8_t* row_base = rows.data(1) + start_row * row_size;
+ row_base += offset_within_row;
+ uint8_t* col_base = col_prep.mutable_data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ col_base[i] = row_base[i * row_size];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint16_t*>(col_base)[i] =
+ *reinterpret_cast<const uint16_t*>(row_base + i * row_size);
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint32_t*>(col_base)[i] =
+ *reinterpret_cast<const uint32_t*>(row_base + i * row_size);
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint64_t*>(col_base)[i] =
+ *reinterpret_cast<const uint64_t*>(row_base + i * row_size);
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ } else {
+ const uint32_t* row_offsets = rows.offsets() + start_row;
+ const uint8_t* row_base = rows.data(2);
+ row_base += offset_within_row;
+ uint8_t* col_base = col_prep.mutable_data(1);
+ switch (col_prep.metadata().fixed_length) {
+ case 1:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ col_base[i] = row_base[row_offsets[i]];
+ }
+ break;
+ case 2:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint16_t*>(col_base)[i] =
+ *reinterpret_cast<const uint16_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ case 4:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint32_t*>(col_base)[i] =
+ *reinterpret_cast<const uint32_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ case 8:
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ reinterpret_cast<uint64_t*>(col_base)[i] =
+ *reinterpret_cast<const uint64_t*>(row_base + row_offsets[i]);
+ }
+ break;
+ default:
+ DCHECK(false);
+ }
+ }
+
+ if (UsesTransform(*col)) {
+ PostDecode(col_prep, col, ctx);
+ }
+}
+
+bool KeyEncoder::EncoderBinary::IsInteger(const KeyColumnMetadata& metadata) {
+ bool is_fixed_length = metadata.is_fixed_length;
+ auto size = metadata.fixed_length;
+ return is_fixed_length &&
+ (size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
+}
+
+void KeyEncoder::EncoderBinary::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp) {
+ if (IsInteger(col.metadata())) {
+ EncoderInteger::Encode(offset_within_row, rows, col, ctx, temp);
+ } else {
+ KeyColumnArray col_prep;
+ if (EncoderInteger::UsesTransform(col)) {
+ col_prep = EncoderInteger::ArrayReplace(col, *temp);
+ EncoderInteger::PreEncode(col, &col_prep, ctx);
+ } else {
+ col_prep = col;
+ }
+
+ bool is_row_fixed_length = rows->metadata().is_fixed_length;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ EncodeHelper_avx2(is_row_fixed_length, offset_within_row, rows, col);
+ } else {
+#endif
+ if (is_row_fixed_length) {
+ EncodeImp<true>(offset_within_row, rows, col);
+ } else {
+ EncodeImp<false>(offset_within_row, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+ }
+
+ DCHECK(temp->metadata().is_fixed_length);
+ DCHECK(temp->length() * temp->metadata().fixed_length >=
+ col.length() * static_cast<int64_t>(sizeof(uint16_t)));
+
+ KeyColumnArray temp16bit(KeyColumnMetadata(true, sizeof(uint16_t)), col.length(),
+ nullptr, temp->mutable_data(1), nullptr);
+ ColumnMemsetNulls(offset_within_row, rows, col, ctx, &temp16bit, 0xae);
+}
+
+void KeyEncoder::EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp) {
+ if (IsInteger(col->metadata())) {
+ EncoderInteger::Decode(start_row, num_rows, offset_within_row, rows, col, ctx, temp);
+ } else {
+ KeyColumnArray col_prep;
+ if (EncoderInteger::UsesTransform(*col)) {
+ col_prep = EncoderInteger::ArrayReplace(*col, *temp);
+ } else {
+ col_prep = *col;
+ }
+
+ bool is_row_fixed_length = rows.metadata().is_fixed_length;
+
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows,
+ col);
+ } else {
+#endif
+ if (is_row_fixed_length) {
+ DecodeImp<true>(start_row, num_rows, offset_within_row, rows, col);
+ } else {
+ DecodeImp<false>(start_row, num_rows, offset_within_row, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+
+ if (EncoderInteger::UsesTransform(*col)) {
+ EncoderInteger::PostDecode(col_prep, col, ctx);
+ }
+ }
+}
+
+template <bool is_row_fixed_length>
+void KeyEncoder::EncoderBinary::EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col) {
+ EncodeDecodeHelper<is_row_fixed_length, true>(
+ 0, static_cast<uint32_t>(col.length()), offset_within_row, rows, rows, &col,
+ nullptr, [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ uint32_t istripe;
+ for (istripe = 0; istripe < length / 8; ++istripe) {
+ dst64[istripe] = util::SafeLoad(src64 + istripe);
+ }
+ if ((length % 8) > 0) {
+ uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
+ dst64[istripe] = (dst64[istripe] & ~mask_last) |
+ (util::SafeLoad(src64 + istripe) & mask_last);
+ }
+ });
+}
+
+template <bool is_row_fixed_length>
+void KeyEncoder::EncoderBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col) {
+ EncodeDecodeHelper<is_row_fixed_length, false>(
+ start_row, num_rows, offset_within_row, &rows, nullptr, col, col,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ util::SafeStore(dst64 + istripe, src64[istripe]);
+ }
+ });
+}
+
+void KeyEncoder::EncoderBinary::ColumnMemsetNulls(
+ uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
+ using ColumnMemsetNullsImp_t = void (*)(uint32_t, KeyRowArray*, const KeyColumnArray&,
+ KeyEncoderContext*, KeyColumnArray*, uint8_t);
+ static const ColumnMemsetNullsImp_t ColumnMemsetNullsImp_fn[] = {
+ ColumnMemsetNullsImp<false, 1>, ColumnMemsetNullsImp<false, 2>,
+ ColumnMemsetNullsImp<false, 4>, ColumnMemsetNullsImp<false, 8>,
+ ColumnMemsetNullsImp<false, 16>, ColumnMemsetNullsImp<true, 1>,
+ ColumnMemsetNullsImp<true, 2>, ColumnMemsetNullsImp<true, 4>,
+ ColumnMemsetNullsImp<true, 8>, ColumnMemsetNullsImp<true, 16>};
+ uint32_t col_width = col.metadata().fixed_length;
+ int dispatch_const =
+ (rows->metadata().is_fixed_length ? 5 : 0) +
+ (col_width == 1 ? 0
+ : col_width == 2 ? 1 : col_width == 4 ? 2 : col_width == 8 ? 3 : 4);
+ ColumnMemsetNullsImp_fn[dispatch_const](offset_within_row, rows, col, ctx,
+ temp_vector_16bit, byte_value);
+}
+
+template <bool is_row_fixed_length, uint32_t col_width>
+void KeyEncoder::EncoderBinary::ColumnMemsetNullsImp(
+ uint32_t offset_within_row, KeyRowArray* rows, const KeyColumnArray& col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit, uint8_t byte_value) {
+ // Nothing to do when there are no nulls
+ if (!col.data(0)) {
+ return;
+ }
+
+ const auto num_rows = static_cast<uint32_t>(col.length());
+
+ // Temp vector needs space for the required number of rows
+ DCHECK(temp_vector_16bit->length() >= num_rows);
+ DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
+ temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
+ auto temp_vector = reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1));
+
+ // Bit vector to index vector of null positions
+ int num_selected;
+ util::BitUtil::bits_to_indexes(0, ctx->hardware_flags, static_cast<int>(col.length()),
+ col.data(0), &num_selected, temp_vector,
+ col.bit_offset(0));
+
+ for (int i = 0; i < num_selected; ++i) {
+ uint32_t row_id = temp_vector[i];
+
+ // Target binary field pointer
+ uint8_t* dst;
+ if (is_row_fixed_length) {
+ dst = rows->mutable_data(1) + rows->metadata().fixed_length * row_id;
+ } else {
+ dst = rows->mutable_data(2) + rows->offsets()[row_id];
+ }
+ dst += offset_within_row;
+
+ if (col_width == 1) {
+ *dst = byte_value;
+ } else if (col_width == 2) {
+ *reinterpret_cast<uint16_t*>(dst) =
+ (static_cast<uint16_t>(byte_value) * static_cast<uint16_t>(0x0101));
+ } else if (col_width == 4) {
+ *reinterpret_cast<uint32_t*>(dst) =
+ (static_cast<uint32_t>(byte_value) * static_cast<uint32_t>(0x01010101));
+ } else if (col_width == 8) {
+ *reinterpret_cast<uint64_t*>(dst) =
+ (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
+ } else {
+ uint64_t value = (static_cast<uint64_t>(byte_value) * 0x0101010101010101ULL);
+ uint32_t col_width_actual = col.metadata().fixed_length;
+ uint32_t j;
+ for (j = 0; j < col_width_actual / 8; ++j) {
+ reinterpret_cast<uint64_t*>(dst)[j] = value;
+ }
+ int tail = col_width_actual % 8;
+ if (tail) {
+ uint64_t mask = ~0ULL >> (8 * (8 - tail));
+ reinterpret_cast<uint64_t*>(dst)[j] =
+ (reinterpret_cast<const uint64_t*>(dst)[j] & ~mask) | (value & mask);
+ }
+ }
+ }
+}
+
+void KeyEncoder::EncoderBinaryPair::Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2,
+ KeyEncoderContext* ctx, KeyColumnArray* temp1,
+ KeyColumnArray* temp2) {
+ DCHECK(CanProcessPair(col1.metadata(), col2.metadata()));
+
+ KeyColumnArray col_prep[2];
+ if (EncoderInteger::UsesTransform(col1)) {
+ col_prep[0] = EncoderInteger::ArrayReplace(col1, *temp1);
+ EncoderInteger::PreEncode(col1, &(col_prep[0]), ctx);
+ } else {
+ col_prep[0] = col1;
+ }
+ if (EncoderInteger::UsesTransform(col2)) {
+ col_prep[1] = EncoderInteger::ArrayReplace(col2, *temp2);
+ EncoderInteger::PreEncode(col2, &(col_prep[1]), ctx);
+ } else {
+ col_prep[1] = col2;
+ }
+
+ uint32_t col_width1 = col_prep[0].metadata().fixed_length;
+ uint32_t col_width2 = col_prep[1].metadata().fixed_length;
+ int log_col_width1 =
+ col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
+ int log_col_width2 =
+ col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+
+ bool is_row_fixed_length = rows->metadata().is_fixed_length;
+
+ const auto num_rows = static_cast<uint32_t>(col1.length());
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && col_width1 == col_width2) {
+ num_processed = EncodeHelper_avx2(is_row_fixed_length, col_width1, offset_within_row,
+ rows, col_prep[0], col_prep[1]);
+ }
+#endif
+ if (num_processed < num_rows) {
+ using EncodeImp_t = void (*)(uint32_t, uint32_t, KeyRowArray*, const KeyColumnArray&,
+ const KeyColumnArray&);
+ static const EncodeImp_t EncodeImp_fn[] = {
+ EncodeImp<false, uint8_t, uint8_t>, EncodeImp<false, uint16_t, uint8_t>,
+ EncodeImp<false, uint32_t, uint8_t>, EncodeImp<false, uint64_t, uint8_t>,
+ EncodeImp<false, uint8_t, uint16_t>, EncodeImp<false, uint16_t, uint16_t>,
+ EncodeImp<false, uint32_t, uint16_t>, EncodeImp<false, uint64_t, uint16_t>,
+ EncodeImp<false, uint8_t, uint32_t>, EncodeImp<false, uint16_t, uint32_t>,
+ EncodeImp<false, uint32_t, uint32_t>, EncodeImp<false, uint64_t, uint32_t>,
+ EncodeImp<false, uint8_t, uint64_t>, EncodeImp<false, uint16_t, uint64_t>,
+ EncodeImp<false, uint32_t, uint64_t>, EncodeImp<false, uint64_t, uint64_t>,
+ EncodeImp<true, uint8_t, uint8_t>, EncodeImp<true, uint16_t, uint8_t>,
+ EncodeImp<true, uint32_t, uint8_t>, EncodeImp<true, uint64_t, uint8_t>,
+ EncodeImp<true, uint8_t, uint16_t>, EncodeImp<true, uint16_t, uint16_t>,
+ EncodeImp<true, uint32_t, uint16_t>, EncodeImp<true, uint64_t, uint16_t>,
+ EncodeImp<true, uint8_t, uint32_t>, EncodeImp<true, uint16_t, uint32_t>,
+ EncodeImp<true, uint32_t, uint32_t>, EncodeImp<true, uint64_t, uint32_t>,
+ EncodeImp<true, uint8_t, uint64_t>, EncodeImp<true, uint16_t, uint64_t>,
+ EncodeImp<true, uint32_t, uint64_t>, EncodeImp<true, uint64_t, uint64_t>};
+ int dispatch_const = (log_col_width2 << 2) | log_col_width1;
+ dispatch_const += (is_row_fixed_length ? 16 : 0);
+ EncodeImp_fn[dispatch_const](num_processed, offset_within_row, rows, col_prep[0],
+ col_prep[1]);
+ }
+}
+
+template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+void KeyEncoder::EncoderBinaryPair::EncodeImp(uint32_t num_rows_to_skip,
+ uint32_t offset_within_row,
+ KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2) {
+ const uint8_t* src_A = col1.data(1);
+ const uint8_t* src_B = col2.data(1);
+
+ const auto num_rows = static_cast<uint32_t>(col1.length());
+
+ uint32_t fixed_length = rows->metadata().fixed_length;
+ const uint32_t* offsets;
+ uint8_t* dst_base;
+ if (is_row_fixed_length) {
+ dst_base = rows->mutable_data(1) + offset_within_row;
+ offsets = nullptr;
+ } else {
+ dst_base = rows->mutable_data(2) + offset_within_row;
+ offsets = rows->offsets();
+ }
+
+ using col1_type_const = typename std::add_const<col1_type>::type;
+ using col2_type_const = typename std::add_const<col2_type>::type;
+
+ if (is_row_fixed_length) {
+ uint8_t* dst = dst_base + num_rows_to_skip * fixed_length;
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
+ *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
+ reinterpret_cast<col2_type_const*>(src_B)[i];
+ dst += fixed_length;
+ }
+ } else {
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ uint8_t* dst = dst_base + offsets[i];
+ *reinterpret_cast<col1_type*>(dst) = reinterpret_cast<col1_type_const*>(src_A)[i];
+ *reinterpret_cast<col2_type*>(dst + sizeof(col1_type)) =
+ reinterpret_cast<col2_type_const*>(src_B)[i];
+ }
+ }
+}
+
+void KeyEncoder::EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2, KeyEncoderContext* ctx,
+ KeyColumnArray* temp1, KeyColumnArray* temp2) {
+ DCHECK(CanProcessPair(col1->metadata(), col2->metadata()));
+
+ KeyColumnArray col_prep[2];
+ if (EncoderInteger::UsesTransform(*col1)) {
+ col_prep[0] = EncoderInteger::ArrayReplace(*col1, *temp1);
+ } else {
+ col_prep[0] = *col1;
+ }
+ if (EncoderInteger::UsesTransform(*col2)) {
+ col_prep[1] = EncoderInteger::ArrayReplace(*col2, *temp2);
+ } else {
+ col_prep[1] = *col2;
+ }
+
+ uint32_t col_width1 = col_prep[0].metadata().fixed_length;
+ uint32_t col_width2 = col_prep[1].metadata().fixed_length;
+ int log_col_width1 =
+ col_width1 == 8 ? 3 : col_width1 == 4 ? 2 : col_width1 == 2 ? 1 : 0;
+ int log_col_width2 =
+ col_width2 == 8 ? 3 : col_width2 == 4 ? 2 : col_width2 == 2 ? 1 : 0;
+
+ bool is_row_fixed_length = rows.metadata().is_fixed_length;
+
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2() && col_width1 == col_width2) {
+ num_processed =
+ DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows,
+ offset_within_row, rows, &col_prep[0], &col_prep[1]);
+ }
+#endif
+ if (num_processed < num_rows) {
+ using DecodeImp_t = void (*)(uint32_t, uint32_t, uint32_t, uint32_t,
+ const KeyRowArray&, KeyColumnArray*, KeyColumnArray*);
+ static const DecodeImp_t DecodeImp_fn[] = {
+ DecodeImp<false, uint8_t, uint8_t>, DecodeImp<false, uint16_t, uint8_t>,
+ DecodeImp<false, uint32_t, uint8_t>, DecodeImp<false, uint64_t, uint8_t>,
+ DecodeImp<false, uint8_t, uint16_t>, DecodeImp<false, uint16_t, uint16_t>,
+ DecodeImp<false, uint32_t, uint16_t>, DecodeImp<false, uint64_t, uint16_t>,
+ DecodeImp<false, uint8_t, uint32_t>, DecodeImp<false, uint16_t, uint32_t>,
+ DecodeImp<false, uint32_t, uint32_t>, DecodeImp<false, uint64_t, uint32_t>,
+ DecodeImp<false, uint8_t, uint64_t>, DecodeImp<false, uint16_t, uint64_t>,
+ DecodeImp<false, uint32_t, uint64_t>, DecodeImp<false, uint64_t, uint64_t>,
+ DecodeImp<true, uint8_t, uint8_t>, DecodeImp<true, uint16_t, uint8_t>,
+ DecodeImp<true, uint32_t, uint8_t>, DecodeImp<true, uint64_t, uint8_t>,
+ DecodeImp<true, uint8_t, uint16_t>, DecodeImp<true, uint16_t, uint16_t>,
+ DecodeImp<true, uint32_t, uint16_t>, DecodeImp<true, uint64_t, uint16_t>,
+ DecodeImp<true, uint8_t, uint32_t>, DecodeImp<true, uint16_t, uint32_t>,
+ DecodeImp<true, uint32_t, uint32_t>, DecodeImp<true, uint64_t, uint32_t>,
+ DecodeImp<true, uint8_t, uint64_t>, DecodeImp<true, uint16_t, uint64_t>,
+ DecodeImp<true, uint32_t, uint64_t>, DecodeImp<true, uint64_t, uint64_t>};
+ int dispatch_const =
+ (log_col_width2 << 2) | log_col_width1 | (is_row_fixed_length ? 16 : 0);
+ DecodeImp_fn[dispatch_const](num_processed, start_row, num_rows, offset_within_row,
+ rows, &(col_prep[0]), &(col_prep[1]));
+ }
+
+ if (EncoderInteger::UsesTransform(*col1)) {
+ EncoderInteger::PostDecode(col_prep[0], col1, ctx);
+ }
+ if (EncoderInteger::UsesTransform(*col2)) {
+ EncoderInteger::PostDecode(col_prep[1], col2, ctx);
+ }
+}
+
+template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+void KeyEncoder::EncoderBinaryPair::DecodeImp(uint32_t num_rows_to_skip,
+ uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray& rows,
+ KeyColumnArray* col1,
+ KeyColumnArray* col2) {
+ DCHECK(rows.length() >= start_row + num_rows);
+ DCHECK(col1->length() == num_rows && col2->length() == num_rows);
+
+ uint8_t* dst_A = col1->mutable_data(1);
+ uint8_t* dst_B = col2->mutable_data(1);
+
+ uint32_t fixed_length = rows.metadata().fixed_length;
+ const uint32_t* offsets;
+ const uint8_t* src_base;
+ if (is_row_fixed_length) {
+ src_base = rows.data(1) + fixed_length * start_row + offset_within_row;
+ offsets = nullptr;
+ } else {
+ src_base = rows.data(2) + offset_within_row;
+ offsets = rows.offsets() + start_row;
+ }
+
+ using col1_type_const = typename std::add_const<col1_type>::type;
+ using col2_type_const = typename std::add_const<col2_type>::type;
+
+ if (is_row_fixed_length) {
+ const uint8_t* src = src_base + num_rows_to_skip * fixed_length;
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
+ reinterpret_cast<col2_type*>(dst_B)[i] =
+ *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
+ src += fixed_length;
+ }
+ } else {
+ for (uint32_t i = num_rows_to_skip; i < num_rows; ++i) {
+ const uint8_t* src = src_base + offsets[i];
+ reinterpret_cast<col1_type*>(dst_A)[i] = *reinterpret_cast<col1_type_const*>(src);
+ reinterpret_cast<col2_type*>(dst_B)[i] =
+ *reinterpret_cast<col2_type_const*>(src + sizeof(col1_type));
+ }
+ }
+}
+
+void KeyEncoder::EncoderOffsets::Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyEncoderContext* ctx) {
+ DCHECK(!varbinary_cols.empty());
+
+ // Rows and columns must all be varying-length
+ DCHECK(!rows->metadata().is_fixed_length);
+ for (const auto& col : varbinary_cols) {
+ DCHECK(!col.metadata().is_fixed_length);
+ }
+
+ const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
+
+ uint32_t num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ // Whether any of the columns has non-zero starting bit offset for non-nulls bit vector
+ bool has_bit_offset = false;
+
+ // The space in columns must be exactly equal to a space for offsets in rows
+ DCHECK(rows->length() == num_rows);
+ for (const auto& col : varbinary_cols) {
+ DCHECK(col.length() == num_rows);
+ if (col.bit_offset(0) != 0) {
+ has_bit_offset = true;
+ }
+ }
+
+ if (ctx->has_avx2() && !has_bit_offset) {
+ // Create a temp vector sized based on the number of columns
+ auto temp_buffer_holder = util::TempVectorHolder<uint32_t>(
+ ctx->stack, static_cast<uint32_t>(varbinary_cols.size()) * 8);
+ auto temp_buffer_32B_per_col = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint32_t)), varbinary_cols.size() * 8, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder.mutable_data()), nullptr);
+
+ num_processed = EncodeImp_avx2(rows, varbinary_cols, &temp_buffer_32B_per_col);
+ }
+#endif
+ if (num_processed < num_rows) {
+ EncodeImp(num_processed, rows, varbinary_cols);
+ }
+}
+
+void KeyEncoder::EncoderOffsets::EncodeImp(
+ uint32_t num_rows_already_processed, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols) {
+ DCHECK_GT(varbinary_cols.size(), 0);
+
+ int row_alignment = rows->metadata().row_alignment;
+ int string_alignment = rows->metadata().string_alignment;
+
+ uint32_t* row_offsets = rows->mutable_offsets();
+ uint8_t* row_values = rows->mutable_data(2);
+ const auto num_rows = static_cast<uint32_t>(varbinary_cols[0].length());
+
+ if (num_rows_already_processed == 0) {
+ row_offsets[0] = 0;
+ }
+
+ uint32_t row_offset = row_offsets[num_rows_already_processed];
+ for (uint32_t i = num_rows_already_processed; i < num_rows; ++i) {
+ uint32_t* varbinary_end =
+ rows->metadata().varbinary_end_array(row_values + row_offset);
+
+ // Zero out lengths for nulls.
+ // Add lengths of all columns to get row size.
+ // Store varbinary field ends while summing their lengths.
+
+ uint32_t offset_within_row = rows->metadata().fixed_length;
+
+ for (size_t col = 0; col < varbinary_cols.size(); ++col) {
+ const uint32_t* col_offsets = varbinary_cols[col].offsets();
+ uint32_t col_length = col_offsets[i + 1] - col_offsets[i];
+
+ const int bit_offset = varbinary_cols[col].bit_offset(0);
+
+ const uint8_t* non_nulls = varbinary_cols[col].data(0);
+ if (non_nulls && BitUtil::GetBit(non_nulls, bit_offset + i) == 0) {
+ col_length = 0;
+ }
+
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
+ offset_within_row += col_length;
+
+ varbinary_end[col] = offset_within_row;
+ }
+
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, row_alignment);
+ row_offset += offset_within_row;
+ row_offsets[i + 1] = row_offset;
+ }
+}
+
+void KeyEncoder::EncoderOffsets::Decode(
+ uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* varbinary_cols,
+ const std::vector<uint32_t>& varbinary_cols_base_offset, KeyEncoderContext* ctx) {
+ DCHECK(!varbinary_cols->empty());
+ DCHECK(varbinary_cols->size() == varbinary_cols_base_offset.size());
+
+ DCHECK(!rows.metadata().is_fixed_length);
+ DCHECK(rows.length() >= start_row + num_rows);
+ for (const auto& col : *varbinary_cols) {
+ // Rows and columns must all be varying-length
+ DCHECK(!col.metadata().is_fixed_length);
+ // The space in columns must be exactly equal to a subset of rows selected
+ DCHECK(col.length() == num_rows);
+ }
+
+ // Offsets of varbinary columns data within each encoded row are stored
+ // in the same encoded row as an array of 32-bit integers.
+ // This array follows immediately the data of fixed-length columns.
+ // There is one element for each varying-length column.
+ // The Nth element is the sum of all the lengths of varbinary columns data in
+ // that row, up to and including Nth varbinary column.
+
+ const uint32_t* row_offsets = rows.offsets() + start_row;
+
+ // Set the base offset for each column
+ for (size_t col = 0; col < varbinary_cols->size(); ++col) {
+ uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
+ col_offsets[0] = varbinary_cols_base_offset[col];
+ }
+
+ int string_alignment = rows.metadata().string_alignment;
+
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ // Find the beginning of cumulative lengths array for next row
+ const uint8_t* row = rows.data(2) + row_offsets[i];
+ const uint32_t* varbinary_ends = rows.metadata().varbinary_end_array(row);
+
+ // Update the offset of each column
+ uint32_t offset_within_row = rows.metadata().fixed_length;
+ for (size_t col = 0; col < varbinary_cols->size(); ++col) {
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment);
+ uint32_t length = varbinary_ends[col] - offset_within_row;
+ offset_within_row = varbinary_ends[col];
+ uint32_t* col_offsets = (*varbinary_cols)[col].mutable_offsets();
+ col_offsets[i + 1] = col_offsets[i] + length;
+ }
+ }
+}
+
+void KeyEncoder::EncoderVarBinary::Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col,
+ KeyEncoderContext* ctx) {
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ EncodeHelper_avx2(varbinary_col_id, rows, col);
+ } else {
+#endif
+ if (varbinary_col_id == 0) {
+ EncodeImp<true>(varbinary_col_id, rows, col);
+ } else {
+ EncodeImp<false>(varbinary_col_id, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+void KeyEncoder::EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx) {
+ // Output column varbinary buffer needs an extra 32B
+ // at the end in avx2 version and 8B otherwise.
+#if defined(ARROW_HAVE_AVX2)
+ if (ctx->has_avx2()) {
+ DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col);
+ } else {
+#endif
+ if (varbinary_col_id == 0) {
+ DecodeImp<true>(start_row, num_rows, varbinary_col_id, rows, col);
+ } else {
+ DecodeImp<false>(start_row, num_rows, varbinary_col_id, rows, col);
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+template <bool first_varbinary_col>
+void KeyEncoder::EncoderVarBinary::EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col) {
+ EncodeDecodeHelper<first_varbinary_col, true>(
+ 0, static_cast<uint32_t>(col.length()), varbinary_col_id, rows, rows, &col, nullptr,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ uint32_t istripe;
+ for (istripe = 0; istripe < length / 8; ++istripe) {
+ dst64[istripe] = util::SafeLoad(src64 + istripe);
+ }
+ if ((length % 8) > 0) {
+ uint64_t mask_last = ~0ULL >> (8 * (8 * (istripe + 1) - length));
+ dst64[istripe] = (dst64[istripe] & ~mask_last) |
+ (util::SafeLoad(src64 + istripe) & mask_last);
+ }
+ });
+}
+
+template <bool first_varbinary_col>
+void KeyEncoder::EncoderVarBinary::DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray& rows,
+ KeyColumnArray* col) {
+ EncodeDecodeHelper<first_varbinary_col, false>(
+ start_row, num_rows, varbinary_col_id, &rows, nullptr, col, col,
+ [](uint8_t* dst, const uint8_t* src, int64_t length) {
+ for (uint32_t istripe = 0; istripe < (length + 7) / 8; ++istripe) {
+ auto dst64 = reinterpret_cast<uint64_t*>(dst);
+ auto src64 = reinterpret_cast<const uint64_t*>(src);
+ util::SafeStore(dst64 + istripe, src64[istripe]);
+ }
+ });
+}
+
+void KeyEncoder::EncoderNulls::Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols,
+ KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit) {
+ DCHECK_GT(cols.size(), 0);
+ const auto num_rows = static_cast<uint32_t>(rows->length());
+
+ // All input columns should have the same number of rows.
+ // They may or may not have non-nulls bit-vectors allocated.
+ for (const auto& col : cols) {
+ DCHECK(col.length() == num_rows);
+ }
+
+ // Temp vector needs space for the required number of rows
+ DCHECK(temp_vector_16bit->length() >= num_rows);
+ DCHECK(temp_vector_16bit->metadata().is_fixed_length &&
+ temp_vector_16bit->metadata().fixed_length == sizeof(uint16_t));
+
+ uint8_t* null_masks = rows->null_masks();
+ uint32_t null_masks_bytes_per_row = rows->metadata().null_masks_bytes_per_row;
+ memset(null_masks, 0, null_masks_bytes_per_row * num_rows);
+ for (size_t col = 0; col < cols.size(); ++col) {
+ const uint8_t* non_nulls = cols[col].data(0);
+ if (!non_nulls) {
+ continue;
+ }
+ int bit_offset = cols[col].bit_offset(0);
+ DCHECK_LT(bit_offset, 8);
+ int num_selected;
+ util::BitUtil::bits_to_indexes(
+ 0, ctx->hardware_flags, num_rows, non_nulls, &num_selected,
+ reinterpret_cast<uint16_t*>(temp_vector_16bit->mutable_data(1)), bit_offset);
+ for (int i = 0; i < num_selected; ++i) {
+ uint16_t row_id = reinterpret_cast<const uint16_t*>(temp_vector_16bit->data(1))[i];
+ int64_t null_masks_bit_id = row_id * null_masks_bytes_per_row * 8 + col;
+ BitUtil::SetBit(null_masks, null_masks_bit_id);
+ }
+ }
+}
+
+void KeyEncoder::EncoderNulls::Decode(uint32_t start_row, uint32_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Every output column needs to have a space for exactly the required number
+ // of rows. It also needs to have non-nulls bit-vector allocated and mutable.
+ DCHECK_GT(cols->size(), 0);
+ for (auto& col : *cols) {
+ DCHECK(col.length() == num_rows);
+ DCHECK(col.mutable_data(0));
+ }
+
+ const uint8_t* null_masks = rows.null_masks();
+ uint32_t null_masks_bytes_per_row = rows.metadata().null_masks_bytes_per_row;
+ for (size_t col = 0; col < cols->size(); ++col) {
+ uint8_t* non_nulls = (*cols)[col].mutable_data(0);
+ const int bit_offset = (*cols)[col].bit_offset(0);
+ DCHECK_LT(bit_offset, 8);
+ non_nulls[0] |= 0xff << (bit_offset);
+ if (bit_offset + num_rows > 8) {
+ int bits_in_first_byte = 8 - bit_offset;
+ memset(non_nulls + 1, 0xff, BitUtil::BytesForBits(num_rows - bits_in_first_byte));
+ }
+ for (uint32_t row = 0; row < num_rows; ++row) {
+ uint32_t null_masks_bit_id =
+ (start_row + row) * null_masks_bytes_per_row * 8 + static_cast<uint32_t>(col);
+ bool is_set = BitUtil::GetBit(null_masks, null_masks_bit_id);
+ if (is_set) {
+ BitUtil::ClearBit(non_nulls, bit_offset + row);
+ }
+ }
+ }
+}
+
+uint32_t KeyEncoder::KeyRowMetadata::num_varbinary_cols() const {
+ uint32_t result = 0;
+ for (auto column_metadata : column_metadatas) {
+ if (!column_metadata.is_fixed_length) {
+ ++result;
+ }
+ }
+ return result;
+}
+
+bool KeyEncoder::KeyRowMetadata::is_compatible(const KeyRowMetadata& other) const {
+ if (other.num_cols() != num_cols()) {
+ return false;
+ }
+ if (row_alignment != other.row_alignment ||
+ string_alignment != other.string_alignment) {
+ return false;
+ }
+ for (size_t i = 0; i < column_metadatas.size(); ++i) {
+ if (column_metadatas[i].is_fixed_length !=
+ other.column_metadatas[i].is_fixed_length) {
+ return false;
+ }
+ if (column_metadatas[i].fixed_length != other.column_metadatas[i].fixed_length) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void KeyEncoder::KeyRowMetadata::FromColumnMetadataVector(
+ const std::vector<KeyColumnMetadata>& cols, int in_row_alignment,
+ int in_string_alignment) {
+ column_metadatas.resize(cols.size());
+ for (size_t i = 0; i < cols.size(); ++i) {
+ column_metadatas[i] = cols[i];
+ }
+
+ const auto num_cols = static_cast<uint32_t>(cols.size());
+
+ // Sort columns.
+ // Columns are sorted based on the size in bytes of their fixed-length part.
+ // For the varying-length column, the fixed-length part is the 32-bit field storing
+ // cumulative length of varying-length fields.
+ // The rules are:
+ // a) Boolean column, marked with fixed-length 0, is considered to have fixed-length
+ // part of 1 byte. b) Columns with fixed-length part being power of 2 or multiple of row
+ // alignment precede other columns. They are sorted among themselves based on size of
+ // fixed-length part. c) Fixed-length columns precede varying-length columns when both
+ // have the same size fixed-length part.
+ column_order.resize(num_cols);
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ column_order[i] = i;
+ }
+ std::sort(
+ column_order.begin(), column_order.end(), [&cols](uint32_t left, uint32_t right) {
+ bool is_left_pow2 =
+ !cols[left].is_fixed_length || ARROW_POPCOUNT64(cols[left].fixed_length) <= 1;
+ bool is_right_pow2 = !cols[right].is_fixed_length ||
+ ARROW_POPCOUNT64(cols[right].fixed_length) <= 1;
+ bool is_left_fixedlen = cols[left].is_fixed_length;
+ bool is_right_fixedlen = cols[right].is_fixed_length;
+ uint32_t width_left =
+ cols[left].is_fixed_length ? cols[left].fixed_length : sizeof(uint32_t);
+ uint32_t width_right =
+ cols[right].is_fixed_length ? cols[right].fixed_length : sizeof(uint32_t);
+ if (is_left_pow2 != is_right_pow2) {
+ return is_left_pow2;
+ }
+ if (!is_left_pow2) {
+ return left < right;
+ }
+ if (width_left != width_right) {
+ return width_left > width_right;
+ }
+ if (is_left_fixedlen != is_right_fixedlen) {
+ return is_left_fixedlen;
+ }
+ return left < right;
+ });
+
+ row_alignment = in_row_alignment;
+ string_alignment = in_string_alignment;
+ varbinary_end_array_offset = 0;
+
+ column_offsets.resize(num_cols);
+ uint32_t num_varbinary_cols = 0;
+ uint32_t offset_within_row = 0;
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ const KeyColumnMetadata& col = cols[column_order[i]];
+ offset_within_row +=
+ KeyRowMetadata::padding_for_alignment(offset_within_row, string_alignment, col);
+ column_offsets[i] = offset_within_row;
+ if (!col.is_fixed_length) {
+ if (num_varbinary_cols == 0) {
+ varbinary_end_array_offset = offset_within_row;
+ }
+ DCHECK(column_offsets[i] - varbinary_end_array_offset ==
+ num_varbinary_cols * sizeof(uint32_t));
+ ++num_varbinary_cols;
+ offset_within_row += sizeof(uint32_t);
+ } else {
+ // Boolean column is a bit-vector, which is indicated by
+ // setting fixed length in column metadata to zero.
+ // It will be stored as a byte in output row.
+ if (col.fixed_length == 0) {
+ offset_within_row += 1;
+ } else {
+ offset_within_row += col.fixed_length;
+ }
+ }
+ }
+
+ is_fixed_length = (num_varbinary_cols == 0);
+ fixed_length =
+ offset_within_row +
+ KeyRowMetadata::padding_for_alignment(
+ offset_within_row, num_varbinary_cols == 0 ? row_alignment : string_alignment);
+
+ // We set the number of bytes per row storing null masks of individual key columns
+ // to be a power of two. This is not required. It could be also set to the minimal
+ // number of bytes required for a given number of bits (one bit per column).
+ null_masks_bytes_per_row = 1;
+ while (static_cast<uint32_t>(null_masks_bytes_per_row * 8) < num_cols) {
+ null_masks_bytes_per_row *= 2;
+ }
+}
+
+void KeyEncoder::Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
+ int row_alignment, int string_alignment) {
+ ctx_ = ctx;
+ row_metadata_.FromColumnMetadataVector(cols, row_alignment, string_alignment);
+ uint32_t num_cols = row_metadata_.num_cols();
+ uint32_t num_varbinary_cols = row_metadata_.num_varbinary_cols();
+ batch_all_cols_.resize(num_cols);
+ batch_varbinary_cols_.resize(num_varbinary_cols);
+ batch_varbinary_cols_base_offsets_.resize(num_varbinary_cols);
+}
+
+void KeyEncoder::PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in) {
+ const auto num_cols = static_cast<uint32_t>(cols_in.size());
+ DCHECK(batch_all_cols_.size() == num_cols);
+
+ uint32_t num_varbinary_visited = 0;
+ for (uint32_t i = 0; i < num_cols; ++i) {
+ const KeyColumnArray& col = cols_in[row_metadata_.column_order[i]];
+ KeyColumnArray col_window(col, start_row, num_rows);
+ batch_all_cols_[i] = col_window;
+ if (!col.metadata().is_fixed_length) {
+ DCHECK(num_varbinary_visited < batch_varbinary_cols_.size());
+ // If start row is zero, then base offset of varbinary column is also zero.
+ if (start_row == 0) {
+ batch_varbinary_cols_base_offsets_[num_varbinary_visited] = 0;
+ } else {
+ batch_varbinary_cols_base_offsets_[num_varbinary_visited] =
+ col.offsets()[start_row];
+ }
+ batch_varbinary_cols_[num_varbinary_visited++] = col_window;
+ }
+ }
+}
+
+Status KeyEncoder::PrepareOutputForEncode(int64_t start_row, int64_t num_rows,
+ KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& all_cols) {
+ int64_t num_bytes_required = 0;
+
+ int64_t fixed_part = row_metadata_.fixed_length * num_rows;
+ int64_t var_part = 0;
+ for (const auto& col : all_cols) {
+ if (!col.metadata().is_fixed_length) {
+ DCHECK(col.length() >= start_row + num_rows);
+ const uint32_t* offsets = col.offsets();
+ var_part += offsets[start_row + num_rows] - offsets[start_row];
+ // Include maximum padding that can be added to align the start of varbinary fields.
+ var_part += num_rows * row_metadata_.string_alignment;
+ }
+ }
+ // Include maximum padding that can be added to align the start of the rows.
+ if (!row_metadata_.is_fixed_length) {
+ fixed_part += row_metadata_.row_alignment * num_rows;
+ }
+ num_bytes_required = fixed_part + var_part;
+
+ rows->Clean();
+ RETURN_NOT_OK(rows->AppendEmpty(static_cast<uint32_t>(num_rows),
+ static_cast<uint32_t>(num_bytes_required)));
+
+ return Status::OK();
+}
+
+void KeyEncoder::Encode(int64_t start_row, int64_t num_rows, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row, num_rows, cols);
+
+ // Create two temp vectors with 16-bit elements
+ auto temp_buffer_holder_A =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_A = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
+ auto temp_buffer_holder_B =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_B = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ // This call will generate and fill in data for both:
+ // - offsets to the entire encoded arrays
+ // - offsets for individual varbinary fields within each row
+ EncoderOffsets::Encode(rows, batch_varbinary_cols_, ctx_);
+
+ for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
+ // Memcpy varbinary fields into precomputed in the previous step
+ // positions in the output row buffer.
+ EncoderVarBinary::Encode(static_cast<uint32_t>(i), rows, batch_varbinary_cols_[i],
+ ctx_);
+ }
+ }
+
+ // Process fixed length columns
+ const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
+ for (uint32_t i = 0; i < num_cols;) {
+ if (!batch_all_cols_[i].metadata().is_fixed_length) {
+ i += 1;
+ continue;
+ }
+ bool can_process_pair =
+ (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
+ EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
+ batch_all_cols_[i + 1].metadata());
+ if (!can_process_pair) {
+ EncoderBinary::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
+ ctx_, &temp_buffer_A);
+ i += 1;
+ } else {
+ EncoderBinaryPair::Encode(row_metadata_.column_offsets[i], rows, batch_all_cols_[i],
+ batch_all_cols_[i + 1], ctx_, &temp_buffer_A,
+ &temp_buffer_B);
+ i += 2;
+ }
+ }
+
+ // Process nulls
+ EncoderNulls::Encode(rows, batch_all_cols_, ctx_, &temp_buffer_A);
+}
+
+void KeyEncoder::DecodeFixedLengthBuffers(int64_t start_row_input,
+ int64_t start_row_output, int64_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
+
+ // Create two temp vectors with 16-bit elements
+ auto temp_buffer_holder_A =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_A = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_A.mutable_data()), nullptr);
+ auto temp_buffer_holder_B =
+ util::TempVectorHolder<uint16_t>(ctx_->stack, static_cast<uint32_t>(num_rows));
+ auto temp_buffer_B = KeyColumnArray(
+ KeyColumnMetadata(true, sizeof(uint16_t)), num_rows, nullptr,
+ reinterpret_cast<uint8_t*>(temp_buffer_holder_B.mutable_data()), nullptr);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ EncoderOffsets::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), rows, &batch_varbinary_cols_,
+ batch_varbinary_cols_base_offsets_, ctx_);
+ }
+
+ // Process fixed length columns
+ const auto num_cols = static_cast<uint32_t>(batch_all_cols_.size());
+ for (uint32_t i = 0; i < num_cols;) {
+ if (!batch_all_cols_[i].metadata().is_fixed_length) {
+ i += 1;
+ continue;
+ }
+ bool can_process_pair =
+ (i + 1 < num_cols) && batch_all_cols_[i + 1].metadata().is_fixed_length &&
+ EncoderBinaryPair::CanProcessPair(batch_all_cols_[i].metadata(),
+ batch_all_cols_[i + 1].metadata());
+ if (!can_process_pair) {
+ EncoderBinary::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows),
+ row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
+ ctx_, &temp_buffer_A);
+ i += 1;
+ } else {
+ EncoderBinaryPair::Decode(
+ static_cast<uint32_t>(start_row_input), static_cast<uint32_t>(num_rows),
+ row_metadata_.column_offsets[i], rows, &batch_all_cols_[i],
+ &batch_all_cols_[i + 1], ctx_, &temp_buffer_A, &temp_buffer_B);
+ i += 2;
+ }
+ }
+
+ // Process nulls
+ EncoderNulls::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), rows, &batch_all_cols_);
+}
+
+void KeyEncoder::DecodeVaryingLengthBuffers(int64_t start_row_input,
+ int64_t start_row_output, int64_t num_rows,
+ const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols) {
+ // Prepare column array vectors
+ PrepareKeyColumnArrays(start_row_output, num_rows, *cols);
+
+ bool is_row_fixed_length = row_metadata_.is_fixed_length;
+ if (!is_row_fixed_length) {
+ for (size_t i = 0; i < batch_varbinary_cols_.size(); ++i) {
+ // Memcpy varbinary fields into precomputed in the previous step
+ // positions in the output row buffer.
+ EncoderVarBinary::Decode(static_cast<uint32_t>(start_row_input),
+ static_cast<uint32_t>(num_rows), static_cast<uint32_t>(i),
+ rows, &batch_varbinary_cols_[i], ctx_);
+ }
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
index f59690e0e6c..e5397b9dfd4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_encode.h
@@ -1,635 +1,635 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/bit_util.h"
-
-namespace arrow {
-namespace compute {
-
-class KeyColumnMetadata;
-
-/// Converts between key representation as a collection of arrays for
-/// individual columns and another representation as a single array of rows
-/// combining data from all columns into one value.
-/// This conversion is reversible.
-/// Row-oriented storage is beneficial when there is a need for random access
-/// of individual rows and at the same time all included columns are likely to
-/// be accessed together, as in the case of hash table key.
-class KeyEncoder {
- public:
- struct KeyEncoderContext {
- bool has_avx2() const {
- return (hardware_flags & arrow::internal::CpuInfo::AVX2) > 0;
- }
- int64_t hardware_flags;
- util::TempVectorStack* stack;
- };
-
- /// Description of a storage format of a single key column as needed
- /// for the purpose of row encoding.
- struct KeyColumnMetadata {
- KeyColumnMetadata() = default;
- KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in)
- : is_fixed_length(is_fixed_length_in), fixed_length(fixed_length_in) {}
- /// Is column storing a varying-length binary, using offsets array
- /// to find a beginning of a value, or is it a fixed-length binary.
- bool is_fixed_length;
- /// For a fixed-length binary column: number of bytes per value.
- /// Zero has a special meaning, indicating a bit vector with one bit per value.
- /// For a varying-length binary column: number of bytes per offset.
- uint32_t fixed_length;
- };
-
- /// Description of a storage format for rows produced by encoder.
- struct KeyRowMetadata {
- /// Is row a varying-length binary, using offsets array to find a beginning of a row,
- /// or is it a fixed-length binary.
- bool is_fixed_length;
-
- /// For a fixed-length binary row, common size of rows in bytes,
- /// rounded up to the multiple of alignment.
- ///
- /// For a varying-length binary, size of all encoded fixed-length key columns,
- /// including lengths of varying-length columns, rounded up to the multiple of string
- /// alignment.
- uint32_t fixed_length;
-
- /// Offset within a row to the array of 32-bit offsets within a row of
- /// ends of varbinary fields.
- /// Used only when the row is not fixed-length, zero for fixed-length row.
- /// There are N elements for N varbinary fields.
- /// Each element is the offset within a row of the first byte after
- /// the corresponding varbinary field bytes in that row.
- /// If varbinary fields begin at aligned addresses, than the end of the previous
- /// varbinary field needs to be rounded up according to the specified alignment
- /// to obtain the beginning of the next varbinary field.
- /// The first varbinary field starts at offset specified by fixed_length,
- /// which should already be aligned.
- uint32_t varbinary_end_array_offset;
-
- /// Fixed number of bytes per row that are used to encode null masks.
- /// Null masks indicate for a single row which of its key columns are null.
- /// Nth bit in the sequence of bytes assigned to a row represents null
- /// information for Nth field according to the order in which they are encoded.
- int null_masks_bytes_per_row;
-
- /// Power of 2. Every row will start at the offset aligned to that number of bytes.
- int row_alignment;
-
- /// Power of 2. Must be no greater than row alignment.
- /// Every non-power-of-2 binary field and every varbinary field bytes
- /// will start aligned to that number of bytes.
- int string_alignment;
-
- /// Metadata of encoded columns in their original order.
- std::vector<KeyColumnMetadata> column_metadatas;
-
- /// Order in which fields are encoded.
- std::vector<uint32_t> column_order;
-
- /// Offsets within a row to fields in their encoding order.
- std::vector<uint32_t> column_offsets;
-
- /// Rounding up offset to the nearest multiple of alignment value.
- /// Alignment must be a power of 2.
- static inline uint32_t padding_for_alignment(uint32_t offset,
- int required_alignment) {
- ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
- return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
- (required_alignment - 1));
- }
-
- /// Rounding up offset to the beginning of next column,
- /// chosing required alignment based on the data type of that column.
- static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment,
- const KeyColumnMetadata& col_metadata) {
- if (!col_metadata.is_fixed_length ||
- ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
- return 0;
- } else {
- return padding_for_alignment(offset, string_alignment);
- }
- }
-
- /// Returns an array of offsets within a row of ends of varbinary fields.
- inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
- ARROW_DCHECK(!is_fixed_length);
- return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
- }
- inline uint32_t* varbinary_end_array(uint8_t* row) const {
- ARROW_DCHECK(!is_fixed_length);
- return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
- }
-
- /// Returns the offset within the row and length of the first varbinary field.
- inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t* offset,
- uint32_t* length) const {
- ARROW_DCHECK(!is_fixed_length);
- *offset = fixed_length;
- *length = varbinary_end_array(row)[0] - fixed_length;
- }
-
- /// Returns the offset within the row and length of the second and further varbinary
- /// fields.
- inline void nth_varbinary_offset_and_length(const uint8_t* row, int varbinary_id,
- uint32_t* out_offset,
- uint32_t* out_length) const {
- ARROW_DCHECK(!is_fixed_length);
- ARROW_DCHECK(varbinary_id > 0);
- const uint32_t* varbinary_end = varbinary_end_array(row);
- uint32_t offset = varbinary_end[varbinary_id - 1];
- offset += padding_for_alignment(offset, string_alignment);
- *out_offset = offset;
- *out_length = varbinary_end[varbinary_id] - offset;
- }
-
- uint32_t encoded_field_order(uint32_t icol) const { return column_order[icol]; }
-
- uint32_t encoded_field_offset(uint32_t icol) const { return column_offsets[icol]; }
-
- uint32_t num_cols() const { return static_cast<uint32_t>(column_metadatas.size()); }
-
- uint32_t num_varbinary_cols() const;
-
- void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
- int in_row_alignment, int in_string_alignment);
-
- bool is_compatible(const KeyRowMetadata& other) const;
- };
-
- class KeyRowArray {
- public:
- KeyRowArray();
- Status Init(MemoryPool* pool, const KeyRowMetadata& metadata);
- void Clean();
- Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append);
- Status AppendSelectionFrom(const KeyRowArray& from, uint32_t num_rows_to_append,
- const uint16_t* source_row_ids);
- const KeyRowMetadata& metadata() const { return metadata_; }
- int64_t length() const { return num_rows_; }
- const uint8_t* data(int i) const {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return buffers_[i];
- }
- uint8_t* mutable_data(int i) {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return mutable_buffers_[i];
- }
- const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
- uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
- const uint8_t* null_masks() const { return null_masks_->data(); }
- uint8_t* null_masks() { return null_masks_->mutable_data(); }
-
- bool has_any_nulls(const KeyEncoderContext* ctx) const;
-
- private:
- Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
- Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
-
- int64_t size_null_masks(int64_t num_rows);
- int64_t size_offsets(int64_t num_rows);
- int64_t size_rows_fixed_length(int64_t num_rows);
- int64_t size_rows_varying_length(int64_t num_bytes);
- void update_buffer_pointers();
-
- static constexpr int64_t padding_for_vectors = 64;
- MemoryPool* pool_;
- KeyRowMetadata metadata_;
- /// Buffers can only expand during lifetime and never shrink.
- std::unique_ptr<ResizableBuffer> null_masks_;
- std::unique_ptr<ResizableBuffer> offsets_;
- std::unique_ptr<ResizableBuffer> rows_;
- static constexpr int max_buffers_ = 3;
- const uint8_t* buffers_[max_buffers_];
- uint8_t* mutable_buffers_[max_buffers_];
- int64_t num_rows_;
- int64_t rows_capacity_;
- int64_t bytes_capacity_;
-
- // Mutable to allow lazy evaluation
- mutable int64_t num_rows_for_has_any_nulls_;
- mutable bool has_any_nulls_;
- };
-
- /// A lightweight description of an array representing one of key columns.
- class KeyColumnArray {
- public:
- KeyColumnArray() = default;
- /// Create as a mix of buffers according to the mask from two descriptions
- /// (Nth bit is set to 0 if Nth buffer from the first input
- /// should be used and is set to 1 otherwise).
- /// Metadata is inherited from the first input.
- KeyColumnArray(const KeyColumnMetadata& metadata, const KeyColumnArray& left,
- const KeyColumnArray& right, int buffer_id_to_replace);
- /// Create for reading
- KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
- const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
- int bit_offset0 = 0, int bit_offset1 = 0);
- /// Create for writing
- KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
- uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
- int bit_offset1 = 0);
- /// Create as a window view of original description that is offset
- /// by a given number of rows.
- /// The number of rows used in offset must be divisible by 8
- /// in order to not split bit vectors within a single byte.
- KeyColumnArray(const KeyColumnArray& from, int64_t start, int64_t length);
- uint8_t* mutable_data(int i) {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return mutable_buffers_[i];
- }
- const uint8_t* data(int i) const {
- ARROW_DCHECK(i >= 0 && i <= max_buffers_);
- return buffers_[i];
- }
- uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
- const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
- const KeyColumnMetadata& metadata() const { return metadata_; }
- int64_t length() const { return length_; }
- int bit_offset(int i) const {
- ARROW_DCHECK(i >= 0 && i < max_buffers_);
- return bit_offset_[i];
- }
-
- private:
- static constexpr int max_buffers_ = 3;
- const uint8_t* buffers_[max_buffers_];
- uint8_t* mutable_buffers_[max_buffers_];
- KeyColumnMetadata metadata_;
- int64_t length_;
- // Starting bit offset within the first byte (between 0 and 7)
- // to be used when accessing buffers that store bit vectors.
- int bit_offset_[max_buffers_ - 1];
- };
-
- void Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
- int row_alignment, int string_alignment);
-
- const KeyRowMetadata& row_metadata() { return row_metadata_; }
-
- /// Find out the required sizes of all buffers output buffers for encoding
- /// (including varying-length buffers).
- /// Use that information to resize provided row array so that it can fit
- /// encoded data.
- Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows,
- KeyRowArray* rows,
- const std::vector<KeyColumnArray>& all_cols);
-
- /// Encode a window of column oriented data into the entire output
- /// row oriented storage.
- /// The output buffers for encoding need to be correctly sized before
- /// starting encoding.
- void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& cols);
-
- /// Decode a window of row oriented data into a corresponding
- /// window of column oriented storage.
- /// The output buffers need to be correctly allocated and sized before
- /// calling each method.
- /// For that reason decoding is split into two functions.
- /// The output of the first one, that processes everything except for
- /// varying length buffers, can be used to find out required varying
- /// length buffers sizes.
- void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output,
- int64_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols);
-
- void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t start_row_output,
- int64_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols);
-
- private:
- /// Prepare column array vectors.
- /// Output column arrays represent a range of input column arrays
- /// specified by starting row and number of rows.
- /// Three vectors are generated:
- /// - all columns
- /// - fixed-length columns only
- /// - varying-length columns only
- void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
- const std::vector<KeyColumnArray>& cols_in);
-
- class TransformBoolean {
- public:
- static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
- const KeyColumnArray& temp);
- static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
- static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
- };
-
- class EncoderInteger {
- public:
- static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp);
- static bool UsesTransform(const KeyColumnArray& column);
- static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
- const KeyColumnArray& temp);
- static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
- static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
- KeyEncoderContext* ctx);
-
- private:
- static bool IsBoolean(const KeyColumnMetadata& metadata);
- };
-
- class EncoderBinary {
- public:
- static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx, KeyColumnArray* temp);
- static bool IsInteger(const KeyColumnMetadata& metadata);
-
- private:
- template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
- static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row,
- const KeyRowArray* rows_const,
- KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const,
- KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn);
- template <bool is_row_fixed_length>
- static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool is_row_fixed_length>
- static void DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
- static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row,
- KeyRowArray* rows, const KeyColumnArray& col);
- static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
- uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col);
- template <bool is_row_fixed_length>
- static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool is_row_fixed_length>
- static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col);
-#endif
- static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp_vector_16bit, uint8_t byte_value);
- template <bool is_row_fixed_length, uint32_t col_width>
- static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx,
- KeyColumnArray* temp_vector_16bit,
- uint8_t byte_value);
- };
-
- class EncoderBinaryPair {
- public:
- static bool CanProcessPair(const KeyColumnMetadata& col1,
- const KeyColumnMetadata& col2) {
- return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2);
- }
- static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1, const KeyColumnArray& col2,
- KeyEncoderContext* ctx, KeyColumnArray* temp1,
- KeyColumnArray* temp2);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col1,
- KeyColumnArray* col2, KeyEncoderContext* ctx,
- KeyColumnArray* temp1, KeyColumnArray* temp2);
-
- private:
- template <bool is_row_fixed_length, typename col1_type, typename col2_type>
- static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row,
- KeyRowArray* rows, const KeyColumnArray& col1,
- const KeyColumnArray& col2);
- template <bool is_row_fixed_length, typename col1_type, typename col2_type>
- static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
- uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray& rows, KeyColumnArray* col1,
- KeyColumnArray* col2);
-#if defined(ARROW_HAVE_AVX2)
- static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
- uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2);
- static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
- uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col1, KeyColumnArray* col2);
- template <bool is_row_fixed_length, uint32_t col_width>
- static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
- const KeyColumnArray& col1,
- const KeyColumnArray& col2);
- template <bool is_row_fixed_length, uint32_t col_width>
- static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t offset_within_row, const KeyRowArray& rows,
- KeyColumnArray* col1, KeyColumnArray* col2);
-#endif
- };
-
- class EncoderOffsets {
- public:
- // In order not to repeat work twice,
- // encoding combines in a single pass computing of:
- // a) row offsets for varying-length rows
- // b) within each new row, the cumulative length array
- // of varying-length values within a row.
- static void Encode(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols,
- KeyEncoderContext* ctx);
- static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* varbinary_cols,
- const std::vector<uint32_t>& varbinary_cols_base_offset,
- KeyEncoderContext* ctx);
-
- private:
- static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols);
-#if defined(ARROW_HAVE_AVX2)
- static uint32_t EncodeImp_avx2(KeyRowArray* rows,
- const std::vector<KeyColumnArray>& varbinary_cols,
- KeyColumnArray* temp_buffer_32B_per_col);
-#endif
- };
-
- class EncoderVarBinary {
- public:
- static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col, KeyEncoderContext* ctx);
- static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
- const KeyRowArray& rows, KeyColumnArray* col,
- KeyEncoderContext* ctx);
-
- private:
- template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
- static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id,
- const KeyRowArray* rows_const,
- KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const,
- KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn);
- template <bool first_varbinary_col>
- static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool first_varbinary_col>
- static void DecodeImp(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id, const KeyRowArray& rows,
- KeyColumnArray* col);
-#if defined(ARROW_HAVE_AVX2)
- static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col);
- static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id, const KeyRowArray& rows,
- KeyColumnArray* col);
- template <bool first_varbinary_col>
- static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
- const KeyColumnArray& col);
- template <bool first_varbinary_col>
- static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
- uint32_t varbinary_col_id, const KeyRowArray& rows,
- KeyColumnArray* col);
-#endif
- };
-
- class EncoderNulls {
- public:
- static void Encode(KeyRowArray* rows, const std::vector<KeyColumnArray>& cols,
- KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit);
- static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
- std::vector<KeyColumnArray>* cols);
- };
-
- KeyEncoderContext* ctx_;
-
- // Data initialized once, based on data types of key columns
- KeyRowMetadata row_metadata_;
-
- // Data initialized for each input batch.
- // All elements are ordered according to the order of encoded fields in a row.
- std::vector<KeyColumnArray> batch_all_cols_;
- std::vector<KeyColumnArray> batch_varbinary_cols_;
- std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
-};
-
-template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
-inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper(
- uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
- const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn) {
- ARROW_DCHECK(col_const && col_const->metadata().is_fixed_length);
- uint32_t col_width = col_const->metadata().fixed_length;
-
- if (is_row_fixed_length) {
- uint32_t row_width = rows_const->metadata().fixed_length;
- for (uint32_t i = 0; i < num_rows; ++i) {
- const uint8_t* src;
- uint8_t* dst;
- if (is_encoding) {
- src = col_const->data(1) + col_width * i;
- dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) +
- offset_within_row;
- } else {
- src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row;
- dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
- }
- copy_fn(dst, src, col_width);
- }
- } else {
- const uint32_t* row_offsets = rows_const->offsets();
- for (uint32_t i = 0; i < num_rows; ++i) {
- const uint8_t* src;
- uint8_t* dst;
- if (is_encoding) {
- src = col_const->data(1) + col_width * i;
- dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] +
- offset_within_row;
- } else {
- src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row;
- dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
- }
- copy_fn(dst, src, col_width);
- }
- }
-}
-
-template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
-inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper(
- uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
- const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
- const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
- COPY_FN copy_fn) {
- // Column and rows need to be varying length
- ARROW_DCHECK(!rows_const->metadata().is_fixed_length &&
- !col_const->metadata().is_fixed_length);
-
- const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row;
- const uint32_t* col_offsets = col_const->offsets();
-
- uint32_t col_offset_next = col_offsets[0];
- for (uint32_t i = 0; i < num_rows; ++i) {
- uint32_t col_offset = col_offset_next;
- col_offset_next = col_offsets[i + 1];
-
- uint32_t row_offset = row_offsets_for_batch[i];
- const uint8_t* row = rows_const->data(2) + row_offset;
-
- uint32_t offset_within_row;
- uint32_t length;
- if (first_varbinary_col) {
- rows_const->metadata().first_varbinary_offset_and_length(row, &offset_within_row,
- &length);
- } else {
- rows_const->metadata().nth_varbinary_offset_and_length(row, varbinary_col_id,
- &offset_within_row, &length);
- }
-
- row_offset += offset_within_row;
-
- const uint8_t* src;
- uint8_t* dst;
- if (is_encoding) {
- src = col_const->data(2) + col_offset;
- dst = rows_mutable_maybe_null->mutable_data(2) + row_offset;
- } else {
- src = rows_const->data(2) + row_offset;
- dst = col_mutable_maybe_null->mutable_data(2) + col_offset;
- }
- copy_fn(dst, src, length);
- }
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace compute {
+
+class KeyColumnMetadata;
+
+/// Converts between key representation as a collection of arrays for
+/// individual columns and another representation as a single array of rows
+/// combining data from all columns into one value.
+/// This conversion is reversible.
+/// Row-oriented storage is beneficial when there is a need for random access
+/// of individual rows and at the same time all included columns are likely to
+/// be accessed together, as in the case of hash table key.
+class KeyEncoder {
+ public:
+ struct KeyEncoderContext {
+ bool has_avx2() const {
+ return (hardware_flags & arrow::internal::CpuInfo::AVX2) > 0;
+ }
+ int64_t hardware_flags;
+ util::TempVectorStack* stack;
+ };
+
+ /// Description of a storage format of a single key column as needed
+ /// for the purpose of row encoding.
+ struct KeyColumnMetadata {
+ KeyColumnMetadata() = default;
+ KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in)
+ : is_fixed_length(is_fixed_length_in), fixed_length(fixed_length_in) {}
+ /// Is column storing a varying-length binary, using offsets array
+ /// to find a beginning of a value, or is it a fixed-length binary.
+ bool is_fixed_length;
+ /// For a fixed-length binary column: number of bytes per value.
+ /// Zero has a special meaning, indicating a bit vector with one bit per value.
+ /// For a varying-length binary column: number of bytes per offset.
+ uint32_t fixed_length;
+ };
+
+ /// Description of a storage format for rows produced by encoder.
+ struct KeyRowMetadata {
+ /// Is row a varying-length binary, using offsets array to find a beginning of a row,
+ /// or is it a fixed-length binary.
+ bool is_fixed_length;
+
+ /// For a fixed-length binary row, common size of rows in bytes,
+ /// rounded up to the multiple of alignment.
+ ///
+ /// For a varying-length binary, size of all encoded fixed-length key columns,
+ /// including lengths of varying-length columns, rounded up to the multiple of string
+ /// alignment.
+ uint32_t fixed_length;
+
+ /// Offset within a row to the array of 32-bit offsets within a row of
+ /// ends of varbinary fields.
+ /// Used only when the row is not fixed-length, zero for fixed-length row.
+ /// There are N elements for N varbinary fields.
+ /// Each element is the offset within a row of the first byte after
+ /// the corresponding varbinary field bytes in that row.
+ /// If varbinary fields begin at aligned addresses, than the end of the previous
+ /// varbinary field needs to be rounded up according to the specified alignment
+ /// to obtain the beginning of the next varbinary field.
+ /// The first varbinary field starts at offset specified by fixed_length,
+ /// which should already be aligned.
+ uint32_t varbinary_end_array_offset;
+
+ /// Fixed number of bytes per row that are used to encode null masks.
+ /// Null masks indicate for a single row which of its key columns are null.
+ /// Nth bit in the sequence of bytes assigned to a row represents null
+ /// information for Nth field according to the order in which they are encoded.
+ int null_masks_bytes_per_row;
+
+ /// Power of 2. Every row will start at the offset aligned to that number of bytes.
+ int row_alignment;
+
+ /// Power of 2. Must be no greater than row alignment.
+ /// Every non-power-of-2 binary field and every varbinary field bytes
+ /// will start aligned to that number of bytes.
+ int string_alignment;
+
+ /// Metadata of encoded columns in their original order.
+ std::vector<KeyColumnMetadata> column_metadatas;
+
+ /// Order in which fields are encoded.
+ std::vector<uint32_t> column_order;
+
+ /// Offsets within a row to fields in their encoding order.
+ std::vector<uint32_t> column_offsets;
+
+ /// Rounding up offset to the nearest multiple of alignment value.
+ /// Alignment must be a power of 2.
+ static inline uint32_t padding_for_alignment(uint32_t offset,
+ int required_alignment) {
+ ARROW_DCHECK(ARROW_POPCOUNT64(required_alignment) == 1);
+ return static_cast<uint32_t>((-static_cast<int32_t>(offset)) &
+ (required_alignment - 1));
+ }
+
+ /// Rounding up offset to the beginning of next column,
+ /// chosing required alignment based on the data type of that column.
+ static inline uint32_t padding_for_alignment(uint32_t offset, int string_alignment,
+ const KeyColumnMetadata& col_metadata) {
+ if (!col_metadata.is_fixed_length ||
+ ARROW_POPCOUNT64(col_metadata.fixed_length) <= 1) {
+ return 0;
+ } else {
+ return padding_for_alignment(offset, string_alignment);
+ }
+ }
+
+ /// Returns an array of offsets within a row of ends of varbinary fields.
+ inline const uint32_t* varbinary_end_array(const uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<const uint32_t*>(row + varbinary_end_array_offset);
+ }
+ inline uint32_t* varbinary_end_array(uint8_t* row) const {
+ ARROW_DCHECK(!is_fixed_length);
+ return reinterpret_cast<uint32_t*>(row + varbinary_end_array_offset);
+ }
+
+ /// Returns the offset within the row and length of the first varbinary field.
+ inline void first_varbinary_offset_and_length(const uint8_t* row, uint32_t* offset,
+ uint32_t* length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ *offset = fixed_length;
+ *length = varbinary_end_array(row)[0] - fixed_length;
+ }
+
+ /// Returns the offset within the row and length of the second and further varbinary
+ /// fields.
+ inline void nth_varbinary_offset_and_length(const uint8_t* row, int varbinary_id,
+ uint32_t* out_offset,
+ uint32_t* out_length) const {
+ ARROW_DCHECK(!is_fixed_length);
+ ARROW_DCHECK(varbinary_id > 0);
+ const uint32_t* varbinary_end = varbinary_end_array(row);
+ uint32_t offset = varbinary_end[varbinary_id - 1];
+ offset += padding_for_alignment(offset, string_alignment);
+ *out_offset = offset;
+ *out_length = varbinary_end[varbinary_id] - offset;
+ }
+
+ uint32_t encoded_field_order(uint32_t icol) const { return column_order[icol]; }
+
+ uint32_t encoded_field_offset(uint32_t icol) const { return column_offsets[icol]; }
+
+ uint32_t num_cols() const { return static_cast<uint32_t>(column_metadatas.size()); }
+
+ uint32_t num_varbinary_cols() const;
+
+ void FromColumnMetadataVector(const std::vector<KeyColumnMetadata>& cols,
+ int in_row_alignment, int in_string_alignment);
+
+ bool is_compatible(const KeyRowMetadata& other) const;
+ };
+
+ class KeyRowArray {
+ public:
+ KeyRowArray();
+ Status Init(MemoryPool* pool, const KeyRowMetadata& metadata);
+ void Clean();
+ Status AppendEmpty(uint32_t num_rows_to_append, uint32_t num_extra_bytes_to_append);
+ Status AppendSelectionFrom(const KeyRowArray& from, uint32_t num_rows_to_append,
+ const uint16_t* source_row_ids);
+ const KeyRowMetadata& metadata() const { return metadata_; }
+ int64_t length() const { return num_rows_; }
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
+ uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint8_t* null_masks() const { return null_masks_->data(); }
+ uint8_t* null_masks() { return null_masks_->mutable_data(); }
+
+ bool has_any_nulls(const KeyEncoderContext* ctx) const;
+
+ private:
+ Status ResizeFixedLengthBuffers(int64_t num_extra_rows);
+ Status ResizeOptionalVaryingLengthBuffer(int64_t num_extra_bytes);
+
+ int64_t size_null_masks(int64_t num_rows);
+ int64_t size_offsets(int64_t num_rows);
+ int64_t size_rows_fixed_length(int64_t num_rows);
+ int64_t size_rows_varying_length(int64_t num_bytes);
+ void update_buffer_pointers();
+
+ static constexpr int64_t padding_for_vectors = 64;
+ MemoryPool* pool_;
+ KeyRowMetadata metadata_;
+ /// Buffers can only expand during lifetime and never shrink.
+ std::unique_ptr<ResizableBuffer> null_masks_;
+ std::unique_ptr<ResizableBuffer> offsets_;
+ std::unique_ptr<ResizableBuffer> rows_;
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ int64_t num_rows_;
+ int64_t rows_capacity_;
+ int64_t bytes_capacity_;
+
+ // Mutable to allow lazy evaluation
+ mutable int64_t num_rows_for_has_any_nulls_;
+ mutable bool has_any_nulls_;
+ };
+
+ /// A lightweight description of an array representing one of key columns.
+ class KeyColumnArray {
+ public:
+ KeyColumnArray() = default;
+ /// Create as a mix of buffers according to the mask from two descriptions
+ /// (Nth bit is set to 0 if Nth buffer from the first input
+ /// should be used and is set to 1 otherwise).
+ /// Metadata is inherited from the first input.
+ KeyColumnArray(const KeyColumnMetadata& metadata, const KeyColumnArray& left,
+ const KeyColumnArray& right, int buffer_id_to_replace);
+ /// Create for reading
+ KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
+ const uint8_t* buffer0, const uint8_t* buffer1, const uint8_t* buffer2,
+ int bit_offset0 = 0, int bit_offset1 = 0);
+ /// Create for writing
+ KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* buffer0,
+ uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
+ int bit_offset1 = 0);
+ /// Create as a window view of original description that is offset
+ /// by a given number of rows.
+ /// The number of rows used in offset must be divisible by 8
+ /// in order to not split bit vectors within a single byte.
+ KeyColumnArray(const KeyColumnArray& from, int64_t start, int64_t length);
+ uint8_t* mutable_data(int i) {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return mutable_buffers_[i];
+ }
+ const uint8_t* data(int i) const {
+ ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+ return buffers_[i];
+ }
+ uint32_t* mutable_offsets() { return reinterpret_cast<uint32_t*>(mutable_data(1)); }
+ const uint32_t* offsets() const { return reinterpret_cast<const uint32_t*>(data(1)); }
+ const KeyColumnMetadata& metadata() const { return metadata_; }
+ int64_t length() const { return length_; }
+ int bit_offset(int i) const {
+ ARROW_DCHECK(i >= 0 && i < max_buffers_);
+ return bit_offset_[i];
+ }
+
+ private:
+ static constexpr int max_buffers_ = 3;
+ const uint8_t* buffers_[max_buffers_];
+ uint8_t* mutable_buffers_[max_buffers_];
+ KeyColumnMetadata metadata_;
+ int64_t length_;
+ // Starting bit offset within the first byte (between 0 and 7)
+ // to be used when accessing buffers that store bit vectors.
+ int bit_offset_[max_buffers_ - 1];
+ };
+
+ void Init(const std::vector<KeyColumnMetadata>& cols, KeyEncoderContext* ctx,
+ int row_alignment, int string_alignment);
+
+ const KeyRowMetadata& row_metadata() { return row_metadata_; }
+
+ /// Find out the required sizes of all buffers output buffers for encoding
+ /// (including varying-length buffers).
+ /// Use that information to resize provided row array so that it can fit
+ /// encoded data.
+ Status PrepareOutputForEncode(int64_t start_input_row, int64_t num_input_rows,
+ KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& all_cols);
+
+ /// Encode a window of column oriented data into the entire output
+ /// row oriented storage.
+ /// The output buffers for encoding need to be correctly sized before
+ /// starting encoding.
+ void Encode(int64_t start_input_row, int64_t num_input_rows, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& cols);
+
+ /// Decode a window of row oriented data into a corresponding
+ /// window of column oriented storage.
+ /// The output buffers need to be correctly allocated and sized before
+ /// calling each method.
+ /// For that reason decoding is split into two functions.
+ /// The output of the first one, that processes everything except for
+ /// varying length buffers, can be used to find out required varying
+ /// length buffers sizes.
+ void DecodeFixedLengthBuffers(int64_t start_row_input, int64_t start_row_output,
+ int64_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ void DecodeVaryingLengthBuffers(int64_t start_row_input, int64_t start_row_output,
+ int64_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+
+ private:
+ /// Prepare column array vectors.
+ /// Output column arrays represent a range of input column arrays
+ /// specified by starting row and number of rows.
+ /// Three vectors are generated:
+ /// - all columns
+ /// - fixed-length columns only
+ /// - varying-length columns only
+ void PrepareKeyColumnArrays(int64_t start_row, int64_t num_rows,
+ const std::vector<KeyColumnArray>& cols_in);
+
+ class TransformBoolean {
+ public:
+ static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
+ const KeyColumnArray& temp);
+ static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ };
+
+ class EncoderInteger {
+ public:
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp);
+ static bool UsesTransform(const KeyColumnArray& column);
+ static KeyColumnArray ArrayReplace(const KeyColumnArray& column,
+ const KeyColumnArray& temp);
+ static void PreEncode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+ static void PostDecode(const KeyColumnArray& input, KeyColumnArray* output,
+ KeyEncoderContext* ctx);
+
+ private:
+ static bool IsBoolean(const KeyColumnMetadata& metadata);
+ };
+
+ class EncoderBinary {
+ public:
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx, KeyColumnArray* temp);
+ static bool IsInteger(const KeyColumnMetadata& metadata);
+
+ private:
+ template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
+ static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row,
+ const KeyRowArray* rows_const,
+ KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const,
+ KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn);
+ template <bool is_row_fixed_length>
+ static void EncodeImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool is_row_fixed_length>
+ static void DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#if defined(ARROW_HAVE_AVX2)
+ static void EncodeHelper_avx2(bool is_row_fixed_length, uint32_t offset_within_row,
+ KeyRowArray* rows, const KeyColumnArray& col);
+ static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row,
+ uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col);
+ template <bool is_row_fixed_length>
+ static void EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool is_row_fixed_length>
+ static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#endif
+ static void ColumnMemsetNulls(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit, uint8_t byte_value);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static void ColumnMemsetNullsImp(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx,
+ KeyColumnArray* temp_vector_16bit,
+ uint8_t byte_value);
+ };
+
+ class EncoderBinaryPair {
+ public:
+ static bool CanProcessPair(const KeyColumnMetadata& col1,
+ const KeyColumnMetadata& col2) {
+ return EncoderBinary::IsInteger(col1) && EncoderBinary::IsInteger(col2);
+ }
+ static void Encode(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1, const KeyColumnArray& col2,
+ KeyEncoderContext* ctx, KeyColumnArray* temp1,
+ KeyColumnArray* temp2);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2, KeyEncoderContext* ctx,
+ KeyColumnArray* temp1, KeyColumnArray* temp2);
+
+ private:
+ template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+ static void EncodeImp(uint32_t num_rows_to_skip, uint32_t offset_within_row,
+ KeyRowArray* rows, const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ template <bool is_row_fixed_length, typename col1_type, typename col2_type>
+ static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row,
+ uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray& rows, KeyColumnArray* col1,
+ KeyColumnArray* col2);
+#if defined(ARROW_HAVE_AVX2)
+ static uint32_t EncodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
+ uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width,
+ uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col1, KeyColumnArray* col2);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static uint32_t EncodeImp_avx2(uint32_t offset_within_row, KeyRowArray* rows,
+ const KeyColumnArray& col1,
+ const KeyColumnArray& col2);
+ template <bool is_row_fixed_length, uint32_t col_width>
+ static uint32_t DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t offset_within_row, const KeyRowArray& rows,
+ KeyColumnArray* col1, KeyColumnArray* col2);
+#endif
+ };
+
+ class EncoderOffsets {
+ public:
+ // In order not to repeat work twice,
+ // encoding combines in a single pass computing of:
+ // a) row offsets for varying-length rows
+ // b) within each new row, the cumulative length array
+ // of varying-length values within a row.
+ static void Encode(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyEncoderContext* ctx);
+ static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* varbinary_cols,
+ const std::vector<uint32_t>& varbinary_cols_base_offset,
+ KeyEncoderContext* ctx);
+
+ private:
+ static void EncodeImp(uint32_t num_rows_already_processed, KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols);
+#if defined(ARROW_HAVE_AVX2)
+ static uint32_t EncodeImp_avx2(KeyRowArray* rows,
+ const std::vector<KeyColumnArray>& varbinary_cols,
+ KeyColumnArray* temp_buffer_32B_per_col);
+#endif
+ };
+
+ class EncoderVarBinary {
+ public:
+ static void Encode(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col, KeyEncoderContext* ctx);
+ static void Decode(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
+ const KeyRowArray& rows, KeyColumnArray* col,
+ KeyEncoderContext* ctx);
+
+ private:
+ template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
+ static inline void EncodeDecodeHelper(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id,
+ const KeyRowArray* rows_const,
+ KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const,
+ KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn);
+ template <bool first_varbinary_col>
+ static void EncodeImp(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool first_varbinary_col>
+ static void DecodeImp(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#if defined(ARROW_HAVE_AVX2)
+ static void EncodeHelper_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+ template <bool first_varbinary_col>
+ static void EncodeImp_avx2(uint32_t varbinary_col_id, KeyRowArray* rows,
+ const KeyColumnArray& col);
+ template <bool first_varbinary_col>
+ static void DecodeImp_avx2(uint32_t start_row, uint32_t num_rows,
+ uint32_t varbinary_col_id, const KeyRowArray& rows,
+ KeyColumnArray* col);
+#endif
+ };
+
+ class EncoderNulls {
+ public:
+ static void Encode(KeyRowArray* rows, const std::vector<KeyColumnArray>& cols,
+ KeyEncoderContext* ctx, KeyColumnArray* temp_vector_16bit);
+ static void Decode(uint32_t start_row, uint32_t num_rows, const KeyRowArray& rows,
+ std::vector<KeyColumnArray>* cols);
+ };
+
+ KeyEncoderContext* ctx_;
+
+ // Data initialized once, based on data types of key columns
+ KeyRowMetadata row_metadata_;
+
+ // Data initialized for each input batch.
+ // All elements are ordered according to the order of encoded fields in a row.
+ std::vector<KeyColumnArray> batch_all_cols_;
+ std::vector<KeyColumnArray> batch_varbinary_cols_;
+ std::vector<uint32_t> batch_varbinary_cols_base_offsets_;
+};
+
+template <bool is_row_fixed_length, bool is_encoding, class COPY_FN>
+inline void KeyEncoder::EncoderBinary::EncodeDecodeHelper(
+ uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row,
+ const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn) {
+ ARROW_DCHECK(col_const && col_const->metadata().is_fixed_length);
+ uint32_t col_width = col_const->metadata().fixed_length;
+
+ if (is_row_fixed_length) {
+ uint32_t row_width = rows_const->metadata().fixed_length;
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(1) + col_width * i;
+ dst = rows_mutable_maybe_null->mutable_data(1) + row_width * (start_row + i) +
+ offset_within_row;
+ } else {
+ src = rows_const->data(1) + row_width * (start_row + i) + offset_within_row;
+ dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
+ }
+ copy_fn(dst, src, col_width);
+ }
+ } else {
+ const uint32_t* row_offsets = rows_const->offsets();
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(1) + col_width * i;
+ dst = rows_mutable_maybe_null->mutable_data(2) + row_offsets[start_row + i] +
+ offset_within_row;
+ } else {
+ src = rows_const->data(2) + row_offsets[start_row + i] + offset_within_row;
+ dst = col_mutable_maybe_null->mutable_data(1) + col_width * i;
+ }
+ copy_fn(dst, src, col_width);
+ }
+ }
+}
+
+template <bool first_varbinary_col, bool is_encoding, class COPY_FN>
+inline void KeyEncoder::EncoderVarBinary::EncodeDecodeHelper(
+ uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id,
+ const KeyRowArray* rows_const, KeyRowArray* rows_mutable_maybe_null,
+ const KeyColumnArray* col_const, KeyColumnArray* col_mutable_maybe_null,
+ COPY_FN copy_fn) {
+ // Column and rows need to be varying length
+ ARROW_DCHECK(!rows_const->metadata().is_fixed_length &&
+ !col_const->metadata().is_fixed_length);
+
+ const uint32_t* row_offsets_for_batch = rows_const->offsets() + start_row;
+ const uint32_t* col_offsets = col_const->offsets();
+
+ uint32_t col_offset_next = col_offsets[0];
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t col_offset = col_offset_next;
+ col_offset_next = col_offsets[i + 1];
+
+ uint32_t row_offset = row_offsets_for_batch[i];
+ const uint8_t* row = rows_const->data(2) + row_offset;
+
+ uint32_t offset_within_row;
+ uint32_t length;
+ if (first_varbinary_col) {
+ rows_const->metadata().first_varbinary_offset_and_length(row, &offset_within_row,
+ &length);
+ } else {
+ rows_const->metadata().nth_varbinary_offset_and_length(row, varbinary_col_id,
+ &offset_within_row, &length);
+ }
+
+ row_offset += offset_within_row;
+
+ const uint8_t* src;
+ uint8_t* dst;
+ if (is_encoding) {
+ src = col_const->data(2) + col_offset;
+ dst = rows_mutable_maybe_null->mutable_data(2) + row_offset;
+ } else {
+ src = rows_const->data(2) + row_offset;
+ dst = col_mutable_maybe_null->mutable_data(2) + col_offset;
+ }
+ copy_fn(dst, src, length);
+ }
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
index db69ac37d1d..081411e708e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.cc
@@ -1,238 +1,238 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_hash.h"
-
-#include <memory.h>
-
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/compute/exec/util.h"
-
-namespace arrow {
-namespace compute {
-
-inline uint32_t Hashing::avalanche_helper(uint32_t acc) {
- acc ^= (acc >> 15);
- acc *= PRIME32_2;
- acc ^= (acc >> 13);
- acc *= PRIME32_3;
- acc ^= (acc >> 16);
- return acc;
-}
-
-void Hashing::avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes) {
- uint32_t processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- int tail = num_keys % 8;
- avalanche_avx2(num_keys - tail, hashes);
- processed = num_keys - tail;
- }
-#endif
- for (uint32_t i = processed; i < num_keys; ++i) {
- hashes[i] = avalanche_helper(hashes[i]);
- }
-}
-
-inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_t acc2,
- const uint32_t acc3, const uint32_t acc4) {
- return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18);
-}
-
-inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys,
- const uint8_t* keys, uint32_t* hashes) {
- ARROW_DCHECK(key_length <= 8);
- uint64_t mask = ~0ULL >> (8 * (8 - key_length));
- constexpr uint64_t multiplier = 14029467366897019727ULL;
- uint32_t offset = 0;
- for (uint32_t ikey = 0; ikey < num_keys; ++ikey) {
- uint64_t x = *reinterpret_cast<const uint64_t*>(keys + offset);
- x &= mask;
- hashes[ikey] = static_cast<uint32_t>(BYTESWAP(x * multiplier));
- offset += key_length;
- }
-}
-
-inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
- uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
- uint32_t& acc4) {
- uint64_t v1 = reinterpret_cast<const uint64_t*>(keys + offset)[0];
- // We do not need to mask v1, because we will not process a stripe
- // unless at least 9 bytes of it are part of the key.
- uint64_t v2 = reinterpret_cast<const uint64_t*>(keys + offset)[1];
- v2 &= mask_hi;
- uint32_t x1 = static_cast<uint32_t>(v1);
- uint32_t x2 = static_cast<uint32_t>(v1 >> 32);
- uint32_t x3 = static_cast<uint32_t>(v2);
- uint32_t x4 = static_cast<uint32_t>(v2 >> 32);
- acc1 += x1 * PRIME32_2;
- acc1 = ROTL(acc1, 13) * PRIME32_1;
- acc2 += x2 * PRIME32_2;
- acc2 = ROTL(acc2, 13) * PRIME32_1;
- acc3 += x3 * PRIME32_2;
- acc3 = ROTL(acc3, 13) * PRIME32_1;
- acc4 += x4 * PRIME32_2;
- acc4 = ROTL(acc4, 13) * PRIME32_1;
-}
-
-void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys,
- uint32_t key_length, const uint8_t* keys, uint32_t* hash) {
- uint32_t processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- int tail = num_keys % 2;
- helper_stripes_avx2(num_keys - tail, key_length, keys, hash);
- processed = num_keys - tail;
- }
-#endif
-
- // If length modulo stripe length is less than or equal 8, round down to the nearest 16B
- // boundary (8B ending will be processed in a separate function), otherwise round up.
- const uint32_t num_stripes = (key_length + 7) / 16;
- uint64_t mask_hi =
- ~0ULL >>
- (8 * ((num_stripes * 16 > key_length) ? num_stripes * 16 - key_length : 0));
-
- for (uint32_t i = processed; i < num_keys; ++i) {
- uint32_t acc1, acc2, acc3, acc4;
- acc1 = static_cast<uint32_t>(
- (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
- 0xffffffff);
- acc2 = PRIME32_2;
- acc3 = 0;
- acc4 = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
- uint32_t offset = i * key_length;
- for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) {
- helper_stripe(offset, ~0ULL, keys, acc1, acc2, acc3, acc4);
- offset += 16;
- }
- helper_stripe(offset, mask_hi, keys, acc1, acc2, acc3, acc4);
- hash[i] = combine_accumulators(acc1, acc2, acc3, acc4);
- }
-}
-
-inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
- uint32_t acc) {
- uint64_t v = reinterpret_cast<const uint64_t*>(keys + offset)[0];
- v &= mask;
- uint32_t x1 = static_cast<uint32_t>(v);
- uint32_t x2 = static_cast<uint32_t>(v >> 32);
- acc += x1 * PRIME32_3;
- acc = ROTL(acc, 17) * PRIME32_4;
- acc += x2 * PRIME32_3;
- acc = ROTL(acc, 17) * PRIME32_4;
- return acc;
-}
-
-void Hashing::helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash) {
- uint32_t processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- int tail = num_keys % 8;
- helper_tails_avx2(num_keys - tail, key_length, keys, hash);
- processed = num_keys - tail;
- }
-#endif
- uint64_t mask = ~0ULL >> (8 * (((key_length % 8) == 0) ? 0 : 8 - (key_length % 8)));
- uint32_t offset = key_length / 16 * 16;
- offset += processed * key_length;
- for (uint32_t i = processed; i < num_keys; ++i) {
- hash[i] = helper_tail(offset, mask, keys, hash[i]);
- offset += key_length;
- }
-}
-
-void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
- const uint8_t* keys, uint32_t* hashes) {
- ARROW_DCHECK(length_key > 0);
-
- if (length_key <= 8) {
- helper_8B(length_key, num_keys, keys, hashes);
- return;
- }
- helper_stripes(hardware_flags, num_keys, length_key, keys, hashes);
- if ((length_key % 16) > 0 && (length_key % 16) <= 8) {
- helper_tails(hardware_flags, num_keys, length_key, keys, hashes);
- }
- avalanche(hardware_flags, num_keys, hashes);
-}
-
-void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) {
- for (uint32_t i = 0; i < length / 16; ++i) {
- for (int j = 0; j < 4; ++j) {
- uint32_t lane = reinterpret_cast<const uint32_t*>(key)[i * 4 + j];
- acc[j] += (lane * PRIME32_2);
- acc[j] = ROTL(acc[j], 13);
- acc[j] *= PRIME32_1;
- }
- }
-
- int tail = length % 16;
- if (tail) {
- uint64_t last_stripe[2];
- const uint64_t* last_stripe_base =
- reinterpret_cast<const uint64_t*>(key + length - (length % 16));
- last_stripe[0] = last_stripe_base[0];
- uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length));
- if (tail <= 8) {
- last_stripe[1] = 0;
- last_stripe[0] &= mask;
- } else {
- last_stripe[1] = last_stripe_base[1];
- last_stripe[1] &= mask;
- }
- for (int j = 0; j < 4; ++j) {
- uint32_t lane = reinterpret_cast<const uint32_t*>(last_stripe)[j];
- acc[j] += (lane * PRIME32_2);
- acc[j] = ROTL(acc[j], 13);
- acc[j] *= PRIME32_1;
- }
- }
-}
-
-void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows,
- const uint32_t* offsets, const uint8_t* concatenated_keys,
- uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
- uint32_t* hashes) {
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes);
- } else {
-#endif
- for (uint32_t i = 0; i < num_rows; ++i) {
- uint32_t acc[4];
- acc[0] = static_cast<uint32_t>(
- (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
- 0xffffffff);
- acc[1] = PRIME32_2;
- acc[2] = 0;
- acc[3] = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
- uint32_t length = offsets[i + 1] - offsets[i];
- hash_varlen_helper(length, concatenated_keys + offsets[i], acc);
- hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]);
- }
- avalanche(hardware_flags, num_rows, hashes);
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_hash.h"
+
+#include <memory.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+
+namespace arrow {
+namespace compute {
+
+inline uint32_t Hashing::avalanche_helper(uint32_t acc) {
+ acc ^= (acc >> 15);
+ acc *= PRIME32_2;
+ acc ^= (acc >> 13);
+ acc *= PRIME32_3;
+ acc ^= (acc >> 16);
+ return acc;
+}
+
+void Hashing::avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 8;
+ avalanche_avx2(num_keys - tail, hashes);
+ processed = num_keys - tail;
+ }
+#endif
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ hashes[i] = avalanche_helper(hashes[i]);
+ }
+}
+
+inline uint32_t Hashing::combine_accumulators(const uint32_t acc1, const uint32_t acc2,
+ const uint32_t acc3, const uint32_t acc4) {
+ return ROTL(acc1, 1) + ROTL(acc2, 7) + ROTL(acc3, 12) + ROTL(acc4, 18);
+}
+
+inline void Hashing::helper_8B(uint32_t key_length, uint32_t num_keys,
+ const uint8_t* keys, uint32_t* hashes) {
+ ARROW_DCHECK(key_length <= 8);
+ uint64_t mask = ~0ULL >> (8 * (8 - key_length));
+ constexpr uint64_t multiplier = 14029467366897019727ULL;
+ uint32_t offset = 0;
+ for (uint32_t ikey = 0; ikey < num_keys; ++ikey) {
+ uint64_t x = *reinterpret_cast<const uint64_t*>(keys + offset);
+ x &= mask;
+ hashes[ikey] = static_cast<uint32_t>(BYTESWAP(x * multiplier));
+ offset += key_length;
+ }
+}
+
+inline void Hashing::helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
+ uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
+ uint32_t& acc4) {
+ uint64_t v1 = reinterpret_cast<const uint64_t*>(keys + offset)[0];
+ // We do not need to mask v1, because we will not process a stripe
+ // unless at least 9 bytes of it are part of the key.
+ uint64_t v2 = reinterpret_cast<const uint64_t*>(keys + offset)[1];
+ v2 &= mask_hi;
+ uint32_t x1 = static_cast<uint32_t>(v1);
+ uint32_t x2 = static_cast<uint32_t>(v1 >> 32);
+ uint32_t x3 = static_cast<uint32_t>(v2);
+ uint32_t x4 = static_cast<uint32_t>(v2 >> 32);
+ acc1 += x1 * PRIME32_2;
+ acc1 = ROTL(acc1, 13) * PRIME32_1;
+ acc2 += x2 * PRIME32_2;
+ acc2 = ROTL(acc2, 13) * PRIME32_1;
+ acc3 += x3 * PRIME32_2;
+ acc3 = ROTL(acc3, 13) * PRIME32_1;
+ acc4 += x4 * PRIME32_2;
+ acc4 = ROTL(acc4, 13) * PRIME32_1;
+}
+
+void Hashing::helper_stripes(int64_t hardware_flags, uint32_t num_keys,
+ uint32_t key_length, const uint8_t* keys, uint32_t* hash) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 2;
+ helper_stripes_avx2(num_keys - tail, key_length, keys, hash);
+ processed = num_keys - tail;
+ }
+#endif
+
+ // If length modulo stripe length is less than or equal 8, round down to the nearest 16B
+ // boundary (8B ending will be processed in a separate function), otherwise round up.
+ const uint32_t num_stripes = (key_length + 7) / 16;
+ uint64_t mask_hi =
+ ~0ULL >>
+ (8 * ((num_stripes * 16 > key_length) ? num_stripes * 16 - key_length : 0));
+
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ uint32_t acc1, acc2, acc3, acc4;
+ acc1 = static_cast<uint32_t>(
+ (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
+ 0xffffffff);
+ acc2 = PRIME32_2;
+ acc3 = 0;
+ acc4 = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
+ uint32_t offset = i * key_length;
+ for (uint32_t stripe = 0; stripe < num_stripes - 1; ++stripe) {
+ helper_stripe(offset, ~0ULL, keys, acc1, acc2, acc3, acc4);
+ offset += 16;
+ }
+ helper_stripe(offset, mask_hi, keys, acc1, acc2, acc3, acc4);
+ hash[i] = combine_accumulators(acc1, acc2, acc3, acc4);
+ }
+}
+
+inline uint32_t Hashing::helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
+ uint32_t acc) {
+ uint64_t v = reinterpret_cast<const uint64_t*>(keys + offset)[0];
+ v &= mask;
+ uint32_t x1 = static_cast<uint32_t>(v);
+ uint32_t x2 = static_cast<uint32_t>(v >> 32);
+ acc += x1 * PRIME32_3;
+ acc = ROTL(acc, 17) * PRIME32_4;
+ acc += x2 * PRIME32_3;
+ acc = ROTL(acc, 17) * PRIME32_4;
+ return acc;
+}
+
+void Hashing::helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash) {
+ uint32_t processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ int tail = num_keys % 8;
+ helper_tails_avx2(num_keys - tail, key_length, keys, hash);
+ processed = num_keys - tail;
+ }
+#endif
+ uint64_t mask = ~0ULL >> (8 * (((key_length % 8) == 0) ? 0 : 8 - (key_length % 8)));
+ uint32_t offset = key_length / 16 * 16;
+ offset += processed * key_length;
+ for (uint32_t i = processed; i < num_keys; ++i) {
+ hash[i] = helper_tail(offset, mask, keys, hash[i]);
+ offset += key_length;
+ }
+}
+
+void Hashing::hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
+ const uint8_t* keys, uint32_t* hashes) {
+ ARROW_DCHECK(length_key > 0);
+
+ if (length_key <= 8) {
+ helper_8B(length_key, num_keys, keys, hashes);
+ return;
+ }
+ helper_stripes(hardware_flags, num_keys, length_key, keys, hashes);
+ if ((length_key % 16) > 0 && (length_key % 16) <= 8) {
+ helper_tails(hardware_flags, num_keys, length_key, keys, hashes);
+ }
+ avalanche(hardware_flags, num_keys, hashes);
+}
+
+void Hashing::hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc) {
+ for (uint32_t i = 0; i < length / 16; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ uint32_t lane = reinterpret_cast<const uint32_t*>(key)[i * 4 + j];
+ acc[j] += (lane * PRIME32_2);
+ acc[j] = ROTL(acc[j], 13);
+ acc[j] *= PRIME32_1;
+ }
+ }
+
+ int tail = length % 16;
+ if (tail) {
+ uint64_t last_stripe[2];
+ const uint64_t* last_stripe_base =
+ reinterpret_cast<const uint64_t*>(key + length - (length % 16));
+ last_stripe[0] = last_stripe_base[0];
+ uint64_t mask = ~0ULL >> (8 * ((length + 7) / 8 * 8 - length));
+ if (tail <= 8) {
+ last_stripe[1] = 0;
+ last_stripe[0] &= mask;
+ } else {
+ last_stripe[1] = last_stripe_base[1];
+ last_stripe[1] &= mask;
+ }
+ for (int j = 0; j < 4; ++j) {
+ uint32_t lane = reinterpret_cast<const uint32_t*>(last_stripe)[j];
+ acc[j] += (lane * PRIME32_2);
+ acc[j] = ROTL(acc[j], 13);
+ acc[j] *= PRIME32_1;
+ }
+ }
+}
+
+void Hashing::hash_varlen(int64_t hardware_flags, uint32_t num_rows,
+ const uint32_t* offsets, const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes) {
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ hash_varlen_avx2(num_rows, offsets, concatenated_keys, temp_buffer, hashes);
+ } else {
+#endif
+ for (uint32_t i = 0; i < num_rows; ++i) {
+ uint32_t acc[4];
+ acc[0] = static_cast<uint32_t>(
+ (static_cast<uint64_t>(PRIME32_1) + static_cast<uint64_t>(PRIME32_2)) &
+ 0xffffffff);
+ acc[1] = PRIME32_2;
+ acc[2] = 0;
+ acc[3] = static_cast<uint32_t>(-static_cast<int32_t>(PRIME32_1));
+ uint32_t length = offsets[i + 1] - offsets[i];
+ hash_varlen_helper(length, concatenated_keys + offsets[i], acc);
+ hashes[i] = combine_accumulators(acc[0], acc[1], acc[2], acc[3]);
+ }
+ avalanche(hardware_flags, num_rows, hashes);
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
index 4d36c9aa585..7f8ab5185cc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_hash.h
@@ -1,94 +1,94 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#if defined(ARROW_HAVE_AVX2)
-#include <immintrin.h>
-#endif
-
-#include <cstdint>
-
-#include "arrow/compute/exec/util.h"
-
-namespace arrow {
-namespace compute {
-
-// Implementations are based on xxh3 32-bit algorithm description from:
-// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
-//
-class Hashing {
- public:
- static void hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
- const uint8_t* keys, uint32_t* hashes);
-
- static void hash_varlen(int64_t hardware_flags, uint32_t num_rows,
- const uint32_t* offsets, const uint8_t* concatenated_keys,
- uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
- uint32_t* hashes);
-
- private:
- static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
- static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
- static const uint32_t PRIME32_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
- static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
- static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001
-
- // Avalanche
- static inline uint32_t avalanche_helper(uint32_t acc);
-#if defined(ARROW_HAVE_AVX2)
- static void avalanche_avx2(uint32_t num_keys, uint32_t* hashes);
-#endif
- static void avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes);
-
- // Accumulator combine
- static inline uint32_t combine_accumulators(const uint32_t acc1, const uint32_t acc2,
- const uint32_t acc3, const uint32_t acc4);
-#if defined(ARROW_HAVE_AVX2)
- static inline uint64_t combine_accumulators_avx2(__m256i acc);
-#endif
-
- // Helpers
- static inline void helper_8B(uint32_t key_length, uint32_t num_keys,
- const uint8_t* keys, uint32_t* hashes);
- static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
- uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
- uint32_t& acc4);
- static inline uint32_t helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
- uint32_t acc);
-#if defined(ARROW_HAVE_AVX2)
- static void helper_stripes_avx2(uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash);
- static void helper_tails_avx2(uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash);
-#endif
- static void helper_stripes(int64_t hardware_flags, uint32_t num_keys,
- uint32_t key_length, const uint8_t* keys, uint32_t* hash);
- static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
- const uint8_t* keys, uint32_t* hash);
-
- static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc);
-#if defined(ARROW_HAVE_AVX2)
- static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets,
- const uint8_t* concatenated_keys,
- uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
- uint32_t* hashes);
-#endif
-};
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(ARROW_HAVE_AVX2)
+#include <immintrin.h>
+#endif
+
+#include <cstdint>
+
+#include "arrow/compute/exec/util.h"
+
+namespace arrow {
+namespace compute {
+
+// Implementations are based on xxh3 32-bit algorithm description from:
+// https://github.com/Cyan4973/xxHash/blob/dev/doc/xxhash_spec.md
+//
+class Hashing {
+ public:
+ static void hash_fixed(int64_t hardware_flags, uint32_t num_keys, uint32_t length_key,
+ const uint8_t* keys, uint32_t* hashes);
+
+ static void hash_varlen(int64_t hardware_flags, uint32_t num_rows,
+ const uint32_t* offsets, const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes);
+
+ private:
+ static const uint32_t PRIME32_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
+ static const uint32_t PRIME32_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
+ static const uint32_t PRIME32_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
+ static const uint32_t PRIME32_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
+ static const uint32_t PRIME32_5 = 0x165667B1; // 0b00010110010101100110011110110001
+
+ // Avalanche
+ static inline uint32_t avalanche_helper(uint32_t acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void avalanche_avx2(uint32_t num_keys, uint32_t* hashes);
+#endif
+ static void avalanche(int64_t hardware_flags, uint32_t num_keys, uint32_t* hashes);
+
+ // Accumulator combine
+ static inline uint32_t combine_accumulators(const uint32_t acc1, const uint32_t acc2,
+ const uint32_t acc3, const uint32_t acc4);
+#if defined(ARROW_HAVE_AVX2)
+ static inline uint64_t combine_accumulators_avx2(__m256i acc);
+#endif
+
+ // Helpers
+ static inline void helper_8B(uint32_t key_length, uint32_t num_keys,
+ const uint8_t* keys, uint32_t* hashes);
+ static inline void helper_stripe(uint32_t offset, uint64_t mask_hi, const uint8_t* keys,
+ uint32_t& acc1, uint32_t& acc2, uint32_t& acc3,
+ uint32_t& acc4);
+ static inline uint32_t helper_tail(uint32_t offset, uint64_t mask, const uint8_t* keys,
+ uint32_t acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void helper_stripes_avx2(uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+ static void helper_tails_avx2(uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+#endif
+ static void helper_stripes(int64_t hardware_flags, uint32_t num_keys,
+ uint32_t key_length, const uint8_t* keys, uint32_t* hash);
+ static void helper_tails(int64_t hardware_flags, uint32_t num_keys, uint32_t key_length,
+ const uint8_t* keys, uint32_t* hash);
+
+ static void hash_varlen_helper(uint32_t length, const uint8_t* key, uint32_t* acc);
+#if defined(ARROW_HAVE_AVX2)
+ static void hash_varlen_avx2(uint32_t num_rows, const uint32_t* offsets,
+ const uint8_t* concatenated_keys,
+ uint32_t* temp_buffer, // Needs to hold 4 x 32-bit per row
+ uint32_t* hashes);
+#endif
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
index 5cc4105f45c..ac47c04403c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.cc
@@ -1,610 +1,610 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/key_map.h"
-
-#include <memory.h>
-
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-
-using BitUtil::CountLeadingZeros;
-
-namespace compute {
-
-constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL;
-
-// Search status bytes inside a block of 8 slots (64-bit word).
-// Try to find a slot that contains a 7-bit stamp matching the one provided.
-// There are three possible outcomes:
-// 1. A matching slot is found.
-// -> Return its index between 0 and 7 and set match found flag.
-// 2. A matching slot is not found and there is an empty slot in the block.
-// -> Return the index of the first empty slot and clear match found flag.
-// 3. A matching slot is not found and there are no empty slots in the block.
-// -> Return 8 as the output slot index and clear match found flag.
-//
-// Optionally an index of the first slot to start the search from can be specified.
-// In this case slots before it will be ignored.
-//
-template <bool use_start_slot>
-inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot,
- int* out_slot, int* out_match_found) {
- // Filled slot bytes have the highest bit set to 0 and empty slots are equal to 0x80.
- uint64_t block_high_bits = block & kHighBitOfEachByte;
-
- // Replicate 7-bit stamp to all non-empty slots, leaving zeroes for empty slots.
- uint64_t stamp_pattern = stamp * ((block_high_bits ^ kHighBitOfEachByte) >> 7);
-
- // If we xor this pattern with block status bytes we get in individual bytes:
- // a) 0x00, for filled slots matching the stamp,
- // b) 0x00 < x < 0x80, for filled slots not matching the stamp,
- // c) 0x80, for empty slots.
- uint64_t block_xor_pattern = block ^ stamp_pattern;
-
- // If we then add 0x7f to every byte, we get:
- // a) 0x7F
- // b) 0x80 <= x < 0xFF
- // c) 0xFF
- uint64_t match_base = block_xor_pattern + ~kHighBitOfEachByte;
-
- // The highest bit now tells us if we have a match (0) or not (1).
- // We will negate the bits so that match is represented by a set bit.
- uint64_t matches = ~match_base;
-
- // Clear 7 non-relevant bits in each byte.
- // Also clear bytes that correspond to slots that we were supposed to
- // skip due to provided start slot index.
- // Note: the highest byte corresponds to the first slot.
- if (use_start_slot) {
- matches &= kHighBitOfEachByte >> (8 * start_slot);
- } else {
- matches &= kHighBitOfEachByte;
- }
-
- // We get 0 if there are no matches
- *out_match_found = (matches == 0 ? 0 : 1);
-
- // Now if we or with the highest bits of the block and scan zero bits in reverse,
- // we get 8x slot index that we were looking for.
- // This formula works in all three cases a), b) and c).
- *out_slot = static_cast<int>(CountLeadingZeros(matches | block_high_bits) >> 3);
-}
-
-// This call follows the call to search_block.
-// The input slot index is the output returned by it, which is a value from 0 to 8,
-// with 8 indicating that both: no match was found and there were no empty slots.
-//
-// If the slot corresponds to a non-empty slot return a group id associated with it.
-// Otherwise return any group id from any of the slots or
-// zero, which is the default value stored in empty slots.
-//
-inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
- uint64_t group_id_mask) {
- // Input slot can be equal to 8, in which case we need to output any valid group id
- // value, so we take the one from slot 0 in the block.
- int clamped_slot = slot & 7;
-
- // Group id values for all 8 slots in the block are bit-packed and follow the status
- // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
- // that case we can extract group id using aligned 64-bit word access.
- int num_groupid_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
- ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
- num_groupid_bits == 32 || num_groupid_bits == 64);
-
- int bit_offset = clamped_slot * num_groupid_bits;
- const uint64_t* group_id_bytes =
- reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
- uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
-
- return group_id;
-}
-
-// Return global slot id (the index including the information about the block)
-// where the search should continue if the first comparison fails.
-// This function always follows search_block and receives the slot id returned by it.
-//
-inline uint64_t SwissTable::next_slot_to_visit(uint64_t block_index, int slot,
- int match_found) {
- // The result should be taken modulo the number of all slots in all blocks,
- // but here we allow it to take a value one above the last slot index.
- // Modulo operation is postponed to later.
- return block_index * 8 + slot + match_found;
-}
-
-// Implements first (fast-path, optimistic) lookup.
-// Searches for a match only within the start block and
-// trying only the first slot with a matching stamp.
-//
-// Comparison callback needed for match verification is done outside of this function.
-// Match bit vector filled by it only indicates finding a matching stamp in a slot.
-//
-template <bool use_selection>
-void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
- const uint32_t* hashes, uint8_t* out_match_bitvector,
- uint32_t* out_groupids, uint32_t* out_slot_ids) {
- // Clear the output bit vector
- memset(out_match_bitvector, 0, (num_keys + 7) / 8);
-
- // Based on the size of the table, prepare bit number constants.
- uint32_t stamp_mask = (1 << bits_stamp_) - 1;
- int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- uint32_t groupid_mask = (1 << num_groupid_bits) - 1;
-
- for (int i = 0; i < num_keys; ++i) {
- int id;
- if (use_selection) {
- id = util::SafeLoad(&selection[i]);
- } else {
- id = i;
- }
-
- // Extract from hash: block index and stamp
- //
- uint32_t hash = hashes[id];
- uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_);
- uint32_t stamp = iblock & stamp_mask;
- iblock >>= bits_stamp_;
-
- uint32_t num_block_bytes = num_groupid_bits + 8;
- const uint8_t* blockbase = reinterpret_cast<const uint8_t*>(blocks_) +
- static_cast<uint64_t>(iblock) * num_block_bytes;
- uint64_t block = util::SafeLoadAs<uint64_t>(blockbase);
-
- // Call helper functions to obtain the output triplet:
- // - match (of a stamp) found flag
- // - group id for key comparison
- // - slot to resume search from in case of no match or false positive
- int match_found;
- int islot_in_block;
- search_block<false>(block, stamp, 0, &islot_in_block, &match_found);
- uint64_t groupid = extract_group_id(blockbase, islot_in_block, groupid_mask);
- ARROW_DCHECK(groupid < num_inserted_ || num_inserted_ == 0);
- uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found);
-
- out_match_bitvector[id / 8] |= match_found << (id & 7);
- util::SafeStore(&out_groupids[id], static_cast<uint32_t>(groupid));
- util::SafeStore(&out_slot_ids[id], static_cast<uint32_t>(islot));
- }
-}
-
-// How many groups we can keep in the hash table without the need for resizing.
-// When we reach this limit, we need to break processing of any further rows and resize.
-//
-uint64_t SwissTable::num_groups_for_resize() const {
- // Resize small hash tables when 50% full (up to 12KB).
- // Resize large hash tables when 75% full.
- constexpr int log_blocks_small_ = 9;
- uint64_t num_slots = 1ULL << (log_blocks_ + 3);
- if (log_blocks_ <= log_blocks_small_) {
- return num_slots / 2;
- } else {
- return num_slots * 3 / 4;
- }
-}
-
-uint64_t SwissTable::wrap_global_slot_id(uint64_t global_slot_id) {
- uint64_t global_slot_id_mask = (1 << (log_blocks_ + 3)) - 1;
- return global_slot_id & global_slot_id_mask;
-}
-
-// Run a single round of slot search - comparison / insert - filter unprocessed.
-// Update selection vector to reflect which items have been processed.
-// Ids in selection vector do not have to be sorted.
-//
-Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
- uint16_t* inout_selection, bool* out_need_resize,
- uint32_t* out_group_ids, uint32_t* inout_next_slot_ids) {
- auto num_groups_limit = num_groups_for_resize();
- ARROW_DCHECK(num_inserted_ < num_groups_limit);
-
- // Temporary arrays are of limited size.
- // The input needs to be split into smaller portions if it exceeds that limit.
- //
- ARROW_DCHECK(*inout_num_selected <= static_cast<uint32_t>(1 << log_minibatch_));
-
- // We will split input row ids into three categories:
- // - needing to visit next block [0]
- // - needing comparison [1]
- // - inserted [2]
- //
- auto ids_inserted_buf =
- util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
- auto ids_for_comparison_buf =
- util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
- constexpr int category_nomatch = 0;
- constexpr int category_cmp = 1;
- constexpr int category_inserted = 2;
- int num_ids[3];
- num_ids[0] = num_ids[1] = num_ids[2] = 0;
- uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(),
- ids_inserted_buf.mutable_data()};
- auto push_id = [&num_ids, &ids](int category, int id) {
- util::SafeStore(&ids[category][num_ids[category]++], static_cast<uint16_t>(id));
- };
-
- uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- uint64_t groupid_mask = (1ULL << num_groupid_bits) - 1;
- constexpr uint64_t stamp_mask = 0x7f;
- uint64_t num_block_bytes = (8 + num_groupid_bits);
-
- uint32_t num_processed;
- for (num_processed = 0;
- // Second condition in for loop:
- // We need to break processing and have the caller of this function
- // resize hash table if we reach the limit of the number of groups present.
- num_processed < *inout_num_selected &&
- num_inserted_ + num_ids[category_inserted] < num_groups_limit;
- ++num_processed) {
- // row id in original batch
- int id = util::SafeLoad(&inout_selection[num_processed]);
-
- uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id]));
- uint64_t block_id = slot_id >> 3;
- uint32_t hash = hashes[id];
- uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
- uint64_t block = *reinterpret_cast<uint64_t*>(blockbase);
- uint64_t stamp = (hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask;
- int start_slot = (slot_id & 7);
-
- bool isempty = (blockbase[7 - start_slot] == 0x80);
- if (isempty) {
- // If we reach the empty slot we insert key for new group
-
- blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
- uint32_t group_id = num_inserted_ + num_ids[category_inserted];
- int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
-
- // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
- // In that case we can insert group id value using aligned 64-bit word access.
- ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
- num_groupid_bits == 32 || num_groupid_bits == 64);
- uint64_t* ptr =
- &reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6];
- util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast<uint64_t>(group_id)
- << (groupid_bit_offset & 63)));
-
- hashes_[slot_id] = hash;
- util::SafeStore(&out_group_ids[id], group_id);
- push_id(category_inserted, id);
- } else {
- // We search for a slot with a matching stamp within a single block.
- // We append row id to the appropriate sequence of ids based on
- // whether the match has been found or not.
-
- int new_match_found;
- int new_slot;
- search_block<true>(block, static_cast<int>(stamp), start_slot, &new_slot,
- &new_match_found);
- auto new_groupid =
- static_cast<uint32_t>(extract_group_id(blockbase, new_slot, groupid_mask));
- ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]);
- new_slot =
- static_cast<int>(next_slot_to_visit(block_id, new_slot, new_match_found));
- util::SafeStore(&inout_next_slot_ids[id], new_slot);
- util::SafeStore(&out_group_ids[id], new_groupid);
- push_id(new_match_found, id);
- }
- }
-
- // Copy keys for newly inserted rows using callback
- RETURN_NOT_OK(append_impl_(num_ids[category_inserted], ids[category_inserted]));
- num_inserted_ += num_ids[category_inserted];
-
- // Evaluate comparisons and append ids of rows that failed it to the non-match set.
- uint32_t num_not_equal;
- equal_impl_(num_ids[category_cmp], ids[category_cmp], out_group_ids, &num_not_equal,
- ids[category_nomatch] + num_ids[category_nomatch]);
- num_ids[category_nomatch] += num_not_equal;
-
- // Append ids of any unprocessed entries if we aborted processing due to the need
- // to resize.
- if (num_processed < *inout_num_selected) {
- memmove(ids[category_nomatch] + num_ids[category_nomatch],
- inout_selection + num_processed,
- sizeof(uint16_t) * (*inout_num_selected - num_processed));
- num_ids[category_nomatch] += (*inout_num_selected - num_processed);
- }
-
- *out_need_resize = (num_inserted_ == num_groups_limit);
- *inout_num_selected = num_ids[category_nomatch];
- return Status::OK();
-}
-
-// Use hashes and callbacks to find group ids for already existing keys and
-// to insert and report newly assigned group ids for new keys.
-//
-Status SwissTable::map(const int num_keys, const uint32_t* hashes,
- uint32_t* out_groupids) {
- // Temporary buffers have limited size.
- // Caller is responsible for splitting larger input arrays into smaller chunks.
- ARROW_DCHECK(num_keys <= (1 << log_minibatch_));
-
- // Allocate temporary buffers with a lifetime of this function
- auto match_bitvector_buf = util::TempVectorHolder<uint8_t>(temp_stack_, num_keys);
- uint8_t* match_bitvector = match_bitvector_buf.mutable_data();
- auto slot_ids_buf = util::TempVectorHolder<uint32_t>(temp_stack_, num_keys);
- uint32_t* slot_ids = slot_ids_buf.mutable_data();
- auto ids_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
- uint16_t* ids = ids_buf.mutable_data();
- uint32_t num_ids;
-
- // First-pass processing.
- // Optimistically use simplified lookup involving only a start block to find
- // a single group id candidate for every input.
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
- if (log_blocks_ <= 4) {
- int tail = num_keys % 32;
- int delta = num_keys - tail;
- lookup_1_avx2_x32(num_keys - tail, hashes, match_bitvector, out_groupids, slot_ids);
- lookup_1_avx2_x8(tail, hashes + delta, match_bitvector + delta / 8,
- out_groupids + delta, slot_ids + delta);
- } else {
- lookup_1_avx2_x8(num_keys, hashes, match_bitvector, out_groupids, slot_ids);
- }
- } else {
-#endif
- lookup_1<false>(nullptr, num_keys, hashes, match_bitvector, out_groupids, slot_ids);
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
-
- int64_t num_matches =
- arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys);
-
- // After the first-pass processing count rows with matches (based on stamp comparison)
- // and decide based on their percentage whether to call dense or sparse comparison
- // function. Dense comparison means evaluating it for all inputs, even if the matching
- // stamp was not found. It may be cheaper to evaluate comparison for all inputs if the
- // extra cost of filtering is higher than the wasted processing of rows with no match.
- //
- // Dense comparison can only be used if there is at least one inserted key,
- // because otherwise there is no key to compare to.
- //
- if (num_inserted_ > 0 && num_matches > 0 && num_matches > 3 * num_keys / 4) {
- // Dense comparisons
- equal_impl_(num_keys, nullptr, out_groupids, &num_ids, ids);
- } else {
- // Sparse comparisons that involve filtering the input set of keys
- auto ids_cmp_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
- uint16_t* ids_cmp = ids_cmp_buf.mutable_data();
- int num_ids_result;
- util::BitUtil::bits_split_indexes(hardware_flags_, num_keys, match_bitvector,
- &num_ids_result, ids, ids_cmp);
- num_ids = num_ids_result;
- uint32_t num_not_equal;
- equal_impl_(num_keys - num_ids, ids_cmp, out_groupids, &num_not_equal, ids + num_ids);
- num_ids += num_not_equal;
- }
-
- do {
- // A single round of slow-pass (robust) lookup or insert.
- // A single round ends with either a single comparison verifying the match candidate
- // or inserting a new key. A single round of slow-pass may return early if we reach
- // the limit of the number of groups due to inserts of new keys. In that case we need
- // to resize and recalculating starting global slot ids for new bigger hash table.
- bool out_of_capacity;
- RETURN_NOT_OK(
- lookup_2(hashes, &num_ids, ids, &out_of_capacity, out_groupids, slot_ids));
- if (out_of_capacity) {
- RETURN_NOT_OK(grow_double());
- // Reset start slot ids for still unprocessed input keys.
- //
- for (uint32_t i = 0; i < num_ids; ++i) {
- // First slot in the new starting block
- const int16_t id = util::SafeLoad(&ids[i]);
- util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8);
- }
- }
- } while (num_ids > 0);
-
- return Status::OK();
-}
-
-Status SwissTable::grow_double() {
- // Before and after metadata
- int num_group_id_bits_before = num_groupid_bits_from_log_blocks(log_blocks_);
- int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1);
- uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before);
- int log_blocks_before = log_blocks_;
- int log_blocks_after = log_blocks_ + 1;
- uint64_t block_size_before = (8 + num_group_id_bits_before);
- uint64_t block_size_after = (8 + num_group_id_bits_after);
- uint64_t block_size_total_before = (block_size_before << log_blocks_before) + padding_;
- uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_;
- uint64_t hashes_size_total_before =
- (bits_hash_ / 8 * (1 << (log_blocks_before + 3))) + padding_;
- uint64_t hashes_size_total_after =
- (bits_hash_ / 8 * (1 << (log_blocks_after + 3))) + padding_;
- constexpr uint32_t stamp_mask = (1 << bits_stamp_) - 1;
-
- // Allocate new buffers
- uint8_t* blocks_new;
- RETURN_NOT_OK(pool_->Allocate(block_size_total_after, &blocks_new));
- memset(blocks_new, 0, block_size_total_after);
- uint8_t* hashes_new_8B;
- uint32_t* hashes_new;
- RETURN_NOT_OK(pool_->Allocate(hashes_size_total_after, &hashes_new_8B));
- hashes_new = reinterpret_cast<uint32_t*>(hashes_new_8B);
-
- // First pass over all old blocks.
- // Reinsert entries that were not in the overflow block
- // (block other than selected by hash bits corresponding to the entry).
- for (int i = 0; i < (1 << log_blocks_); ++i) {
- // How many full slots in this block
- uint8_t* block_base = blocks_ + i * block_size_before;
- uint8_t* double_block_base_new = blocks_new + 2 * i * block_size_after;
- uint64_t block = *reinterpret_cast<const uint64_t*>(block_base);
-
- auto full_slots =
- static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
- int full_slots_new[2];
- full_slots_new[0] = full_slots_new[1] = 0;
- util::SafeStore(double_block_base_new, kHighBitOfEachByte);
- util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte);
-
- for (int j = 0; j < full_slots; ++j) {
- uint64_t slot_id = i * 8 + j;
- uint32_t hash = hashes_[slot_id];
- uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
- bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
- if (is_overflow_entry) {
- continue;
- }
-
- int ihalf = block_id_new & 1;
- uint8_t stamp_new =
- hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
- uint64_t group_id_bit_offs = j * num_group_id_bits_before;
- uint64_t group_id =
- (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
- (group_id_bit_offs & 7)) &
- group_id_mask_before;
-
- uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf];
- hashes_new[slot_id_new] = hash;
- uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after;
- block_base_new[7 - full_slots_new[ihalf]] = stamp_new;
- int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after;
- uint64_t* ptr =
- reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
- util::SafeStore(ptr,
- util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
- full_slots_new[ihalf]++;
- }
- }
-
- // Second pass over all old blocks.
- // Reinsert entries that were in an overflow block.
- for (int i = 0; i < (1 << log_blocks_); ++i) {
- // How many full slots in this block
- uint8_t* block_base = blocks_ + i * block_size_before;
- uint64_t block = util::SafeLoadAs<uint64_t>(block_base);
- int full_slots = static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
-
- for (int j = 0; j < full_slots; ++j) {
- uint64_t slot_id = i * 8 + j;
- uint32_t hash = hashes_[slot_id];
- uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
- bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
- if (!is_overflow_entry) {
- continue;
- }
-
- uint64_t group_id_bit_offs = j * num_group_id_bits_before;
- uint64_t group_id =
- (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
- (group_id_bit_offs & 7)) &
- group_id_mask_before;
- uint8_t stamp_new =
- hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
-
- uint8_t* block_base_new = blocks_new + block_id_new * block_size_after;
- uint64_t block_new = util::SafeLoadAs<uint64_t>(block_base_new);
- int full_slots_new =
- static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
- while (full_slots_new == 8) {
- block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1);
- block_base_new = blocks_new + block_id_new * block_size_after;
- block_new = util::SafeLoadAs<uint64_t>(block_base_new);
- full_slots_new =
- static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
- }
-
- hashes_new[block_id_new * 8 + full_slots_new] = hash;
- block_base_new[7 - full_slots_new] = stamp_new;
- int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after;
- uint64_t* ptr =
- reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
- util::SafeStore(ptr,
- util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
- }
- }
-
- pool_->Free(blocks_, block_size_total_before);
- pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hashes_size_total_before);
- log_blocks_ = log_blocks_after;
- blocks_ = blocks_new;
- hashes_ = hashes_new;
-
- return Status::OK();
-}
-
-Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool,
- util::TempVectorStack* temp_stack, int log_minibatch,
- EqualImpl equal_impl, AppendImpl append_impl) {
- hardware_flags_ = hardware_flags;
- pool_ = pool;
- temp_stack_ = temp_stack;
- log_minibatch_ = log_minibatch;
- equal_impl_ = equal_impl;
- append_impl_ = append_impl;
-
- log_blocks_ = 0;
- int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- num_inserted_ = 0;
-
- const uint64_t block_bytes = 8 + num_groupid_bits;
- const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
- RETURN_NOT_OK(pool_->Allocate(slot_bytes, &blocks_));
-
- // Make sure group ids are initially set to zero for all slots.
- memset(blocks_, 0, slot_bytes);
-
- // Initialize all status bytes to represent an empty slot.
- for (uint64_t i = 0; i < (static_cast<uint64_t>(1) << log_blocks_); ++i) {
- util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte);
- }
-
- uint64_t num_slots = 1ULL << (log_blocks_ + 3);
- const uint64_t hash_size = sizeof(uint32_t);
- const uint64_t hash_bytes = hash_size * num_slots + padding_;
- uint8_t* hashes8;
- RETURN_NOT_OK(pool_->Allocate(hash_bytes, &hashes8));
- hashes_ = reinterpret_cast<uint32_t*>(hashes8);
-
- return Status::OK();
-}
-
-void SwissTable::cleanup() {
- if (blocks_) {
- int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
- const uint64_t block_bytes = 8 + num_groupid_bits;
- const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
- pool_->Free(blocks_, slot_bytes);
- blocks_ = nullptr;
- }
- if (hashes_) {
- uint64_t num_slots = 1ULL << (log_blocks_ + 3);
- const uint64_t hash_size = sizeof(uint32_t);
- const uint64_t hash_bytes = hash_size * num_slots + padding_;
- pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hash_bytes);
- hashes_ = nullptr;
- }
- log_blocks_ = 0;
- num_inserted_ = 0;
-}
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/key_map.h"
+
+#include <memory.h>
+
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+
+using BitUtil::CountLeadingZeros;
+
+namespace compute {
+
+constexpr uint64_t kHighBitOfEachByte = 0x8080808080808080ULL;
+
+// Search status bytes inside a block of 8 slots (64-bit word).
+// Try to find a slot that contains a 7-bit stamp matching the one provided.
+// There are three possible outcomes:
+// 1. A matching slot is found.
+// -> Return its index between 0 and 7 and set match found flag.
+// 2. A matching slot is not found and there is an empty slot in the block.
+// -> Return the index of the first empty slot and clear match found flag.
+// 3. A matching slot is not found and there are no empty slots in the block.
+// -> Return 8 as the output slot index and clear match found flag.
+//
+// Optionally an index of the first slot to start the search from can be specified.
+// In this case slots before it will be ignored.
+//
+template <bool use_start_slot>
+inline void SwissTable::search_block(uint64_t block, int stamp, int start_slot,
+ int* out_slot, int* out_match_found) {
+ // Filled slot bytes have the highest bit set to 0 and empty slots are equal to 0x80.
+ uint64_t block_high_bits = block & kHighBitOfEachByte;
+
+ // Replicate 7-bit stamp to all non-empty slots, leaving zeroes for empty slots.
+ uint64_t stamp_pattern = stamp * ((block_high_bits ^ kHighBitOfEachByte) >> 7);
+
+ // If we xor this pattern with block status bytes we get in individual bytes:
+ // a) 0x00, for filled slots matching the stamp,
+ // b) 0x00 < x < 0x80, for filled slots not matching the stamp,
+ // c) 0x80, for empty slots.
+ uint64_t block_xor_pattern = block ^ stamp_pattern;
+
+ // If we then add 0x7f to every byte, we get:
+ // a) 0x7F
+ // b) 0x80 <= x < 0xFF
+ // c) 0xFF
+ uint64_t match_base = block_xor_pattern + ~kHighBitOfEachByte;
+
+ // The highest bit now tells us if we have a match (0) or not (1).
+ // We will negate the bits so that match is represented by a set bit.
+ uint64_t matches = ~match_base;
+
+ // Clear 7 non-relevant bits in each byte.
+ // Also clear bytes that correspond to slots that we were supposed to
+ // skip due to provided start slot index.
+ // Note: the highest byte corresponds to the first slot.
+ if (use_start_slot) {
+ matches &= kHighBitOfEachByte >> (8 * start_slot);
+ } else {
+ matches &= kHighBitOfEachByte;
+ }
+
+ // We get 0 if there are no matches
+ *out_match_found = (matches == 0 ? 0 : 1);
+
+ // Now if we or with the highest bits of the block and scan zero bits in reverse,
+ // we get 8x slot index that we were looking for.
+ // This formula works in all three cases a), b) and c).
+ *out_slot = static_cast<int>(CountLeadingZeros(matches | block_high_bits) >> 3);
+}
+
+// This call follows the call to search_block.
+// The input slot index is the output returned by it, which is a value from 0 to 8,
+// with 8 indicating that both: no match was found and there were no empty slots.
+//
+// If the slot corresponds to a non-empty slot return a group id associated with it.
+// Otherwise return any group id from any of the slots or
+// zero, which is the default value stored in empty slots.
+//
+inline uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
+ uint64_t group_id_mask) {
+ // Input slot can be equal to 8, in which case we need to output any valid group id
+ // value, so we take the one from slot 0 in the block.
+ int clamped_slot = slot & 7;
+
+ // Group id values for all 8 slots in the block are bit-packed and follow the status
+ // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
+ // that case we can extract group id using aligned 64-bit word access.
+ int num_groupid_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
+ ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+ num_groupid_bits == 32 || num_groupid_bits == 64);
+
+ int bit_offset = clamped_slot * num_groupid_bits;
+ const uint64_t* group_id_bytes =
+ reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
+ uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
+
+ return group_id;
+}
+
+// Return global slot id (the index including the information about the block)
+// where the search should continue if the first comparison fails.
+// This function always follows search_block and receives the slot id returned by it.
+//
+inline uint64_t SwissTable::next_slot_to_visit(uint64_t block_index, int slot,
+ int match_found) {
+ // The result should be taken modulo the number of all slots in all blocks,
+ // but here we allow it to take a value one above the last slot index.
+ // Modulo operation is postponed to later.
+ return block_index * 8 + slot + match_found;
+}
+
+// Implements first (fast-path, optimistic) lookup.
+// Searches for a match only within the start block and
+// trying only the first slot with a matching stamp.
+//
+// Comparison callback needed for match verification is done outside of this function.
+// Match bit vector filled by it only indicates finding a matching stamp in a slot.
+//
+template <bool use_selection>
+void SwissTable::lookup_1(const uint16_t* selection, const int num_keys,
+ const uint32_t* hashes, uint8_t* out_match_bitvector,
+ uint32_t* out_groupids, uint32_t* out_slot_ids) {
+ // Clear the output bit vector
+ memset(out_match_bitvector, 0, (num_keys + 7) / 8);
+
+ // Based on the size of the table, prepare bit number constants.
+ uint32_t stamp_mask = (1 << bits_stamp_) - 1;
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ uint32_t groupid_mask = (1 << num_groupid_bits) - 1;
+
+ for (int i = 0; i < num_keys; ++i) {
+ int id;
+ if (use_selection) {
+ id = util::SafeLoad(&selection[i]);
+ } else {
+ id = i;
+ }
+
+ // Extract from hash: block index and stamp
+ //
+ uint32_t hash = hashes[id];
+ uint32_t iblock = hash >> (bits_hash_ - bits_stamp_ - log_blocks_);
+ uint32_t stamp = iblock & stamp_mask;
+ iblock >>= bits_stamp_;
+
+ uint32_t num_block_bytes = num_groupid_bits + 8;
+ const uint8_t* blockbase = reinterpret_cast<const uint8_t*>(blocks_) +
+ static_cast<uint64_t>(iblock) * num_block_bytes;
+ uint64_t block = util::SafeLoadAs<uint64_t>(blockbase);
+
+ // Call helper functions to obtain the output triplet:
+ // - match (of a stamp) found flag
+ // - group id for key comparison
+ // - slot to resume search from in case of no match or false positive
+ int match_found;
+ int islot_in_block;
+ search_block<false>(block, stamp, 0, &islot_in_block, &match_found);
+ uint64_t groupid = extract_group_id(blockbase, islot_in_block, groupid_mask);
+ ARROW_DCHECK(groupid < num_inserted_ || num_inserted_ == 0);
+ uint64_t islot = next_slot_to_visit(iblock, islot_in_block, match_found);
+
+ out_match_bitvector[id / 8] |= match_found << (id & 7);
+ util::SafeStore(&out_groupids[id], static_cast<uint32_t>(groupid));
+ util::SafeStore(&out_slot_ids[id], static_cast<uint32_t>(islot));
+ }
+}
+
+// How many groups we can keep in the hash table without the need for resizing.
+// When we reach this limit, we need to break processing of any further rows and resize.
+//
+uint64_t SwissTable::num_groups_for_resize() const {
+ // Resize small hash tables when 50% full (up to 12KB).
+ // Resize large hash tables when 75% full.
+ constexpr int log_blocks_small_ = 9;
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ if (log_blocks_ <= log_blocks_small_) {
+ return num_slots / 2;
+ } else {
+ return num_slots * 3 / 4;
+ }
+}
+
+uint64_t SwissTable::wrap_global_slot_id(uint64_t global_slot_id) {
+ uint64_t global_slot_id_mask = (1 << (log_blocks_ + 3)) - 1;
+ return global_slot_id & global_slot_id_mask;
+}
+
+// Run a single round of slot search - comparison / insert - filter unprocessed.
+// Update selection vector to reflect which items have been processed.
+// Ids in selection vector do not have to be sorted.
+//
+Status SwissTable::lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
+ uint16_t* inout_selection, bool* out_need_resize,
+ uint32_t* out_group_ids, uint32_t* inout_next_slot_ids) {
+ auto num_groups_limit = num_groups_for_resize();
+ ARROW_DCHECK(num_inserted_ < num_groups_limit);
+
+ // Temporary arrays are of limited size.
+ // The input needs to be split into smaller portions if it exceeds that limit.
+ //
+ ARROW_DCHECK(*inout_num_selected <= static_cast<uint32_t>(1 << log_minibatch_));
+
+ // We will split input row ids into three categories:
+ // - needing to visit next block [0]
+ // - needing comparison [1]
+ // - inserted [2]
+ //
+ auto ids_inserted_buf =
+ util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
+ auto ids_for_comparison_buf =
+ util::TempVectorHolder<uint16_t>(temp_stack_, *inout_num_selected);
+ constexpr int category_nomatch = 0;
+ constexpr int category_cmp = 1;
+ constexpr int category_inserted = 2;
+ int num_ids[3];
+ num_ids[0] = num_ids[1] = num_ids[2] = 0;
+ uint16_t* ids[3]{inout_selection, ids_for_comparison_buf.mutable_data(),
+ ids_inserted_buf.mutable_data()};
+ auto push_id = [&num_ids, &ids](int category, int id) {
+ util::SafeStore(&ids[category][num_ids[category]++], static_cast<uint16_t>(id));
+ };
+
+ uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ uint64_t groupid_mask = (1ULL << num_groupid_bits) - 1;
+ constexpr uint64_t stamp_mask = 0x7f;
+ uint64_t num_block_bytes = (8 + num_groupid_bits);
+
+ uint32_t num_processed;
+ for (num_processed = 0;
+ // Second condition in for loop:
+ // We need to break processing and have the caller of this function
+ // resize hash table if we reach the limit of the number of groups present.
+ num_processed < *inout_num_selected &&
+ num_inserted_ + num_ids[category_inserted] < num_groups_limit;
+ ++num_processed) {
+ // row id in original batch
+ int id = util::SafeLoad(&inout_selection[num_processed]);
+
+ uint64_t slot_id = wrap_global_slot_id(util::SafeLoad(&inout_next_slot_ids[id]));
+ uint64_t block_id = slot_id >> 3;
+ uint32_t hash = hashes[id];
+ uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
+ uint64_t block = *reinterpret_cast<uint64_t*>(blockbase);
+ uint64_t stamp = (hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask;
+ int start_slot = (slot_id & 7);
+
+ bool isempty = (blockbase[7 - start_slot] == 0x80);
+ if (isempty) {
+ // If we reach the empty slot we insert key for new group
+
+ blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
+ uint32_t group_id = num_inserted_ + num_ids[category_inserted];
+ int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
+
+ // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
+ // In that case we can insert group id value using aligned 64-bit word access.
+ ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+ num_groupid_bits == 32 || num_groupid_bits == 64);
+ uint64_t* ptr =
+ &reinterpret_cast<uint64_t*>(blockbase + 8)[groupid_bit_offset >> 6];
+ util::SafeStore(ptr, util::SafeLoad(ptr) | (static_cast<uint64_t>(group_id)
+ << (groupid_bit_offset & 63)));
+
+ hashes_[slot_id] = hash;
+ util::SafeStore(&out_group_ids[id], group_id);
+ push_id(category_inserted, id);
+ } else {
+ // We search for a slot with a matching stamp within a single block.
+ // We append row id to the appropriate sequence of ids based on
+ // whether the match has been found or not.
+
+ int new_match_found;
+ int new_slot;
+ search_block<true>(block, static_cast<int>(stamp), start_slot, &new_slot,
+ &new_match_found);
+ auto new_groupid =
+ static_cast<uint32_t>(extract_group_id(blockbase, new_slot, groupid_mask));
+ ARROW_DCHECK(new_groupid < num_inserted_ + num_ids[category_inserted]);
+ new_slot =
+ static_cast<int>(next_slot_to_visit(block_id, new_slot, new_match_found));
+ util::SafeStore(&inout_next_slot_ids[id], new_slot);
+ util::SafeStore(&out_group_ids[id], new_groupid);
+ push_id(new_match_found, id);
+ }
+ }
+
+ // Copy keys for newly inserted rows using callback
+ RETURN_NOT_OK(append_impl_(num_ids[category_inserted], ids[category_inserted]));
+ num_inserted_ += num_ids[category_inserted];
+
+ // Evaluate comparisons and append ids of rows that failed it to the non-match set.
+ uint32_t num_not_equal;
+ equal_impl_(num_ids[category_cmp], ids[category_cmp], out_group_ids, &num_not_equal,
+ ids[category_nomatch] + num_ids[category_nomatch]);
+ num_ids[category_nomatch] += num_not_equal;
+
+ // Append ids of any unprocessed entries if we aborted processing due to the need
+ // to resize.
+ if (num_processed < *inout_num_selected) {
+ memmove(ids[category_nomatch] + num_ids[category_nomatch],
+ inout_selection + num_processed,
+ sizeof(uint16_t) * (*inout_num_selected - num_processed));
+ num_ids[category_nomatch] += (*inout_num_selected - num_processed);
+ }
+
+ *out_need_resize = (num_inserted_ == num_groups_limit);
+ *inout_num_selected = num_ids[category_nomatch];
+ return Status::OK();
+}
+
+// Use hashes and callbacks to find group ids for already existing keys and
+// to insert and report newly assigned group ids for new keys.
+//
+Status SwissTable::map(const int num_keys, const uint32_t* hashes,
+ uint32_t* out_groupids) {
+ // Temporary buffers have limited size.
+ // Caller is responsible for splitting larger input arrays into smaller chunks.
+ ARROW_DCHECK(num_keys <= (1 << log_minibatch_));
+
+ // Allocate temporary buffers with a lifetime of this function
+ auto match_bitvector_buf = util::TempVectorHolder<uint8_t>(temp_stack_, num_keys);
+ uint8_t* match_bitvector = match_bitvector_buf.mutable_data();
+ auto slot_ids_buf = util::TempVectorHolder<uint32_t>(temp_stack_, num_keys);
+ uint32_t* slot_ids = slot_ids_buf.mutable_data();
+ auto ids_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
+ uint16_t* ids = ids_buf.mutable_data();
+ uint32_t num_ids;
+
+ // First-pass processing.
+ // Optimistically use simplified lookup involving only a start block to find
+ // a single group id candidate for every input.
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) {
+ if (log_blocks_ <= 4) {
+ int tail = num_keys % 32;
+ int delta = num_keys - tail;
+ lookup_1_avx2_x32(num_keys - tail, hashes, match_bitvector, out_groupids, slot_ids);
+ lookup_1_avx2_x8(tail, hashes + delta, match_bitvector + delta / 8,
+ out_groupids + delta, slot_ids + delta);
+ } else {
+ lookup_1_avx2_x8(num_keys, hashes, match_bitvector, out_groupids, slot_ids);
+ }
+ } else {
+#endif
+ lookup_1<false>(nullptr, num_keys, hashes, match_bitvector, out_groupids, slot_ids);
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+
+ int64_t num_matches =
+ arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys);
+
+ // After the first-pass processing count rows with matches (based on stamp comparison)
+ // and decide based on their percentage whether to call dense or sparse comparison
+ // function. Dense comparison means evaluating it for all inputs, even if the matching
+ // stamp was not found. It may be cheaper to evaluate comparison for all inputs if the
+ // extra cost of filtering is higher than the wasted processing of rows with no match.
+ //
+ // Dense comparison can only be used if there is at least one inserted key,
+ // because otherwise there is no key to compare to.
+ //
+ if (num_inserted_ > 0 && num_matches > 0 && num_matches > 3 * num_keys / 4) {
+ // Dense comparisons
+ equal_impl_(num_keys, nullptr, out_groupids, &num_ids, ids);
+ } else {
+ // Sparse comparisons that involve filtering the input set of keys
+ auto ids_cmp_buf = util::TempVectorHolder<uint16_t>(temp_stack_, num_keys);
+ uint16_t* ids_cmp = ids_cmp_buf.mutable_data();
+ int num_ids_result;
+ util::BitUtil::bits_split_indexes(hardware_flags_, num_keys, match_bitvector,
+ &num_ids_result, ids, ids_cmp);
+ num_ids = num_ids_result;
+ uint32_t num_not_equal;
+ equal_impl_(num_keys - num_ids, ids_cmp, out_groupids, &num_not_equal, ids + num_ids);
+ num_ids += num_not_equal;
+ }
+
+ do {
+ // A single round of slow-pass (robust) lookup or insert.
+ // A single round ends with either a single comparison verifying the match candidate
+ // or inserting a new key. A single round of slow-pass may return early if we reach
+ // the limit of the number of groups due to inserts of new keys. In that case we need
+ // to resize and recalculating starting global slot ids for new bigger hash table.
+ bool out_of_capacity;
+ RETURN_NOT_OK(
+ lookup_2(hashes, &num_ids, ids, &out_of_capacity, out_groupids, slot_ids));
+ if (out_of_capacity) {
+ RETURN_NOT_OK(grow_double());
+ // Reset start slot ids for still unprocessed input keys.
+ //
+ for (uint32_t i = 0; i < num_ids; ++i) {
+ // First slot in the new starting block
+ const int16_t id = util::SafeLoad(&ids[i]);
+ util::SafeStore(&slot_ids[id], (hashes[id] >> (bits_hash_ - log_blocks_)) * 8);
+ }
+ }
+ } while (num_ids > 0);
+
+ return Status::OK();
+}
+
+Status SwissTable::grow_double() {
+ // Before and after metadata
+ int num_group_id_bits_before = num_groupid_bits_from_log_blocks(log_blocks_);
+ int num_group_id_bits_after = num_groupid_bits_from_log_blocks(log_blocks_ + 1);
+ uint64_t group_id_mask_before = ~0ULL >> (64 - num_group_id_bits_before);
+ int log_blocks_before = log_blocks_;
+ int log_blocks_after = log_blocks_ + 1;
+ uint64_t block_size_before = (8 + num_group_id_bits_before);
+ uint64_t block_size_after = (8 + num_group_id_bits_after);
+ uint64_t block_size_total_before = (block_size_before << log_blocks_before) + padding_;
+ uint64_t block_size_total_after = (block_size_after << log_blocks_after) + padding_;
+ uint64_t hashes_size_total_before =
+ (bits_hash_ / 8 * (1 << (log_blocks_before + 3))) + padding_;
+ uint64_t hashes_size_total_after =
+ (bits_hash_ / 8 * (1 << (log_blocks_after + 3))) + padding_;
+ constexpr uint32_t stamp_mask = (1 << bits_stamp_) - 1;
+
+ // Allocate new buffers
+ uint8_t* blocks_new;
+ RETURN_NOT_OK(pool_->Allocate(block_size_total_after, &blocks_new));
+ memset(blocks_new, 0, block_size_total_after);
+ uint8_t* hashes_new_8B;
+ uint32_t* hashes_new;
+ RETURN_NOT_OK(pool_->Allocate(hashes_size_total_after, &hashes_new_8B));
+ hashes_new = reinterpret_cast<uint32_t*>(hashes_new_8B);
+
+ // First pass over all old blocks.
+ // Reinsert entries that were not in the overflow block
+ // (block other than selected by hash bits corresponding to the entry).
+ for (int i = 0; i < (1 << log_blocks_); ++i) {
+ // How many full slots in this block
+ uint8_t* block_base = blocks_ + i * block_size_before;
+ uint8_t* double_block_base_new = blocks_new + 2 * i * block_size_after;
+ uint64_t block = *reinterpret_cast<const uint64_t*>(block_base);
+
+ auto full_slots =
+ static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
+ int full_slots_new[2];
+ full_slots_new[0] = full_slots_new[1] = 0;
+ util::SafeStore(double_block_base_new, kHighBitOfEachByte);
+ util::SafeStore(double_block_base_new + block_size_after, kHighBitOfEachByte);
+
+ for (int j = 0; j < full_slots; ++j) {
+ uint64_t slot_id = i * 8 + j;
+ uint32_t hash = hashes_[slot_id];
+ uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
+ bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
+ if (is_overflow_entry) {
+ continue;
+ }
+
+ int ihalf = block_id_new & 1;
+ uint8_t stamp_new =
+ hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
+ uint64_t group_id_bit_offs = j * num_group_id_bits_before;
+ uint64_t group_id =
+ (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+ (group_id_bit_offs & 7)) &
+ group_id_mask_before;
+
+ uint64_t slot_id_new = i * 16 + ihalf * 8 + full_slots_new[ihalf];
+ hashes_new[slot_id_new] = hash;
+ uint8_t* block_base_new = double_block_base_new + ihalf * block_size_after;
+ block_base_new[7 - full_slots_new[ihalf]] = stamp_new;
+ int group_id_bit_offs_new = full_slots_new[ihalf] * num_group_id_bits_after;
+ uint64_t* ptr =
+ reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+ util::SafeStore(ptr,
+ util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
+ full_slots_new[ihalf]++;
+ }
+ }
+
+ // Second pass over all old blocks.
+ // Reinsert entries that were in an overflow block.
+ for (int i = 0; i < (1 << log_blocks_); ++i) {
+ // How many full slots in this block
+ uint8_t* block_base = blocks_ + i * block_size_before;
+ uint64_t block = util::SafeLoadAs<uint64_t>(block_base);
+ int full_slots = static_cast<int>(CountLeadingZeros(block & kHighBitOfEachByte) >> 3);
+
+ for (int j = 0; j < full_slots; ++j) {
+ uint64_t slot_id = i * 8 + j;
+ uint32_t hash = hashes_[slot_id];
+ uint64_t block_id_new = hash >> (bits_hash_ - log_blocks_after);
+ bool is_overflow_entry = ((block_id_new >> 1) != static_cast<uint64_t>(i));
+ if (!is_overflow_entry) {
+ continue;
+ }
+
+ uint64_t group_id_bit_offs = j * num_group_id_bits_before;
+ uint64_t group_id =
+ (util::SafeLoadAs<uint64_t>(block_base + 8 + (group_id_bit_offs >> 3)) >>
+ (group_id_bit_offs & 7)) &
+ group_id_mask_before;
+ uint8_t stamp_new =
+ hash >> ((bits_hash_ - log_blocks_after - bits_stamp_)) & stamp_mask;
+
+ uint8_t* block_base_new = blocks_new + block_id_new * block_size_after;
+ uint64_t block_new = util::SafeLoadAs<uint64_t>(block_base_new);
+ int full_slots_new =
+ static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
+ while (full_slots_new == 8) {
+ block_id_new = (block_id_new + 1) & ((1 << log_blocks_after) - 1);
+ block_base_new = blocks_new + block_id_new * block_size_after;
+ block_new = util::SafeLoadAs<uint64_t>(block_base_new);
+ full_slots_new =
+ static_cast<int>(CountLeadingZeros(block_new & kHighBitOfEachByte) >> 3);
+ }
+
+ hashes_new[block_id_new * 8 + full_slots_new] = hash;
+ block_base_new[7 - full_slots_new] = stamp_new;
+ int group_id_bit_offs_new = full_slots_new * num_group_id_bits_after;
+ uint64_t* ptr =
+ reinterpret_cast<uint64_t*>(block_base_new + 8 + (group_id_bit_offs_new >> 3));
+ util::SafeStore(ptr,
+ util::SafeLoad(ptr) | (group_id << (group_id_bit_offs_new & 7)));
+ }
+ }
+
+ pool_->Free(blocks_, block_size_total_before);
+ pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hashes_size_total_before);
+ log_blocks_ = log_blocks_after;
+ blocks_ = blocks_new;
+ hashes_ = hashes_new;
+
+ return Status::OK();
+}
+
+Status SwissTable::init(int64_t hardware_flags, MemoryPool* pool,
+ util::TempVectorStack* temp_stack, int log_minibatch,
+ EqualImpl equal_impl, AppendImpl append_impl) {
+ hardware_flags_ = hardware_flags;
+ pool_ = pool;
+ temp_stack_ = temp_stack;
+ log_minibatch_ = log_minibatch;
+ equal_impl_ = equal_impl;
+ append_impl_ = append_impl;
+
+ log_blocks_ = 0;
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ num_inserted_ = 0;
+
+ const uint64_t block_bytes = 8 + num_groupid_bits;
+ const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
+ RETURN_NOT_OK(pool_->Allocate(slot_bytes, &blocks_));
+
+ // Make sure group ids are initially set to zero for all slots.
+ memset(blocks_, 0, slot_bytes);
+
+ // Initialize all status bytes to represent an empty slot.
+ for (uint64_t i = 0; i < (static_cast<uint64_t>(1) << log_blocks_); ++i) {
+ util::SafeStore(blocks_ + i * block_bytes, kHighBitOfEachByte);
+ }
+
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ const uint64_t hash_size = sizeof(uint32_t);
+ const uint64_t hash_bytes = hash_size * num_slots + padding_;
+ uint8_t* hashes8;
+ RETURN_NOT_OK(pool_->Allocate(hash_bytes, &hashes8));
+ hashes_ = reinterpret_cast<uint32_t*>(hashes8);
+
+ return Status::OK();
+}
+
+void SwissTable::cleanup() {
+ if (blocks_) {
+ int num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+ const uint64_t block_bytes = 8 + num_groupid_bits;
+ const uint64_t slot_bytes = (block_bytes << log_blocks_) + padding_;
+ pool_->Free(blocks_, slot_bytes);
+ blocks_ = nullptr;
+ }
+ if (hashes_) {
+ uint64_t num_slots = 1ULL << (log_blocks_ + 3);
+ const uint64_t hash_size = sizeof(uint32_t);
+ const uint64_t hash_bytes = hash_size * num_slots + padding_;
+ pool_->Free(reinterpret_cast<uint8_t*>(hashes_), hash_bytes);
+ hashes_ = nullptr;
+ }
+ log_blocks_ = 0;
+ num_inserted_ = 0;
+}
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
index da50db91040..8c472736ec4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/key_map.h
@@ -1,172 +1,172 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <functional>
-
-#include "arrow/compute/exec/util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-
-namespace arrow {
-namespace compute {
-
-class SwissTable {
- public:
- SwissTable() = default;
- ~SwissTable() { cleanup(); }
-
- using EqualImpl =
- std::function<void(int num_keys, const uint16_t* selection /* may be null */,
- const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
- uint16_t* out_selection_mismatch)>;
- using AppendImpl = std::function<Status(int num_keys, const uint16_t* selection)>;
-
- Status init(int64_t hardware_flags, MemoryPool* pool, util::TempVectorStack* temp_stack,
- int log_minibatch, EqualImpl equal_impl, AppendImpl append_impl);
- void cleanup();
-
- Status map(const int ckeys, const uint32_t* hashes, uint32_t* outgroupids);
-
- private:
- // Lookup helpers
-
- /// \brief Scan bytes in block in reverse and stop as soon
- /// as a position of interest is found.
- ///
- /// Positions of interest:
- /// a) slot with a matching stamp is encountered,
- /// b) first empty slot is encountered,
- /// c) we reach the end of the block.
- ///
- /// \param[in] block 8 byte block of hash table
- /// \param[in] stamp 7 bits of hash used as a stamp
- /// \param[in] start_slot Index of the first slot in the block to start search from. We
- /// assume that this index always points to a non-empty slot, equivalently
- /// that it comes before any empty slots. (Used only by one template
- /// variant.)
- /// \param[out] out_slot index corresponding to the discovered position of interest (8
- /// represents end of block).
- /// \param[out] out_match_found an integer flag (0 or 1) indicating if we found a
- /// matching stamp.
- template <bool use_start_slot>
- inline void search_block(uint64_t block, int stamp, int start_slot, int* out_slot,
- int* out_match_found);
-
- /// \brief Extract group id for a given slot in a given block.
- ///
- /// Group ids follow in memory after 64-bit block data.
- /// Maximum number of groups inserted is equal to the number
- /// of all slots in all blocks, which is 8 * the number of blocks.
- /// Group ids are bit packed using that maximum to determine the necessary number of
- /// bits.
- inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
- uint64_t group_id_mask);
-
- inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found);
-
- inline void insert(uint8_t* block_base, uint64_t slot_id, uint32_t hash, uint8_t stamp,
- uint32_t group_id);
-
- inline uint64_t num_groups_for_resize() const;
-
- inline uint64_t wrap_global_slot_id(uint64_t global_slot_id);
-
- // First hash table access
- // Find first match in the start block if exists.
- // Possible cases:
- // 1. Stamp match in a block
- // 2. No stamp match in a block, no empty buckets in a block
- // 3. No stamp match in a block, empty buckets in a block
- //
- template <bool use_selection>
- void lookup_1(const uint16_t* selection, const int num_keys, const uint32_t* hashes,
- uint8_t* out_match_bitvector, uint32_t* out_group_ids,
- uint32_t* out_slot_ids);
-#if defined(ARROW_HAVE_AVX2)
- void lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes,
- uint8_t* out_match_bitvector, uint32_t* out_group_ids,
- uint32_t* out_next_slot_ids);
- void lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes,
- uint8_t* out_match_bitvector, uint32_t* out_group_ids,
- uint32_t* out_next_slot_ids);
-#endif
-
- // Completing hash table lookup post first access
- Status lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
- uint16_t* inout_selection, bool* out_need_resize,
- uint32_t* out_group_ids, uint32_t* out_next_slot_ids);
-
- // Resize small hash tables when 50% full (up to 8KB).
- // Resize large hash tables when 75% full.
- Status grow_double();
-
- static int num_groupid_bits_from_log_blocks(int log_blocks) {
- int required_bits = log_blocks + 3;
- return required_bits <= 8 ? 8
- : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64;
- }
-
- // Use 32-bit hash for now
- static constexpr int bits_hash_ = 32;
-
- // Number of hash bits stored in slots in a block.
- // The highest bits of hash determine block id.
- // The next set of highest bits is a "stamp" stored in a slot in a block.
- static constexpr int bits_stamp_ = 7;
-
- // Padding bytes added at the end of buffers for ease of SIMD access
- static constexpr int padding_ = 64;
-
- int log_minibatch_;
- // Base 2 log of the number of blocks
- int log_blocks_ = 0;
- // Number of keys inserted into hash table
- uint32_t num_inserted_ = 0;
-
- // Data for blocks.
- // Each block has 8 status bytes for 8 slots, followed by 8 bit packed group ids for
- // these slots. In 8B status word, the order of bytes is reversed. Group ids are in
- // normal order. There is 64B padding at the end.
- //
- // 0 byte - 7 bucket | 1. byte - 6 bucket | ...
- // ---------------------------------------------------
- // | Empty bit* | Empty bit |
- // ---------------------------------------------------
- // | 7-bit hash | 7-bit hash |
- // ---------------------------------------------------
- // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0.
- //
- uint8_t* blocks_;
-
- // Array of hashes of values inserted into slots.
- // Undefined if the corresponding slot is empty.
- // There is 64B padding at the end.
- uint32_t* hashes_;
-
- int64_t hardware_flags_;
- MemoryPool* pool_;
- util::TempVectorStack* temp_stack_;
-
- EqualImpl equal_impl_;
- AppendImpl append_impl_;
-};
-
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+
+#include "arrow/compute/exec/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace compute {
+
+class SwissTable {
+ public:
+ SwissTable() = default;
+ ~SwissTable() { cleanup(); }
+
+ using EqualImpl =
+ std::function<void(int num_keys, const uint16_t* selection /* may be null */,
+ const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
+ uint16_t* out_selection_mismatch)>;
+ using AppendImpl = std::function<Status(int num_keys, const uint16_t* selection)>;
+
+ Status init(int64_t hardware_flags, MemoryPool* pool, util::TempVectorStack* temp_stack,
+ int log_minibatch, EqualImpl equal_impl, AppendImpl append_impl);
+ void cleanup();
+
+ Status map(const int ckeys, const uint32_t* hashes, uint32_t* outgroupids);
+
+ private:
+ // Lookup helpers
+
+ /// \brief Scan bytes in block in reverse and stop as soon
+ /// as a position of interest is found.
+ ///
+ /// Positions of interest:
+ /// a) slot with a matching stamp is encountered,
+ /// b) first empty slot is encountered,
+ /// c) we reach the end of the block.
+ ///
+ /// \param[in] block 8 byte block of hash table
+ /// \param[in] stamp 7 bits of hash used as a stamp
+ /// \param[in] start_slot Index of the first slot in the block to start search from. We
+ /// assume that this index always points to a non-empty slot, equivalently
+ /// that it comes before any empty slots. (Used only by one template
+ /// variant.)
+ /// \param[out] out_slot index corresponding to the discovered position of interest (8
+ /// represents end of block).
+ /// \param[out] out_match_found an integer flag (0 or 1) indicating if we found a
+ /// matching stamp.
+ template <bool use_start_slot>
+ inline void search_block(uint64_t block, int stamp, int start_slot, int* out_slot,
+ int* out_match_found);
+
+ /// \brief Extract group id for a given slot in a given block.
+ ///
+ /// Group ids follow in memory after 64-bit block data.
+ /// Maximum number of groups inserted is equal to the number
+ /// of all slots in all blocks, which is 8 * the number of blocks.
+ /// Group ids are bit packed using that maximum to determine the necessary number of
+ /// bits.
+ inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
+ uint64_t group_id_mask);
+
+ inline uint64_t next_slot_to_visit(uint64_t block_index, int slot, int match_found);
+
+ inline void insert(uint8_t* block_base, uint64_t slot_id, uint32_t hash, uint8_t stamp,
+ uint32_t group_id);
+
+ inline uint64_t num_groups_for_resize() const;
+
+ inline uint64_t wrap_global_slot_id(uint64_t global_slot_id);
+
+ // First hash table access
+ // Find first match in the start block if exists.
+ // Possible cases:
+ // 1. Stamp match in a block
+ // 2. No stamp match in a block, no empty buckets in a block
+ // 3. No stamp match in a block, empty buckets in a block
+ //
+ template <bool use_selection>
+ void lookup_1(const uint16_t* selection, const int num_keys, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_slot_ids);
+#if defined(ARROW_HAVE_AVX2)
+ void lookup_1_avx2_x8(const int num_hashes, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_next_slot_ids);
+ void lookup_1_avx2_x32(const int num_hashes, const uint32_t* hashes,
+ uint8_t* out_match_bitvector, uint32_t* out_group_ids,
+ uint32_t* out_next_slot_ids);
+#endif
+
+ // Completing hash table lookup post first access
+ Status lookup_2(const uint32_t* hashes, uint32_t* inout_num_selected,
+ uint16_t* inout_selection, bool* out_need_resize,
+ uint32_t* out_group_ids, uint32_t* out_next_slot_ids);
+
+ // Resize small hash tables when 50% full (up to 8KB).
+ // Resize large hash tables when 75% full.
+ Status grow_double();
+
+ static int num_groupid_bits_from_log_blocks(int log_blocks) {
+ int required_bits = log_blocks + 3;
+ return required_bits <= 8 ? 8
+ : required_bits <= 16 ? 16 : required_bits <= 32 ? 32 : 64;
+ }
+
+ // Use 32-bit hash for now
+ static constexpr int bits_hash_ = 32;
+
+ // Number of hash bits stored in slots in a block.
+ // The highest bits of hash determine block id.
+ // The next set of highest bits is a "stamp" stored in a slot in a block.
+ static constexpr int bits_stamp_ = 7;
+
+ // Padding bytes added at the end of buffers for ease of SIMD access
+ static constexpr int padding_ = 64;
+
+ int log_minibatch_;
+ // Base 2 log of the number of blocks
+ int log_blocks_ = 0;
+ // Number of keys inserted into hash table
+ uint32_t num_inserted_ = 0;
+
+ // Data for blocks.
+ // Each block has 8 status bytes for 8 slots, followed by 8 bit packed group ids for
+ // these slots. In 8B status word, the order of bytes is reversed. Group ids are in
+ // normal order. There is 64B padding at the end.
+ //
+ // 0 byte - 7 bucket | 1. byte - 6 bucket | ...
+ // ---------------------------------------------------
+ // | Empty bit* | Empty bit |
+ // ---------------------------------------------------
+ // | 7-bit hash | 7-bit hash |
+ // ---------------------------------------------------
+ // * Empty bucket has value 0x80. Non-empty bucket has highest bit set to 0.
+ //
+ uint8_t* blocks_;
+
+ // Array of hashes of values inserted into slots.
+ // Undefined if the corresponding slot is empty.
+ // There is 64B padding at the end.
+ uint32_t* hashes_;
+
+ int64_t hardware_flags_;
+ MemoryPool* pool_;
+ util::TempVectorStack* temp_stack_;
+
+ EqualImpl equal_impl_;
+ AppendImpl append_impl_;
+};
+
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
index b667afc65bb..a44676c2f0d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.cc
@@ -1,278 +1,278 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/exec/util.h"
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-
-using BitUtil::CountTrailingZeros;
-
-namespace util {
-
-inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
- int* num_indexes, uint16_t* indexes) {
- int n = *num_indexes;
- while (word) {
- indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
- word &= word - 1;
- }
- *num_indexes = n;
-}
-
-inline void BitUtil::bits_filter_indexes_helper(uint64_t word,
- const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes) {
- int n = *num_indexes;
- while (word) {
- indexes[n++] = input_indexes[CountTrailingZeros(word)];
- word &= word - 1;
- }
- *num_indexes = n;
-}
-
-template <int bit_to_search, bool filter_input_indexes>
-void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes) {
- // 64 bits at a time
- constexpr int unroll = 64;
- int tail = num_bits % unroll;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- if (filter_input_indexes) {
- bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
- num_indexes, indexes);
- } else {
- bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes);
- }
- } else {
-#endif
- *num_indexes = 0;
- for (int i = 0; i < num_bits / unroll; ++i) {
- uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
- if (bit_to_search == 0) {
- word = ~word;
- }
- if (filter_input_indexes) {
- bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
- } else {
- bits_to_indexes_helper(word, i * 64, num_indexes, indexes);
- }
- }
-#if defined(ARROW_HAVE_AVX2)
- }
-#endif
- // Optionally process the last partial word with masking out bits outside range
- if (tail) {
- uint64_t word =
- util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll]);
- if (bit_to_search == 0) {
- word = ~word;
- }
- word &= ~0ULL >> (64 - tail);
- if (filter_input_indexes) {
- bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
- indexes);
- } else {
- bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes);
- }
- }
-}
-
-void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits, int* num_indexes,
- uint16_t* indexes, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- int num_indexes_head = 0;
- uint64_t bits_head =
- util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
- reinterpret_cast<const uint8_t*>(&bits_head), &num_indexes_head,
- indexes);
- int num_indexes_tail = 0;
- if (num_bits > bits_in_first_byte) {
- bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
- bits + 1, &num_indexes_tail, indexes + num_indexes_head);
- }
- *num_indexes = num_indexes_head + num_indexes_tail;
- return;
- }
-
- if (bit_to_search == 0) {
- bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
- num_indexes, indexes);
- } else {
- ARROW_DCHECK(bit_to_search == 1);
- bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
- num_indexes, indexes);
- }
-}
-
-void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits,
- const uint16_t* input_indexes, int* num_indexes,
- uint16_t* indexes, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- int num_indexes_head = 0;
- uint64_t bits_head =
- util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
- reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
- &num_indexes_head, indexes);
- int num_indexes_tail = 0;
- if (num_bits > bits_in_first_byte) {
- bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
- bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
- indexes + num_indexes_head);
- }
- *num_indexes = num_indexes_head + num_indexes_tail;
- return;
- }
-
- if (bit_to_search == 0) {
- bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
- num_indexes, indexes);
- } else {
- ARROW_DCHECK(bit_to_search == 1);
- bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
- num_indexes, indexes);
- }
-}
-
-void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, int* num_indexes_bit0,
- uint16_t* indexes_bit0, uint16_t* indexes_bit1,
- int bit_offset) {
- bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
- bit_offset);
- int num_indexes_bit1;
- bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
- bit_offset);
-}
-
-void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, uint8_t* bytes, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- uint64_t bits_head =
- util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bits_to_bytes(hardware_flags, bits_in_first_byte,
- reinterpret_cast<const uint8_t*>(&bits_head), bytes);
- if (num_bits > bits_in_first_byte) {
- bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
- bytes + bits_in_first_byte);
- }
- return;
- }
-
- int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- // The function call below processes whole 32 bit chunks together.
- num_processed = num_bits - (num_bits % 32);
- bits_to_bytes_avx2(num_processed, bits, bytes);
- }
-#endif
- // Processing 8 bits at a time
- constexpr int unroll = 8;
- for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
- uint8_t bits_next = bits[i];
- // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
- // from the previous.
- uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
- ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
- (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
- unpacked |= (bits_next & 1);
- unpacked &= 0x0101010101010101ULL;
- unpacked *= 255;
- util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
- }
-}
-
-void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits,
- const uint8_t* bytes, uint8_t* bits, int bit_offset) {
- bits += bit_offset / 8;
- bit_offset %= 8;
- if (bit_offset != 0) {
- uint64_t bits_head;
- int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
- bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
- reinterpret_cast<uint8_t*>(&bits_head));
- uint8_t mask = (1 << bit_offset) - 1;
- *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
-
- if (num_bits > bits_in_first_byte) {
- bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
- bytes + bits_in_first_byte, bits + 1);
- }
- return;
- }
-
- int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- // The function call below processes whole 32 bit chunks together.
- num_processed = num_bits - (num_bits % 32);
- bytes_to_bits_avx2(num_processed, bytes, bits);
- }
-#endif
- // Process 8 bits at a time
- constexpr int unroll = 8;
- for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
- uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
- bytes_next &= 0x0101010101010101ULL;
- bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes
- bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes
- bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte
- bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
- }
-}
-
-bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
- uint32_t num_bytes) {
-#if defined(ARROW_HAVE_AVX2)
- if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
- return are_all_bytes_zero_avx2(bytes, num_bytes);
- }
-#endif
- uint64_t result_or = 0;
- uint32_t i;
- for (i = 0; i < num_bytes / 8; ++i) {
- uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
- result_or |= x;
- }
- if (num_bytes % 8 > 0) {
- uint64_t tail = 0;
- result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
- }
- return result_or == 0;
-}
-
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec/util.h"
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+
+using BitUtil::CountTrailingZeros;
+
+namespace util {
+
+inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+ int* num_indexes, uint16_t* indexes) {
+ int n = *num_indexes;
+ while (word) {
+ indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
+ word &= word - 1;
+ }
+ *num_indexes = n;
+}
+
+inline void BitUtil::bits_filter_indexes_helper(uint64_t word,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes) {
+ int n = *num_indexes;
+ while (word) {
+ indexes[n++] = input_indexes[CountTrailingZeros(word)];
+ word &= word - 1;
+ }
+ *num_indexes = n;
+}
+
+template <int bit_to_search, bool filter_input_indexes>
+void BitUtil::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes) {
+ // 64 bits at a time
+ constexpr int unroll = 64;
+ int tail = num_bits % unroll;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ if (filter_input_indexes) {
+ bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
+ num_indexes, indexes);
+ } else {
+ bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes);
+ }
+ } else {
+#endif
+ *num_indexes = 0;
+ for (int i = 0; i < num_bits / unroll; ++i) {
+ uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
+ if (bit_to_search == 0) {
+ word = ~word;
+ }
+ if (filter_input_indexes) {
+ bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
+ } else {
+ bits_to_indexes_helper(word, i * 64, num_indexes, indexes);
+ }
+ }
+#if defined(ARROW_HAVE_AVX2)
+ }
+#endif
+ // Optionally process the last partial word with masking out bits outside range
+ if (tail) {
+ uint64_t word =
+ util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll]);
+ if (bit_to_search == 0) {
+ word = ~word;
+ }
+ word &= ~0ULL >> (64 - tail);
+ if (filter_input_indexes) {
+ bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
+ indexes);
+ } else {
+ bits_to_indexes_helper(word, num_bits - tail, num_indexes, indexes);
+ }
+ }
+}
+
+void BitUtil::bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ int num_indexes_head = 0;
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), &num_indexes_head,
+ indexes);
+ int num_indexes_tail = 0;
+ if (num_bits > bits_in_first_byte) {
+ bits_to_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+ bits + 1, &num_indexes_tail, indexes + num_indexes_head);
+ }
+ *num_indexes = num_indexes_head + num_indexes_tail;
+ return;
+ }
+
+ if (bit_to_search == 0) {
+ bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
+ num_indexes, indexes);
+ } else {
+ ARROW_DCHECK(bit_to_search == 1);
+ bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
+ num_indexes, indexes);
+ }
+}
+
+void BitUtil::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes, int* num_indexes,
+ uint16_t* indexes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ int num_indexes_head = 0;
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
+ &num_indexes_head, indexes);
+ int num_indexes_tail = 0;
+ if (num_bits > bits_in_first_byte) {
+ bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+ bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
+ indexes + num_indexes_head);
+ }
+ *num_indexes = num_indexes_head + num_indexes_tail;
+ return;
+ }
+
+ if (bit_to_search == 0) {
+ bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
+ num_indexes, indexes);
+ } else {
+ ARROW_DCHECK(bit_to_search == 1);
+ bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
+ num_indexes, indexes);
+ }
+}
+
+void BitUtil::bits_split_indexes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, int* num_indexes_bit0,
+ uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+ int bit_offset) {
+ bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
+ bit_offset);
+ int num_indexes_bit1;
+ bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
+ bit_offset);
+}
+
+void BitUtil::bits_to_bytes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, uint8_t* bytes, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ uint64_t bits_head =
+ util::SafeLoad(reinterpret_cast<const uint64_t*>(bits)) >> bit_offset;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bits_to_bytes(hardware_flags, bits_in_first_byte,
+ reinterpret_cast<const uint8_t*>(&bits_head), bytes);
+ if (num_bits > bits_in_first_byte) {
+ bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
+ bytes + bits_in_first_byte);
+ }
+ return;
+ }
+
+ int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ // The function call below processes whole 32 bit chunks together.
+ num_processed = num_bits - (num_bits % 32);
+ bits_to_bytes_avx2(num_processed, bits, bytes);
+ }
+#endif
+ // Processing 8 bits at a time
+ constexpr int unroll = 8;
+ for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
+ uint8_t bits_next = bits[i];
+ // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+ // from the previous.
+ uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+ ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+ (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+ unpacked |= (bits_next & 1);
+ unpacked &= 0x0101010101010101ULL;
+ unpacked *= 255;
+ util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
+ }
+}
+
+void BitUtil::bytes_to_bits(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bytes, uint8_t* bits, int bit_offset) {
+ bits += bit_offset / 8;
+ bit_offset %= 8;
+ if (bit_offset != 0) {
+ uint64_t bits_head;
+ int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+ bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
+ reinterpret_cast<uint8_t*>(&bits_head));
+ uint8_t mask = (1 << bit_offset) - 1;
+ *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
+
+ if (num_bits > bits_in_first_byte) {
+ bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
+ bytes + bits_in_first_byte, bits + 1);
+ }
+ return;
+ }
+
+ int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ // The function call below processes whole 32 bit chunks together.
+ num_processed = num_bits - (num_bits % 32);
+ bytes_to_bits_avx2(num_processed, bytes, bits);
+ }
+#endif
+ // Process 8 bits at a time
+ constexpr int unroll = 8;
+ for (int i = num_processed / unroll; i < (num_bits + unroll - 1) / unroll; ++i) {
+ uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+ bytes_next &= 0x0101010101010101ULL;
+ bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes
+ bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes
+ bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte
+ bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
+ }
+}
+
+bool BitUtil::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+ uint32_t num_bytes) {
+#if defined(ARROW_HAVE_AVX2)
+ if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+ return are_all_bytes_zero_avx2(bytes, num_bytes);
+ }
+#endif
+ uint64_t result_or = 0;
+ uint32_t i;
+ for (i = 0; i < num_bytes / 8; ++i) {
+ uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+ result_or |= x;
+ }
+ if (num_bytes % 8 > 0) {
+ uint64_t tail = 0;
+ result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
+ }
+ return result_or == 0;
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
index 1025476ac63..471cc332220 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec/util.h
@@ -1,171 +1,171 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include "arrow/buffer.h"
-#include "arrow/memory_pool.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/cpu_info.h"
-#include "arrow/util/logging.h"
-
-#if defined(__clang__) || defined(__GNUC__)
-#define BYTESWAP(x) __builtin_bswap64(x)
-#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#define BYTESWAP(x) _byteswap_uint64(x)
-#define ROTL(x, n) _rotl((x), (n))
-#endif
-
-namespace arrow {
-namespace util {
-
-// Some platforms typedef int64_t as long int instead of long long int,
-// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
-// which need long long.
-// We use the cast to the type below in these intrinsics to make the code
-// compile in all cases.
-//
-using int64_for_gather_t = const long long int; // NOLINT runtime-int
-
-/// Storage used to allocate temporary vectors of a batch size.
-/// Temporary vectors should resemble allocating temporary variables on the stack
-/// but in the context of vectorized processing where we need to store a vector of
-/// temporaries instead of a single value.
-class TempVectorStack {
- template <typename>
- friend class TempVectorHolder;
-
- public:
- Status Init(MemoryPool* pool, int64_t size) {
- num_vectors_ = 0;
- top_ = 0;
- buffer_size_ = size;
- ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
- buffer_ = std::move(buffer);
- return Status::OK();
- }
-
- private:
- void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
- int64_t old_top = top_;
- top_ += num_bytes + padding;
- // Stack overflow check
- ARROW_DCHECK(top_ <= buffer_size_);
- *data = buffer_->mutable_data() + old_top;
- *id = num_vectors_++;
- }
- void release(int id, uint32_t num_bytes) {
- ARROW_DCHECK(num_vectors_ == id + 1);
- int64_t size = num_bytes + padding;
- ARROW_DCHECK(top_ >= size);
- top_ -= size;
- --num_vectors_;
- }
- static constexpr int64_t padding = 64;
- int num_vectors_;
- int64_t top_;
- std::unique_ptr<Buffer> buffer_;
- int64_t buffer_size_;
-};
-
-template <typename T>
-class TempVectorHolder {
- friend class TempVectorStack;
-
- public:
- ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
- T* mutable_data() { return reinterpret_cast<T*>(data_); }
- TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
- stack_ = stack;
- num_elements_ = num_elements;
- stack_->alloc(num_elements * sizeof(T), &data_, &id_);
- }
-
- private:
- TempVectorStack* stack_;
- uint8_t* data_;
- int id_;
- uint32_t num_elements_;
-};
-
-class BitUtil {
- public:
- static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits, int* num_indexes,
- uint16_t* indexes, int bit_offset = 0);
-
- static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
- const int num_bits, const uint8_t* bits,
- const uint16_t* input_indexes, int* num_indexes,
- uint16_t* indexes, int bit_offset = 0);
-
- // Input and output indexes may be pointing to the same data (in-place filtering).
- static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, int* num_indexes_bit0,
- uint16_t* indexes_bit0, uint16_t* indexes_bit1,
- int bit_offset = 0);
-
- // Bit 1 is replaced with byte 0xFF.
- static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
-
- // Return highest bit of each byte.
- static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
- const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
-
- static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
- uint32_t num_bytes);
-
- private:
- inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
- int* num_indexes, uint16_t* indexes);
- inline static void bits_filter_indexes_helper(uint64_t word,
- const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
- template <int bit_to_search, bool filter_input_indexes>
- static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
- const uint8_t* bits, const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
-
-#if defined(ARROW_HAVE_AVX2)
- static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
- const uint8_t* bits, int* num_indexes,
- uint16_t* indexes);
- static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
- const uint8_t* bits, const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
- template <int bit_to_search>
- static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
- int* num_indexes, uint16_t* indexes);
- template <int bit_to_search>
- static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
- const uint16_t* input_indexes,
- int* num_indexes, uint16_t* indexes);
- static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
- static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
- static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
-#endif
-};
-
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+
+#if defined(__clang__) || defined(__GNUC__)
+#define BYTESWAP(x) __builtin_bswap64(x)
+#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#define BYTESWAP(x) _byteswap_uint64(x)
+#define ROTL(x, n) _rotl((x), (n))
+#endif
+
+namespace arrow {
+namespace util {
+
+// Some platforms typedef int64_t as long int instead of long long int,
+// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
+// which need long long.
+// We use the cast to the type below in these intrinsics to make the code
+// compile in all cases.
+//
+using int64_for_gather_t = const long long int; // NOLINT runtime-int
+
+/// Storage used to allocate temporary vectors of a batch size.
+/// Temporary vectors should resemble allocating temporary variables on the stack
+/// but in the context of vectorized processing where we need to store a vector of
+/// temporaries instead of a single value.
+class TempVectorStack {
+ template <typename>
+ friend class TempVectorHolder;
+
+ public:
+ Status Init(MemoryPool* pool, int64_t size) {
+ num_vectors_ = 0;
+ top_ = 0;
+ buffer_size_ = size;
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
+ buffer_ = std::move(buffer);
+ return Status::OK();
+ }
+
+ private:
+ void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
+ int64_t old_top = top_;
+ top_ += num_bytes + padding;
+ // Stack overflow check
+ ARROW_DCHECK(top_ <= buffer_size_);
+ *data = buffer_->mutable_data() + old_top;
+ *id = num_vectors_++;
+ }
+ void release(int id, uint32_t num_bytes) {
+ ARROW_DCHECK(num_vectors_ == id + 1);
+ int64_t size = num_bytes + padding;
+ ARROW_DCHECK(top_ >= size);
+ top_ -= size;
+ --num_vectors_;
+ }
+ static constexpr int64_t padding = 64;
+ int num_vectors_;
+ int64_t top_;
+ std::unique_ptr<Buffer> buffer_;
+ int64_t buffer_size_;
+};
+
+template <typename T>
+class TempVectorHolder {
+ friend class TempVectorStack;
+
+ public:
+ ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
+ T* mutable_data() { return reinterpret_cast<T*>(data_); }
+ TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
+ stack_ = stack;
+ num_elements_ = num_elements;
+ stack_->alloc(num_elements * sizeof(T), &data_, &id_);
+ }
+
+ private:
+ TempVectorStack* stack_;
+ uint8_t* data_;
+ int id_;
+ uint32_t num_elements_;
+};
+
+class BitUtil {
+ public:
+ static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes, int bit_offset = 0);
+
+ static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+ const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes, int* num_indexes,
+ uint16_t* indexes, int bit_offset = 0);
+
+ // Input and output indexes may be pointing to the same data (in-place filtering).
+ static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, int* num_indexes_bit0,
+ uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+ int bit_offset = 0);
+
+ // Bit 1 is replaced with byte 0xFF.
+ static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
+
+ // Return highest bit of each byte.
+ static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
+
+ static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+ uint32_t num_bytes);
+
+ private:
+ inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+ int* num_indexes, uint16_t* indexes);
+ inline static void bits_filter_indexes_helper(uint64_t word,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search, bool filter_input_indexes>
+ static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+
+#if defined(ARROW_HAVE_AVX2)
+ static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+ const uint8_t* bits, int* num_indexes,
+ uint16_t* indexes);
+ static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+ const uint8_t* bits, const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search>
+ static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+ int* num_indexes, uint16_t* indexes);
+ template <int bit_to_search>
+ static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+ const uint16_t* input_indexes,
+ int* num_indexes, uint16_t* indexes);
+ static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
+ static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
+ static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+#endif
+};
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
index abc9861537f..55daa243cd3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/exec_internal.h
@@ -84,14 +84,14 @@ class ARROW_EXPORT ExecListener {
class DatumAccumulator : public ExecListener {
public:
- DatumAccumulator() = default;
+ DatumAccumulator() = default;
Status OnResult(Datum value) override {
values_.emplace_back(value);
return Status::OK();
}
- std::vector<Datum> values() { return std::move(values_); }
+ std::vector<Datum> values() { return std::move(values_); }
private:
std::vector<Datum> values_;
@@ -102,17 +102,17 @@ class DatumAccumulator : public ExecListener {
/// inputs will be split into non-chunked ExecBatch values for execution
Status CheckAllValues(const std::vector<Datum>& values);
-class ARROW_EXPORT KernelExecutor {
+class ARROW_EXPORT KernelExecutor {
public:
- virtual ~KernelExecutor() = default;
-
- /// The Kernel's `init` method must be called and any KernelState set in the
- /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate
- /// the case where init may be expensive and does not need to be called again for
- /// each execution of the kernel, for example the same lookup table can be re-used
- /// for all scanned batches in a dataset filter.
- virtual Status Init(KernelContext*, KernelInitArgs) = 0;
-
+ virtual ~KernelExecutor() = default;
+
+ /// The Kernel's `init` method must be called and any KernelState set in the
+ /// KernelContext *before* KernelExecutor::Init is called. This is to facilitate
+ /// the case where init may be expensive and does not need to be called again for
+ /// each execution of the kernel, for example the same lookup table can be re-used
+ /// for all scanned batches in a dataset filter.
+ virtual Status Init(KernelContext*, KernelInitArgs) = 0;
+
/// XXX: Better configurability for listener
/// Not thread-safe
virtual Status Execute(const std::vector<Datum>& args, ExecListener* listener) = 0;
@@ -120,9 +120,9 @@ class ARROW_EXPORT KernelExecutor {
virtual Datum WrapResults(const std::vector<Datum>& args,
const std::vector<Datum>& outputs) = 0;
- static std::unique_ptr<KernelExecutor> MakeScalar();
- static std::unique_ptr<KernelExecutor> MakeVector();
- static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
+ static std::unique_ptr<KernelExecutor> MakeScalar();
+ static std::unique_ptr<KernelExecutor> MakeVector();
+ static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
};
/// \brief Populate validity bitmap with the intersection of the nullity of the
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
index 1958f442849..05d14d03b16 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.cc
@@ -21,108 +21,108 @@
#include <memory>
#include <sstream>
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/cast.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
#include "arrow/compute/exec.h"
#include "arrow/compute/exec_internal.h"
-#include "arrow/compute/function_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/registry.h"
+#include "arrow/compute/function_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/registry.h"
#include "arrow/datum.h"
#include "arrow/util/cpu_info.h"
namespace arrow {
-
-using internal::checked_cast;
-
+
+using internal::checked_cast;
+
namespace compute {
-Result<std::shared_ptr<Buffer>> FunctionOptionsType::Serialize(
- const FunctionOptions&) const {
- return Status::NotImplemented("Serialize for ", type_name());
-}
-
-Result<std::unique_ptr<FunctionOptions>> FunctionOptionsType::Deserialize(
- const Buffer& buffer) const {
- return Status::NotImplemented("Deserialize for ", type_name());
-}
-
-std::string FunctionOptions::ToString() const { return options_type()->Stringify(*this); }
-
-bool FunctionOptions::Equals(const FunctionOptions& other) const {
- if (this == &other) return true;
- if (options_type() != other.options_type()) return false;
- return options_type()->Compare(*this, other);
-}
-
-Result<std::shared_ptr<Buffer>> FunctionOptions::Serialize() const {
- return options_type()->Serialize(*this);
-}
-
-Result<std::unique_ptr<FunctionOptions>> FunctionOptions::Deserialize(
- const std::string& type_name, const Buffer& buffer) {
- ARROW_ASSIGN_OR_RAISE(auto options,
- GetFunctionRegistry()->GetFunctionOptionsType(type_name));
- return options->Deserialize(buffer);
-}
-
-void PrintTo(const FunctionOptions& options, std::ostream* os) {
- *os << options.ToString();
-}
-
-static const FunctionDoc kEmptyFunctionDoc{};
-
-const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; }
-
-static Status CheckArityImpl(const Function* function, int passed_num_args,
- const char* passed_num_args_label) {
- if (function->arity().is_varargs && passed_num_args < function->arity().num_args) {
- return Status::Invalid("VarArgs function ", function->name(), " needs at least ",
- function->arity().num_args, " arguments but ",
- passed_num_args_label, " only ", passed_num_args);
+Result<std::shared_ptr<Buffer>> FunctionOptionsType::Serialize(
+ const FunctionOptions&) const {
+ return Status::NotImplemented("Serialize for ", type_name());
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsType::Deserialize(
+ const Buffer& buffer) const {
+ return Status::NotImplemented("Deserialize for ", type_name());
+}
+
+std::string FunctionOptions::ToString() const { return options_type()->Stringify(*this); }
+
+bool FunctionOptions::Equals(const FunctionOptions& other) const {
+ if (this == &other) return true;
+ if (options_type() != other.options_type()) return false;
+ return options_type()->Compare(*this, other);
+}
+
+Result<std::shared_ptr<Buffer>> FunctionOptions::Serialize() const {
+ return options_type()->Serialize(*this);
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptions::Deserialize(
+ const std::string& type_name, const Buffer& buffer) {
+ ARROW_ASSIGN_OR_RAISE(auto options,
+ GetFunctionRegistry()->GetFunctionOptionsType(type_name));
+ return options->Deserialize(buffer);
+}
+
+void PrintTo(const FunctionOptions& options, std::ostream* os) {
+ *os << options.ToString();
+}
+
+static const FunctionDoc kEmptyFunctionDoc{};
+
+const FunctionDoc& FunctionDoc::Empty() { return kEmptyFunctionDoc; }
+
+static Status CheckArityImpl(const Function* function, int passed_num_args,
+ const char* passed_num_args_label) {
+ if (function->arity().is_varargs && passed_num_args < function->arity().num_args) {
+ return Status::Invalid("VarArgs function ", function->name(), " needs at least ",
+ function->arity().num_args, " arguments but ",
+ passed_num_args_label, " only ", passed_num_args);
}
-
- if (!function->arity().is_varargs && passed_num_args != function->arity().num_args) {
- return Status::Invalid("Function ", function->name(), " accepts ",
- function->arity().num_args, " arguments but ",
- passed_num_args_label, " ", passed_num_args);
- }
-
+
+ if (!function->arity().is_varargs && passed_num_args != function->arity().num_args) {
+ return Status::Invalid("Function ", function->name(), " accepts ",
+ function->arity().num_args, " arguments but ",
+ passed_num_args_label, " ", passed_num_args);
+ }
+
return Status::OK();
}
-Status Function::CheckArity(const std::vector<InputType>& in_types) const {
- return CheckArityImpl(this, static_cast<int>(in_types.size()), "kernel accepts");
+Status Function::CheckArity(const std::vector<InputType>& in_types) const {
+ return CheckArityImpl(this, static_cast<int>(in_types.size()), "kernel accepts");
+}
+
+Status Function::CheckArity(const std::vector<ValueDescr>& descrs) const {
+ return CheckArityImpl(this, static_cast<int>(descrs.size()),
+ "attempted to look up kernel(s) with");
+}
+
+namespace detail {
+
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>& descrs) {
+ return Status::NotImplemented("Function ", func->name(),
+ " has no kernel matching input types ",
+ ValueDescr::ToString(descrs));
}
-Status Function::CheckArity(const std::vector<ValueDescr>& descrs) const {
- return CheckArityImpl(this, static_cast<int>(descrs.size()),
- "attempted to look up kernel(s) with");
-}
-
-namespace detail {
-
-Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>& descrs) {
- return Status::NotImplemented("Function ", func->name(),
- " has no kernel matching input types ",
- ValueDescr::ToString(descrs));
-}
-
-template <typename KernelType>
-const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
- const std::vector<ValueDescr>& values) {
- const KernelType* kernel_matches[SimdLevel::MAX] = {nullptr};
-
+template <typename KernelType>
+const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
+ const std::vector<ValueDescr>& values) {
+ const KernelType* kernel_matches[SimdLevel::MAX] = {nullptr};
+
// Validate arity
for (const auto& kernel : kernels) {
- if (kernel->signature->MatchesInputs(values)) {
- kernel_matches[kernel->simd_level] = kernel;
+ if (kernel->signature->MatchesInputs(values)) {
+ kernel_matches[kernel->simd_level] = kernel;
}
}
// Dispatch as the CPU feature
-#if defined(ARROW_HAVE_RUNTIME_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX2)
+#if defined(ARROW_HAVE_RUNTIME_AVX512) || defined(ARROW_HAVE_RUNTIME_AVX2)
auto cpu_info = arrow::internal::CpuInfo::GetInstance();
-#endif
+#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX512)) {
if (kernel_matches[SimdLevel::AVX512]) {
@@ -141,54 +141,54 @@ const KernelType* DispatchExactImpl(const std::vector<KernelType*>& kernels,
return kernel_matches[SimdLevel::NONE];
}
- return nullptr;
+ return nullptr;
+}
+
+const Kernel* DispatchExactImpl(const Function* func,
+ const std::vector<ValueDescr>& values) {
+ if (func->kind() == Function::SCALAR) {
+ return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
+ values);
+ }
+
+ if (func->kind() == Function::VECTOR) {
+ return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
+ values);
+ }
+
+ if (func->kind() == Function::SCALAR_AGGREGATE) {
+ return DispatchExactImpl(
+ checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
+ }
+
+ if (func->kind() == Function::HASH_AGGREGATE) {
+ return DispatchExactImpl(checked_cast<const HashAggregateFunction*>(func)->kernels(),
+ values);
+ }
+
+ return nullptr;
+}
+
+} // namespace detail
+
+Result<const Kernel*> Function::DispatchExact(
+ const std::vector<ValueDescr>& values) const {
+ if (kind_ == Function::META) {
+ return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
+ }
+ RETURN_NOT_OK(CheckArity(values));
+
+ if (auto kernel = detail::DispatchExactImpl(this, values)) {
+ return kernel;
+ }
+ return detail::NoMatchingKernel(this, values);
+}
+
+Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
+ // TODO(ARROW-11508) permit generic conversions here
+ return DispatchExact(*values);
}
-const Kernel* DispatchExactImpl(const Function* func,
- const std::vector<ValueDescr>& values) {
- if (func->kind() == Function::SCALAR) {
- return DispatchExactImpl(checked_cast<const ScalarFunction*>(func)->kernels(),
- values);
- }
-
- if (func->kind() == Function::VECTOR) {
- return DispatchExactImpl(checked_cast<const VectorFunction*>(func)->kernels(),
- values);
- }
-
- if (func->kind() == Function::SCALAR_AGGREGATE) {
- return DispatchExactImpl(
- checked_cast<const ScalarAggregateFunction*>(func)->kernels(), values);
- }
-
- if (func->kind() == Function::HASH_AGGREGATE) {
- return DispatchExactImpl(checked_cast<const HashAggregateFunction*>(func)->kernels(),
- values);
- }
-
- return nullptr;
-}
-
-} // namespace detail
-
-Result<const Kernel*> Function::DispatchExact(
- const std::vector<ValueDescr>& values) const {
- if (kind_ == Function::META) {
- return Status::NotImplemented("Dispatch for a MetaFunction's Kernels");
- }
- RETURN_NOT_OK(CheckArity(values));
-
- if (auto kernel = detail::DispatchExactImpl(this, values)) {
- return kernel;
- }
- return detail::NoMatchingKernel(this, values);
-}
-
-Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) const {
- // TODO(ARROW-11508) permit generic conversions here
- return DispatchExact(*values);
-}
-
Result<Datum> Function::Execute(const std::vector<Datum>& args,
const FunctionOptions* options, ExecContext* ctx) const {
if (options == nullptr) {
@@ -198,63 +198,63 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
ExecContext default_ctx;
return Execute(args, options, &default_ctx);
}
-
+
// type-check Datum arguments here. Really we'd like to avoid this as much as
// possible
RETURN_NOT_OK(detail::CheckAllValues(args));
- std::vector<ValueDescr> inputs(args.size());
- for (size_t i = 0; i != args.size(); ++i) {
- inputs[i] = args[i].descr();
- }
-
- ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
- ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx));
-
- std::unique_ptr<KernelState> state;
-
- KernelContext kernel_ctx{ctx};
- if (kernel->init) {
- ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
- kernel_ctx.SetState(state.get());
- }
-
- std::unique_ptr<detail::KernelExecutor> executor;
- if (kind() == Function::SCALAR) {
- executor = detail::KernelExecutor::MakeScalar();
- } else if (kind() == Function::VECTOR) {
- executor = detail::KernelExecutor::MakeVector();
- } else if (kind() == Function::SCALAR_AGGREGATE) {
- executor = detail::KernelExecutor::MakeScalarAggregate();
- } else {
- return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
- }
- RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
-
+ std::vector<ValueDescr> inputs(args.size());
+ for (size_t i = 0; i != args.size(); ++i) {
+ inputs[i] = args[i].descr();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
+ ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx));
+
+ std::unique_ptr<KernelState> state;
+
+ KernelContext kernel_ctx{ctx};
+ if (kernel->init) {
+ ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
+ kernel_ctx.SetState(state.get());
+ }
+
+ std::unique_ptr<detail::KernelExecutor> executor;
+ if (kind() == Function::SCALAR) {
+ executor = detail::KernelExecutor::MakeScalar();
+ } else if (kind() == Function::VECTOR) {
+ executor = detail::KernelExecutor::MakeVector();
+ } else if (kind() == Function::SCALAR_AGGREGATE) {
+ executor = detail::KernelExecutor::MakeScalarAggregate();
+ } else {
+ return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
+ }
+ RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
+
auto listener = std::make_shared<detail::DatumAccumulator>();
- RETURN_NOT_OK(executor->Execute(implicitly_cast_args, listener.get()));
- return executor->WrapResults(implicitly_cast_args, listener->values());
+ RETURN_NOT_OK(executor->Execute(implicitly_cast_args, listener.get()));
+ return executor->WrapResults(implicitly_cast_args, listener->values());
+}
+
+Status Function::Validate() const {
+ if (!doc_->summary.empty()) {
+ // Documentation given, check its contents
+ int arg_count = static_cast<int>(doc_->arg_names.size());
+ if (arg_count == arity_.num_args) {
+ return Status::OK();
+ }
+ if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
+ return Status::OK();
+ }
+ return Status::Invalid(
+ "In function '", name_,
+ "': ", "number of argument names for function documentation != function arity");
+ }
+ return Status::OK();
}
-Status Function::Validate() const {
- if (!doc_->summary.empty()) {
- // Documentation given, check its contents
- int arg_count = static_cast<int>(doc_->arg_names.size());
- if (arg_count == arity_.num_args) {
- return Status::OK();
- }
- if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
- return Status::OK();
- }
- return Status::Invalid(
- "In function '", name_,
- "': ", "number of argument names for function documentation != function arity");
- }
- return Status::OK();
-}
-
Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, KernelInit init) {
- RETURN_NOT_OK(CheckArity(in_types));
+ RETURN_NOT_OK(CheckArity(in_types));
if (arity_.is_varargs && in_types.size() != 1) {
return Status::Invalid("VarArgs signatures must have exactly one input type");
@@ -266,7 +266,7 @@ Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out
}
Status ScalarFunction::AddKernel(ScalarKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
if (arity_.is_varargs && !kernel.signature->is_varargs()) {
return Status::Invalid("Function accepts varargs but kernel signature does not");
}
@@ -276,7 +276,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) {
Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, KernelInit init) {
- RETURN_NOT_OK(CheckArity(in_types));
+ RETURN_NOT_OK(CheckArity(in_types));
if (arity_.is_varargs && in_types.size() != 1) {
return Status::Invalid("VarArgs signatures must have exactly one input type");
@@ -288,7 +288,7 @@ Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out
}
Status VectorFunction::AddKernel(VectorKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
if (arity_.is_varargs && !kernel.signature->is_varargs()) {
return Status::Invalid("Function accepts varargs but kernel signature does not");
}
@@ -297,7 +297,7 @@ Status VectorFunction::AddKernel(VectorKernel kernel) {
}
Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
if (arity_.is_varargs && !kernel.signature->is_varargs()) {
return Status::Invalid("Function accepts varargs but kernel signature does not");
}
@@ -305,21 +305,21 @@ Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
return Status::OK();
}
-Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) {
- RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
- if (arity_.is_varargs && !kernel.signature->is_varargs()) {
- return Status::Invalid("Function accepts varargs but kernel signature does not");
- }
- kernels_.emplace_back(std::move(kernel));
- return Status::OK();
+Status HashAggregateFunction::AddKernel(HashAggregateKernel kernel) {
+ RETURN_NOT_OK(CheckArity(kernel.signature->in_types()));
+ if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+ return Status::Invalid("Function accepts varargs but kernel signature does not");
+ }
+ kernels_.emplace_back(std::move(kernel));
+ return Status::OK();
}
Result<Datum> MetaFunction::Execute(const std::vector<Datum>& args,
const FunctionOptions* options,
ExecContext* ctx) const {
- RETURN_NOT_OK(
- CheckArityImpl(this, static_cast<int>(args.size()), "attempted to Execute with"));
-
+ RETURN_NOT_OK(
+ CheckArityImpl(this, static_cast<int>(args.size()), "attempted to Execute with"));
+
if (options == nullptr) {
options = default_options();
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
index e50ba155244..bd854bbb28e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function.h
@@ -29,7 +29,7 @@
#include "arrow/datum.h"
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/compare.h"
+#include "arrow/util/compare.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -40,50 +40,50 @@ namespace compute {
///
/// @{
-/// \brief Extension point for defining options outside libarrow (but
-/// still within this project).
-class ARROW_EXPORT FunctionOptionsType {
- public:
- virtual ~FunctionOptionsType() = default;
-
- virtual const char* type_name() const = 0;
- virtual std::string Stringify(const FunctionOptions&) const = 0;
- virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
- virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
- virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
- const Buffer& buffer) const;
-};
-
+/// \brief Extension point for defining options outside libarrow (but
+/// still within this project).
+class ARROW_EXPORT FunctionOptionsType {
+ public:
+ virtual ~FunctionOptionsType() = default;
+
+ virtual const char* type_name() const = 0;
+ virtual std::string Stringify(const FunctionOptions&) const = 0;
+ virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
+ virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
+ virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const Buffer& buffer) const;
+};
+
/// \brief Base class for specifying options configuring a function's behavior,
/// such as error handling.
-class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
- public:
- virtual ~FunctionOptions() = default;
-
- const FunctionOptionsType* options_type() const { return options_type_; }
- const char* type_name() const { return options_type()->type_name(); }
-
- bool Equals(const FunctionOptions& other) const;
- using util::EqualityComparable<FunctionOptions>::Equals;
- using util::EqualityComparable<FunctionOptions>::operator==;
- using util::EqualityComparable<FunctionOptions>::operator!=;
- std::string ToString() const;
- /// \brief Serialize an options struct to a buffer.
- Result<std::shared_ptr<Buffer>> Serialize() const;
- /// \brief Deserialize an options struct from a buffer.
- /// Note: this will only look for `type_name` in the default FunctionRegistry;
- /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
- /// call FunctionOptionsType::Deserialize().
- static Result<std::unique_ptr<FunctionOptions>> Deserialize(
- const std::string& type_name, const Buffer& buffer);
-
- protected:
- explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
- const FunctionOptionsType* options_type_;
-};
-
-ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
-
+class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
+ public:
+ virtual ~FunctionOptions() = default;
+
+ const FunctionOptionsType* options_type() const { return options_type_; }
+ const char* type_name() const { return options_type()->type_name(); }
+
+ bool Equals(const FunctionOptions& other) const;
+ using util::EqualityComparable<FunctionOptions>::Equals;
+ using util::EqualityComparable<FunctionOptions>::operator==;
+ using util::EqualityComparable<FunctionOptions>::operator!=;
+ std::string ToString() const;
+ /// \brief Serialize an options struct to a buffer.
+ Result<std::shared_ptr<Buffer>> Serialize() const;
+ /// \brief Deserialize an options struct from a buffer.
+ /// Note: this will only look for `type_name` in the default FunctionRegistry;
+ /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
+ /// call FunctionOptionsType::Deserialize().
+ static Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const std::string& type_name, const Buffer& buffer);
+
+ protected:
+ explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
+ const FunctionOptionsType* options_type_;
+};
+
+ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
+
/// \brief Contains the number of required arguments for the function.
///
/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
@@ -118,37 +118,37 @@ struct ARROW_EXPORT Arity {
bool is_varargs = false;
};
-struct ARROW_EXPORT FunctionDoc {
- /// \brief A one-line summary of the function, using a verb.
- ///
- /// For example, "Add two numeric arrays or scalars".
- std::string summary;
-
- /// \brief A detailed description of the function, meant to follow the summary.
- std::string description;
-
- /// \brief Symbolic names (identifiers) for the function arguments.
- ///
- /// Some bindings may use this to generate nicer function signatures.
- std::vector<std::string> arg_names;
-
- // TODO add argument descriptions?
-
- /// \brief Name of the options class, if any.
- std::string options_class;
-
- FunctionDoc() = default;
-
- FunctionDoc(std::string summary, std::string description,
- std::vector<std::string> arg_names, std::string options_class = "")
- : summary(std::move(summary)),
- description(std::move(description)),
- arg_names(std::move(arg_names)),
- options_class(std::move(options_class)) {}
-
- static const FunctionDoc& Empty();
-};
-
+struct ARROW_EXPORT FunctionDoc {
+ /// \brief A one-line summary of the function, using a verb.
+ ///
+ /// For example, "Add two numeric arrays or scalars".
+ std::string summary;
+
+ /// \brief A detailed description of the function, meant to follow the summary.
+ std::string description;
+
+ /// \brief Symbolic names (identifiers) for the function arguments.
+ ///
+ /// Some bindings may use this to generate nicer function signatures.
+ std::vector<std::string> arg_names;
+
+ // TODO add argument descriptions?
+
+ /// \brief Name of the options class, if any.
+ std::string options_class;
+
+ FunctionDoc() = default;
+
+ FunctionDoc(std::string summary, std::string description,
+ std::vector<std::string> arg_names, std::string options_class = "")
+ : summary(std::move(summary)),
+ description(std::move(description)),
+ arg_names(std::move(arg_names)),
+ options_class(std::move(options_class)) {}
+
+ static const FunctionDoc& Empty();
+};
+
/// \brief Base class for compute functions. Function implementations contain a
/// collection of "kernels" which are implementations of the function for
/// specific argument types. Selecting a viable kernel for executing a function
@@ -172,10 +172,10 @@ class ARROW_EXPORT Function {
/// A function that computes scalar summary statistics from array input.
SCALAR_AGGREGATE,
- /// A function that computes grouped summary statistics from array input
- /// and an array of group identifiers.
- HASH_AGGREGATE,
-
+ /// A function that computes grouped summary statistics from array input
+ /// and an array of group identifiers.
+ HASH_AGGREGATE,
+
/// A function that dispatches to other functions and does not contain its
/// own kernels.
META
@@ -194,27 +194,27 @@ class ARROW_EXPORT Function {
/// function accepts variable numbers of arguments.
const Arity& arity() const { return arity_; }
- /// \brief Return the function documentation
- const FunctionDoc& doc() const { return *doc_; }
-
+ /// \brief Return the function documentation
+ const FunctionDoc& doc() const { return *doc_; }
+
/// \brief Returns the number of registered kernels for this function.
virtual int num_kernels() const = 0;
- /// \brief Return a kernel that can execute the function given the exact
- /// argument types (without implicit type casts or scalar->array promotions).
- ///
- /// NB: This function is overridden in CastFunction.
- virtual Result<const Kernel*> DispatchExact(
- const std::vector<ValueDescr>& values) const;
-
- /// \brief Return a best-match kernel that can execute the function given the argument
- /// types, after implicit casts are applied.
- ///
- /// \param[in,out] values Argument types. An element may be modified to indicate that
- /// the returned kernel only approximately matches the input value descriptors; callers
- /// are responsible for casting inputs to the type and shape required by the kernel.
- virtual Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const;
-
+ /// \brief Return a kernel that can execute the function given the exact
+ /// argument types (without implicit type casts or scalar->array promotions).
+ ///
+ /// NB: This function is overridden in CastFunction.
+ virtual Result<const Kernel*> DispatchExact(
+ const std::vector<ValueDescr>& values) const;
+
+ /// \brief Return a best-match kernel that can execute the function given the argument
+ /// types, after implicit casts are applied.
+ ///
+ /// \param[in,out] values Argument types. An element may be modified to indicate that
+ /// the returned kernel only approximately matches the input value descriptors; callers
+ /// are responsible for casting inputs to the type and shape required by the kernel.
+ virtual Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const;
+
/// \brief Execute the function eagerly with the passed input arguments with
/// kernel dispatch, batch iteration, and memory allocation details taken
/// care of.
@@ -231,24 +231,24 @@ class ARROW_EXPORT Function {
/// that default_options() is valid to pass to Execute as options.
const FunctionOptions* default_options() const { return default_options_; }
- virtual Status Validate() const;
-
+ virtual Status Validate() const;
+
protected:
Function(std::string name, Function::Kind kind, const Arity& arity,
- const FunctionDoc* doc, const FunctionOptions* default_options)
+ const FunctionDoc* doc, const FunctionOptions* default_options)
: name_(std::move(name)),
kind_(kind),
arity_(arity),
- doc_(doc ? doc : &FunctionDoc::Empty()),
+ doc_(doc ? doc : &FunctionDoc::Empty()),
default_options_(default_options) {}
- Status CheckArity(const std::vector<InputType>&) const;
- Status CheckArity(const std::vector<ValueDescr>&) const;
+ Status CheckArity(const std::vector<InputType>&) const;
+ Status CheckArity(const std::vector<ValueDescr>&) const;
std::string name_;
Function::Kind kind_;
Arity arity_;
- const FunctionDoc* doc_;
+ const FunctionDoc* doc_;
const FunctionOptions* default_options_ = NULLPTR;
};
@@ -270,20 +270,20 @@ class FunctionImpl : public Function {
protected:
FunctionImpl(std::string name, Function::Kind kind, const Arity& arity,
- const FunctionDoc* doc, const FunctionOptions* default_options)
- : Function(std::move(name), kind, arity, doc, default_options) {}
+ const FunctionDoc* doc, const FunctionOptions* default_options)
+ : Function(std::move(name), kind, arity, doc, default_options) {}
std::vector<KernelType> kernels_;
};
-/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
-ARROW_EXPORT
-const Kernel* DispatchExactImpl(const Function* func, const std::vector<ValueDescr>&);
-
-/// \brief Return an error message if no Kernel is found.
-ARROW_EXPORT
-Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>&);
-
+/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
+ARROW_EXPORT
+const Kernel* DispatchExactImpl(const Function* func, const std::vector<ValueDescr>&);
+
+/// \brief Return an error message if no Kernel is found.
+ARROW_EXPORT
+Status NoMatchingKernel(const Function* func, const std::vector<ValueDescr>&);
+
} // namespace detail
/// \brief A function that executes elementwise operations on arrays or
@@ -295,9 +295,9 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
public:
using KernelType = ScalarKernel;
- ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ ScalarFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
- : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity, doc,
+ : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity, doc,
default_options) {}
/// \brief Add a kernel with given input/output types, no required state
@@ -319,9 +319,9 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
public:
using KernelType = VectorKernel;
- VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ VectorFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
- : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity, doc,
+ : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity, doc,
default_options) {}
/// \brief Add a simple kernel with given input/output types, no required
@@ -340,29 +340,29 @@ class ARROW_EXPORT ScalarAggregateFunction
public:
using KernelType = ScalarAggregateKernel;
- ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ ScalarAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
: detail::FunctionImpl<ScalarAggregateKernel>(
- std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {}
+ std::move(name), Function::SCALAR_AGGREGATE, arity, doc, default_options) {}
/// \brief Add a kernel (function implementation). Returns error if the
/// kernel's signature does not match the function's arity.
Status AddKernel(ScalarAggregateKernel kernel);
-};
-
-class ARROW_EXPORT HashAggregateFunction
- : public detail::FunctionImpl<HashAggregateKernel> {
- public:
- using KernelType = HashAggregateKernel;
-
- HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
- const FunctionOptions* default_options = NULLPTR)
- : detail::FunctionImpl<HashAggregateKernel>(
- std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {}
-
- /// \brief Add a kernel (function implementation). Returns error if the
- /// kernel's signature does not match the function's arity.
- Status AddKernel(HashAggregateKernel kernel);
+};
+
+class ARROW_EXPORT HashAggregateFunction
+ : public detail::FunctionImpl<HashAggregateKernel> {
+ public:
+ using KernelType = HashAggregateKernel;
+
+ HashAggregateFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ const FunctionOptions* default_options = NULLPTR)
+ : detail::FunctionImpl<HashAggregateKernel>(
+ std::move(name), Function::HASH_AGGREGATE, arity, doc, default_options) {}
+
+ /// \brief Add a kernel (function implementation). Returns error if the
+ /// kernel's signature does not match the function's arity.
+ Status AddKernel(HashAggregateKernel kernel);
};
/// \brief A function that dispatches to other functions. Must implement
@@ -382,9 +382,9 @@ class ARROW_EXPORT MetaFunction : public Function {
const FunctionOptions* options,
ExecContext* ctx) const = 0;
- MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
+ MetaFunction(std::string name, const Arity& arity, const FunctionDoc* doc,
const FunctionOptions* default_options = NULLPTR)
- : Function(std::move(name), Function::META, arity, doc, default_options) {}
+ : Function(std::move(name), Function::META, arity, doc, default_options) {}
};
/// @}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
index 8515d957cbd..0a926e0a39c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.cc
@@ -1,113 +1,113 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/function_internal.h"
-
-#include "arrow/array/util.h"
-#include "arrow/compute/function.h"
-#include "arrow/compute/registry.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/record_batch.h"
-#include "arrow/scalar.h"
-#include "arrow/util/checked_cast.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-using ::arrow::internal::checked_cast;
-
-constexpr char kTypeNameField[] = "_type_name";
-
-Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
- const FunctionOptions& options) {
- std::vector<std::string> field_names;
- std::vector<std::shared_ptr<Scalar>> values;
- const auto* options_type =
- dynamic_cast<const GenericOptionsType*>(options.options_type());
- if (!options_type) {
- return Status::NotImplemented("serializing ", options.type_name(),
- " to StructScalar");
- }
- RETURN_NOT_OK(options_type->ToStructScalar(options, &field_names, &values));
- field_names.push_back(kTypeNameField);
- const char* options_name = options.type_name();
- values.emplace_back(
- new BinaryScalar(Buffer::Wrap(options_name, std::strlen(options_name))));
- return StructScalar::Make(std::move(values), std::move(field_names));
-}
-
-Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
- const StructScalar& scalar) {
- ARROW_ASSIGN_OR_RAISE(auto type_name_holder, scalar.field(kTypeNameField));
- const std::string type_name =
- checked_cast<const BinaryScalar&>(*type_name_holder).value->ToString();
- ARROW_ASSIGN_OR_RAISE(auto raw_options_type,
- GetFunctionRegistry()->GetFunctionOptionsType(type_name));
- const auto* options_type = checked_cast<const GenericOptionsType*>(raw_options_type);
- return options_type->FromStructScalar(scalar);
-}
-
-Result<std::shared_ptr<Buffer>> GenericOptionsType::Serialize(
- const FunctionOptions& options) const {
- ARROW_ASSIGN_OR_RAISE(auto scalar, FunctionOptionsToStructScalar(options));
- ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*scalar, 1));
- auto batch =
- RecordBatch::Make(schema({field("", array->type())}), /*num_rows=*/1, {array});
- ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
- ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
- RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
- RETURN_NOT_OK(writer->Close());
- return stream->Finish();
-}
-
-Result<std::unique_ptr<FunctionOptions>> GenericOptionsType::Deserialize(
- const Buffer& buffer) const {
- return DeserializeFunctionOptions(buffer);
-}
-
-Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(
- const Buffer& buffer) {
- io::BufferReader stream(buffer);
- ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
- ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
- if (batch->num_rows() != 1) {
- return Status::Invalid(
- "serialized FunctionOptions's batch repr was not a single row - had ",
- batch->num_rows());
- }
- if (batch->num_columns() != 1) {
- return Status::Invalid(
- "serialized FunctionOptions's batch repr was not a single column - had ",
- batch->num_columns());
- }
- auto column = batch->column(0);
- if (column->type()->id() != Type::STRUCT) {
- return Status::Invalid(
- "serialized FunctionOptions's batch repr was not a struct column - was ",
- column->type()->ToString());
- }
- ARROW_ASSIGN_OR_RAISE(auto raw_scalar,
- checked_cast<const StructArray&>(*column).GetScalar(0));
- auto scalar = checked_cast<const StructScalar&>(*raw_scalar);
- return FunctionOptionsFromStructScalar(scalar);
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/function_internal.h"
+
+#include "arrow/array/util.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/record_batch.h"
+#include "arrow/scalar.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+using ::arrow::internal::checked_cast;
+
+constexpr char kTypeNameField[] = "_type_name";
+
+Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
+ const FunctionOptions& options) {
+ std::vector<std::string> field_names;
+ std::vector<std::shared_ptr<Scalar>> values;
+ const auto* options_type =
+ dynamic_cast<const GenericOptionsType*>(options.options_type());
+ if (!options_type) {
+ return Status::NotImplemented("serializing ", options.type_name(),
+ " to StructScalar");
+ }
+ RETURN_NOT_OK(options_type->ToStructScalar(options, &field_names, &values));
+ field_names.push_back(kTypeNameField);
+ const char* options_name = options.type_name();
+ values.emplace_back(
+ new BinaryScalar(Buffer::Wrap(options_name, std::strlen(options_name))));
+ return StructScalar::Make(std::move(values), std::move(field_names));
+}
+
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
+ const StructScalar& scalar) {
+ ARROW_ASSIGN_OR_RAISE(auto type_name_holder, scalar.field(kTypeNameField));
+ const std::string type_name =
+ checked_cast<const BinaryScalar&>(*type_name_holder).value->ToString();
+ ARROW_ASSIGN_OR_RAISE(auto raw_options_type,
+ GetFunctionRegistry()->GetFunctionOptionsType(type_name));
+ const auto* options_type = checked_cast<const GenericOptionsType*>(raw_options_type);
+ return options_type->FromStructScalar(scalar);
+}
+
+Result<std::shared_ptr<Buffer>> GenericOptionsType::Serialize(
+ const FunctionOptions& options) const {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, FunctionOptionsToStructScalar(options));
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*scalar, 1));
+ auto batch =
+ RecordBatch::Make(schema({field("", array->type())}), /*num_rows=*/1, {array});
+ ARROW_ASSIGN_OR_RAISE(auto stream, io::BufferOutputStream::Create());
+ ARROW_ASSIGN_OR_RAISE(auto writer, ipc::MakeFileWriter(stream, batch->schema()));
+ RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
+ RETURN_NOT_OK(writer->Close());
+ return stream->Finish();
+}
+
+Result<std::unique_ptr<FunctionOptions>> GenericOptionsType::Deserialize(
+ const Buffer& buffer) const {
+ return DeserializeFunctionOptions(buffer);
+}
+
+Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(
+ const Buffer& buffer) {
+ io::BufferReader stream(buffer);
+ ARROW_ASSIGN_OR_RAISE(auto reader, ipc::RecordBatchFileReader::Open(&stream));
+ ARROW_ASSIGN_OR_RAISE(auto batch, reader->ReadRecordBatch(0));
+ if (batch->num_rows() != 1) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a single row - had ",
+ batch->num_rows());
+ }
+ if (batch->num_columns() != 1) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a single column - had ",
+ batch->num_columns());
+ }
+ auto column = batch->column(0);
+ if (column->type()->id() != Type::STRUCT) {
+ return Status::Invalid(
+ "serialized FunctionOptions's batch repr was not a struct column - was ",
+ column->type()->ToString());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto raw_scalar,
+ checked_cast<const StructArray&>(*column).GetScalar(0));
+ auto scalar = checked_cast<const StructScalar&>(*raw_scalar);
+ return FunctionOptionsFromStructScalar(scalar);
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
index 9ce0c3cc84e..fdd7f09ba1f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/function_internal.h
@@ -1,626 +1,626 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "arrow/array/builder_base.h"
-#include "arrow/array/builder_binary.h"
-#include "arrow/array/builder_nested.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/function.h"
-#include "arrow/compute/type_fwd.h"
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/reflection_internal.h"
-#include "arrow/util/string.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-struct Scalar;
-struct StructScalar;
-using ::arrow::internal::checked_cast;
-
-namespace internal {
-template <>
-struct EnumTraits<compute::SortOrder>
- : BasicEnumTraits<compute::SortOrder, compute::SortOrder::Ascending,
- compute::SortOrder::Descending> {
- static std::string name() { return "SortOrder"; }
- static std::string value_name(compute::SortOrder value) {
- switch (value) {
- case compute::SortOrder::Ascending:
- return "Ascending";
- case compute::SortOrder::Descending:
- return "Descending";
- }
- return "<INVALID>";
- }
-};
-} // namespace internal
-
-namespace compute {
-namespace internal {
-
-using arrow::internal::EnumTraits;
-using arrow::internal::has_enum_traits;
-
-template <typename Enum, typename CType = typename std::underlying_type<Enum>::type>
-Result<Enum> ValidateEnumValue(CType raw) {
- for (auto valid : EnumTraits<Enum>::values()) {
- if (raw == static_cast<CType>(valid)) {
- return static_cast<Enum>(raw);
- }
- }
- return Status::Invalid("Invalid value for ", EnumTraits<Enum>::name(), ": ", raw);
-}
-
-class GenericOptionsType : public FunctionOptionsType {
- public:
- Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const override;
- Result<std::unique_ptr<FunctionOptions>> Deserialize(
- const Buffer& buffer) const override;
- virtual Status ToStructScalar(const FunctionOptions& options,
- std::vector<std::string>* field_names,
- std::vector<std::shared_ptr<Scalar>>* values) const = 0;
- virtual Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
- const StructScalar& scalar) const = 0;
-};
-
-ARROW_EXPORT
-Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
- const FunctionOptions&);
-ARROW_EXPORT
-Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
- const StructScalar&);
-ARROW_EXPORT
-Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(const Buffer& buffer);
-
-template <typename T>
-static inline enable_if_t<!has_enum_traits<T>::value, std::string> GenericToString(
- const T& value) {
- std::stringstream ss;
- ss << value;
- return ss.str();
-}
-
-static inline std::string GenericToString(bool value) { return value ? "true" : "false"; }
-
-static inline std::string GenericToString(const std::string& value) {
- std::stringstream ss;
- ss << '"' << value << '"';
- return ss.str();
-}
-
-template <typename T>
-static inline enable_if_t<has_enum_traits<T>::value, std::string> GenericToString(
- const T value) {
- return EnumTraits<T>::value_name(value);
-}
-
-template <typename T>
-static inline std::string GenericToString(const std::shared_ptr<T>& value) {
- std::stringstream ss;
- return value ? value->ToString() : "<NULLPTR>";
-}
-
-static inline std::string GenericToString(const std::shared_ptr<Scalar>& value) {
- std::stringstream ss;
- ss << value->type->ToString() << ":" << value->ToString();
- return ss.str();
-}
-
-static inline std::string GenericToString(
- const std::shared_ptr<const KeyValueMetadata>& value) {
- std::stringstream ss;
- ss << "KeyValueMetadata{";
- if (value) {
- bool first = true;
- for (const auto& pair : value->sorted_pairs()) {
- if (!first) ss << ", ";
- first = false;
- ss << pair.first << ':' << pair.second;
- }
- }
- ss << '}';
- return ss.str();
-}
-
-static inline std::string GenericToString(const Datum& value) {
- switch (value.kind()) {
- case Datum::NONE:
- return "<NULL DATUM>";
- case Datum::SCALAR:
- return GenericToString(value.scalar());
- case Datum::ARRAY: {
- std::stringstream ss;
- ss << value.type()->ToString() << ':' << value.make_array()->ToString();
- return ss.str();
- }
- case Datum::CHUNKED_ARRAY:
- case Datum::RECORD_BATCH:
- case Datum::TABLE:
- case Datum::COLLECTION:
- return value.ToString();
- }
- return value.ToString();
-}
-
-template <typename T>
-static inline std::string GenericToString(const std::vector<T>& value) {
- std::stringstream ss;
- ss << "[";
- bool first = true;
- // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
- for (auto it = value.begin(); it != value.end(); it++) {
- if (!first) ss << ", ";
- first = false;
- ss << GenericToString(*it);
- }
- ss << ']';
- return ss.str();
-}
-
-static inline std::string GenericToString(SortOrder value) {
- switch (value) {
- case SortOrder::Ascending:
- return "Ascending";
- case SortOrder::Descending:
- return "Descending";
- }
- return "<INVALID SORT ORDER>";
-}
-
-static inline std::string GenericToString(const std::vector<SortKey>& value) {
- std::stringstream ss;
- ss << '[';
- bool first = true;
- for (const auto& key : value) {
- if (!first) {
- ss << ", ";
- }
- first = false;
- ss << key.ToString();
- }
- ss << ']';
- return ss.str();
-}
-
-template <typename T>
-static inline bool GenericEquals(const T& left, const T& right) {
- return left == right;
-}
-
-template <typename T>
-static inline bool GenericEquals(const std::shared_ptr<T>& left,
- const std::shared_ptr<T>& right) {
- if (left && right) {
- return left->Equals(*right);
- }
- return left == right;
-}
-
-static inline bool IsEmpty(const std::shared_ptr<const KeyValueMetadata>& meta) {
- return !meta || meta->size() == 0;
-}
-
-static inline bool GenericEquals(const std::shared_ptr<const KeyValueMetadata>& left,
- const std::shared_ptr<const KeyValueMetadata>& right) {
- // Special case since null metadata is considered equivalent to empty
- if (IsEmpty(left) || IsEmpty(right)) {
- return IsEmpty(left) && IsEmpty(right);
- }
- return left->Equals(*right);
-}
-
-template <typename T>
-static inline bool GenericEquals(const std::vector<T>& left,
- const std::vector<T>& right) {
- if (left.size() != right.size()) return false;
- for (size_t i = 0; i < left.size(); i++) {
- if (!GenericEquals(left[i], right[i])) return false;
- }
- return true;
-}
-
-template <typename T>
-static inline decltype(TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton())
-GenericTypeSingleton() {
- return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton();
-}
-
-template <typename T>
-static inline enable_if_same<T, std::shared_ptr<const KeyValueMetadata>,
- std::shared_ptr<DataType>>
-GenericTypeSingleton() {
- return map(binary(), binary());
-}
-
-template <typename T>
-static inline enable_if_t<has_enum_traits<T>::value, std::shared_ptr<DataType>>
-GenericTypeSingleton() {
- return TypeTraits<typename EnumTraits<T>::Type>::type_singleton();
-}
-
-template <typename T>
-static inline enable_if_same<T, SortKey, std::shared_ptr<DataType>>
-GenericTypeSingleton() {
- std::vector<std::shared_ptr<Field>> fields;
- fields.emplace_back(new Field("name", GenericTypeSingleton<std::string>()));
- fields.emplace_back(new Field("order", GenericTypeSingleton<SortOrder>()));
- return std::make_shared<StructType>(std::move(fields));
-}
-
-// N.B. ordering of overloads is relatively fragile
-template <typename T>
-static inline Result<decltype(MakeScalar(std::declval<T>()))> GenericToScalar(
- const T& value) {
- return MakeScalar(value);
-}
-
-// For Clang/libc++: when iterating through vector<bool>, we can't
-// pass it by reference so the overload above doesn't apply
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(bool value) {
- return MakeScalar(value);
-}
-
-template <typename T, typename Enable = enable_if_t<has_enum_traits<T>::value>>
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const T value) {
- using CType = typename EnumTraits<T>::CType;
- return GenericToScalar(static_cast<CType>(value));
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& value) {
- ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name));
- ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order));
- return StructScalar::Make({name, order}, {"name", "order"});
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<const KeyValueMetadata>& value) {
- auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
- std::unique_ptr<ArrayBuilder> builder;
- RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder));
- auto* map_builder = checked_cast<MapBuilder*>(builder.get());
- auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder());
- auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder());
- RETURN_NOT_OK(map_builder->Append());
- if (value) {
- RETURN_NOT_OK(key_builder->AppendValues(value->keys()));
- RETURN_NOT_OK(item_builder->AppendValues(value->values()));
- }
- std::shared_ptr<Array> arr;
- RETURN_NOT_OK(map_builder->Finish(&arr));
- return arr->GetScalar(0);
-}
-
-template <typename T>
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::vector<T>& value) {
- std::shared_ptr<DataType> type = GenericTypeSingleton<T>();
- std::vector<std::shared_ptr<Scalar>> scalars;
- scalars.reserve(value.size());
- // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
- for (auto it = value.begin(); it != value.end(); it++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, GenericToScalar(*it));
- scalars.push_back(std::move(scalar));
- }
- std::unique_ptr<ArrayBuilder> builder;
- RETURN_NOT_OK(
- MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder));
- RETURN_NOT_OK(builder->AppendScalars(scalars));
- std::shared_ptr<Array> out;
- RETURN_NOT_OK(builder->Finish(&out));
- return std::make_shared<ListScalar>(std::move(out));
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<DataType>& value) {
- if (!value) {
- return Status::Invalid("shared_ptr<DataType> is nullptr");
- }
- return MakeNullScalar(value);
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<Scalar>& value) {
- return value;
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
- const std::shared_ptr<Array>& value) {
- return std::make_shared<ListScalar>(value);
-}
-
-static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const Datum& value) {
- // TODO(ARROW-9434): store in a union instead.
- switch (value.kind()) {
- case Datum::ARRAY:
- return GenericToScalar(value.make_array());
- break;
- default:
- return Status::NotImplemented("Cannot serialize Datum kind ", value.kind());
- }
-}
-
-template <typename T>
-static inline enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType, Result<T>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- using ArrowType = typename CTypeTraits<T>::ArrowType;
- using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
- if (value->type->id() != ArrowType::type_id) {
- return Status::Invalid("Expected type ", ArrowType::type_id, " but got ",
- value->type->ToString());
- }
- const auto& holder = checked_cast<const ScalarType&>(*value);
- if (!holder.is_valid) return Status::Invalid("Got null scalar");
- return holder.value;
-}
-
-template <typename T>
-static inline enable_if_primitive_ctype<typename EnumTraits<T>::Type, Result<T>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- ARROW_ASSIGN_OR_RAISE(auto raw_val,
- GenericFromScalar<typename EnumTraits<T>::CType>(value));
- return ValidateEnumValue<T>(raw_val);
-}
-
-template <typename T, typename U>
-using enable_if_same_result = enable_if_same<T, U, Result<T>>;
-
-template <typename T>
-static inline enable_if_same_result<T, std::string> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- if (!is_base_binary_like(value->type->id())) {
- return Status::Invalid("Expected binary-like type but got ", value->type->ToString());
- }
- const auto& holder = checked_cast<const BaseBinaryScalar&>(*value);
- if (!holder.is_valid) return Status::Invalid("Got null scalar");
- return holder.value->ToString();
-}
-
-template <typename T>
-static inline enable_if_same_result<T, SortKey> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- if (value->type->id() != Type::STRUCT) {
- return Status::Invalid("Expected type STRUCT but got ", value->type->id());
- }
- if (!value->is_valid) return Status::Invalid("Got null scalar");
- const auto& holder = checked_cast<const StructScalar&>(*value);
- ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name"));
- ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order"));
- ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar<std::string>(name_holder));
- ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar<SortOrder>(order_holder));
- return SortKey{std::move(name), order};
-}
-
-template <typename T>
-static inline enable_if_same_result<T, std::shared_ptr<DataType>> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- return value->type;
-}
-
-template <typename T>
-static inline enable_if_same_result<T, std::shared_ptr<Scalar>> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- return value;
-}
-
-template <typename T>
-static inline enable_if_same_result<T, std::shared_ptr<const KeyValueMetadata>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
- if (!value->type->Equals(ty)) {
- return Status::Invalid("Expected ", ty->ToString(), " but got ",
- value->type->ToString());
- }
- const auto& holder = checked_cast<const MapScalar&>(*value);
- std::vector<std::string> keys;
- std::vector<std::string> values;
- const auto& list = checked_cast<const StructArray&>(*holder.value);
- const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0));
- const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1));
- for (int64_t i = 0; i < list.length(); i++) {
- keys.push_back(key_arr.GetString(i));
- values.push_back(value_arr.GetString(i));
- }
- return key_value_metadata(std::move(keys), std::move(values));
-}
-
-template <typename T>
-static inline enable_if_same_result<T, Datum> GenericFromScalar(
- const std::shared_ptr<Scalar>& value) {
- if (value->type->id() == Type::LIST) {
- const auto& holder = checked_cast<const BaseListScalar&>(*value);
- return holder.value;
- }
- // TODO(ARROW-9434): handle other possible datum kinds by looking for a union
- return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
-}
-
-template <typename T>
-static enable_if_same<typename CTypeTraits<T>::ArrowType, ListType, Result<T>>
-GenericFromScalar(const std::shared_ptr<Scalar>& value) {
- using ValueType = typename T::value_type;
- if (value->type->id() != Type::LIST) {
- return Status::Invalid("Expected type LIST but got ", value->type->ToString());
- }
- const auto& holder = checked_cast<const BaseListScalar&>(*value);
- if (!holder.is_valid) return Status::Invalid("Got null scalar");
- std::vector<ValueType> result;
- for (int i = 0; i < holder.value->length(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i));
- ARROW_ASSIGN_OR_RAISE(auto v, GenericFromScalar<ValueType>(scalar));
- result.push_back(std::move(v));
- }
- return result;
-}
-
-template <typename Options>
-struct StringifyImpl {
- template <typename Tuple>
- StringifyImpl(const Options& obj, const Tuple& props)
- : obj_(obj), members_(props.size()) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t i) {
- std::stringstream ss;
- ss << prop.name() << '=' << GenericToString(prop.get(obj_));
- members_[i] = ss.str();
- }
-
- std::string Finish() {
- return "{" + arrow::internal::JoinStrings(members_, ", ") + "}";
- }
-
- const Options& obj_;
- std::vector<std::string> members_;
-};
-
-template <typename Options>
-struct CompareImpl {
- template <typename Tuple>
- CompareImpl(const Options& l, const Options& r, const Tuple& props)
- : left_(l), right_(r) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t) {
- equal_ &= GenericEquals(prop.get(left_), prop.get(right_));
- }
-
- const Options& left_;
- const Options& right_;
- bool equal_ = true;
-};
-
-template <typename Options>
-struct ToStructScalarImpl {
- template <typename Tuple>
- ToStructScalarImpl(const Options& obj, const Tuple& props,
- std::vector<std::string>* field_names,
- std::vector<std::shared_ptr<Scalar>>* values)
- : obj_(obj), field_names_(field_names), values_(values) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t) {
- if (!status_.ok()) return;
- auto result = GenericToScalar(prop.get(obj_));
- if (!result.ok()) {
- status_ = result.status().WithMessage("Could not serialize field ", prop.name(),
- " of options type ", Options::kTypeName, ": ",
- result.status().message());
- return;
- }
- field_names_->emplace_back(prop.name());
- values_->push_back(result.MoveValueUnsafe());
- }
-
- const Options& obj_;
- Status status_;
- std::vector<std::string>* field_names_;
- std::vector<std::shared_ptr<Scalar>>* values_;
-};
-
-template <typename Options>
-struct FromStructScalarImpl {
- template <typename Tuple>
- FromStructScalarImpl(Options* obj, const StructScalar& scalar, const Tuple& props)
- : obj_(obj), scalar_(scalar) {
- props.ForEach(*this);
- }
-
- template <typename Property>
- void operator()(const Property& prop, size_t) {
- if (!status_.ok()) return;
- auto maybe_holder = scalar_.field(std::string(prop.name()));
- if (!maybe_holder.ok()) {
- status_ = maybe_holder.status().WithMessage(
- "Cannot deserialize field ", prop.name(), " of options type ",
- Options::kTypeName, ": ", maybe_holder.status().message());
- return;
- }
- auto holder = maybe_holder.MoveValueUnsafe();
- auto result = GenericFromScalar<typename Property::Type>(holder);
- if (!result.ok()) {
- status_ = result.status().WithMessage("Cannot deserialize field ", prop.name(),
- " of options type ", Options::kTypeName, ": ",
- result.status().message());
- return;
- }
- prop.set(obj_, result.MoveValueUnsafe());
- }
-
- Options* obj_;
- Status status_;
- const StructScalar& scalar_;
-};
-
-template <typename Options, typename... Properties>
-const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) {
- static const class OptionsType : public GenericOptionsType {
- public:
- explicit OptionsType(const arrow::internal::PropertyTuple<Properties...> properties)
- : properties_(properties) {}
-
- const char* type_name() const override { return Options::kTypeName; }
-
- std::string Stringify(const FunctionOptions& options) const override {
- const auto& self = checked_cast<const Options&>(options);
- return StringifyImpl<Options>(self, properties_).Finish();
- }
- bool Compare(const FunctionOptions& options,
- const FunctionOptions& other) const override {
- const auto& lhs = checked_cast<const Options&>(options);
- const auto& rhs = checked_cast<const Options&>(other);
- return CompareImpl<Options>(lhs, rhs, properties_).equal_;
- }
- Status ToStructScalar(const FunctionOptions& options,
- std::vector<std::string>* field_names,
- std::vector<std::shared_ptr<Scalar>>* values) const override {
- const auto& self = checked_cast<const Options&>(options);
- RETURN_NOT_OK(
- ToStructScalarImpl<Options>(self, properties_, field_names, values).status_);
- return Status::OK();
- }
- Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
- const StructScalar& scalar) const override {
- auto options = std::unique_ptr<Options>(new Options());
- RETURN_NOT_OK(
- FromStructScalarImpl<Options>(options.get(), scalar, properties_).status_);
- return std::move(options);
- }
-
- private:
- const arrow::internal::PropertyTuple<Properties...> properties_;
- } instance(arrow::internal::MakeProperties(properties...));
- return &instance;
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/reflection_internal.h"
+#include "arrow/util/string.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+struct Scalar;
+struct StructScalar;
+using ::arrow::internal::checked_cast;
+
+namespace internal {
+template <>
+struct EnumTraits<compute::SortOrder>
+ : BasicEnumTraits<compute::SortOrder, compute::SortOrder::Ascending,
+ compute::SortOrder::Descending> {
+ static std::string name() { return "SortOrder"; }
+ static std::string value_name(compute::SortOrder value) {
+ switch (value) {
+ case compute::SortOrder::Ascending:
+ return "Ascending";
+ case compute::SortOrder::Descending:
+ return "Descending";
+ }
+ return "<INVALID>";
+ }
+};
+} // namespace internal
+
+namespace compute {
+namespace internal {
+
+using arrow::internal::EnumTraits;
+using arrow::internal::has_enum_traits;
+
+template <typename Enum, typename CType = typename std::underlying_type<Enum>::type>
+Result<Enum> ValidateEnumValue(CType raw) {
+ for (auto valid : EnumTraits<Enum>::values()) {
+ if (raw == static_cast<CType>(valid)) {
+ return static_cast<Enum>(raw);
+ }
+ }
+ return Status::Invalid("Invalid value for ", EnumTraits<Enum>::name(), ": ", raw);
+}
+
+class GenericOptionsType : public FunctionOptionsType {
+ public:
+ Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const override;
+ Result<std::unique_ptr<FunctionOptions>> Deserialize(
+ const Buffer& buffer) const override;
+ virtual Status ToStructScalar(const FunctionOptions& options,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values) const = 0;
+ virtual Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
+ const StructScalar& scalar) const = 0;
+};
+
+ARROW_EXPORT
+Result<std::shared_ptr<StructScalar>> FunctionOptionsToStructScalar(
+ const FunctionOptions&);
+ARROW_EXPORT
+Result<std::unique_ptr<FunctionOptions>> FunctionOptionsFromStructScalar(
+ const StructScalar&);
+ARROW_EXPORT
+Result<std::unique_ptr<FunctionOptions>> DeserializeFunctionOptions(const Buffer& buffer);
+
+template <typename T>
+static inline enable_if_t<!has_enum_traits<T>::value, std::string> GenericToString(
+ const T& value) {
+ std::stringstream ss;
+ ss << value;
+ return ss.str();
+}
+
+static inline std::string GenericToString(bool value) { return value ? "true" : "false"; }
+
+static inline std::string GenericToString(const std::string& value) {
+ std::stringstream ss;
+ ss << '"' << value << '"';
+ return ss.str();
+}
+
+template <typename T>
+static inline enable_if_t<has_enum_traits<T>::value, std::string> GenericToString(
+ const T value) {
+ return EnumTraits<T>::value_name(value);
+}
+
+template <typename T>
+static inline std::string GenericToString(const std::shared_ptr<T>& value) {
+ std::stringstream ss;
+ return value ? value->ToString() : "<NULLPTR>";
+}
+
+static inline std::string GenericToString(const std::shared_ptr<Scalar>& value) {
+ std::stringstream ss;
+ ss << value->type->ToString() << ":" << value->ToString();
+ return ss.str();
+}
+
+static inline std::string GenericToString(
+ const std::shared_ptr<const KeyValueMetadata>& value) {
+ std::stringstream ss;
+ ss << "KeyValueMetadata{";
+ if (value) {
+ bool first = true;
+ for (const auto& pair : value->sorted_pairs()) {
+ if (!first) ss << ", ";
+ first = false;
+ ss << pair.first << ':' << pair.second;
+ }
+ }
+ ss << '}';
+ return ss.str();
+}
+
+static inline std::string GenericToString(const Datum& value) {
+ switch (value.kind()) {
+ case Datum::NONE:
+ return "<NULL DATUM>";
+ case Datum::SCALAR:
+ return GenericToString(value.scalar());
+ case Datum::ARRAY: {
+ std::stringstream ss;
+ ss << value.type()->ToString() << ':' << value.make_array()->ToString();
+ return ss.str();
+ }
+ case Datum::CHUNKED_ARRAY:
+ case Datum::RECORD_BATCH:
+ case Datum::TABLE:
+ case Datum::COLLECTION:
+ return value.ToString();
+ }
+ return value.ToString();
+}
+
+template <typename T>
+static inline std::string GenericToString(const std::vector<T>& value) {
+ std::stringstream ss;
+ ss << "[";
+ bool first = true;
+ // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
+ for (auto it = value.begin(); it != value.end(); it++) {
+ if (!first) ss << ", ";
+ first = false;
+ ss << GenericToString(*it);
+ }
+ ss << ']';
+ return ss.str();
+}
+
+static inline std::string GenericToString(SortOrder value) {
+ switch (value) {
+ case SortOrder::Ascending:
+ return "Ascending";
+ case SortOrder::Descending:
+ return "Descending";
+ }
+ return "<INVALID SORT ORDER>";
+}
+
+static inline std::string GenericToString(const std::vector<SortKey>& value) {
+ std::stringstream ss;
+ ss << '[';
+ bool first = true;
+ for (const auto& key : value) {
+ if (!first) {
+ ss << ", ";
+ }
+ first = false;
+ ss << key.ToString();
+ }
+ ss << ']';
+ return ss.str();
+}
+
+template <typename T>
+static inline bool GenericEquals(const T& left, const T& right) {
+ return left == right;
+}
+
+template <typename T>
+static inline bool GenericEquals(const std::shared_ptr<T>& left,
+ const std::shared_ptr<T>& right) {
+ if (left && right) {
+ return left->Equals(*right);
+ }
+ return left == right;
+}
+
+static inline bool IsEmpty(const std::shared_ptr<const KeyValueMetadata>& meta) {
+ return !meta || meta->size() == 0;
+}
+
+static inline bool GenericEquals(const std::shared_ptr<const KeyValueMetadata>& left,
+ const std::shared_ptr<const KeyValueMetadata>& right) {
+ // Special case since null metadata is considered equivalent to empty
+ if (IsEmpty(left) || IsEmpty(right)) {
+ return IsEmpty(left) && IsEmpty(right);
+ }
+ return left->Equals(*right);
+}
+
+template <typename T>
+static inline bool GenericEquals(const std::vector<T>& left,
+ const std::vector<T>& right) {
+ if (left.size() != right.size()) return false;
+ for (size_t i = 0; i < left.size(); i++) {
+ if (!GenericEquals(left[i], right[i])) return false;
+ }
+ return true;
+}
+
+template <typename T>
+static inline decltype(TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton())
+GenericTypeSingleton() {
+ return TypeTraits<typename CTypeTraits<T>::ArrowType>::type_singleton();
+}
+
+template <typename T>
+static inline enable_if_same<T, std::shared_ptr<const KeyValueMetadata>,
+ std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ return map(binary(), binary());
+}
+
+template <typename T>
+static inline enable_if_t<has_enum_traits<T>::value, std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ return TypeTraits<typename EnumTraits<T>::Type>::type_singleton();
+}
+
+template <typename T>
+static inline enable_if_same<T, SortKey, std::shared_ptr<DataType>>
+GenericTypeSingleton() {
+ std::vector<std::shared_ptr<Field>> fields;
+ fields.emplace_back(new Field("name", GenericTypeSingleton<std::string>()));
+ fields.emplace_back(new Field("order", GenericTypeSingleton<SortOrder>()));
+ return std::make_shared<StructType>(std::move(fields));
+}
+
+// N.B. ordering of overloads is relatively fragile
+template <typename T>
+static inline Result<decltype(MakeScalar(std::declval<T>()))> GenericToScalar(
+ const T& value) {
+ return MakeScalar(value);
+}
+
+// For Clang/libc++: when iterating through vector<bool>, we can't
+// pass it by reference so the overload above doesn't apply
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(bool value) {
+ return MakeScalar(value);
+}
+
+template <typename T, typename Enable = enable_if_t<has_enum_traits<T>::value>>
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const T value) {
+ using CType = typename EnumTraits<T>::CType;
+ return GenericToScalar(static_cast<CType>(value));
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const SortKey& value) {
+ ARROW_ASSIGN_OR_RAISE(auto name, GenericToScalar(value.name));
+ ARROW_ASSIGN_OR_RAISE(auto order, GenericToScalar(value.order));
+ return StructScalar::Make({name, order}, {"name", "order"});
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<const KeyValueMetadata>& value) {
+ auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(default_memory_pool(), ty, &builder));
+ auto* map_builder = checked_cast<MapBuilder*>(builder.get());
+ auto* key_builder = checked_cast<BinaryBuilder*>(map_builder->key_builder());
+ auto* item_builder = checked_cast<BinaryBuilder*>(map_builder->item_builder());
+ RETURN_NOT_OK(map_builder->Append());
+ if (value) {
+ RETURN_NOT_OK(key_builder->AppendValues(value->keys()));
+ RETURN_NOT_OK(item_builder->AppendValues(value->values()));
+ }
+ std::shared_ptr<Array> arr;
+ RETURN_NOT_OK(map_builder->Finish(&arr));
+ return arr->GetScalar(0);
+}
+
+template <typename T>
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::vector<T>& value) {
+ std::shared_ptr<DataType> type = GenericTypeSingleton<T>();
+ std::vector<std::shared_ptr<Scalar>> scalars;
+ scalars.reserve(value.size());
+ // Don't use range-for with auto& to avoid Clang -Wrange-loop-analysis
+ for (auto it = value.begin(); it != value.end(); it++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, GenericToScalar(*it));
+ scalars.push_back(std::move(scalar));
+ }
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(
+ MakeBuilder(default_memory_pool(), type ? type : scalars[0]->type, &builder));
+ RETURN_NOT_OK(builder->AppendScalars(scalars));
+ std::shared_ptr<Array> out;
+ RETURN_NOT_OK(builder->Finish(&out));
+ return std::make_shared<ListScalar>(std::move(out));
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<DataType>& value) {
+ if (!value) {
+ return Status::Invalid("shared_ptr<DataType> is nullptr");
+ }
+ return MakeNullScalar(value);
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value;
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(
+ const std::shared_ptr<Array>& value) {
+ return std::make_shared<ListScalar>(value);
+}
+
+static inline Result<std::shared_ptr<Scalar>> GenericToScalar(const Datum& value) {
+ // TODO(ARROW-9434): store in a union instead.
+ switch (value.kind()) {
+ case Datum::ARRAY:
+ return GenericToScalar(value.make_array());
+ break;
+ default:
+ return Status::NotImplemented("Cannot serialize Datum kind ", value.kind());
+ }
+}
+
+template <typename T>
+static inline enable_if_primitive_ctype<typename CTypeTraits<T>::ArrowType, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ using ArrowType = typename CTypeTraits<T>::ArrowType;
+ using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+ if (value->type->id() != ArrowType::type_id) {
+ return Status::Invalid("Expected type ", ArrowType::type_id, " but got ",
+ value->type->ToString());
+ }
+ const auto& holder = checked_cast<const ScalarType&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ return holder.value;
+}
+
+template <typename T>
+static inline enable_if_primitive_ctype<typename EnumTraits<T>::Type, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ ARROW_ASSIGN_OR_RAISE(auto raw_val,
+ GenericFromScalar<typename EnumTraits<T>::CType>(value));
+ return ValidateEnumValue<T>(raw_val);
+}
+
+template <typename T, typename U>
+using enable_if_same_result = enable_if_same<T, U, Result<T>>;
+
+template <typename T>
+static inline enable_if_same_result<T, std::string> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (!is_base_binary_like(value->type->id())) {
+ return Status::Invalid("Expected binary-like type but got ", value->type->ToString());
+ }
+ const auto& holder = checked_cast<const BaseBinaryScalar&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ return holder.value->ToString();
+}
+
+template <typename T>
+static inline enable_if_same_result<T, SortKey> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (value->type->id() != Type::STRUCT) {
+ return Status::Invalid("Expected type STRUCT but got ", value->type->id());
+ }
+ if (!value->is_valid) return Status::Invalid("Got null scalar");
+ const auto& holder = checked_cast<const StructScalar&>(*value);
+ ARROW_ASSIGN_OR_RAISE(auto name_holder, holder.field("name"));
+ ARROW_ASSIGN_OR_RAISE(auto order_holder, holder.field("order"));
+ ARROW_ASSIGN_OR_RAISE(auto name, GenericFromScalar<std::string>(name_holder));
+ ARROW_ASSIGN_OR_RAISE(auto order, GenericFromScalar<SortOrder>(order_holder));
+ return SortKey{std::move(name), order};
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<DataType>> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value->type;
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<Scalar>> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ return value;
+}
+
+template <typename T>
+static inline enable_if_same_result<T, std::shared_ptr<const KeyValueMetadata>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ auto ty = GenericTypeSingleton<std::shared_ptr<const KeyValueMetadata>>();
+ if (!value->type->Equals(ty)) {
+ return Status::Invalid("Expected ", ty->ToString(), " but got ",
+ value->type->ToString());
+ }
+ const auto& holder = checked_cast<const MapScalar&>(*value);
+ std::vector<std::string> keys;
+ std::vector<std::string> values;
+ const auto& list = checked_cast<const StructArray&>(*holder.value);
+ const auto& key_arr = checked_cast<const BinaryArray&>(*list.field(0));
+ const auto& value_arr = checked_cast<const BinaryArray&>(*list.field(1));
+ for (int64_t i = 0; i < list.length(); i++) {
+ keys.push_back(key_arr.GetString(i));
+ values.push_back(value_arr.GetString(i));
+ }
+ return key_value_metadata(std::move(keys), std::move(values));
+}
+
+template <typename T>
+static inline enable_if_same_result<T, Datum> GenericFromScalar(
+ const std::shared_ptr<Scalar>& value) {
+ if (value->type->id() == Type::LIST) {
+ const auto& holder = checked_cast<const BaseListScalar&>(*value);
+ return holder.value;
+ }
+ // TODO(ARROW-9434): handle other possible datum kinds by looking for a union
+ return Status::Invalid("Cannot deserialize Datum from ", value->ToString());
+}
+
+template <typename T>
+static enable_if_same<typename CTypeTraits<T>::ArrowType, ListType, Result<T>>
+GenericFromScalar(const std::shared_ptr<Scalar>& value) {
+ using ValueType = typename T::value_type;
+ if (value->type->id() != Type::LIST) {
+ return Status::Invalid("Expected type LIST but got ", value->type->ToString());
+ }
+ const auto& holder = checked_cast<const BaseListScalar&>(*value);
+ if (!holder.is_valid) return Status::Invalid("Got null scalar");
+ std::vector<ValueType> result;
+ for (int i = 0; i < holder.value->length(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto scalar, holder.value->GetScalar(i));
+ ARROW_ASSIGN_OR_RAISE(auto v, GenericFromScalar<ValueType>(scalar));
+ result.push_back(std::move(v));
+ }
+ return result;
+}
+
+template <typename Options>
+struct StringifyImpl {
+ template <typename Tuple>
+ StringifyImpl(const Options& obj, const Tuple& props)
+ : obj_(obj), members_(props.size()) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t i) {
+ std::stringstream ss;
+ ss << prop.name() << '=' << GenericToString(prop.get(obj_));
+ members_[i] = ss.str();
+ }
+
+ std::string Finish() {
+ return "{" + arrow::internal::JoinStrings(members_, ", ") + "}";
+ }
+
+ const Options& obj_;
+ std::vector<std::string> members_;
+};
+
+template <typename Options>
+struct CompareImpl {
+ template <typename Tuple>
+ CompareImpl(const Options& l, const Options& r, const Tuple& props)
+ : left_(l), right_(r) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ equal_ &= GenericEquals(prop.get(left_), prop.get(right_));
+ }
+
+ const Options& left_;
+ const Options& right_;
+ bool equal_ = true;
+};
+
+template <typename Options>
+struct ToStructScalarImpl {
+ template <typename Tuple>
+ ToStructScalarImpl(const Options& obj, const Tuple& props,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values)
+ : obj_(obj), field_names_(field_names), values_(values) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ if (!status_.ok()) return;
+ auto result = GenericToScalar(prop.get(obj_));
+ if (!result.ok()) {
+ status_ = result.status().WithMessage("Could not serialize field ", prop.name(),
+ " of options type ", Options::kTypeName, ": ",
+ result.status().message());
+ return;
+ }
+ field_names_->emplace_back(prop.name());
+ values_->push_back(result.MoveValueUnsafe());
+ }
+
+ const Options& obj_;
+ Status status_;
+ std::vector<std::string>* field_names_;
+ std::vector<std::shared_ptr<Scalar>>* values_;
+};
+
+template <typename Options>
+struct FromStructScalarImpl {
+ template <typename Tuple>
+ FromStructScalarImpl(Options* obj, const StructScalar& scalar, const Tuple& props)
+ : obj_(obj), scalar_(scalar) {
+ props.ForEach(*this);
+ }
+
+ template <typename Property>
+ void operator()(const Property& prop, size_t) {
+ if (!status_.ok()) return;
+ auto maybe_holder = scalar_.field(std::string(prop.name()));
+ if (!maybe_holder.ok()) {
+ status_ = maybe_holder.status().WithMessage(
+ "Cannot deserialize field ", prop.name(), " of options type ",
+ Options::kTypeName, ": ", maybe_holder.status().message());
+ return;
+ }
+ auto holder = maybe_holder.MoveValueUnsafe();
+ auto result = GenericFromScalar<typename Property::Type>(holder);
+ if (!result.ok()) {
+ status_ = result.status().WithMessage("Cannot deserialize field ", prop.name(),
+ " of options type ", Options::kTypeName, ": ",
+ result.status().message());
+ return;
+ }
+ prop.set(obj_, result.MoveValueUnsafe());
+ }
+
+ Options* obj_;
+ Status status_;
+ const StructScalar& scalar_;
+};
+
+template <typename Options, typename... Properties>
+const FunctionOptionsType* GetFunctionOptionsType(const Properties&... properties) {
+ static const class OptionsType : public GenericOptionsType {
+ public:
+ explicit OptionsType(const arrow::internal::PropertyTuple<Properties...> properties)
+ : properties_(properties) {}
+
+ const char* type_name() const override { return Options::kTypeName; }
+
+ std::string Stringify(const FunctionOptions& options) const override {
+ const auto& self = checked_cast<const Options&>(options);
+ return StringifyImpl<Options>(self, properties_).Finish();
+ }
+ bool Compare(const FunctionOptions& options,
+ const FunctionOptions& other) const override {
+ const auto& lhs = checked_cast<const Options&>(options);
+ const auto& rhs = checked_cast<const Options&>(other);
+ return CompareImpl<Options>(lhs, rhs, properties_).equal_;
+ }
+ Status ToStructScalar(const FunctionOptions& options,
+ std::vector<std::string>* field_names,
+ std::vector<std::shared_ptr<Scalar>>* values) const override {
+ const auto& self = checked_cast<const Options&>(options);
+ RETURN_NOT_OK(
+ ToStructScalarImpl<Options>(self, properties_, field_names, values).status_);
+ return Status::OK();
+ }
+ Result<std::unique_ptr<FunctionOptions>> FromStructScalar(
+ const StructScalar& scalar) const override {
+ auto options = std::unique_ptr<Options>(new Options());
+ RETURN_NOT_OK(
+ FromStructScalarImpl<Options>(options.get(), scalar, properties_).status_);
+ return std::move(options);
+ }
+
+ private:
+ const arrow::internal::PropertyTuple<Properties...> properties_;
+ } instance(arrow::internal::MakeProperties(properties...));
+ return &instance;
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
index c730cbd131a..f131f524d2e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.cc
@@ -59,25 +59,25 @@ Result<std::shared_ptr<ResizableBuffer>> KernelContext::AllocateBitmap(int64_t n
return result;
}
-Status Kernel::InitAll(KernelContext* ctx, const KernelInitArgs& args,
- std::vector<std::unique_ptr<KernelState>>* states) {
- for (auto& state : *states) {
- ARROW_ASSIGN_OR_RAISE(state, args.kernel->init(ctx, args));
+Status Kernel::InitAll(KernelContext* ctx, const KernelInitArgs& args,
+ std::vector<std::unique_ptr<KernelState>>* states) {
+ for (auto& state : *states) {
+ ARROW_ASSIGN_OR_RAISE(state, args.kernel->init(ctx, args));
}
- return Status::OK();
+ return Status::OK();
}
-Result<std::unique_ptr<KernelState>> ScalarAggregateKernel::MergeAll(
- const ScalarAggregateKernel* kernel, KernelContext* ctx,
- std::vector<std::unique_ptr<KernelState>> states) {
- auto out = std::move(states.back());
- states.pop_back();
- ctx->SetState(out.get());
- for (auto& state : states) {
- RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get()));
- }
- return std::move(out);
-}
+Result<std::unique_ptr<KernelState>> ScalarAggregateKernel::MergeAll(
+ const ScalarAggregateKernel* kernel, KernelContext* ctx,
+ std::vector<std::unique_ptr<KernelState>> states) {
+ auto out = std::move(states.back());
+ states.pop_back();
+ ctx->SetState(out.get());
+ for (auto& state : states) {
+ RETURN_NOT_OK(kernel->merge(ctx, std::move(*state), out.get()));
+ }
+ return std::move(out);
+}
// ----------------------------------------------------------------------
// Some basic TypeMatcher implementations
@@ -402,7 +402,7 @@ KernelSignature::KernelSignature(std::vector<InputType> in_types, OutputType out
out_type_(std::move(out_type)),
is_varargs_(is_varargs),
hash_code_(0) {
- DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1)));
+ DCHECK(!is_varargs || (is_varargs && (in_types_.size() >= 1)));
}
std::shared_ptr<KernelSignature> KernelSignature::Make(std::vector<InputType> in_types,
@@ -429,8 +429,8 @@ bool KernelSignature::Equals(const KernelSignature& other) const {
bool KernelSignature::MatchesInputs(const std::vector<ValueDescr>& args) const {
if (is_varargs_) {
- for (size_t i = 0; i < args.size(); ++i) {
- if (!in_types_[std::min(i, in_types_.size() - 1)].Matches(args[i])) {
+ for (size_t i = 0; i < args.size(); ++i) {
+ if (!in_types_[std::min(i, in_types_.size() - 1)].Matches(args[i])) {
return false;
}
}
@@ -463,19 +463,19 @@ std::string KernelSignature::ToString() const {
std::stringstream ss;
if (is_varargs_) {
- ss << "varargs[";
+ ss << "varargs[";
} else {
ss << "(";
- }
- for (size_t i = 0; i < in_types_.size(); ++i) {
- if (i > 0) {
- ss << ", ";
+ }
+ for (size_t i = 0; i < in_types_.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
}
- ss << in_types_[i].ToString();
- }
- if (is_varargs_) {
- ss << "]";
- } else {
+ ss << in_types_[i].ToString();
+ }
+ if (is_varargs_) {
+ ss << "]";
+ } else {
ss << ")";
}
ss << " -> " << out_type_.ToString();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
index c90c764f5ec..36d20c7289e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernel.h
@@ -41,7 +41,7 @@
namespace arrow {
namespace compute {
-class FunctionOptions;
+class FunctionOptions;
/// \brief Base class for opaque kernel-specific state. For example, if there
/// is some kind of initialization required.
@@ -52,7 +52,7 @@ struct ARROW_EXPORT KernelState {
/// \brief Context/state for the execution of a particular kernel.
class ARROW_EXPORT KernelContext {
public:
- explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx), state_() {}
+ explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx), state_() {}
/// \brief Allocate buffer from the context's memory pool. The contents are
/// not initialized.
@@ -91,7 +91,7 @@ class ARROW_EXPORT KernelContext {
/// into pre-allocated memory if they are able, though for some kernels
/// (e.g. in cases when a builder like StringBuilder) must be employed this may
/// not be possible.
-using ArrayKernelExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
+using ArrayKernelExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
/// \brief An type-checking interface to permit customizable validation rules
/// for use with InputType and KernelSignature. This is for scenarios where the
@@ -321,9 +321,9 @@ class ARROW_EXPORT OutputType {
this->resolver_ = other.resolver_;
}
- OutputType& operator=(const OutputType&) = default;
- OutputType& operator=(OutputType&&) = default;
-
+ OutputType& operator=(const OutputType&) = default;
+ OutputType& operator=(OutputType&&) = default;
+
/// \brief Return the shape and type of the expected output value of the
/// kernel given the value descriptors (shapes and types) of the input
/// arguments. The resolver may make use of state information kept in the
@@ -366,10 +366,10 @@ class ARROW_EXPORT OutputType {
/// \brief Holds the input types and output type of the kernel.
///
-/// VarArgs functions with minimum N arguments should pass up to N input types to be
-/// used to validate the input types of a function invocation. The first N-1 types
-/// will be matched against the first N-1 arguments, and the last type will be
-/// matched against the remaining arguments.
+/// VarArgs functions with minimum N arguments should pass up to N input types to be
+/// used to validate the input types of a function invocation. The first N-1 types
+/// will be matched against the first N-1 arguments, and the last type will be
+/// matched against the remaining arguments.
class ARROW_EXPORT KernelSignature {
public:
KernelSignature(std::vector<InputType> in_types, OutputType out_type,
@@ -500,8 +500,8 @@ struct KernelInitArgs {
};
/// \brief Common initializer function for all kernel types.
-using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
- KernelContext*, const KernelInitArgs&)>;
+using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
+ KernelContext*, const KernelInitArgs&)>;
/// \brief Base type for kernels. Contains the function signature and
/// optionally the state initialization function, along with some common
@@ -513,8 +513,8 @@ struct Kernel {
: signature(std::move(sig)), init(std::move(init)) {}
Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
- : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
- std::move(init)) {}
+ : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init)) {}
/// \brief The "signature" of the kernel containing the InputType input
/// argument validators and OutputType output type and shape resolver.
@@ -524,10 +524,10 @@ struct Kernel {
/// set up any options or state relevant for execution.
KernelInit init;
- /// \brief Create a vector of new KernelState for invocations of this kernel.
- static Status InitAll(KernelContext*, const KernelInitArgs&,
- std::vector<std::unique_ptr<KernelState>>*);
-
+ /// \brief Create a vector of new KernelState for invocations of this kernel.
+ static Status InitAll(KernelContext*, const KernelInitArgs&,
+ std::vector<std::unique_ptr<KernelState>>*);
+
/// \brief Indicates whether execution can benefit from parallelization
/// (splitting large chunks into smaller chunks and using multiple
/// threads). Some kernels may not support parallel execution at
@@ -547,7 +547,7 @@ struct Kernel {
/// output array values (as opposed to scalar values in the case of aggregate
/// functions).
struct ArrayKernel : public Kernel {
- ArrayKernel() = default;
+ ArrayKernel() = default;
ArrayKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
KernelInit init = NULLPTR)
@@ -555,8 +555,8 @@ struct ArrayKernel : public Kernel {
ArrayKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
KernelInit init = NULLPTR)
- : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
- exec(std::move(exec)) {}
+ : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
+ exec(std::move(exec)) {}
/// \brief Perform a single invocation of this kernel. Depending on the
/// implementation, it may only write into preallocated memory, while in some
@@ -588,7 +588,7 @@ struct ScalarKernel : public ArrayKernel {
// VectorKernel (for VectorFunction)
/// \brief See VectorKernel::finalize member for usage
-using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)>;
+using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)>;
/// \brief Kernel data structure for implementations of VectorFunction. In
/// addition to the members found in ArrayKernel, contains an optional
@@ -596,10 +596,10 @@ using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)
/// (which have different defaults from ScalarKernel), and some other
/// execution-related options.
struct VectorKernel : public ArrayKernel {
- VectorKernel() = default;
+ VectorKernel() = default;
VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec)
- : ArrayKernel(std::move(sig), std::move(exec)) {}
+ : ArrayKernel(std::move(sig), std::move(exec)) {}
VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
@@ -643,13 +643,13 @@ struct VectorKernel : public ArrayKernel {
// ----------------------------------------------------------------------
// ScalarAggregateKernel (for ScalarAggregateFunction)
-using ScalarAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
+using ScalarAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
using ScalarAggregateMerge =
- std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
+ std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
// Finalize returns Datum to permit multiple return values
-using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
+using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
/// \brief Kernel data structure for implementations of
/// ScalarAggregateFunction. The four necessary components of an aggregation
@@ -662,12 +662,12 @@ using ScalarAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
/// * finalize: produces the end result of the aggregation using the
/// KernelState in the KernelContext.
struct ScalarAggregateKernel : public Kernel {
- ScalarAggregateKernel() = default;
+ ScalarAggregateKernel() = default;
ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
ScalarAggregateConsume consume, ScalarAggregateMerge merge,
ScalarAggregateFinalize finalize)
- : Kernel(std::move(sig), std::move(init)),
+ : Kernel(std::move(sig), std::move(init)),
consume(std::move(consume)),
merge(std::move(merge)),
finalize(std::move(finalize)) {}
@@ -675,65 +675,65 @@ struct ScalarAggregateKernel : public Kernel {
ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
KernelInit init, ScalarAggregateConsume consume,
ScalarAggregateMerge merge, ScalarAggregateFinalize finalize)
- : ScalarAggregateKernel(
- KernelSignature::Make(std::move(in_types), std::move(out_type)),
- std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
-
- /// \brief Merge a vector of KernelStates into a single KernelState.
- /// The merged state will be returned and will be set on the KernelContext.
- static Result<std::unique_ptr<KernelState>> MergeAll(
- const ScalarAggregateKernel* kernel, KernelContext* ctx,
- std::vector<std::unique_ptr<KernelState>> states);
-
+ : ScalarAggregateKernel(
+ KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+ /// \brief Merge a vector of KernelStates into a single KernelState.
+ /// The merged state will be returned and will be set on the KernelContext.
+ static Result<std::unique_ptr<KernelState>> MergeAll(
+ const ScalarAggregateKernel* kernel, KernelContext* ctx,
+ std::vector<std::unique_ptr<KernelState>> states);
+
ScalarAggregateConsume consume;
ScalarAggregateMerge merge;
ScalarAggregateFinalize finalize;
};
-// ----------------------------------------------------------------------
-// HashAggregateKernel (for HashAggregateFunction)
-
-using HashAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
-
-using HashAggregateMerge =
- std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
-
-// Finalize returns Datum to permit multiple return values
-using HashAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
-
-/// \brief Kernel data structure for implementations of
-/// HashAggregateFunction. The four necessary components of an aggregation
-/// kernel are the init, consume, merge, and finalize functions.
-///
-/// * init: creates a new KernelState for a kernel.
-/// * consume: processes an ExecBatch (which includes the argument as well
-/// as an array of group identifiers) and updates the KernelState found in the
-/// KernelContext.
-/// * merge: combines one KernelState with another.
-/// * finalize: produces the end result of the aggregation using the
-/// KernelState in the KernelContext.
-struct HashAggregateKernel : public Kernel {
- HashAggregateKernel() = default;
-
- HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
- HashAggregateConsume consume, HashAggregateMerge merge,
- HashAggregateFinalize finalize)
- : Kernel(std::move(sig), std::move(init)),
- consume(std::move(consume)),
- merge(std::move(merge)),
- finalize(std::move(finalize)) {}
-
- HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
- KernelInit init, HashAggregateMerge merge,
- HashAggregateConsume consume, HashAggregateFinalize finalize)
- : HashAggregateKernel(
- KernelSignature::Make(std::move(in_types), std::move(out_type)),
- std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
-
- HashAggregateConsume consume;
- HashAggregateMerge merge;
- HashAggregateFinalize finalize;
-};
-
+// ----------------------------------------------------------------------
+// HashAggregateKernel (for HashAggregateFunction)
+
+using HashAggregateConsume = std::function<Status(KernelContext*, const ExecBatch&)>;
+
+using HashAggregateMerge =
+ std::function<Status(KernelContext*, KernelState&&, KernelState*)>;
+
+// Finalize returns Datum to permit multiple return values
+using HashAggregateFinalize = std::function<Status(KernelContext*, Datum*)>;
+
+/// \brief Kernel data structure for implementations of
+/// HashAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * consume: processes an ExecBatch (which includes the argument as well
+/// as an array of group identifiers) and updates the KernelState found in the
+/// KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+/// KernelState in the KernelContext.
+struct HashAggregateKernel : public Kernel {
+ HashAggregateKernel() = default;
+
+ HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ HashAggregateConsume consume, HashAggregateMerge merge,
+ HashAggregateFinalize finalize)
+ : Kernel(std::move(sig), std::move(init)),
+ consume(std::move(consume)),
+ merge(std::move(merge)),
+ finalize(std::move(finalize)) {}
+
+ HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+ KernelInit init, HashAggregateMerge merge,
+ HashAggregateConsume consume, HashAggregateFinalize finalize)
+ : HashAggregateKernel(
+ KernelSignature::Make(std::move(in_types), std::move(out_type)),
+ std::move(init), std::move(consume), std::move(merge), std::move(finalize)) {}
+
+ HashAggregateConsume consume;
+ HashAggregateMerge merge;
+ HashAggregateFinalize finalize;
+};
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 0c9636eae09..a7df66695b2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -25,375 +25,375 @@
namespace arrow {
namespace compute {
-namespace {
-
-Status AggregateConsume(KernelContext* ctx, const ExecBatch& batch) {
- return checked_cast<ScalarAggregator*>(ctx->state())->Consume(ctx, batch);
+namespace {
+
+Status AggregateConsume(KernelContext* ctx, const ExecBatch& batch) {
+ return checked_cast<ScalarAggregator*>(ctx->state())->Consume(ctx, batch);
+}
+
+Status AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) {
+ return checked_cast<ScalarAggregator*>(dst)->MergeFrom(ctx, std::move(src));
}
-Status AggregateMerge(KernelContext* ctx, KernelState&& src, KernelState* dst) {
- return checked_cast<ScalarAggregator*>(dst)->MergeFrom(ctx, std::move(src));
+Status AggregateFinalize(KernelContext* ctx, Datum* out) {
+ return checked_cast<ScalarAggregator*>(ctx->state())->Finalize(ctx, out);
}
-Status AggregateFinalize(KernelContext* ctx, Datum* out) {
- return checked_cast<ScalarAggregator*>(ctx->state())->Finalize(ctx, out);
+} // namespace
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateFunction* func, SimdLevel::type simd_level) {
+ ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge,
+ AggregateFinalize);
+ // Set the simd level
+ kernel.simd_level = simd_level;
+ DCHECK_OK(func->AddKernel(kernel));
}
-} // namespace
-
-void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
- ScalarAggregateFunction* func, SimdLevel::type simd_level) {
- ScalarAggregateKernel kernel(std::move(sig), init, AggregateConsume, AggregateMerge,
- AggregateFinalize);
- // Set the simd level
- kernel.simd_level = simd_level;
- DCHECK_OK(func->AddKernel(kernel));
-}
-
-namespace aggregate {
-
+namespace aggregate {
+
// ----------------------------------------------------------------------
// Count implementation
struct CountImpl : public ScalarAggregator {
- explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- const ArrayData& input = *batch[0].array();
- const int64_t nulls = input.GetNullCount();
- this->nulls += nulls;
- this->non_nulls += input.length - nulls;
- } else {
- const Scalar& input = *batch[0].scalar();
- this->nulls += !input.is_valid * batch.length;
- this->non_nulls += input.is_valid * batch.length;
- }
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
+ explicit CountImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const ArrayData& input = *batch[0].array();
+ const int64_t nulls = input.GetNullCount();
+ this->nulls += nulls;
+ this->non_nulls += input.length - nulls;
+ } else {
+ const Scalar& input = *batch[0].scalar();
+ this->nulls += !input.is_valid * batch.length;
+ this->non_nulls += input.is_valid * batch.length;
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
const auto& other_state = checked_cast<const CountImpl&>(src);
this->non_nulls += other_state.non_nulls;
this->nulls += other_state.nulls;
- return Status::OK();
+ return Status::OK();
}
- Status Finalize(KernelContext* ctx, Datum* out) override {
+ Status Finalize(KernelContext* ctx, Datum* out) override {
const auto& state = checked_cast<const CountImpl&>(*ctx->state());
- if (state.options.skip_nulls) {
- *out = Datum(state.non_nulls);
- } else {
- *out = Datum(state.nulls);
+ if (state.options.skip_nulls) {
+ *out = Datum(state.non_nulls);
+ } else {
+ *out = Datum(state.nulls);
}
- return Status::OK();
+ return Status::OK();
}
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
int64_t non_nulls = 0;
int64_t nulls = 0;
};
-Result<std::unique_ptr<KernelState>> CountInit(KernelContext*,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> CountInit(KernelContext*,
+ const KernelInitArgs& args) {
return ::arrow::internal::make_unique<CountImpl>(
- static_cast<const ScalarAggregateOptions&>(*args.options));
+ static_cast<const ScalarAggregateOptions&>(*args.options));
}
// ----------------------------------------------------------------------
// Sum implementation
-template <typename ArrowType>
-struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
- explicit SumImplDefault(const ScalarAggregateOptions& options_) {
- this->options = options_;
- }
+template <typename ArrowType>
+struct SumImplDefault : public SumImpl<ArrowType, SimdLevel::NONE> {
+ explicit SumImplDefault(const ScalarAggregateOptions& options_) {
+ this->options = options_;
+ }
};
-template <typename ArrowType>
-struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
- explicit MeanImplDefault(const ScalarAggregateOptions& options_) {
- this->options = options_;
- }
+template <typename ArrowType>
+struct MeanImplDefault : public MeanImpl<ArrowType, SimdLevel::NONE> {
+ explicit MeanImplDefault(const ScalarAggregateOptions& options_) {
+ this->options = options_;
+ }
};
-Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- SumLikeInit<SumImplDefault> visitor(
- ctx, *args.inputs[0].type,
- static_cast<const ScalarAggregateOptions&>(*args.options));
+Result<std::unique_ptr<KernelState>> SumInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ SumLikeInit<SumImplDefault> visitor(
+ ctx, *args.inputs[0].type,
+ static_cast<const ScalarAggregateOptions&>(*args.options));
return visitor.Create();
}
-Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- SumLikeInit<MeanImplDefault> visitor(
- ctx, *args.inputs[0].type,
- static_cast<const ScalarAggregateOptions&>(*args.options));
+Result<std::unique_ptr<KernelState>> MeanInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ SumLikeInit<MeanImplDefault> visitor(
+ ctx, *args.inputs[0].type,
+ static_cast<const ScalarAggregateOptions&>(*args.options));
return visitor.Create();
}
// ----------------------------------------------------------------------
// MinMax implementation
-Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> MinMaxInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
MinMaxInitState<SimdLevel::NONE> visitor(
ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
- static_cast<const ScalarAggregateOptions&>(*args.options));
+ static_cast<const ScalarAggregateOptions&>(*args.options));
return visitor.Create();
}
-// ----------------------------------------------------------------------
-// Any implementation
-
-struct BooleanAnyImpl : public ScalarAggregator {
- explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- // short-circuit if seen a True already
- if (this->any == true) {
- return Status::OK();
- }
- if (batch[0].is_scalar()) {
- const auto& scalar = *batch[0].scalar();
- this->has_nulls = !scalar.is_valid;
- this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
- return Status::OK();
- }
- const auto& data = *batch[0].array();
- this->has_nulls = data.GetNullCount() > 0;
- arrow::internal::OptionalBinaryBitBlockCounter counter(
- data.buffers[0], data.offset, data.buffers[1], data.offset, data.length);
- int64_t position = 0;
- while (position < data.length) {
- const auto block = counter.NextAndBlock();
- if (block.popcount > 0) {
- this->any = true;
- break;
- }
- position += block.length;
- }
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const BooleanAnyImpl&>(src);
- this->any |= other.any;
- this->has_nulls |= other.has_nulls;
- return Status::OK();
- }
-
- Status Finalize(KernelContext* ctx, Datum* out) override {
- if (!options.skip_nulls && !this->any && this->has_nulls) {
- out->value = std::make_shared<BooleanScalar>();
- } else {
- out->value = std::make_shared<BooleanScalar>(this->any);
- }
- return Status::OK();
- }
-
- bool any = false;
- bool has_nulls = false;
- ScalarAggregateOptions options;
-};
-
-Result<std::unique_ptr<KernelState>> AnyInit(KernelContext*, const KernelInitArgs& args) {
- const ScalarAggregateOptions options =
- static_cast<const ScalarAggregateOptions&>(*args.options);
- return ::arrow::internal::make_unique<BooleanAnyImpl>(
- static_cast<const ScalarAggregateOptions&>(*args.options));
+// ----------------------------------------------------------------------
+// Any implementation
+
+struct BooleanAnyImpl : public ScalarAggregator {
+ explicit BooleanAnyImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ // short-circuit if seen a True already
+ if (this->any == true) {
+ return Status::OK();
+ }
+ if (batch[0].is_scalar()) {
+ const auto& scalar = *batch[0].scalar();
+ this->has_nulls = !scalar.is_valid;
+ this->any = scalar.is_valid && checked_cast<const BooleanScalar&>(scalar).value;
+ return Status::OK();
+ }
+ const auto& data = *batch[0].array();
+ this->has_nulls = data.GetNullCount() > 0;
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ data.buffers[0], data.offset, data.buffers[1], data.offset, data.length);
+ int64_t position = 0;
+ while (position < data.length) {
+ const auto block = counter.NextAndBlock();
+ if (block.popcount > 0) {
+ this->any = true;
+ break;
+ }
+ position += block.length;
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const BooleanAnyImpl&>(src);
+ this->any |= other.any;
+ this->has_nulls |= other.has_nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ if (!options.skip_nulls && !this->any && this->has_nulls) {
+ out->value = std::make_shared<BooleanScalar>();
+ } else {
+ out->value = std::make_shared<BooleanScalar>(this->any);
+ }
+ return Status::OK();
+ }
+
+ bool any = false;
+ bool has_nulls = false;
+ ScalarAggregateOptions options;
+};
+
+Result<std::unique_ptr<KernelState>> AnyInit(KernelContext*, const KernelInitArgs& args) {
+ const ScalarAggregateOptions options =
+ static_cast<const ScalarAggregateOptions&>(*args.options);
+ return ::arrow::internal::make_unique<BooleanAnyImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
+}
+
+// ----------------------------------------------------------------------
+// All implementation
+
+struct BooleanAllImpl : public ScalarAggregator {
+ explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ // short-circuit if seen a false already
+ if (this->all == false) {
+ return Status::OK();
+ }
+ // short-circuit if seen a null already
+ if (!options.skip_nulls && this->has_nulls) {
+ return Status::OK();
+ }
+ if (batch[0].is_scalar()) {
+ const auto& scalar = *batch[0].scalar();
+ this->has_nulls = !scalar.is_valid;
+ this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
+ return Status::OK();
+ }
+ const auto& data = *batch[0].array();
+ this->has_nulls = data.GetNullCount() > 0;
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ data.buffers[1], data.offset, data.buffers[0], data.offset, data.length);
+ int64_t position = 0;
+ while (position < data.length) {
+ const auto block = counter.NextOrNotBlock();
+ if (!block.AllSet()) {
+ this->all = false;
+ break;
+ }
+ position += block.length;
+ }
+
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const BooleanAllImpl&>(src);
+ this->all &= other.all;
+ this->has_nulls |= other.has_nulls;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (!options.skip_nulls && this->all && this->has_nulls) {
+ out->value = std::make_shared<BooleanScalar>();
+ } else {
+ out->value = std::make_shared<BooleanScalar>(this->all);
+ }
+ return Status::OK();
+ }
+
+ bool all = true;
+ bool has_nulls = false;
+ ScalarAggregateOptions options;
+};
+
+Result<std::unique_ptr<KernelState>> AllInit(KernelContext*, const KernelInitArgs& args) {
+ return ::arrow::internal::make_unique<BooleanAllImpl>(
+ static_cast<const ScalarAggregateOptions&>(*args.options));
}
-// ----------------------------------------------------------------------
-// All implementation
-
-struct BooleanAllImpl : public ScalarAggregator {
- explicit BooleanAllImpl(ScalarAggregateOptions options) : options(std::move(options)) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- // short-circuit if seen a false already
- if (this->all == false) {
- return Status::OK();
- }
- // short-circuit if seen a null already
- if (!options.skip_nulls && this->has_nulls) {
- return Status::OK();
- }
- if (batch[0].is_scalar()) {
- const auto& scalar = *batch[0].scalar();
- this->has_nulls = !scalar.is_valid;
- this->all = !scalar.is_valid || checked_cast<const BooleanScalar&>(scalar).value;
- return Status::OK();
- }
- const auto& data = *batch[0].array();
- this->has_nulls = data.GetNullCount() > 0;
- arrow::internal::OptionalBinaryBitBlockCounter counter(
- data.buffers[1], data.offset, data.buffers[0], data.offset, data.length);
- int64_t position = 0;
- while (position < data.length) {
- const auto block = counter.NextOrNotBlock();
- if (!block.AllSet()) {
- this->all = false;
- break;
- }
- position += block.length;
- }
-
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const BooleanAllImpl&>(src);
- this->all &= other.all;
- this->has_nulls |= other.has_nulls;
- return Status::OK();
- }
-
- Status Finalize(KernelContext*, Datum* out) override {
- if (!options.skip_nulls && this->all && this->has_nulls) {
- out->value = std::make_shared<BooleanScalar>();
- } else {
- out->value = std::make_shared<BooleanScalar>(this->all);
- }
- return Status::OK();
- }
-
- bool all = true;
- bool has_nulls = false;
- ScalarAggregateOptions options;
-};
-
-Result<std::unique_ptr<KernelState>> AllInit(KernelContext*, const KernelInitArgs& args) {
- return ::arrow::internal::make_unique<BooleanAllImpl>(
- static_cast<const ScalarAggregateOptions&>(*args.options));
-}
-
-// ----------------------------------------------------------------------
-// Index implementation
-
-template <typename ArgType>
-struct IndexImpl : public ScalarAggregator {
- using ArgValue = typename internal::GetViewType<ArgType>::T;
-
- explicit IndexImpl(IndexOptions options, KernelState* raw_state)
- : options(std::move(options)), seen(0), index(-1) {
- if (auto state = static_cast<IndexImpl<ArgType>*>(raw_state)) {
- seen = state->seen;
- index = state->index;
- }
- }
-
- Status Consume(KernelContext* ctx, const ExecBatch& batch) override {
- // short-circuit
- if (index >= 0 || !options.value->is_valid) {
- return Status::OK();
- }
-
- auto input = batch[0].array();
- seen = input->length;
- const ArgValue desired = internal::UnboxScalar<ArgType>::Unbox(*options.value);
- int64_t i = 0;
-
- ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
- *input,
- [&](ArgValue v) -> Status {
- if (v == desired) {
- index = i;
- return Status::Cancelled("Found");
- } else {
- ++i;
- return Status::OK();
- }
- },
- [&]() -> Status {
- ++i;
- return Status::OK();
- }));
-
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const IndexImpl&>(src);
- if (index < 0 && other.index >= 0) {
- index = seen + other.index;
- }
- seen += other.seen;
- return Status::OK();
- }
-
- Status Finalize(KernelContext*, Datum* out) override {
- out->value = std::make_shared<Int64Scalar>(index >= 0 ? index : -1);
- return Status::OK();
- }
-
- const IndexOptions options;
- int64_t seen = 0;
- int64_t index = -1;
-};
-
-struct IndexInit {
- std::unique_ptr<KernelState> state;
- KernelContext* ctx;
- const IndexOptions& options;
- const DataType& type;
-
- IndexInit(KernelContext* ctx, const IndexOptions& options, const DataType& type)
- : ctx(ctx), options(options), type(type) {}
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("Index kernel not implemented for ", type.ToString());
- }
-
- Status Visit(const BooleanType&) {
- state.reset(new IndexImpl<BooleanType>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_number<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_base_binary<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_date<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_time<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- template <typename Type>
- enable_if_timestamp<Type, Status> Visit(const Type&) {
- state.reset(new IndexImpl<Type>(options, ctx->state()));
- return Status::OK();
- }
-
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(type, this));
- return std::move(state);
- }
-
- static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
- const KernelInitArgs& args) {
- IndexInit visitor(ctx, static_cast<const IndexOptions&>(*args.options),
- *args.inputs[0].type);
- return visitor.Create();
- }
-};
-
+// ----------------------------------------------------------------------
+// Index implementation
+
+template <typename ArgType>
+struct IndexImpl : public ScalarAggregator {
+ using ArgValue = typename internal::GetViewType<ArgType>::T;
+
+ explicit IndexImpl(IndexOptions options, KernelState* raw_state)
+ : options(std::move(options)), seen(0), index(-1) {
+ if (auto state = static_cast<IndexImpl<ArgType>*>(raw_state)) {
+ seen = state->seen;
+ index = state->index;
+ }
+ }
+
+ Status Consume(KernelContext* ctx, const ExecBatch& batch) override {
+ // short-circuit
+ if (index >= 0 || !options.value->is_valid) {
+ return Status::OK();
+ }
+
+ auto input = batch[0].array();
+ seen = input->length;
+ const ArgValue desired = internal::UnboxScalar<ArgType>::Unbox(*options.value);
+ int64_t i = 0;
+
+ ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
+ *input,
+ [&](ArgValue v) -> Status {
+ if (v == desired) {
+ index = i;
+ return Status::Cancelled("Found");
+ } else {
+ ++i;
+ return Status::OK();
+ }
+ },
+ [&]() -> Status {
+ ++i;
+ return Status::OK();
+ }));
+
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const IndexImpl&>(src);
+ if (index < 0 && other.index >= 0) {
+ index = seen + other.index;
+ }
+ seen += other.seen;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ out->value = std::make_shared<Int64Scalar>(index >= 0 ? index : -1);
+ return Status::OK();
+ }
+
+ const IndexOptions options;
+ int64_t seen = 0;
+ int64_t index = -1;
+};
+
+struct IndexInit {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const IndexOptions& options;
+ const DataType& type;
+
+ IndexInit(KernelContext* ctx, const IndexOptions& options, const DataType& type)
+ : ctx(ctx), options(options), type(type) {}
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Index kernel not implemented for ", type.ToString());
+ }
+
+ Status Visit(const BooleanType&) {
+ state.reset(new IndexImpl<BooleanType>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_number<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_base_binary<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_date<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_time<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ template <typename Type>
+ enable_if_timestamp<Type, Status> Visit(const Type&) {
+ state.reset(new IndexImpl<Type>(options, ctx->state()));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
+ return std::move(state);
+ }
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ IndexInit visitor(ctx, static_cast<const IndexOptions&>(*args.options),
+ *args.inputs[0].type);
+ return visitor.Create();
+ }
+};
+
void AddBasicAggKernels(KernelInit init,
const std::vector<std::shared_ptr<DataType>>& types,
std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func,
@@ -405,33 +405,33 @@ void AddBasicAggKernels(KernelInit init,
}
}
-void AddScalarAggKernels(KernelInit init,
- const std::vector<std::shared_ptr<DataType>>& types,
- std::shared_ptr<DataType> out_ty,
- ScalarAggregateFunction* func) {
- for (const auto& ty : types) {
- // scalar[InT] -> scalar[OutT]
- auto sig = KernelSignature::Make({InputType::Scalar(ty)}, ValueDescr::Scalar(out_ty));
- AddAggKernel(std::move(sig), init, func, SimdLevel::NONE);
- }
-}
-
-void AddArrayScalarAggKernels(KernelInit init,
- const std::vector<std::shared_ptr<DataType>>& types,
- std::shared_ptr<DataType> out_ty,
- ScalarAggregateFunction* func,
- SimdLevel::type simd_level = SimdLevel::NONE) {
- AddBasicAggKernels(init, types, out_ty, func, simd_level);
- AddScalarAggKernels(init, types, out_ty, func);
-}
-
+void AddScalarAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ // scalar[InT] -> scalar[OutT]
+ auto sig = KernelSignature::Make({InputType::Scalar(ty)}, ValueDescr::Scalar(out_ty));
+ AddAggKernel(std::move(sig), init, func, SimdLevel::NONE);
+ }
+}
+
+void AddArrayScalarAggKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ std::shared_ptr<DataType> out_ty,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE) {
+ AddBasicAggKernels(init, types, out_ty, func, simd_level);
+ AddScalarAggKernels(init, types, out_ty, func);
+}
+
void AddMinMaxKernels(KernelInit init,
const std::vector<std::shared_ptr<DataType>>& types,
ScalarAggregateFunction* func, SimdLevel::type simd_level) {
for (const auto& ty : types) {
- // any[T] -> scalar[struct<min: T, max: T>]
+ // any[T] -> scalar[struct<min: T, max: T>]
auto out_ty = struct_({field("min", ty), field("max", ty)});
- auto sig = KernelSignature::Make({InputType(ty)}, ValueDescr::Scalar(out_ty));
+ auto sig = KernelSignature::Make({InputType(ty)}, ValueDescr::Scalar(out_ty));
AddAggKernel(std::move(sig), init, func, simd_level);
}
}
@@ -439,92 +439,92 @@ void AddMinMaxKernels(KernelInit init,
} // namespace aggregate
namespace internal {
-namespace {
-
-const FunctionDoc count_doc{"Count the number of null / non-null values",
- ("By default, only non-null values are counted.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc sum_doc{
- "Compute the sum of a numeric array",
- ("Null values are ignored by default. Minimum count of non-null\n"
- "values can be set and null is returned if too few are present.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc mean_doc{
- "Compute the mean of a numeric array",
- ("Null values are ignored by default. Minimum count of non-null\n"
- "values can be set and null is returned if too few are "
- "present.\nThis can be changed through ScalarAggregateOptions.\n"
- "The result is always computed as a double, regardless of the input types."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array",
- ("Null values are ignored by default.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
- ("Null values are ignored by default.\n"
- "If null values are taken into account by setting "
- "ScalarAggregateOptions parameter skip_nulls = false then "
- "Kleene logic is used.\n"
- "See KleeneOr for more details on Kleene logic."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
- ("Null values are ignored by default.\n"
- "If null values are taken into account by setting "
- "ScalarAggregateOptions parameter skip_nulls = false then "
- "Kleene logic is used.\n"
- "See KleeneAnd for more details on Kleene logic."),
- {"array"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc index_doc{"Find the index of the first occurrence of a given value",
- ("The result is always computed as an int64_t, regardless\n"
- "of the offset type of the input array."),
- {"array"},
- "IndexOptions"};
-
-} // namespace
-
+namespace {
+
+const FunctionDoc count_doc{"Count the number of null / non-null values",
+ ("By default, only non-null values are counted.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc sum_doc{
+ "Compute the sum of a numeric array",
+ ("Null values are ignored by default. Minimum count of non-null\n"
+ "values can be set and null is returned if too few are present.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc mean_doc{
+ "Compute the mean of a numeric array",
+ ("Null values are ignored by default. Minimum count of non-null\n"
+ "values can be set and null is returned if too few are "
+ "present.\nThis can be changed through ScalarAggregateOptions.\n"
+ "The result is always computed as a double, regardless of the input types."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc min_max_doc{"Compute the minimum and maximum values of a numeric array",
+ ("Null values are ignored by default.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc any_doc{"Test whether any element in a boolean array evaluates to true",
+ ("Null values are ignored by default.\n"
+ "If null values are taken into account by setting "
+ "ScalarAggregateOptions parameter skip_nulls = false then "
+ "Kleene logic is used.\n"
+ "See KleeneOr for more details on Kleene logic."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc all_doc{"Test whether all elements in a boolean array evaluate to true",
+ ("Null values are ignored by default.\n"
+ "If null values are taken into account by setting "
+ "ScalarAggregateOptions parameter skip_nulls = false then "
+ "Kleene logic is used.\n"
+ "See KleeneAnd for more details on Kleene logic."),
+ {"array"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc index_doc{"Find the index of the first occurrence of a given value",
+ ("The result is always computed as an int64_t, regardless\n"
+ "of the offset type of the input array."),
+ {"array"},
+ "IndexOptions"};
+
+} // namespace
+
void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
- static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options);
- auto func = std::make_shared<ScalarAggregateFunction>(
- "count", Arity::Unary(), &count_doc, &default_scalar_aggregate_options);
-
// Takes any array input, outputs int64 scalar
InputType any_array(ValueDescr::ARRAY);
- AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())),
- aggregate::CountInit, func.get());
- AddAggKernel(
- KernelSignature::Make({InputType(ValueDescr::SCALAR)}, ValueDescr::Scalar(int64())),
- aggregate::CountInit, func.get());
+ AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())),
+ aggregate::CountInit, func.get());
+ AddAggKernel(
+ KernelSignature::Make({InputType(ValueDescr::SCALAR)}, ValueDescr::Scalar(int64())),
+ aggregate::CountInit, func.get());
DCHECK_OK(registry->AddFunction(std::move(func)));
- func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary(), &sum_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, {boolean()}, int64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, SignedIntTypes(), int64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(),
- func.get());
+ func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary(), &sum_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, {boolean()}, int64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, SignedIntTypes(), int64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, UnsignedIntTypes(), uint64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::SumInit, FloatingPointTypes(), float64(),
+ func.get());
// Add the SIMD variants for sum
-#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
+#if defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
auto cpu_info = arrow::internal::CpuInfo::GetInstance();
-#endif
+#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
aggregate::AddSumAvx2AggKernels(func.get());
@@ -537,12 +537,12 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
#endif
DCHECK_OK(registry->AddFunction(std::move(func)));
- func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary(), &mean_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, {boolean()}, float64(),
- func.get());
- aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, NumericTypes(), float64(),
- func.get());
+ func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary(), &mean_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, {boolean()}, float64(),
+ func.get());
+ aggregate::AddArrayScalarAggKernels(aggregate::MeanInit, NumericTypes(), float64(),
+ func.get());
// Add the SIMD variants for mean
#if defined(ARROW_HAVE_RUNTIME_AVX2)
if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) {
@@ -556,8 +556,8 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
#endif
DCHECK_OK(registry->AddFunction(std::move(func)));
- func = std::make_shared<ScalarAggregateFunction>(
- "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options);
+ func = std::make_shared<ScalarAggregateFunction>(
+ "min_max", Arity::Unary(), &min_max_doc, &default_scalar_aggregate_options);
aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get());
aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get());
// Add the SIMD variants for min max
@@ -574,29 +574,29 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
- // any
- func = std::make_shared<ScalarAggregateFunction>("any", Arity::Unary(), &any_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::AnyInit, {boolean()}, boolean(),
- func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
-
- // all
- func = std::make_shared<ScalarAggregateFunction>("all", Arity::Unary(), &all_doc,
- &default_scalar_aggregate_options);
- aggregate::AddArrayScalarAggKernels(aggregate::AllInit, {boolean()}, boolean(),
- func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
-
- // index
- func = std::make_shared<ScalarAggregateFunction>("index", Arity::Unary(), &index_doc);
- aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, BaseBinaryTypes(), int64(),
- func.get());
- aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, PrimitiveTypes(), int64(),
- func.get());
- aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, TemporalTypes(), int64(),
- func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
+ // any
+ func = std::make_shared<ScalarAggregateFunction>("any", Arity::Unary(), &any_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::AnyInit, {boolean()}, boolean(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // all
+ func = std::make_shared<ScalarAggregateFunction>("all", Arity::Unary(), &all_doc,
+ &default_scalar_aggregate_options);
+ aggregate::AddArrayScalarAggKernels(aggregate::AllInit, {boolean()}, boolean(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // index
+ func = std::make_shared<ScalarAggregateFunction>("index", Arity::Unary(), &index_doc);
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, BaseBinaryTypes(), int64(),
+ func.get());
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, PrimitiveTypes(), int64(),
+ func.get());
+ aggregate::AddBasicAggKernels(aggregate::IndexInit::Init, TemporalTypes(), int64(),
+ func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
index 4b1ae8d3d6c..5163d3fd03d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_basic_internal.h
@@ -51,68 +51,68 @@ void AddMinMaxAvx512AggKernels(ScalarAggregateFunction* func);
// ----------------------------------------------------------------------
// Sum implementation
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct SumImpl : public ScalarAggregator {
- using ThisType = SumImpl<ArrowType, SimdLevel>;
- using CType = typename ArrowType::c_type;
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct SumImpl : public ScalarAggregator {
+ using ThisType = SumImpl<ArrowType, SimdLevel>;
+ using CType = typename ArrowType::c_type;
using SumType = typename FindAccumulatorType<ArrowType>::Type;
- using OutputType = typename TypeTraits<SumType>::ScalarType;
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- const auto& data = batch[0].array();
- this->count += data->length - data->GetNullCount();
- if (is_boolean_type<ArrowType>::value) {
- this->sum +=
- static_cast<typename SumType::c_type>(BooleanArray(data).true_count());
- } else {
- this->sum +=
- arrow::compute::detail::SumArray<CType, typename SumType::c_type, SimdLevel>(
- *data);
- }
+ using OutputType = typename TypeTraits<SumType>::ScalarType;
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const auto& data = batch[0].array();
+ this->count += data->length - data->GetNullCount();
+ if (is_boolean_type<ArrowType>::value) {
+ this->sum +=
+ static_cast<typename SumType::c_type>(BooleanArray(data).true_count());
+ } else {
+ this->sum +=
+ arrow::compute::detail::SumArray<CType, typename SumType::c_type, SimdLevel>(
+ *data);
+ }
} else {
- const auto& data = *batch[0].scalar();
- this->count += data.is_valid * batch.length;
- if (data.is_valid) {
- this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
+ const auto& data = *batch[0].scalar();
+ this->count += data.is_valid * batch.length;
+ if (data.is_valid) {
+ this->sum += internal::UnboxScalar<ArrowType>::Unbox(data) * batch.length;
}
}
- return Status::OK();
+ return Status::OK();
}
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const ThisType&>(src);
- this->count += other.count;
- this->sum += other.sum;
- return Status::OK();
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ThisType&>(src);
+ this->count += other.count;
+ this->sum += other.sum;
+ return Status::OK();
}
- Status Finalize(KernelContext*, Datum* out) override {
- if (this->count < options.min_count) {
- out->value = std::make_shared<OutputType>();
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->count < options.min_count) {
+ out->value = std::make_shared<OutputType>();
} else {
- out->value = MakeScalar(this->sum);
+ out->value = MakeScalar(this->sum);
}
- return Status::OK();
+ return Status::OK();
}
size_t count = 0;
typename SumType::c_type sum = 0;
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
};
-template <typename ArrowType, SimdLevel::type SimdLevel>
-struct MeanImpl : public SumImpl<ArrowType, SimdLevel> {
- Status Finalize(KernelContext*, Datum* out) override {
- if (this->count < options.min_count) {
- out->value = std::make_shared<DoubleScalar>();
+template <typename ArrowType, SimdLevel::type SimdLevel>
+struct MeanImpl : public SumImpl<ArrowType, SimdLevel> {
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (this->count < options.min_count) {
+ out->value = std::make_shared<DoubleScalar>();
} else {
- const double mean = static_cast<double>(this->sum) / this->count;
- out->value = std::make_shared<DoubleScalar>(mean);
+ const double mean = static_cast<double>(this->sum) / this->count;
+ out->value = std::make_shared<DoubleScalar>(mean);
}
- return Status::OK();
+ return Status::OK();
}
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
};
template <template <typename> class KernelClass>
@@ -120,11 +120,11 @@ struct SumLikeInit {
std::unique_ptr<KernelState> state;
KernelContext* ctx;
const DataType& type;
- const ScalarAggregateOptions& options;
+ const ScalarAggregateOptions& options;
- SumLikeInit(KernelContext* ctx, const DataType& type,
- const ScalarAggregateOptions& options)
- : ctx(ctx), type(type), options(options) {}
+ SumLikeInit(KernelContext* ctx, const DataType& type,
+ const ScalarAggregateOptions& options)
+ : ctx(ctx), type(type), options(options) {}
Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
@@ -133,18 +133,18 @@ struct SumLikeInit {
}
Status Visit(const BooleanType&) {
- state.reset(new KernelClass<BooleanType>(options));
+ state.reset(new KernelClass<BooleanType>(options));
return Status::OK();
}
template <typename Type>
enable_if_number<Type, Status> Visit(const Type&) {
- state.reset(new KernelClass<Type>(options));
+ state.reset(new KernelClass<Type>(options));
return Status::OK();
}
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(type, this));
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(type, this));
return std::move(state);
}
};
@@ -233,42 +233,42 @@ struct MinMaxImpl : public ScalarAggregator {
using ThisType = MinMaxImpl<ArrowType, SimdLevel>;
using StateType = MinMaxState<ArrowType, SimdLevel>;
- MinMaxImpl(const std::shared_ptr<DataType>& out_type,
- const ScalarAggregateOptions& options)
+ MinMaxImpl(const std::shared_ptr<DataType>& out_type,
+ const ScalarAggregateOptions& options)
: out_type(out_type), options(options) {}
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- return ConsumeArray(ArrayType(batch[0].array()));
- }
- return ConsumeScalar(*batch[0].scalar());
- }
-
- Status ConsumeScalar(const Scalar& scalar) {
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ return ConsumeArray(ArrayType(batch[0].array()));
+ }
+ return ConsumeScalar(*batch[0].scalar());
+ }
+
+ Status ConsumeScalar(const Scalar& scalar) {
+ StateType local;
+ local.has_nulls = !scalar.is_valid;
+ local.has_values = scalar.is_valid;
+
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
+ this->state = local;
+ return Status::OK();
+ }
+
+ Status ConsumeArray(const ArrayType& arr) {
StateType local;
- local.has_nulls = !scalar.is_valid;
- local.has_values = scalar.is_valid;
-
- if (local.has_nulls && !options.skip_nulls) {
- this->state = local;
- return Status::OK();
- }
-
- local.MergeOne(internal::UnboxScalar<ArrowType>::Unbox(scalar));
- this->state = local;
- return Status::OK();
- }
-
- Status ConsumeArray(const ArrayType& arr) {
- StateType local;
-
+
const auto null_count = arr.null_count();
local.has_nulls = null_count > 0;
local.has_values = (arr.length() - null_count) > 0;
- if (local.has_nulls && !options.skip_nulls) {
+ if (local.has_nulls && !options.skip_nulls) {
this->state = local;
- return Status::OK();
+ return Status::OK();
}
if (local.has_nulls) {
@@ -279,32 +279,32 @@ struct MinMaxImpl : public ScalarAggregator {
}
}
this->state = local;
- return Status::OK();
+ return Status::OK();
}
- Status MergeFrom(KernelContext*, KernelState&& src) override {
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
const auto& other = checked_cast<const ThisType&>(src);
this->state += other.state;
- return Status::OK();
+ return Status::OK();
}
- Status Finalize(KernelContext*, Datum* out) override {
+ Status Finalize(KernelContext*, Datum* out) override {
using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
std::vector<std::shared_ptr<Scalar>> values;
- if (!state.has_values || (state.has_nulls && !options.skip_nulls)) {
+ if (!state.has_values || (state.has_nulls && !options.skip_nulls)) {
// (null, null)
values = {std::make_shared<ScalarType>(), std::make_shared<ScalarType>()};
} else {
values = {std::make_shared<ScalarType>(state.min),
std::make_shared<ScalarType>(state.max)};
}
- out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
- return Status::OK();
+ out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+ return Status::OK();
}
std::shared_ptr<DataType> out_type;
- ScalarAggregateOptions options;
+ ScalarAggregateOptions options;
MinMaxState<ArrowType, SimdLevel> state;
private:
@@ -373,10 +373,10 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
using MinMaxImpl<BooleanType, SimdLevel>::MinMaxImpl;
using MinMaxImpl<BooleanType, SimdLevel>::options;
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
- return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar()));
- }
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (ARROW_PREDICT_FALSE(batch[0].is_scalar())) {
+ return ConsumeScalar(checked_cast<const BooleanScalar&>(*batch[0].scalar()));
+ }
StateType local;
ArrayType arr(batch[0].array());
@@ -386,9 +386,9 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
local.has_nulls = null_count > 0;
local.has_values = valid_count > 0;
- if (local.has_nulls && !options.skip_nulls) {
+ if (local.has_nulls && !options.skip_nulls) {
this->state = local;
- return Status::OK();
+ return Status::OK();
}
const auto true_count = arr.true_count();
@@ -397,27 +397,27 @@ struct BooleanMinMaxImpl : public MinMaxImpl<BooleanType, SimdLevel> {
local.min = false_count == 0;
this->state = local;
- return Status::OK();
+ return Status::OK();
+ }
+
+ Status ConsumeScalar(const BooleanScalar& scalar) {
+ StateType local;
+
+ local.has_nulls = !scalar.is_valid;
+ local.has_values = scalar.is_valid;
+ if (local.has_nulls && !options.skip_nulls) {
+ this->state = local;
+ return Status::OK();
+ }
+
+ const int true_count = scalar.is_valid && scalar.value;
+ const int false_count = scalar.is_valid && !scalar.value;
+ local.max = true_count > 0;
+ local.min = false_count == 0;
+
+ this->state = local;
+ return Status::OK();
}
-
- Status ConsumeScalar(const BooleanScalar& scalar) {
- StateType local;
-
- local.has_nulls = !scalar.is_valid;
- local.has_values = scalar.is_valid;
- if (local.has_nulls && !options.skip_nulls) {
- this->state = local;
- return Status::OK();
- }
-
- const int true_count = scalar.is_valid && scalar.value;
- const int false_count = scalar.is_valid && !scalar.value;
- local.max = true_count > 0;
- local.min = false_count == 0;
-
- this->state = local;
- return Status::OK();
- }
};
template <SimdLevel::type SimdLevel>
@@ -426,11 +426,11 @@ struct MinMaxInitState {
KernelContext* ctx;
const DataType& in_type;
const std::shared_ptr<DataType>& out_type;
- const ScalarAggregateOptions& options;
+ const ScalarAggregateOptions& options;
MinMaxInitState(KernelContext* ctx, const DataType& in_type,
- const std::shared_ptr<DataType>& out_type,
- const ScalarAggregateOptions& options)
+ const std::shared_ptr<DataType>& out_type,
+ const ScalarAggregateOptions& options)
: ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
Status Visit(const DataType&) {
@@ -452,8 +452,8 @@ struct MinMaxInitState {
return Status::OK();
}
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
return std::move(state);
}
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
index d72cdb14941..ed29f26f2c3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_internal.h
@@ -19,8 +19,8 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace compute {
@@ -49,124 +49,124 @@ struct FindAccumulatorType<I, enable_if_floating_point<I>> {
using Type = DoubleType;
};
-struct ScalarAggregator : public KernelState {
- virtual Status Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
- virtual Status MergeFrom(KernelContext* ctx, KernelState&& src) = 0;
- virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
-};
-
-void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
- ScalarAggregateFunction* func,
- SimdLevel::type simd_level = SimdLevel::NONE);
-
-namespace detail {
-
-using arrow::internal::VisitSetBitRunsVoid;
-
-// SumArray must be parameterized with the SIMD level since it's called both from
-// translation units with and without vectorization. Normally it gets inlined but
-// if not, without the parameter, we'll have multiple definitions of the same
-// symbol and we'll get unexpected results.
-
-// non-recursive pairwise summation for floating points
-// https://en.wikipedia.org/wiki/Pairwise_summation
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
- typename ValueFunc>
-enable_if_t<std::is_floating_point<SumType>::value, SumType> SumArray(
- const ArrayData& data, ValueFunc&& func) {
- const int64_t data_size = data.length - data.GetNullCount();
- if (data_size == 0) {
- return 0;
- }
-
- // number of inputs to accumulate before merging with another block
- constexpr int kBlockSize = 16; // same as numpy
- // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary
- const int levels = BitUtil::Log2(static_cast<uint64_t>(data_size)) + 1;
- // temporary summation per level
- std::vector<SumType> sum(levels);
- // whether two summations are ready and should be reduced to upper level
- // one bit for each level, bit0 -> level0, ...
- uint64_t mask = 0;
- // level of root node holding the final summation
- int root_level = 0;
-
- // reduce summation of one block (may be smaller than kBlockSize) from leaf node
- // continue reducing to upper level if two summations are ready for non-leaf node
- auto reduce = [&](SumType block_sum) {
- int cur_level = 0;
- uint64_t cur_level_mask = 1ULL;
- sum[cur_level] += block_sum;
- mask ^= cur_level_mask;
- while ((mask & cur_level_mask) == 0) {
- block_sum = sum[cur_level];
- sum[cur_level] = 0;
- ++cur_level;
- DCHECK_LT(cur_level, levels);
- cur_level_mask <<= 1;
- sum[cur_level] += block_sum;
- mask ^= cur_level_mask;
- }
- root_level = std::max(root_level, cur_level);
- };
-
- const ValueType* values = data.GetValues<ValueType>(1);
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- const ValueType* v = &values[pos];
- // unsigned division by constant is cheaper than signed one
- const uint64_t blocks = static_cast<uint64_t>(len) / kBlockSize;
- const uint64_t remains = static_cast<uint64_t>(len) % kBlockSize;
-
- for (uint64_t i = 0; i < blocks; ++i) {
- SumType block_sum = 0;
- for (int j = 0; j < kBlockSize; ++j) {
- block_sum += func(v[j]);
- }
- reduce(block_sum);
- v += kBlockSize;
- }
-
- if (remains > 0) {
- SumType block_sum = 0;
- for (uint64_t i = 0; i < remains; ++i) {
- block_sum += func(v[i]);
- }
- reduce(block_sum);
- }
- });
-
- // reduce intermediate summations from all non-leaf nodes
- for (int i = 1; i <= root_level; ++i) {
- sum[i] += sum[i - 1];
- }
-
- return sum[root_level];
-}
-
-// naive summation for integers
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
- typename ValueFunc>
-enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
- const ArrayData& data, ValueFunc&& func) {
- SumType sum = 0;
- const ValueType* values = data.GetValues<ValueType>(1);
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- sum += func(values[pos + i]);
- }
- });
- return sum;
-}
-
-template <typename ValueType, typename SumType, SimdLevel::type SimdLevel>
-SumType SumArray(const ArrayData& data) {
- return SumArray<ValueType, SumType, SimdLevel>(
- data, [](ValueType v) { return static_cast<SumType>(v); });
-}
-
-} // namespace detail
-
+struct ScalarAggregator : public KernelState {
+ virtual Status Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
+ virtual Status MergeFrom(KernelContext* ctx, KernelState&& src) = 0;
+ virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
+};
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+ ScalarAggregateFunction* func,
+ SimdLevel::type simd_level = SimdLevel::NONE);
+
+namespace detail {
+
+using arrow::internal::VisitSetBitRunsVoid;
+
+// SumArray must be parameterized with the SIMD level since it's called both from
+// translation units with and without vectorization. Normally it gets inlined but
+// if not, without the parameter, we'll have multiple definitions of the same
+// symbol and we'll get unexpected results.
+
+// non-recursive pairwise summation for floating points
+// https://en.wikipedia.org/wiki/Pairwise_summation
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+ typename ValueFunc>
+enable_if_t<std::is_floating_point<SumType>::value, SumType> SumArray(
+ const ArrayData& data, ValueFunc&& func) {
+ const int64_t data_size = data.length - data.GetNullCount();
+ if (data_size == 0) {
+ return 0;
+ }
+
+ // number of inputs to accumulate before merging with another block
+ constexpr int kBlockSize = 16; // same as numpy
+ // levels (tree depth) = ceil(log2(len)) + 1, a bit larger than necessary
+ const int levels = BitUtil::Log2(static_cast<uint64_t>(data_size)) + 1;
+ // temporary summation per level
+ std::vector<SumType> sum(levels);
+ // whether two summations are ready and should be reduced to upper level
+ // one bit for each level, bit0 -> level0, ...
+ uint64_t mask = 0;
+ // level of root node holding the final summation
+ int root_level = 0;
+
+ // reduce summation of one block (may be smaller than kBlockSize) from leaf node
+ // continue reducing to upper level if two summations are ready for non-leaf node
+ auto reduce = [&](SumType block_sum) {
+ int cur_level = 0;
+ uint64_t cur_level_mask = 1ULL;
+ sum[cur_level] += block_sum;
+ mask ^= cur_level_mask;
+ while ((mask & cur_level_mask) == 0) {
+ block_sum = sum[cur_level];
+ sum[cur_level] = 0;
+ ++cur_level;
+ DCHECK_LT(cur_level, levels);
+ cur_level_mask <<= 1;
+ sum[cur_level] += block_sum;
+ mask ^= cur_level_mask;
+ }
+ root_level = std::max(root_level, cur_level);
+ };
+
+ const ValueType* values = data.GetValues<ValueType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ const ValueType* v = &values[pos];
+ // unsigned division by constant is cheaper than signed one
+ const uint64_t blocks = static_cast<uint64_t>(len) / kBlockSize;
+ const uint64_t remains = static_cast<uint64_t>(len) % kBlockSize;
+
+ for (uint64_t i = 0; i < blocks; ++i) {
+ SumType block_sum = 0;
+ for (int j = 0; j < kBlockSize; ++j) {
+ block_sum += func(v[j]);
+ }
+ reduce(block_sum);
+ v += kBlockSize;
+ }
+
+ if (remains > 0) {
+ SumType block_sum = 0;
+ for (uint64_t i = 0; i < remains; ++i) {
+ block_sum += func(v[i]);
+ }
+ reduce(block_sum);
+ }
+ });
+
+ // reduce intermediate summations from all non-leaf nodes
+ for (int i = 1; i <= root_level; ++i) {
+ sum[i] += sum[i - 1];
+ }
+
+ return sum[root_level];
+}
+
+// naive summation for integers
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel,
+ typename ValueFunc>
+enable_if_t<!std::is_floating_point<SumType>::value, SumType> SumArray(
+ const ArrayData& data, ValueFunc&& func) {
+ SumType sum = 0;
+ const ValueType* values = data.GetValues<ValueType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ sum += func(values[pos + i]);
+ }
+ });
+ return sum;
+}
+
+template <typename ValueType, typename SumType, SimdLevel::type SimdLevel>
+SumType SumArray(const ArrayData& data) {
+ return SumArray<ValueType, SumType, SimdLevel>(
+ data, [](ValueType v) { return static_cast<SumType>(v); });
+}
+
+} // namespace detail
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index b2659355ba9..6ad0eeb6456 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -16,377 +16,377 @@
// under the License.
#include <cmath>
-#include <queue>
-#include <utility>
+#include <queue>
+#include <utility>
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/result.h"
-#include "arrow/stl_allocator.h"
-#include "arrow/type_traits.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/result.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
namespace arrow {
namespace compute {
-namespace internal {
+namespace internal {
namespace {
-using ModeState = OptionsWrapper<ModeOptions>;
+using ModeState = OptionsWrapper<ModeOptions>;
-constexpr char kModeFieldName[] = "mode";
-constexpr char kCountFieldName[] = "count";
+constexpr char kModeFieldName[] = "mode";
+constexpr char kCountFieldName[] = "count";
-constexpr uint64_t kCountEOF = ~0ULL;
+constexpr uint64_t kCountEOF = ~0ULL;
-template <typename InType, typename CType = typename InType::c_type>
-Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
- Datum* out) {
- const auto& mode_type = TypeTraits<InType>::type_singleton();
- const auto& count_type = int64();
+template <typename InType, typename CType = typename InType::c_type>
+Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
+ Datum* out) {
+ const auto& mode_type = TypeTraits<InType>::type_singleton();
+ const auto& count_type = int64();
- auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0);
- mode_data->buffers.resize(2, nullptr);
- auto count_data = ArrayData::Make(count_type, n, 0);
- count_data->buffers.resize(2, nullptr);
+ auto mode_data = ArrayData::Make(mode_type, /*length=*/n, /*null_count=*/0);
+ mode_data->buffers.resize(2, nullptr);
+ auto count_data = ArrayData::Make(count_type, n, 0);
+ count_data->buffers.resize(2, nullptr);
- CType* mode_buffer = nullptr;
- int64_t* count_buffer = nullptr;
-
- if (n > 0) {
- ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(n * sizeof(CType)));
- ARROW_ASSIGN_OR_RAISE(count_data->buffers[1], ctx->Allocate(n * sizeof(int64_t)));
- mode_buffer = mode_data->template GetMutableValues<CType>(1);
- count_buffer = count_data->template GetMutableValues<int64_t>(1);
+ CType* mode_buffer = nullptr;
+ int64_t* count_buffer = nullptr;
+
+ if (n > 0) {
+ ARROW_ASSIGN_OR_RAISE(mode_data->buffers[1], ctx->Allocate(n * sizeof(CType)));
+ ARROW_ASSIGN_OR_RAISE(count_data->buffers[1], ctx->Allocate(n * sizeof(int64_t)));
+ mode_buffer = mode_data->template GetMutableValues<CType>(1);
+ count_buffer = count_data->template GetMutableValues<int64_t>(1);
}
- const auto& out_type =
- struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
- *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0));
-
- return std::make_pair(mode_buffer, count_buffer);
+ const auto& out_type =
+ struct_({field(kModeFieldName, mode_type), field(kCountFieldName, count_type)});
+ *out = Datum(ArrayData::Make(out_type, n, {nullptr}, {mode_data, count_data}, 0));
+
+ return std::make_pair(mode_buffer, count_buffer);
}
-// find top-n value:count pairs with minimal heap
-// suboptimal for tiny or large n, possibly okay as we're not in hot path
-template <typename InType, typename Generator>
-Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
- using CType = typename InType::c_type;
-
- using ValueCountPair = std::pair<CType, uint64_t>;
- auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) {
- const bool rhs_is_nan = rhs.first != rhs.first; // nan as largest value
- return lhs.second > rhs.second ||
- (lhs.second == rhs.second && (lhs.first < rhs.first || rhs_is_nan));
- };
-
- std::priority_queue<ValueCountPair, std::vector<ValueCountPair>, decltype(gt)> min_heap(
- std::move(gt));
-
- const ModeOptions& options = ModeState::Get(ctx);
- while (true) {
- const ValueCountPair& value_count = gen();
- DCHECK_NE(value_count.second, 0);
- if (value_count.second == kCountEOF) break;
- if (static_cast<int64_t>(min_heap.size()) < options.n) {
- min_heap.push(value_count);
- } else if (gt(value_count, min_heap.top())) {
- min_heap.pop();
- min_heap.push(value_count);
+// find top-n value:count pairs with minimal heap
+// suboptimal for tiny or large n, possibly okay as we're not in hot path
+template <typename InType, typename Generator>
+Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
+ using CType = typename InType::c_type;
+
+ using ValueCountPair = std::pair<CType, uint64_t>;
+ auto gt = [](const ValueCountPair& lhs, const ValueCountPair& rhs) {
+ const bool rhs_is_nan = rhs.first != rhs.first; // nan as largest value
+ return lhs.second > rhs.second ||
+ (lhs.second == rhs.second && (lhs.first < rhs.first || rhs_is_nan));
+ };
+
+ std::priority_queue<ValueCountPair, std::vector<ValueCountPair>, decltype(gt)> min_heap(
+ std::move(gt));
+
+ const ModeOptions& options = ModeState::Get(ctx);
+ while (true) {
+ const ValueCountPair& value_count = gen();
+ DCHECK_NE(value_count.second, 0);
+ if (value_count.second == kCountEOF) break;
+ if (static_cast<int64_t>(min_heap.size()) < options.n) {
+ min_heap.push(value_count);
+ } else if (gt(value_count, min_heap.top())) {
+ min_heap.pop();
+ min_heap.push(value_count);
}
}
- const int64_t n = min_heap.size();
-
- CType* mode_buffer;
- int64_t* count_buffer;
- ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
- PrepareOutput<InType>(n, ctx, out));
-
- for (int64_t i = n - 1; i >= 0; --i) {
- std::tie(mode_buffer[i], count_buffer[i]) = min_heap.top();
- min_heap.pop();
- }
-
- return Status::OK();
+ const int64_t n = min_heap.size();
+
+ CType* mode_buffer;
+ int64_t* count_buffer;
+ ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
+ PrepareOutput<InType>(n, ctx, out));
+
+ for (int64_t i = n - 1; i >= 0; --i) {
+ std::tie(mode_buffer[i], count_buffer[i]) = min_heap.top();
+ min_heap.pop();
+ }
+
+ return Status::OK();
}
-// count value occurances for integers with narrow value range
-// O(1) space, O(n) time
-template <typename T>
-struct CountModer {
- using CType = typename T::c_type;
+// count value occurances for integers with narrow value range
+// O(1) space, O(n) time
+template <typename T>
+struct CountModer {
+ using CType = typename T::c_type;
- CType min;
- std::vector<uint64_t> counts;
+ CType min;
+ std::vector<uint64_t> counts;
- CountModer(CType min, CType max) {
- uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
- DCHECK_LT(value_range, 1 << 20);
- this->min = min;
- this->counts.resize(value_range, 0);
+ CountModer(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 20);
+ this->min = min;
+ this->counts.resize(value_range, 0);
}
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // count values in all chunks, ignore nulls
- const Datum& datum = batch[0];
- CountValues<CType>(this->counts.data(), datum, this->min);
-
- // generator to emit next value:count pair
- int index = 0;
- auto gen = [&]() {
- for (; index < static_cast<int>(counts.size()); ++index) {
- if (counts[index] != 0) {
- auto value_count =
- std::make_pair(static_cast<CType>(index + this->min), counts[index]);
- ++index;
- return value_count;
- }
- }
- return std::pair<CType, uint64_t>(0, kCountEOF);
- };
-
- return Finalize<T>(ctx, out, std::move(gen));
- }
-};
-
-// booleans can be handled more straightforward
-template <>
-struct CountModer<BooleanType> {
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- int64_t counts[2]{};
-
- const Datum& datum = batch[0];
- for (const auto& array : datum.chunks()) {
- if (array->length() > array->null_count()) {
- const int64_t true_count =
- arrow::internal::checked_pointer_cast<BooleanArray>(array)->true_count();
- const int64_t false_count = array->length() - array->null_count() - true_count;
- counts[true] += true_count;
- counts[false] += false_count;
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ CountValues<CType>(this->counts.data(), datum, this->min);
+
+ // generator to emit next value:count pair
+ int index = 0;
+ auto gen = [&]() {
+ for (; index < static_cast<int>(counts.size()); ++index) {
+ if (counts[index] != 0) {
+ auto value_count =
+ std::make_pair(static_cast<CType>(index + this->min), counts[index]);
+ ++index;
+ return value_count;
+ }
+ }
+ return std::pair<CType, uint64_t>(0, kCountEOF);
+ };
+
+ return Finalize<T>(ctx, out, std::move(gen));
+ }
+};
+
+// booleans can be handled more straightforward
+template <>
+struct CountModer<BooleanType> {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ int64_t counts[2]{};
+
+ const Datum& datum = batch[0];
+ for (const auto& array : datum.chunks()) {
+ if (array->length() > array->null_count()) {
+ const int64_t true_count =
+ arrow::internal::checked_pointer_cast<BooleanArray>(array)->true_count();
+ const int64_t false_count = array->length() - array->null_count() - true_count;
+ counts[true] += true_count;
+ counts[false] += false_count;
}
}
- const ModeOptions& options = ModeState::Get(ctx);
- const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0);
- const int64_t n = std::min(options.n, distinct_values);
-
- bool* mode_buffer;
- int64_t* count_buffer;
- ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
- PrepareOutput<BooleanType>(n, ctx, out));
-
- if (n >= 1) {
- const bool index = counts[1] > counts[0];
- mode_buffer[0] = index;
- count_buffer[0] = counts[index];
- if (n == 2) {
- mode_buffer[1] = !index;
- count_buffer[1] = counts[!index];
+ const ModeOptions& options = ModeState::Get(ctx);
+ const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0);
+ const int64_t n = std::min(options.n, distinct_values);
+
+ bool* mode_buffer;
+ int64_t* count_buffer;
+ ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
+ PrepareOutput<BooleanType>(n, ctx, out));
+
+ if (n >= 1) {
+ const bool index = counts[1] > counts[0];
+ mode_buffer[0] = index;
+ count_buffer[0] = counts[index];
+ if (n == 2) {
+ mode_buffer[1] = !index;
+ count_buffer[1] = counts[!index];
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
-// copy and sort approach for floating points or integers with wide value range
-// O(n) space, O(nlogn) time
-template <typename T>
-struct SortModer {
- using CType = typename T::c_type;
- using Allocator = arrow::stl::allocator<CType>;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // copy all chunks to a buffer, ignore nulls and nans
- std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
-
- uint64_t nan_count = 0;
- const Datum& datum = batch[0];
- const int64_t in_length = datum.length() - datum.null_count();
- if (in_length > 0) {
- in_buffer.resize(in_length);
- CopyNonNullValues(datum, in_buffer.data());
-
- // drop nan
- if (is_floating_type<T>::value) {
- const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
- [](CType v) { return v != v; });
- nan_count = in_buffer.end() - it;
- in_buffer.resize(it - in_buffer.begin());
- }
- }
-
- // sort the input data to count same values
- std::sort(in_buffer.begin(), in_buffer.end());
-
- // generator to emit next value:count pair
- auto it = in_buffer.cbegin();
- auto gen = [&]() {
- if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
- // handle NAN at last
- if (nan_count > 0) {
- auto value_count = std::make_pair(static_cast<CType>(NAN), nan_count);
- nan_count = 0;
- return value_count;
- }
- return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
- }
- // count same values
- const CType value = *it;
- uint64_t count = 0;
- do {
- ++it;
- ++count;
- } while (it != in_buffer.cend() && *it == value);
- return std::make_pair(value, count);
- };
-
- return Finalize<T>(ctx, out, std::move(gen));
+// copy and sort approach for floating points or integers with wide value range
+// O(n) space, O(nlogn) time
+template <typename T>
+struct SortModer {
+ using CType = typename T::c_type;
+ using Allocator = arrow::stl::allocator<CType>;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // copy all chunks to a buffer, ignore nulls and nans
+ std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+
+ uint64_t nan_count = 0;
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ in_buffer.resize(in_length);
+ CopyNonNullValues(datum, in_buffer.data());
+
+ // drop nan
+ if (is_floating_type<T>::value) {
+ const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
+ [](CType v) { return v != v; });
+ nan_count = in_buffer.end() - it;
+ in_buffer.resize(it - in_buffer.begin());
+ }
+ }
+
+ // sort the input data to count same values
+ std::sort(in_buffer.begin(), in_buffer.end());
+
+ // generator to emit next value:count pair
+ auto it = in_buffer.cbegin();
+ auto gen = [&]() {
+ if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
+ // handle NAN at last
+ if (nan_count > 0) {
+ auto value_count = std::make_pair(static_cast<CType>(NAN), nan_count);
+ nan_count = 0;
+ return value_count;
+ }
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ }
+ // count same values
+ const CType value = *it;
+ uint64_t count = 0;
+ do {
+ ++it;
+ ++count;
+ } while (it != in_buffer.cend() && *it == value);
+ return std::make_pair(value, count);
+ };
+
+ return Finalize<T>(ctx, out, std::move(gen));
}
-};
-
-// pick counting or sorting approach per integers value range
-template <typename T>
-struct CountOrSortModer {
- using CType = typename T::c_type;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // cross point to benefit from counting approach
- // about 2x improvement for int32/64 from micro-benchmarking
- static constexpr int kMinArraySize = 8192;
- static constexpr int kMaxValueRange = 32768;
-
- const Datum& datum = batch[0];
- if (datum.length() - datum.null_count() >= kMinArraySize) {
- CType min, max;
- std::tie(min, max) = GetMinMax<CType>(datum);
-
- if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
- return CountModer<T>(min, max).Exec(ctx, batch, out);
- }
+};
+
+// pick counting or sorting approach per integers value range
+template <typename T>
+struct CountOrSortModer {
+ using CType = typename T::c_type;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from counting approach
+ // about 2x improvement for int32/64 from micro-benchmarking
+ static constexpr int kMinArraySize = 8192;
+ static constexpr int kMaxValueRange = 32768;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
+ CType min, max;
+ std::tie(min, max) = GetMinMax<CType>(datum);
+
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+ return CountModer<T>(min, max).Exec(ctx, batch, out);
+ }
}
-
- return SortModer<T>().Exec(ctx, batch, out);
+
+ return SortModer<T>().Exec(ctx, batch, out);
}
-};
-
-template <typename InType, typename Enable = void>
-struct Moder;
-
-template <>
-struct Moder<Int8Type> {
- CountModer<Int8Type> impl;
- Moder() : impl(-128, 127) {}
};
-template <>
-struct Moder<UInt8Type> {
- CountModer<UInt8Type> impl;
- Moder() : impl(0, 255) {}
-};
-
-template <>
-struct Moder<BooleanType> {
- CountModer<BooleanType> impl;
-};
-
-template <typename InType>
-struct Moder<InType, enable_if_t<(is_integer_type<InType>::value &&
- (sizeof(typename InType::c_type) > 1))>> {
- CountOrSortModer<InType> impl;
-};
-
-template <typename InType>
-struct Moder<InType, enable_if_t<is_floating_type<InType>::value>> {
- SortModer<InType> impl;
-};
-
-template <typename T>
-Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
- using CType = typename T::c_type;
- if (scalar.is_valid) {
- bool called = false;
- return Finalize<T>(ctx, out, [&]() {
- if (!called) {
- called = true;
- return std::pair<CType, uint64_t>(UnboxScalar<T>::Unbox(scalar), 1);
- }
- return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
- });
+template <typename InType, typename Enable = void>
+struct Moder;
+
+template <>
+struct Moder<Int8Type> {
+ CountModer<Int8Type> impl;
+ Moder() : impl(-128, 127) {}
+};
+
+template <>
+struct Moder<UInt8Type> {
+ CountModer<UInt8Type> impl;
+ Moder() : impl(0, 255) {}
+};
+
+template <>
+struct Moder<BooleanType> {
+ CountModer<BooleanType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_t<(is_integer_type<InType>::value &&
+ (sizeof(typename InType::c_type) > 1))>> {
+ CountOrSortModer<InType> impl;
+};
+
+template <typename InType>
+struct Moder<InType, enable_if_t<is_floating_type<InType>::value>> {
+ SortModer<InType> impl;
+};
+
+template <typename T>
+Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
+ using CType = typename T::c_type;
+ if (scalar.is_valid) {
+ bool called = false;
+ return Finalize<T>(ctx, out, [&]() {
+ if (!called) {
+ called = true;
+ return std::pair<CType, uint64_t>(UnboxScalar<T>::Unbox(scalar), 1);
+ }
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ });
}
- return Finalize<T>(ctx, out, []() {
- return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
- });
-}
-
-template <typename _, typename InType>
-struct ModeExecutor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (ctx->state() == nullptr) {
- return Status::Invalid("Mode requires ModeOptions");
- }
- const ModeOptions& options = ModeState::Get(ctx);
- if (options.n <= 0) {
- return Status::Invalid("ModeOption::n must be strictly positive");
- }
-
- if (batch[0].is_scalar()) {
- return ScalarMode<InType>(ctx, *batch[0].scalar(), out);
- }
-
- return Moder<InType>().impl.Exec(ctx, batch, out);
+ return Finalize<T>(ctx, out, []() {
+ return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
+ });
+}
+
+template <typename _, typename InType>
+struct ModeExecutor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("Mode requires ModeOptions");
+ }
+ const ModeOptions& options = ModeState::Get(ctx);
+ if (options.n <= 0) {
+ return Status::Invalid("ModeOption::n must be strictly positive");
+ }
+
+ if (batch[0].is_scalar()) {
+ return ScalarMode<InType>(ctx, *batch[0].scalar(), out);
+ }
+
+ return Moder<InType>().impl.Exec(ctx, batch, out);
}
};
-VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type) {
- VectorKernel kernel;
- kernel.init = ModeState::Init;
- kernel.can_execute_chunkwise = false;
- kernel.output_chunked = false;
- auto out_type =
- struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
- kernel.signature =
- KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type));
- return kernel;
+VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type) {
+ VectorKernel kernel;
+ kernel.init = ModeState::Init;
+ kernel.can_execute_chunkwise = false;
+ kernel.output_chunked = false;
+ auto out_type =
+ struct_({field(kModeFieldName, in_type), field(kCountFieldName, int64())});
+ kernel.signature =
+ KernelSignature::Make({InputType(in_type)}, ValueDescr::Array(out_type));
+ return kernel;
}
-void AddBooleanModeKernel(VectorFunction* func) {
- VectorKernel kernel = NewModeKernel(boolean());
- kernel.exec = ModeExecutor<StructType, BooleanType>::Exec;
- DCHECK_OK(func->AddKernel(kernel));
-}
-
-void AddNumericModeKernels(VectorFunction* func) {
- for (const auto& type : NumericTypes()) {
- VectorKernel kernel = NewModeKernel(type);
- kernel.exec = GenerateNumeric<ModeExecutor, StructType>(*type);
- DCHECK_OK(func->AddKernel(kernel));
+void AddBooleanModeKernel(VectorFunction* func) {
+ VectorKernel kernel = NewModeKernel(boolean());
+ kernel.exec = ModeExecutor<StructType, BooleanType>::Exec;
+ DCHECK_OK(func->AddKernel(kernel));
+}
+
+void AddNumericModeKernels(VectorFunction* func) {
+ for (const auto& type : NumericTypes()) {
+ VectorKernel kernel = NewModeKernel(type);
+ kernel.exec = GenerateNumeric<ModeExecutor, StructType>(*type);
+ DCHECK_OK(func->AddKernel(kernel));
}
}
-const FunctionDoc mode_doc{
- "Calculate the modal (most common) values of a numeric array",
- ("Returns top-n most common values and number of times they occur in an array.\n"
- "Result is an array of `struct<mode: T, count: int64>`, where T is the input type.\n"
- "Values with larger counts are returned before smaller counts.\n"
- "If there are more than one values with same count, smaller one is returned first.\n"
- "Nulls are ignored. If there are no non-null values in the array,\n"
- "empty array is returned."),
- {"array"},
- "ModeOptions"};
-
+const FunctionDoc mode_doc{
+ "Calculate the modal (most common) values of a numeric array",
+ ("Returns top-n most common values and number of times they occur in an array.\n"
+ "Result is an array of `struct<mode: T, count: int64>`, where T is the input type.\n"
+ "Values with larger counts are returned before smaller counts.\n"
+ "If there are more than one values with same count, smaller one is returned first.\n"
+ "Nulls are ignored. If there are no non-null values in the array,\n"
+ "empty array is returned."),
+ {"array"},
+ "ModeOptions"};
+
} // namespace
-void RegisterScalarAggregateMode(FunctionRegistry* registry) {
- static auto default_options = ModeOptions::Defaults();
- auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), &mode_doc,
- &default_options);
- AddBooleanModeKernel(func.get());
- AddNumericModeKernels(func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
+void RegisterScalarAggregateMode(FunctionRegistry* registry) {
+ static auto default_options = ModeOptions::Defaults();
+ auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), &mode_doc,
+ &default_options);
+ AddBooleanModeKernel(func.get());
+ AddNumericModeKernels(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
}
-} // namespace internal
+} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
index feacedbb96e..7d2ffe0770c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -1,493 +1,493 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cmath>
-#include <vector>
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/stl_allocator.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-
-namespace {
-
-using QuantileState = internal::OptionsWrapper<QuantileOptions>;
-
-// output is at some input data point, not interpolated
-bool IsDataPoint(const QuantileOptions& options) {
- // some interpolation methods return exact data point
- return options.interpolation == QuantileOptions::LOWER ||
- options.interpolation == QuantileOptions::HIGHER ||
- options.interpolation == QuantileOptions::NEAREST;
-}
-
-// quantile to exact datapoint index (IsDataPoint == true)
-uint64_t QuantileToDataPoint(size_t length, double q,
- enum QuantileOptions::Interpolation interpolation) {
- const double index = (length - 1) * q;
- uint64_t datapoint_index = static_cast<uint64_t>(index);
- const double fraction = index - datapoint_index;
-
- if (interpolation == QuantileOptions::LINEAR ||
- interpolation == QuantileOptions::MIDPOINT) {
- DCHECK_EQ(fraction, 0);
- }
-
- // convert NEAREST interpolation method to LOWER or HIGHER
- if (interpolation == QuantileOptions::NEAREST) {
- if (fraction < 0.5) {
- interpolation = QuantileOptions::LOWER;
- } else if (fraction > 0.5) {
- interpolation = QuantileOptions::HIGHER;
- } else {
- // round 0.5 to nearest even number, similar to numpy.around
- interpolation =
- (datapoint_index & 1) ? QuantileOptions::HIGHER : QuantileOptions::LOWER;
- }
- }
-
- if (interpolation == QuantileOptions::HIGHER && fraction != 0) {
- ++datapoint_index;
- }
-
- return datapoint_index;
-}
-
-// copy and nth_element approach, large memory footprint
-template <typename InType>
-struct SortQuantiler {
- using CType = typename InType::c_type;
- using Allocator = arrow::stl::allocator<CType>;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const QuantileOptions& options = QuantileState::Get(ctx);
-
- // copy all chunks to a buffer, ignore nulls and nans
- std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
-
- const Datum& datum = batch[0];
- const int64_t in_length = datum.length() - datum.null_count();
- if (in_length > 0) {
- in_buffer.resize(in_length);
- CopyNonNullValues(datum, in_buffer.data());
-
- // drop nan
- if (is_floating_type<InType>::value) {
- const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
- [](CType v) { return v != v; });
- in_buffer.resize(it - in_buffer.begin());
- }
- }
-
- // prepare out array
- int64_t out_length = options.q.size();
- if (in_buffer.empty()) {
- out_length = 0; // input is empty or only contains null and nan, return empty array
- }
- // out type depends on options
- const bool is_datapoint = IsDataPoint(options);
- const std::shared_ptr<DataType> out_type =
- is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
- auto out_data = ArrayData::Make(out_type, out_length, 0);
- out_data->buffers.resize(2, nullptr);
-
- // calculate quantiles
- if (out_length > 0) {
- ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
- ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
-
- // find quantiles in descending order
- std::vector<int64_t> q_indices(out_length);
- std::iota(q_indices.begin(), q_indices.end(), 0);
- std::sort(q_indices.begin(), q_indices.end(),
- [&options](int64_t left_index, int64_t right_index) {
- return options.q[right_index] < options.q[left_index];
- });
-
- // input array is partitioned around data point at `last_index` (pivot)
- // for next quatile which is smaller, we only consider inputs left of the pivot
- uint64_t last_index = in_buffer.size();
- if (is_datapoint) {
- CType* out_buffer = out_data->template GetMutableValues<CType>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileAtDataPoint(
- in_buffer, &last_index, options.q[q_index], options.interpolation);
- }
- } else {
- double* out_buffer = out_data->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileByInterp(
- in_buffer, &last_index, options.q[q_index], options.interpolation);
- }
- }
- }
-
- *out = Datum(std::move(out_data));
- return Status::OK();
- }
-
- // return quantile located exactly at some input data point
- CType GetQuantileAtDataPoint(std::vector<CType, Allocator>& in, uint64_t* last_index,
- double q,
- enum QuantileOptions::Interpolation interpolation) {
- const uint64_t datapoint_index = QuantileToDataPoint(in.size(), q, interpolation);
-
- if (datapoint_index != *last_index) {
- DCHECK_LT(datapoint_index, *last_index);
- std::nth_element(in.begin(), in.begin() + datapoint_index,
- in.begin() + *last_index);
- *last_index = datapoint_index;
- }
-
- return in[datapoint_index];
- }
-
- // return quantile interpolated from adjacent input data points
- double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
- double q,
- enum QuantileOptions::Interpolation interpolation) {
- const double index = (in.size() - 1) * q;
- const uint64_t lower_index = static_cast<uint64_t>(index);
- const double fraction = index - lower_index;
-
- if (lower_index != *last_index) {
- DCHECK_LT(lower_index, *last_index);
- std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
- }
-
- const double lower_value = static_cast<double>(in[lower_index]);
- if (fraction == 0) {
- *last_index = lower_index;
- return lower_value;
- }
-
- const uint64_t higher_index = lower_index + 1;
- DCHECK_LT(higher_index, in.size());
- if (lower_index != *last_index && higher_index != *last_index) {
- DCHECK_LT(higher_index, *last_index);
- // higher value must be the minimal value after lower_index
- auto min = std::min_element(in.begin() + higher_index, in.begin() + *last_index);
- std::iter_swap(in.begin() + higher_index, min);
- }
- *last_index = lower_index;
-
- const double higher_value = static_cast<double>(in[higher_index]);
-
- if (interpolation == QuantileOptions::LINEAR) {
- // more stable than naive linear interpolation
- return fraction * higher_value + (1 - fraction) * lower_value;
- } else if (interpolation == QuantileOptions::MIDPOINT) {
- return lower_value / 2 + higher_value / 2;
- } else {
- DCHECK(false);
- return NAN;
- }
- }
-};
-
-// histogram approach with constant memory, only for integers within limited value range
-template <typename InType>
-struct CountQuantiler {
- using CType = typename InType::c_type;
-
- CType min;
- std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
-
- // indices to adjacent non-empty bins covering current quantile
- struct AdjacentBins {
- int left_index;
- int right_index;
- uint64_t total_count; // accumulated counts till left_index (inclusive)
- };
-
- CountQuantiler(CType min, CType max) {
- uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
- DCHECK_LT(value_range, 1 << 30);
- this->min = min;
- this->counts.resize(value_range, 0);
- }
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const QuantileOptions& options = QuantileState::Get(ctx);
-
- // count values in all chunks, ignore nulls
- const Datum& datum = batch[0];
- int64_t in_length = CountValues<CType>(this->counts.data(), datum, this->min);
-
- // prepare out array
- int64_t out_length = options.q.size();
- if (in_length == 0) {
- out_length = 0; // input is empty or only contains null, return empty array
- }
- // out type depends on options
- const bool is_datapoint = IsDataPoint(options);
- const std::shared_ptr<DataType> out_type =
- is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
- auto out_data = ArrayData::Make(out_type, out_length, 0);
- out_data->buffers.resize(2, nullptr);
-
- // calculate quantiles
- if (out_length > 0) {
- ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
- ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
-
- // find quantiles in ascending order
- std::vector<int64_t> q_indices(out_length);
- std::iota(q_indices.begin(), q_indices.end(), 0);
- std::sort(q_indices.begin(), q_indices.end(),
- [&options](int64_t left_index, int64_t right_index) {
- return options.q[left_index] < options.q[right_index];
- });
-
- AdjacentBins bins{0, 0, this->counts[0]};
- if (is_datapoint) {
- CType* out_buffer = out_data->template GetMutableValues<CType>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileAtDataPoint(
- in_length, &bins, options.q[q_index], options.interpolation);
- }
- } else {
- double* out_buffer = out_data->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- const int64_t q_index = q_indices[i];
- out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
- options.interpolation);
- }
- }
- }
-
- *out = Datum(std::move(out_data));
- return Status::OK();
- }
-
- // return quantile located exactly at some input data point
- CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
- enum QuantileOptions::Interpolation interpolation) {
- const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
- while (datapoint_index >= bins->total_count &&
- static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
- ++bins->left_index;
- bins->total_count += this->counts[bins->left_index];
- }
- DCHECK_LT(datapoint_index, bins->total_count);
- return static_cast<CType>(bins->left_index + this->min);
- }
-
- // return quantile interpolated from adjacent input data points
- double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
- enum QuantileOptions::Interpolation interpolation) {
- const double index = (in_length - 1) * q;
- const uint64_t index_floor = static_cast<uint64_t>(index);
- const double fraction = index - index_floor;
-
- while (index_floor >= bins->total_count &&
- static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
- ++bins->left_index;
- bins->total_count += this->counts[bins->left_index];
- }
- DCHECK_LT(index_floor, bins->total_count);
- const double lower_value = static_cast<double>(bins->left_index + this->min);
-
- // quantile lies in this bin, no interpolation needed
- if (index <= bins->total_count - 1) {
- return lower_value;
- }
-
- // quantile lies across two bins, locate next bin if not already done
- DCHECK_EQ(index_floor, bins->total_count - 1);
- if (bins->right_index <= bins->left_index) {
- bins->right_index = bins->left_index + 1;
- while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
- this->counts[bins->right_index] == 0) {
- ++bins->right_index;
- }
- }
- DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
- DCHECK_GT(this->counts[bins->right_index], 0);
- const double higher_value = static_cast<double>(bins->right_index + this->min);
-
- if (interpolation == QuantileOptions::LINEAR) {
- return fraction * higher_value + (1 - fraction) * lower_value;
- } else if (interpolation == QuantileOptions::MIDPOINT) {
- return lower_value / 2 + higher_value / 2;
- } else {
- DCHECK(false);
- return NAN;
- }
- }
-};
-
-// histogram or 'copy & nth_element' approach per value range and size, only for integers
-template <typename InType>
-struct CountOrSortQuantiler {
- using CType = typename InType::c_type;
-
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // cross point to benefit from histogram approach
- // parameters estimated from ad-hoc benchmarks manually
- static constexpr int kMinArraySize = 65536;
- static constexpr int kMaxValueRange = 65536;
-
- const Datum& datum = batch[0];
- if (datum.length() - datum.null_count() >= kMinArraySize) {
- CType min, max;
- std::tie(min, max) = GetMinMax<CType>(datum);
-
- if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
- return CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
- }
- }
-
- return SortQuantiler<InType>().Exec(ctx, batch, out);
- }
-};
-
-template <typename InType, typename Enable = void>
-struct ExactQuantiler;
-
-template <>
-struct ExactQuantiler<UInt8Type> {
- CountQuantiler<UInt8Type> impl;
- ExactQuantiler() : impl(0, 255) {}
-};
-
-template <>
-struct ExactQuantiler<Int8Type> {
- CountQuantiler<Int8Type> impl;
- ExactQuantiler() : impl(-128, 127) {}
-};
-
-template <typename InType>
-struct ExactQuantiler<InType, enable_if_t<(is_integer_type<InType>::value &&
- (sizeof(typename InType::c_type) > 1))>> {
- CountOrSortQuantiler<InType> impl;
-};
-
-template <typename InType>
-struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
- SortQuantiler<InType> impl;
-};
-
-template <typename T>
-Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
- const Scalar& scalar, Datum* out) {
- using CType = typename T::c_type;
- ArrayData* output = out->mutable_array();
- if (!scalar.is_valid) {
- output->length = 0;
- output->null_count = 0;
- return Status::OK();
- }
- auto out_type = IsDataPoint(options) ? scalar.type : float64();
- output->length = options.q.size();
- output->null_count = 0;
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[1],
- ctx->Allocate(output->length * BitUtil::BytesForBits(GetBitWidth(*out_type))));
- if (IsDataPoint(options)) {
- CType* out_buffer = output->template GetMutableValues<CType>(1);
- for (int64_t i = 0; i < output->length; i++) {
- out_buffer[i] = UnboxScalar<T>::Unbox(scalar);
- }
- } else {
- double* out_buffer = output->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < output->length; i++) {
- out_buffer[i] = static_cast<double>(UnboxScalar<T>::Unbox(scalar));
- }
- }
- return Status::OK();
-}
-
-template <typename _, typename InType>
-struct QuantileExecutor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (ctx->state() == nullptr) {
- return Status::Invalid("Quantile requires QuantileOptions");
- }
-
- const QuantileOptions& options = QuantileState::Get(ctx);
- if (options.q.empty()) {
- return Status::Invalid("Requires quantile argument");
- }
- for (double q : options.q) {
- if (q < 0 || q > 1) {
- return Status::Invalid("Quantile must be between 0 and 1");
- }
- }
-
- if (batch[0].is_scalar()) {
- return ScalarQuantile<InType>(ctx, options, *batch[0].scalar(), out);
- }
-
- return ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
- }
-};
-
-Result<ValueDescr> ResolveOutput(KernelContext* ctx,
- const std::vector<ValueDescr>& args) {
- const QuantileOptions& options = QuantileState::Get(ctx);
- if (IsDataPoint(options)) {
- return ValueDescr::Array(args[0].type);
- } else {
- return ValueDescr::Array(float64());
- }
-}
-
-void AddQuantileKernels(VectorFunction* func) {
- VectorKernel base;
- base.init = QuantileState::Init;
- base.can_execute_chunkwise = false;
- base.output_chunked = false;
-
- for (const auto& ty : NumericTypes()) {
- base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
- // output type is determined at runtime, set template argument to nulltype
- base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
- DCHECK_OK(func->AddKernel(base));
- }
-}
-
-const FunctionDoc quantile_doc{
- "Compute an array of quantiles of a numeric array or chunked array",
- ("By default, 0.5 quantile (median) is returned.\n"
- "If quantile lies between two data points, an interpolated value is\n"
- "returned based on selected interpolation method.\n"
- "Nulls and NaNs are ignored.\n"
- "An empty array is returned if there is no valid data point."),
- {"array"},
- "QuantileOptions"};
-
-} // namespace
-
-void RegisterScalarAggregateQuantile(FunctionRegistry* registry) {
- static QuantileOptions default_options;
- auto func = std::make_shared<VectorFunction>("quantile", Arity::Unary(), &quantile_doc,
- &default_options);
- AddQuantileKernels(func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/stl_allocator.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using QuantileState = internal::OptionsWrapper<QuantileOptions>;
+
+// output is at some input data point, not interpolated
+bool IsDataPoint(const QuantileOptions& options) {
+ // some interpolation methods return exact data point
+ return options.interpolation == QuantileOptions::LOWER ||
+ options.interpolation == QuantileOptions::HIGHER ||
+ options.interpolation == QuantileOptions::NEAREST;
+}
+
+// quantile to exact datapoint index (IsDataPoint == true)
+uint64_t QuantileToDataPoint(size_t length, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (length - 1) * q;
+ uint64_t datapoint_index = static_cast<uint64_t>(index);
+ const double fraction = index - datapoint_index;
+
+ if (interpolation == QuantileOptions::LINEAR ||
+ interpolation == QuantileOptions::MIDPOINT) {
+ DCHECK_EQ(fraction, 0);
+ }
+
+ // convert NEAREST interpolation method to LOWER or HIGHER
+ if (interpolation == QuantileOptions::NEAREST) {
+ if (fraction < 0.5) {
+ interpolation = QuantileOptions::LOWER;
+ } else if (fraction > 0.5) {
+ interpolation = QuantileOptions::HIGHER;
+ } else {
+ // round 0.5 to nearest even number, similar to numpy.around
+ interpolation =
+ (datapoint_index & 1) ? QuantileOptions::HIGHER : QuantileOptions::LOWER;
+ }
+ }
+
+ if (interpolation == QuantileOptions::HIGHER && fraction != 0) {
+ ++datapoint_index;
+ }
+
+ return datapoint_index;
+}
+
+// copy and nth_element approach, large memory footprint
+template <typename InType>
+struct SortQuantiler {
+ using CType = typename InType::c_type;
+ using Allocator = arrow::stl::allocator<CType>;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // copy all chunks to a buffer, ignore nulls and nans
+ std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+
+ const Datum& datum = batch[0];
+ const int64_t in_length = datum.length() - datum.null_count();
+ if (in_length > 0) {
+ in_buffer.resize(in_length);
+ CopyNonNullValues(datum, in_buffer.data());
+
+ // drop nan
+ if (is_floating_type<InType>::value) {
+ const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
+ [](CType v) { return v != v; });
+ in_buffer.resize(it - in_buffer.begin());
+ }
+ }
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_buffer.empty()) {
+ out_length = 0; // input is empty or only contains null and nan, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+
+ // find quantiles in descending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[right_index] < options.q[left_index];
+ });
+
+ // input array is partitioned around data point at `last_index` (pivot)
+ // for next quatile which is smaller, we only consider inputs left of the pivot
+ uint64_t last_index = in_buffer.size();
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_buffer, &last_index, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(
+ in_buffer, &last_index, options.q[q_index], options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(std::vector<CType, Allocator>& in, uint64_t* last_index,
+ double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in.size(), q, interpolation);
+
+ if (datapoint_index != *last_index) {
+ DCHECK_LT(datapoint_index, *last_index);
+ std::nth_element(in.begin(), in.begin() + datapoint_index,
+ in.begin() + *last_index);
+ *last_index = datapoint_index;
+ }
+
+ return in[datapoint_index];
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(std::vector<CType, Allocator>& in, uint64_t* last_index,
+ double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in.size() - 1) * q;
+ const uint64_t lower_index = static_cast<uint64_t>(index);
+ const double fraction = index - lower_index;
+
+ if (lower_index != *last_index) {
+ DCHECK_LT(lower_index, *last_index);
+ std::nth_element(in.begin(), in.begin() + lower_index, in.begin() + *last_index);
+ }
+
+ const double lower_value = static_cast<double>(in[lower_index]);
+ if (fraction == 0) {
+ *last_index = lower_index;
+ return lower_value;
+ }
+
+ const uint64_t higher_index = lower_index + 1;
+ DCHECK_LT(higher_index, in.size());
+ if (lower_index != *last_index && higher_index != *last_index) {
+ DCHECK_LT(higher_index, *last_index);
+ // higher value must be the minimal value after lower_index
+ auto min = std::min_element(in.begin() + higher_index, in.begin() + *last_index);
+ std::iter_swap(in.begin() + higher_index, min);
+ }
+ *last_index = lower_index;
+
+ const double higher_value = static_cast<double>(in[higher_index]);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ // more stable than naive linear interpolation
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram approach with constant memory, only for integers within limited value range
+template <typename InType>
+struct CountQuantiler {
+ using CType = typename InType::c_type;
+
+ CType min;
+ std::vector<uint64_t> counts; // counts[i]: # of values equals i + min
+
+ // indices to adjacent non-empty bins covering current quantile
+ struct AdjacentBins {
+ int left_index;
+ int right_index;
+ uint64_t total_count; // accumulated counts till left_index (inclusive)
+ };
+
+ CountQuantiler(CType min, CType max) {
+ uint32_t value_range = static_cast<uint32_t>(max - min) + 1;
+ DCHECK_LT(value_range, 1 << 30);
+ this->min = min;
+ this->counts.resize(value_range, 0);
+ }
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+
+ // count values in all chunks, ignore nulls
+ const Datum& datum = batch[0];
+ int64_t in_length = CountValues<CType>(this->counts.data(), datum, this->min);
+
+ // prepare out array
+ int64_t out_length = options.q.size();
+ if (in_length == 0) {
+ out_length = 0; // input is empty or only contains null, return empty array
+ }
+ // out type depends on options
+ const bool is_datapoint = IsDataPoint(options);
+ const std::shared_ptr<DataType> out_type =
+ is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
+ auto out_data = ArrayData::Make(out_type, out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ // calculate quantiles
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+
+ // find quantiles in ascending order
+ std::vector<int64_t> q_indices(out_length);
+ std::iota(q_indices.begin(), q_indices.end(), 0);
+ std::sort(q_indices.begin(), q_indices.end(),
+ [&options](int64_t left_index, int64_t right_index) {
+ return options.q[left_index] < options.q[right_index];
+ });
+
+ AdjacentBins bins{0, 0, this->counts[0]};
+ if (is_datapoint) {
+ CType* out_buffer = out_data->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileAtDataPoint(
+ in_length, &bins, options.q[q_index], options.interpolation);
+ }
+ } else {
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ const int64_t q_index = q_indices[i];
+ out_buffer[q_index] = GetQuantileByInterp(in_length, &bins, options.q[q_index],
+ options.interpolation);
+ }
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ // return quantile located exactly at some input data point
+ CType GetQuantileAtDataPoint(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const uint64_t datapoint_index = QuantileToDataPoint(in_length, q, interpolation);
+ while (datapoint_index >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(datapoint_index, bins->total_count);
+ return static_cast<CType>(bins->left_index + this->min);
+ }
+
+ // return quantile interpolated from adjacent input data points
+ double GetQuantileByInterp(int64_t in_length, AdjacentBins* bins, double q,
+ enum QuantileOptions::Interpolation interpolation) {
+ const double index = (in_length - 1) * q;
+ const uint64_t index_floor = static_cast<uint64_t>(index);
+ const double fraction = index - index_floor;
+
+ while (index_floor >= bins->total_count &&
+ static_cast<size_t>(bins->left_index) < this->counts.size() - 1) {
+ ++bins->left_index;
+ bins->total_count += this->counts[bins->left_index];
+ }
+ DCHECK_LT(index_floor, bins->total_count);
+ const double lower_value = static_cast<double>(bins->left_index + this->min);
+
+ // quantile lies in this bin, no interpolation needed
+ if (index <= bins->total_count - 1) {
+ return lower_value;
+ }
+
+ // quantile lies across two bins, locate next bin if not already done
+ DCHECK_EQ(index_floor, bins->total_count - 1);
+ if (bins->right_index <= bins->left_index) {
+ bins->right_index = bins->left_index + 1;
+ while (static_cast<size_t>(bins->right_index) < this->counts.size() - 1 &&
+ this->counts[bins->right_index] == 0) {
+ ++bins->right_index;
+ }
+ }
+ DCHECK_LT(static_cast<size_t>(bins->right_index), this->counts.size());
+ DCHECK_GT(this->counts[bins->right_index], 0);
+ const double higher_value = static_cast<double>(bins->right_index + this->min);
+
+ if (interpolation == QuantileOptions::LINEAR) {
+ return fraction * higher_value + (1 - fraction) * lower_value;
+ } else if (interpolation == QuantileOptions::MIDPOINT) {
+ return lower_value / 2 + higher_value / 2;
+ } else {
+ DCHECK(false);
+ return NAN;
+ }
+ }
+};
+
+// histogram or 'copy & nth_element' approach per value range and size, only for integers
+template <typename InType>
+struct CountOrSortQuantiler {
+ using CType = typename InType::c_type;
+
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cross point to benefit from histogram approach
+ // parameters estimated from ad-hoc benchmarks manually
+ static constexpr int kMinArraySize = 65536;
+ static constexpr int kMaxValueRange = 65536;
+
+ const Datum& datum = batch[0];
+ if (datum.length() - datum.null_count() >= kMinArraySize) {
+ CType min, max;
+ std::tie(min, max) = GetMinMax<CType>(datum);
+
+ if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+ return CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
+ }
+ }
+
+ return SortQuantiler<InType>().Exec(ctx, batch, out);
+ }
+};
+
+template <typename InType, typename Enable = void>
+struct ExactQuantiler;
+
+template <>
+struct ExactQuantiler<UInt8Type> {
+ CountQuantiler<UInt8Type> impl;
+ ExactQuantiler() : impl(0, 255) {}
+};
+
+template <>
+struct ExactQuantiler<Int8Type> {
+ CountQuantiler<Int8Type> impl;
+ ExactQuantiler() : impl(-128, 127) {}
+};
+
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<(is_integer_type<InType>::value &&
+ (sizeof(typename InType::c_type) > 1))>> {
+ CountOrSortQuantiler<InType> impl;
+};
+
+template <typename InType>
+struct ExactQuantiler<InType, enable_if_t<is_floating_type<InType>::value>> {
+ SortQuantiler<InType> impl;
+};
+
+template <typename T>
+Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
+ const Scalar& scalar, Datum* out) {
+ using CType = typename T::c_type;
+ ArrayData* output = out->mutable_array();
+ if (!scalar.is_valid) {
+ output->length = 0;
+ output->null_count = 0;
+ return Status::OK();
+ }
+ auto out_type = IsDataPoint(options) ? scalar.type : float64();
+ output->length = options.q.size();
+ output->null_count = 0;
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[1],
+ ctx->Allocate(output->length * BitUtil::BytesForBits(GetBitWidth(*out_type))));
+ if (IsDataPoint(options)) {
+ CType* out_buffer = output->template GetMutableValues<CType>(1);
+ for (int64_t i = 0; i < output->length; i++) {
+ out_buffer[i] = UnboxScalar<T>::Unbox(scalar);
+ }
+ } else {
+ double* out_buffer = output->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < output->length; i++) {
+ out_buffer[i] = static_cast<double>(UnboxScalar<T>::Unbox(scalar));
+ }
+ }
+ return Status::OK();
+}
+
+template <typename _, typename InType>
+struct QuantileExecutor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (ctx->state() == nullptr) {
+ return Status::Invalid("Quantile requires QuantileOptions");
+ }
+
+ const QuantileOptions& options = QuantileState::Get(ctx);
+ if (options.q.empty()) {
+ return Status::Invalid("Requires quantile argument");
+ }
+ for (double q : options.q) {
+ if (q < 0 || q > 1) {
+ return Status::Invalid("Quantile must be between 0 and 1");
+ }
+ }
+
+ if (batch[0].is_scalar()) {
+ return ScalarQuantile<InType>(ctx, options, *batch[0].scalar(), out);
+ }
+
+ return ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
+ }
+};
+
+Result<ValueDescr> ResolveOutput(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ const QuantileOptions& options = QuantileState::Get(ctx);
+ if (IsDataPoint(options)) {
+ return ValueDescr::Array(args[0].type);
+ } else {
+ return ValueDescr::Array(float64());
+ }
+}
+
+void AddQuantileKernels(VectorFunction* func) {
+ VectorKernel base;
+ base.init = QuantileState::Init;
+ base.can_execute_chunkwise = false;
+ base.output_chunked = false;
+
+ for (const auto& ty : NumericTypes()) {
+ base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
+ // output type is determined at runtime, set template argument to nulltype
+ base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
+}
+
+const FunctionDoc quantile_doc{
+ "Compute an array of quantiles of a numeric array or chunked array",
+ ("By default, 0.5 quantile (median) is returned.\n"
+ "If quantile lies between two data points, an interpolated value is\n"
+ "returned based on selected interpolation method.\n"
+ "Nulls and NaNs are ignored.\n"
+ "An empty array is returned if there is no valid data point."),
+ {"array"},
+ "QuantileOptions"};
+
+} // namespace
+
+void RegisterScalarAggregateQuantile(FunctionRegistry* registry) {
+ static QuantileOptions default_options;
+ auto func = std::make_shared<VectorFunction>("quantile", Arity::Unary(), &quantile_doc,
+ &default_options);
+ AddQuantileKernels(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
index 54f36ab9159..4c261604c85 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc
@@ -1,164 +1,164 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/tdigest.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-
-namespace {
-
-using arrow::internal::TDigest;
-using arrow::internal::VisitSetBitRunsVoid;
-
-template <typename ArrowType>
-struct TDigestImpl : public ScalarAggregator {
- using ThisType = TDigestImpl<ArrowType>;
- using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
- using CType = typename ArrowType::c_type;
-
- explicit TDigestImpl(const TDigestOptions& options)
- : q{options.q}, tdigest{options.delta, options.buffer_size} {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- if (batch[0].is_array()) {
- const ArrayData& data = *batch[0].array();
- const CType* values = data.GetValues<CType>(1);
-
- if (data.length > data.GetNullCount()) {
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- this->tdigest.NanAdd(values[pos + i]);
- }
- });
- }
- } else {
- const CType value = UnboxScalar<ArrowType>::Unbox(*batch[0].scalar());
- if (batch[0].scalar()->is_valid) {
- this->tdigest.NanAdd(value);
- }
- }
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- auto& other = checked_cast<ThisType&>(src);
- std::vector<TDigest> other_tdigest;
- other_tdigest.push_back(std::move(other.tdigest));
- this->tdigest.Merge(&other_tdigest);
- return Status::OK();
- }
-
- Status Finalize(KernelContext* ctx, Datum* out) override {
- const int64_t out_length = this->tdigest.is_empty() ? 0 : this->q.size();
- auto out_data = ArrayData::Make(float64(), out_length, 0);
- out_data->buffers.resize(2, nullptr);
-
- if (out_length > 0) {
- ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
- ctx->Allocate(out_length * sizeof(double)));
- double* out_buffer = out_data->template GetMutableValues<double>(1);
- for (int64_t i = 0; i < out_length; ++i) {
- out_buffer[i] = this->tdigest.Quantile(this->q[i]);
- }
- }
-
- *out = Datum(std::move(out_data));
- return Status::OK();
- }
-
- const std::vector<double>& q;
- TDigest tdigest;
-};
-
-struct TDigestInitState {
- std::unique_ptr<KernelState> state;
- KernelContext* ctx;
- const DataType& in_type;
- const TDigestOptions& options;
-
- TDigestInitState(KernelContext* ctx, const DataType& in_type,
- const TDigestOptions& options)
- : ctx(ctx), in_type(in_type), options(options) {}
-
- Status Visit(const DataType&) {
- return Status::NotImplemented("No tdigest implemented");
- }
-
- Status Visit(const HalfFloatType&) {
- return Status::NotImplemented("No tdigest implemented");
- }
-
- template <typename Type>
- enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
- state.reset(new TDigestImpl<Type>(options));
- return Status::OK();
- }
-
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(in_type, this));
- return std::move(state);
- }
-};
-
-Result<std::unique_ptr<KernelState>> TDigestInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- TDigestInitState visitor(ctx, *args.inputs[0].type,
- static_cast<const TDigestOptions&>(*args.options));
- return visitor.Create();
-}
-
-void AddTDigestKernels(KernelInit init,
- const std::vector<std::shared_ptr<DataType>>& types,
- ScalarAggregateFunction* func) {
- for (const auto& ty : types) {
- auto sig = KernelSignature::Make({InputType(ty)}, float64());
- AddAggKernel(std::move(sig), init, func);
- }
-}
-
-const FunctionDoc tdigest_doc{
- "Approximate quantiles of a numeric array with T-Digest algorithm",
- ("By default, 0.5 quantile (median) is returned.\n"
- "Nulls and NaNs are ignored.\n"
- "An empty array is returned if there is no valid data point."),
- {"array"},
- "TDigestOptions"};
-
-std::shared_ptr<ScalarAggregateFunction> AddTDigestAggKernels() {
- static auto default_tdigest_options = TDigestOptions::Defaults();
- auto func = std::make_shared<ScalarAggregateFunction>(
- "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options);
- AddTDigestKernels(TDigestInit, NumericTypes(), func.get());
- return func;
-}
-
-} // namespace
-
-void RegisterScalarAggregateTDigest(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunction(AddTDigestAggKernels()));
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/tdigest.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow::internal::TDigest;
+using arrow::internal::VisitSetBitRunsVoid;
+
+template <typename ArrowType>
+struct TDigestImpl : public ScalarAggregator {
+ using ThisType = TDigestImpl<ArrowType>;
+ using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+ using CType = typename ArrowType::c_type;
+
+ explicit TDigestImpl(const TDigestOptions& options)
+ : q{options.q}, tdigest{options.delta, options.buffer_size} {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ if (batch[0].is_array()) {
+ const ArrayData& data = *batch[0].array();
+ const CType* values = data.GetValues<CType>(1);
+
+ if (data.length > data.GetNullCount()) {
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ this->tdigest.NanAdd(values[pos + i]);
+ }
+ });
+ }
+ } else {
+ const CType value = UnboxScalar<ArrowType>::Unbox(*batch[0].scalar());
+ if (batch[0].scalar()->is_valid) {
+ this->tdigest.NanAdd(value);
+ }
+ }
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ auto& other = checked_cast<ThisType&>(src);
+ std::vector<TDigest> other_tdigest;
+ other_tdigest.push_back(std::move(other.tdigest));
+ this->tdigest.Merge(&other_tdigest);
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext* ctx, Datum* out) override {
+ const int64_t out_length = this->tdigest.is_empty() ? 0 : this->q.size();
+ auto out_data = ArrayData::Make(float64(), out_length, 0);
+ out_data->buffers.resize(2, nullptr);
+
+ if (out_length > 0) {
+ ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+ ctx->Allocate(out_length * sizeof(double)));
+ double* out_buffer = out_data->template GetMutableValues<double>(1);
+ for (int64_t i = 0; i < out_length; ++i) {
+ out_buffer[i] = this->tdigest.Quantile(this->q[i]);
+ }
+ }
+
+ *out = Datum(std::move(out_data));
+ return Status::OK();
+ }
+
+ const std::vector<double>& q;
+ TDigest tdigest;
+};
+
+struct TDigestInitState {
+ std::unique_ptr<KernelState> state;
+ KernelContext* ctx;
+ const DataType& in_type;
+ const TDigestOptions& options;
+
+ TDigestInitState(KernelContext* ctx, const DataType& in_type,
+ const TDigestOptions& options)
+ : ctx(ctx), in_type(in_type), options(options) {}
+
+ Status Visit(const DataType&) {
+ return Status::NotImplemented("No tdigest implemented");
+ }
+
+ Status Visit(const HalfFloatType&) {
+ return Status::NotImplemented("No tdigest implemented");
+ }
+
+ template <typename Type>
+ enable_if_t<is_number_type<Type>::value, Status> Visit(const Type&) {
+ state.reset(new TDigestImpl<Type>(options));
+ return Status::OK();
+ }
+
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ return std::move(state);
+ }
+};
+
+Result<std::unique_ptr<KernelState>> TDigestInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ TDigestInitState visitor(ctx, *args.inputs[0].type,
+ static_cast<const TDigestOptions&>(*args.options));
+ return visitor.Create();
+}
+
+void AddTDigestKernels(KernelInit init,
+ const std::vector<std::shared_ptr<DataType>>& types,
+ ScalarAggregateFunction* func) {
+ for (const auto& ty : types) {
+ auto sig = KernelSignature::Make({InputType(ty)}, float64());
+ AddAggKernel(std::move(sig), init, func);
+ }
+}
+
+const FunctionDoc tdigest_doc{
+ "Approximate quantiles of a numeric array with T-Digest algorithm",
+ ("By default, 0.5 quantile (median) is returned.\n"
+ "Nulls and NaNs are ignored.\n"
+ "An empty array is returned if there is no valid data point."),
+ {"array"},
+ "TDigestOptions"};
+
+std::shared_ptr<ScalarAggregateFunction> AddTDigestAggKernels() {
+ static auto default_tdigest_options = TDigestOptions::Defaults();
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "tdigest", Arity::Unary(), &tdigest_doc, &default_tdigest_options);
+ AddTDigestKernels(TDigestInit, NumericTypes(), func.get());
+ return func;
+}
+
+} // namespace
+
+void RegisterScalarAggregateTDigest(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(AddTDigestAggKernels()));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
index d879630e697..d6965fed4a3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/aggregate_var_std.cc
@@ -15,130 +15,130 @@
// specific language governing permissions and limitations
// under the License.
-#include <cmath>
-
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/int128_internal.h"
-
+#include <cmath>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int128_internal.h"
+
namespace arrow {
namespace compute {
-namespace internal {
+namespace internal {
namespace {
-using arrow::internal::int128_t;
-using arrow::internal::VisitSetBitRunsVoid;
-
+using arrow::internal::int128_t;
+using arrow::internal::VisitSetBitRunsVoid;
+
template <typename ArrowType>
struct VarStdState {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
- using CType = typename ArrowType::c_type;
+ using CType = typename ArrowType::c_type;
using ThisType = VarStdState<ArrowType>;
- // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
+ // float/double/int64: calculate `m2` (sum((X-mean)^2)) with `two pass algorithm`
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
- template <typename T = ArrowType>
- enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4)> Consume(
- const ArrayType& array) {
+ template <typename T = ArrowType>
+ enable_if_t<is_floating_type<T>::value || (sizeof(CType) > 4)> Consume(
+ const ArrayType& array) {
int64_t count = array.length() - array.null_count();
if (count == 0) {
return;
}
- using SumType =
- typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
- SumType sum =
- arrow::compute::detail::SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
+ using SumType =
+ typename std::conditional<is_floating_type<T>::value, double, int128_t>::type;
+ SumType sum =
+ arrow::compute::detail::SumArray<CType, SumType, SimdLevel::NONE>(*array.data());
- const double mean = static_cast<double>(sum) / count;
- const double m2 = arrow::compute::detail::SumArray<CType, double, SimdLevel::NONE>(
- *array.data(), [mean](CType value) {
- const double v = static_cast<double>(value);
- return (v - mean) * (v - mean);
- });
+ const double mean = static_cast<double>(sum) / count;
+ const double m2 = arrow::compute::detail::SumArray<CType, double, SimdLevel::NONE>(
+ *array.data(), [mean](CType value) {
+ const double v = static_cast<double>(value);
+ return (v - mean) * (v - mean);
+ });
this->count = count;
- this->mean = mean;
+ this->mean = mean;
this->m2 = m2;
}
- // int32/16/8: textbook one pass algorithm with integer arithmetic
- template <typename T = ArrowType>
- enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
- const ArrayType& array) {
- // max number of elements that sum will not overflow int64 (2Gi int32 elements)
- // for uint32: 0 <= sum < 2^63 (int64 >= 0)
- // for int32: -2^62 <= sum < 2^62
- constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
-
- int64_t start_index = 0;
- int64_t valid_count = array.length() - array.null_count();
-
- while (valid_count > 0) {
- // process in chunks that overflow will never happen
- const auto slice = array.Slice(start_index, max_length);
- const int64_t count = slice->length() - slice->null_count();
- start_index += max_length;
- valid_count -= count;
-
- if (count > 0) {
- int64_t sum = 0;
- int128_t square_sum = 0;
- const ArrayData& data = *slice->data();
- const CType* values = data.GetValues<CType>(1);
- VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- const auto value = values[pos + i];
- sum += value;
- square_sum += static_cast<uint64_t>(value) * value;
- }
- });
-
- const double mean = static_cast<double>(sum) / count;
- // calculate m2 = square_sum - sum * sum / count
- // decompose `sum * sum / count` into integers and fractions
- const int128_t sum_square = static_cast<int128_t>(sum) * sum;
- const int128_t integers = sum_square / count;
- const double fractions = static_cast<double>(sum_square % count) / count;
- const double m2 = static_cast<double>(square_sum - integers) - fractions;
-
- // merge variance
- ThisType state;
- state.count = count;
- state.mean = mean;
- state.m2 = m2;
- this->MergeFrom(state);
- }
- }
- }
-
- // Combine `m2` from two chunks (m2 = n*s2)
- // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
+ // int32/16/8: textbook one pass algorithm with integer arithmetic
+ template <typename T = ArrowType>
+ enable_if_t<is_integer_type<T>::value && (sizeof(CType) <= 4)> Consume(
+ const ArrayType& array) {
+ // max number of elements that sum will not overflow int64 (2Gi int32 elements)
+ // for uint32: 0 <= sum < 2^63 (int64 >= 0)
+ // for int32: -2^62 <= sum < 2^62
+ constexpr int64_t max_length = 1ULL << (63 - sizeof(CType) * 8);
+
+ int64_t start_index = 0;
+ int64_t valid_count = array.length() - array.null_count();
+
+ while (valid_count > 0) {
+ // process in chunks that overflow will never happen
+ const auto slice = array.Slice(start_index, max_length);
+ const int64_t count = slice->length() - slice->null_count();
+ start_index += max_length;
+ valid_count -= count;
+
+ if (count > 0) {
+ int64_t sum = 0;
+ int128_t square_sum = 0;
+ const ArrayData& data = *slice->data();
+ const CType* values = data.GetValues<CType>(1);
+ VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ const auto value = values[pos + i];
+ sum += value;
+ square_sum += static_cast<uint64_t>(value) * value;
+ }
+ });
+
+ const double mean = static_cast<double>(sum) / count;
+ // calculate m2 = square_sum - sum * sum / count
+ // decompose `sum * sum / count` into integers and fractions
+ const int128_t sum_square = static_cast<int128_t>(sum) * sum;
+ const int128_t integers = sum_square / count;
+ const double fractions = static_cast<double>(sum_square % count) / count;
+ const double m2 = static_cast<double>(square_sum - integers) - fractions;
+
+ // merge variance
+ ThisType state;
+ state.count = count;
+ state.mean = mean;
+ state.m2 = m2;
+ this->MergeFrom(state);
+ }
+ }
+ }
+
+ // Combine `m2` from two chunks (m2 = n*s2)
+ // https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html
void MergeFrom(const ThisType& state) {
if (state.count == 0) {
return;
}
if (this->count == 0) {
this->count = state.count;
- this->mean = state.mean;
+ this->mean = state.mean;
this->m2 = state.m2;
return;
}
- double mean = (this->mean * this->count + state.mean * state.count) /
- (this->count + state.count);
- this->m2 += state.m2 + this->count * (this->mean - mean) * (this->mean - mean) +
- state.count * (state.mean - mean) * (state.mean - mean);
+ double mean = (this->mean * this->count + state.mean * state.count) /
+ (this->count + state.count);
+ this->m2 += state.m2 + this->count * (this->mean - mean) * (this->mean - mean) +
+ state.count * (state.mean - mean) * (state.mean - mean);
this->count += state.count;
- this->mean = mean;
+ this->mean = mean;
}
int64_t count = 0;
- double mean = 0;
- double m2 = 0; // m2 = count*s2 = sum((X-mean)^2)
+ double mean = 0;
+ double m2 = 0; // m2 = count*s2 = sum((X-mean)^2)
};
enum class VarOrStd : bool { Var, Std };
@@ -152,27 +152,27 @@ struct VarStdImpl : public ScalarAggregator {
const VarianceOptions& options, VarOrStd return_type)
: out_type(out_type), options(options), return_type(return_type) {}
- Status Consume(KernelContext*, const ExecBatch& batch) override {
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
ArrayType array(batch[0].array());
this->state.Consume(array);
- return Status::OK();
+ return Status::OK();
}
- Status MergeFrom(KernelContext*, KernelState&& src) override {
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
const auto& other = checked_cast<const ThisType&>(src);
this->state.MergeFrom(other.state);
- return Status::OK();
+ return Status::OK();
}
- Status Finalize(KernelContext*, Datum* out) override {
+ Status Finalize(KernelContext*, Datum* out) override {
if (this->state.count <= options.ddof) {
- out->value = std::make_shared<DoubleScalar>();
+ out->value = std::make_shared<DoubleScalar>();
} else {
double var = this->state.m2 / (this->state.count - options.ddof);
out->value =
- std::make_shared<DoubleScalar>(return_type == VarOrStd::Var ? var : sqrt(var));
+ std::make_shared<DoubleScalar>(return_type == VarOrStd::Var ? var : sqrt(var));
}
- return Status::OK();
+ return Status::OK();
}
std::shared_ptr<DataType> out_type;
@@ -181,34 +181,34 @@ struct VarStdImpl : public ScalarAggregator {
VarOrStd return_type;
};
-struct ScalarVarStdImpl : public ScalarAggregator {
- explicit ScalarVarStdImpl(const VarianceOptions& options)
- : options(options), seen(false) {}
-
- Status Consume(KernelContext*, const ExecBatch& batch) override {
- seen = batch[0].scalar()->is_valid;
- return Status::OK();
- }
-
- Status MergeFrom(KernelContext*, KernelState&& src) override {
- const auto& other = checked_cast<const ScalarVarStdImpl&>(src);
- seen = seen || other.seen;
- return Status::OK();
- }
-
- Status Finalize(KernelContext*, Datum* out) override {
- if (!seen || options.ddof > 0) {
- out->value = std::make_shared<DoubleScalar>();
- } else {
- out->value = std::make_shared<DoubleScalar>(0.0);
- }
- return Status::OK();
- }
-
- const VarianceOptions options;
- bool seen;
-};
-
+struct ScalarVarStdImpl : public ScalarAggregator {
+ explicit ScalarVarStdImpl(const VarianceOptions& options)
+ : options(options), seen(false) {}
+
+ Status Consume(KernelContext*, const ExecBatch& batch) override {
+ seen = batch[0].scalar()->is_valid;
+ return Status::OK();
+ }
+
+ Status MergeFrom(KernelContext*, KernelState&& src) override {
+ const auto& other = checked_cast<const ScalarVarStdImpl&>(src);
+ seen = seen || other.seen;
+ return Status::OK();
+ }
+
+ Status Finalize(KernelContext*, Datum* out) override {
+ if (!seen || options.ddof > 0) {
+ out->value = std::make_shared<DoubleScalar>();
+ } else {
+ out->value = std::make_shared<DoubleScalar>(0.0);
+ }
+ return Status::OK();
+ }
+
+ const VarianceOptions options;
+ bool seen;
+};
+
struct VarStdInitState {
std::unique_ptr<KernelState> state;
KernelContext* ctx;
@@ -240,87 +240,87 @@ struct VarStdInitState {
return Status::OK();
}
- Result<std::unique_ptr<KernelState>> Create() {
- RETURN_NOT_OK(VisitTypeInline(in_type, this));
+ Result<std::unique_ptr<KernelState>> Create() {
+ RETURN_NOT_OK(VisitTypeInline(in_type, this));
return std::move(state);
}
};
-Result<std::unique_ptr<KernelState>> StddevInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> StddevInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
VarStdInitState visitor(
ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
static_cast<const VarianceOptions&>(*args.options), VarOrStd::Std);
return visitor.Create();
}
-Result<std::unique_ptr<KernelState>> VarianceInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> VarianceInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
VarStdInitState visitor(
ctx, *args.inputs[0].type, args.kernel->signature->out_type().type(),
static_cast<const VarianceOptions&>(*args.options), VarOrStd::Var);
return visitor.Create();
}
-Result<std::unique_ptr<KernelState>> ScalarVarStdInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- return arrow::internal::make_unique<ScalarVarStdImpl>(
- static_cast<const VarianceOptions&>(*args.options));
-}
-
+Result<std::unique_ptr<KernelState>> ScalarVarStdInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ return arrow::internal::make_unique<ScalarVarStdImpl>(
+ static_cast<const VarianceOptions&>(*args.options));
+}
+
void AddVarStdKernels(KernelInit init,
const std::vector<std::shared_ptr<DataType>>& types,
ScalarAggregateFunction* func) {
for (const auto& ty : types) {
auto sig = KernelSignature::Make({InputType::Array(ty)}, float64());
AddAggKernel(std::move(sig), init, func);
-
- sig = KernelSignature::Make({InputType::Scalar(ty)}, float64());
- AddAggKernel(std::move(sig), ScalarVarStdInit, func);
+
+ sig = KernelSignature::Make({InputType::Scalar(ty)}, float64());
+ AddAggKernel(std::move(sig), ScalarVarStdInit, func);
}
}
-const FunctionDoc stddev_doc{
- "Calculate the standard deviation of a numeric array",
- ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
- "By default (`ddof` = 0), the population standard deviation is calculated.\n"
- "Nulls are ignored. If there are not enough non-null values in the array\n"
- "to satisfy `ddof`, null is returned."),
- {"array"},
- "VarianceOptions"};
-
-const FunctionDoc variance_doc{
- "Calculate the variance of a numeric array",
- ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
- "By default (`ddof` = 0), the population variance is calculated.\n"
- "Nulls are ignored. If there are not enough non-null values in the array\n"
- "to satisfy `ddof`, null is returned."),
- {"array"},
- "VarianceOptions"};
-
+const FunctionDoc stddev_doc{
+ "Calculate the standard deviation of a numeric array",
+ ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
+ "By default (`ddof` = 0), the population standard deviation is calculated.\n"
+ "Nulls are ignored. If there are not enough non-null values in the array\n"
+ "to satisfy `ddof`, null is returned."),
+ {"array"},
+ "VarianceOptions"};
+
+const FunctionDoc variance_doc{
+ "Calculate the variance of a numeric array",
+ ("The number of degrees of freedom can be controlled using VarianceOptions.\n"
+ "By default (`ddof` = 0), the population variance is calculated.\n"
+ "Nulls are ignored. If there are not enough non-null values in the array\n"
+ "to satisfy `ddof`, null is returned."),
+ {"array"},
+ "VarianceOptions"};
+
std::shared_ptr<ScalarAggregateFunction> AddStddevAggKernels() {
static auto default_std_options = VarianceOptions::Defaults();
- auto func = std::make_shared<ScalarAggregateFunction>(
- "stddev", Arity::Unary(), &stddev_doc, &default_std_options);
- AddVarStdKernels(StddevInit, NumericTypes(), func.get());
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "stddev", Arity::Unary(), &stddev_doc, &default_std_options);
+ AddVarStdKernels(StddevInit, NumericTypes(), func.get());
return func;
}
std::shared_ptr<ScalarAggregateFunction> AddVarianceAggKernels() {
static auto default_var_options = VarianceOptions::Defaults();
- auto func = std::make_shared<ScalarAggregateFunction>(
- "variance", Arity::Unary(), &variance_doc, &default_var_options);
- AddVarStdKernels(VarianceInit, NumericTypes(), func.get());
+ auto func = std::make_shared<ScalarAggregateFunction>(
+ "variance", Arity::Unary(), &variance_doc, &default_var_options);
+ AddVarStdKernels(VarianceInit, NumericTypes(), func.get());
return func;
}
-} // namespace
-
-void RegisterScalarAggregateVariance(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunction(AddVarianceAggKernels()));
- DCHECK_OK(registry->AddFunction(AddStddevAggKernels()));
-}
-
-} // namespace internal
+} // namespace
+
+void RegisterScalarAggregateVariance(FunctionRegistry* registry) {
+ DCHECK_OK(registry->AddFunction(AddVarianceAggKernels()));
+ DCHECK_OK(registry->AddFunction(AddStddevAggKernels()));
+}
+
+} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 7133b175472..bab8e7000cd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -28,15 +28,15 @@ namespace arrow {
namespace compute {
namespace internal {
-Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::NotImplemented("This kernel is malformed");
+Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::NotImplemented("This kernel is malformed");
}
ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) {
return [exec](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ExecBatch flipped_batch = batch;
std::swap(flipped_batch.values[0], flipped_batch.values[1]);
- return exec(ctx, flipped_batch, out);
+ return exec(ctx, flipped_batch, out);
};
}
@@ -48,7 +48,7 @@ std::vector<std::shared_ptr<DataType>> g_numeric_types;
std::vector<std::shared_ptr<DataType>> g_base_binary_types;
std::vector<std::shared_ptr<DataType>> g_temporal_types;
std::vector<std::shared_ptr<DataType>> g_primitive_types;
-std::vector<Type::type> g_decimal_type_ids;
+std::vector<Type::type> g_decimal_type_ids;
static std::once_flag codegen_static_initialized;
template <typename T>
@@ -72,9 +72,9 @@ static void InitStaticData() {
// Floating point types
g_floating_types = {float32(), float64()};
- // Decimal types
- g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256};
-
+ // Decimal types
+ g_decimal_type_ids = {Type::DECIMAL128, Type::DECIMAL256};
+
// Numeric types
Extend(g_int_types, &g_numeric_types);
Extend(g_floating_types, &g_numeric_types);
@@ -136,11 +136,11 @@ const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes() {
return g_floating_types;
}
-const std::vector<Type::type>& DecimalTypeIds() {
- std::call_once(codegen_static_initialized, InitStaticData);
- return g_decimal_type_ids;
-}
-
+const std::vector<Type::type>& DecimalTypeIds() {
+ std::call_once(codegen_static_initialized, InitStaticData);
+ return g_decimal_type_ids;
+}
+
const std::vector<TimeUnit::type>& AllTimeUnits() {
static std::vector<TimeUnit::type> units = {TimeUnit::SECOND, TimeUnit::MILLI,
TimeUnit::MICRO, TimeUnit::NANO};
@@ -164,7 +164,7 @@ const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
static DataTypeVector example_parametric_types = {
- decimal128(12, 2),
+ decimal128(12, 2),
duration(TimeUnit::SECOND),
timestamp(TimeUnit::SECOND),
time32(TimeUnit::SECOND),
@@ -185,153 +185,153 @@ const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
// work above
Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs) {
- ValueDescr result = descrs.front();
- result.shape = GetBroadcastShape(descrs);
- return result;
+ ValueDescr result = descrs.front();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
+}
+
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
+ for (ValueDescr& descr : *descrs) {
+ if (descr.type->id() == Type::DICTIONARY) {
+ descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
+ }
+ }
+}
+
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
+ DCHECK_EQ(descrs->size(), 2);
+
+ if (descrs->at(0).type->id() == Type::NA) {
+ descrs->at(0).type = descrs->at(1).type;
+ return;
+ }
+
+ if (descrs->at(1).type->id() == Type::NA) {
+ descrs->at(1).type = descrs->at(0).type;
+ return;
+ }
+}
+
+void ReplaceTypes(const std::shared_ptr<DataType>& type,
+ std::vector<ValueDescr>* descrs) {
+ for (auto& descr : *descrs) {
+ descr.type = type;
+ }
+}
+
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
+ return CommonNumeric(descrs.data(), descrs.size());
+}
+
+std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count) {
+ DCHECK_GT(count, 0) << "tried to find CommonNumeric type of an empty set";
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ auto id = descr.type->id();
+ if (!is_floating(id) && !is_integer(id)) {
+ // a common numeric type is only possible if all types are numeric
+ return nullptr;
+ }
+ if (id == Type::HALF_FLOAT) {
+ // float16 arithmetic is not currently supported
+ return nullptr;
+ }
+ }
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ if (descr.type->id() == Type::DOUBLE) return float64();
+ }
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ if (descr.type->id() == Type::FLOAT) return float32();
+ }
+
+ int max_width_signed = 0, max_width_unsigned = 0;
+
+ for (size_t i = 0; i < count; i++) {
+ const auto& descr = *(begin + i);
+ auto id = descr.type->id();
+ auto max_width = &(is_signed_integer(id) ? max_width_signed : max_width_unsigned);
+ *max_width = std::max(bit_width(id), *max_width);
+ }
+
+ if (max_width_signed == 0) {
+ if (max_width_unsigned >= 64) return uint64();
+ if (max_width_unsigned == 32) return uint32();
+ if (max_width_unsigned == 16) return uint16();
+ DCHECK_EQ(max_width_unsigned, 8);
+ return uint8();
+ }
+
+ if (max_width_signed <= max_width_unsigned) {
+ max_width_signed = static_cast<int>(BitUtil::NextPower2(max_width_unsigned + 1));
+ }
+
+ if (max_width_signed >= 64) return int64();
+ if (max_width_signed == 32) return int32();
+ if (max_width_signed == 16) return int16();
+ DCHECK_EQ(max_width_signed, 8);
+ return int8();
+}
+
+std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs) {
+ TimeUnit::type finest_unit = TimeUnit::SECOND;
+
+ for (const auto& descr : descrs) {
+ auto id = descr.type->id();
+ // a common timestamp is only possible if all types are timestamp like
+ switch (id) {
+ case Type::DATE32:
+ case Type::DATE64:
+ continue;
+ case Type::TIMESTAMP:
+ finest_unit =
+ std::max(finest_unit, checked_cast<const TimestampType&>(*descr.type).unit());
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ return timestamp(finest_unit);
+}
+
+std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs) {
+ bool all_utf8 = true, all_offset32 = true;
+
+ for (const auto& descr : descrs) {
+ auto id = descr.type->id();
+ // a common varbinary type is only possible if all types are binary like
+ switch (id) {
+ case Type::STRING:
+ continue;
+ case Type::BINARY:
+ all_utf8 = false;
+ continue;
+ case Type::LARGE_STRING:
+ all_offset32 = false;
+ continue;
+ case Type::LARGE_BINARY:
+ all_offset32 = false;
+ all_utf8 = false;
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ if (all_utf8) {
+ if (all_offset32) return utf8();
+ return large_utf8();
+ }
+
+ if (all_offset32) return binary();
+ return large_binary();
}
-void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs) {
- for (ValueDescr& descr : *descrs) {
- if (descr.type->id() == Type::DICTIONARY) {
- descr.type = checked_cast<const DictionaryType&>(*descr.type).value_type();
- }
- }
-}
-
-void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs) {
- DCHECK_EQ(descrs->size(), 2);
-
- if (descrs->at(0).type->id() == Type::NA) {
- descrs->at(0).type = descrs->at(1).type;
- return;
- }
-
- if (descrs->at(1).type->id() == Type::NA) {
- descrs->at(1).type = descrs->at(0).type;
- return;
- }
-}
-
-void ReplaceTypes(const std::shared_ptr<DataType>& type,
- std::vector<ValueDescr>* descrs) {
- for (auto& descr : *descrs) {
- descr.type = type;
- }
-}
-
-std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs) {
- return CommonNumeric(descrs.data(), descrs.size());
-}
-
-std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count) {
- DCHECK_GT(count, 0) << "tried to find CommonNumeric type of an empty set";
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- auto id = descr.type->id();
- if (!is_floating(id) && !is_integer(id)) {
- // a common numeric type is only possible if all types are numeric
- return nullptr;
- }
- if (id == Type::HALF_FLOAT) {
- // float16 arithmetic is not currently supported
- return nullptr;
- }
- }
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- if (descr.type->id() == Type::DOUBLE) return float64();
- }
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- if (descr.type->id() == Type::FLOAT) return float32();
- }
-
- int max_width_signed = 0, max_width_unsigned = 0;
-
- for (size_t i = 0; i < count; i++) {
- const auto& descr = *(begin + i);
- auto id = descr.type->id();
- auto max_width = &(is_signed_integer(id) ? max_width_signed : max_width_unsigned);
- *max_width = std::max(bit_width(id), *max_width);
- }
-
- if (max_width_signed == 0) {
- if (max_width_unsigned >= 64) return uint64();
- if (max_width_unsigned == 32) return uint32();
- if (max_width_unsigned == 16) return uint16();
- DCHECK_EQ(max_width_unsigned, 8);
- return uint8();
- }
-
- if (max_width_signed <= max_width_unsigned) {
- max_width_signed = static_cast<int>(BitUtil::NextPower2(max_width_unsigned + 1));
- }
-
- if (max_width_signed >= 64) return int64();
- if (max_width_signed == 32) return int32();
- if (max_width_signed == 16) return int16();
- DCHECK_EQ(max_width_signed, 8);
- return int8();
-}
-
-std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs) {
- TimeUnit::type finest_unit = TimeUnit::SECOND;
-
- for (const auto& descr : descrs) {
- auto id = descr.type->id();
- // a common timestamp is only possible if all types are timestamp like
- switch (id) {
- case Type::DATE32:
- case Type::DATE64:
- continue;
- case Type::TIMESTAMP:
- finest_unit =
- std::max(finest_unit, checked_cast<const TimestampType&>(*descr.type).unit());
- continue;
- default:
- return nullptr;
- }
- }
-
- return timestamp(finest_unit);
-}
-
-std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs) {
- bool all_utf8 = true, all_offset32 = true;
-
- for (const auto& descr : descrs) {
- auto id = descr.type->id();
- // a common varbinary type is only possible if all types are binary like
- switch (id) {
- case Type::STRING:
- continue;
- case Type::BINARY:
- all_utf8 = false;
- continue;
- case Type::LARGE_STRING:
- all_offset32 = false;
- continue;
- case Type::LARGE_BINARY:
- all_offset32 = false;
- all_utf8 = false;
- continue;
- default:
- return nullptr;
- }
- }
-
- if (all_utf8) {
- if (all_offset32) return utf8();
- return large_utf8();
- }
-
- if (all_offset32) return binary();
- return large_binary();
-}
-
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
index c1950a2b11a..cb9b13bb3d7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -18,7 +18,7 @@
#pragma once
#include <cstdint>
-#include <cstring>
+#include <cstring>
#include <memory>
#include <string>
#include <utility>
@@ -71,14 +71,14 @@ template <typename OptionsType>
struct OptionsWrapper : public KernelState {
explicit OptionsWrapper(OptionsType options) : options(std::move(options)) {}
- static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
- const KernelInitArgs& args) {
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
if (auto options = static_cast<const OptionsType*>(args.options)) {
return ::arrow::internal::make_unique<OptionsWrapper>(*options);
}
- return Status::Invalid(
- "Attempted to initialize KernelState from null FunctionOptions");
+ return Status::Invalid(
+ "Attempted to initialize KernelState from null FunctionOptions");
}
static const OptionsType& Get(const KernelState& state) {
@@ -90,34 +90,34 @@ struct OptionsWrapper : public KernelState {
OptionsType options;
};
-/// KernelState adapter for when the state is an instance constructed with the
-/// KernelContext and the FunctionOptions as argument
-template <typename StateType, typename OptionsType>
-struct KernelStateFromFunctionOptions : public KernelState {
- explicit KernelStateFromFunctionOptions(KernelContext* ctx, OptionsType state)
- : state(StateType(ctx, std::move(state))) {}
-
- static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
- const KernelInitArgs& args) {
- if (auto options = static_cast<const OptionsType*>(args.options)) {
- return ::arrow::internal::make_unique<KernelStateFromFunctionOptions>(ctx,
- *options);
- }
-
- return Status::Invalid(
- "Attempted to initialize KernelState from null FunctionOptions");
- }
-
- static const StateType& Get(const KernelState& state) {
- return ::arrow::internal::checked_cast<const KernelStateFromFunctionOptions&>(state)
- .state;
- }
-
- static const StateType& Get(KernelContext* ctx) { return Get(*ctx->state()); }
-
- StateType state;
-};
-
+/// KernelState adapter for when the state is an instance constructed with the
+/// KernelContext and the FunctionOptions as argument
+template <typename StateType, typename OptionsType>
+struct KernelStateFromFunctionOptions : public KernelState {
+ explicit KernelStateFromFunctionOptions(KernelContext* ctx, OptionsType state)
+ : state(StateType(ctx, std::move(state))) {}
+
+ static Result<std::unique_ptr<KernelState>> Init(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (auto options = static_cast<const OptionsType*>(args.options)) {
+ return ::arrow::internal::make_unique<KernelStateFromFunctionOptions>(ctx,
+ *options);
+ }
+
+ return Status::Invalid(
+ "Attempted to initialize KernelState from null FunctionOptions");
+ }
+
+ static const StateType& Get(const KernelState& state) {
+ return ::arrow::internal::checked_cast<const KernelStateFromFunctionOptions&>(state)
+ .state;
+ }
+
+ static const StateType& Get(KernelContext* ctx) { return Get(*ctx->state()); }
+
+ StateType state;
+};
+
// ----------------------------------------------------------------------
// Input and output value type definitions
@@ -149,22 +149,22 @@ struct GetViewType<Decimal128Type> {
static T LogicalValue(PhysicalType value) {
return Decimal128(reinterpret_cast<const uint8_t*>(value.data()));
}
-
- static T LogicalValue(T value) { return value; }
+
+ static T LogicalValue(T value) { return value; }
+};
+
+template <>
+struct GetViewType<Decimal256Type> {
+ using T = Decimal256;
+ using PhysicalType = util::string_view;
+
+ static T LogicalValue(PhysicalType value) {
+ return Decimal256(reinterpret_cast<const uint8_t*>(value.data()));
+ }
+
+ static T LogicalValue(T value) { return value; }
};
-template <>
-struct GetViewType<Decimal256Type> {
- using T = Decimal256;
- using PhysicalType = util::string_view;
-
- static T LogicalValue(PhysicalType value) {
- return Decimal256(reinterpret_cast<const uint8_t*>(value.data()));
- }
-
- static T LogicalValue(T value) { return value; }
-};
-
template <typename Type, typename Enable = void>
struct GetOutputType;
@@ -183,11 +183,11 @@ struct GetOutputType<Decimal128Type> {
using T = Decimal128;
};
-template <>
-struct GetOutputType<Decimal256Type> {
- using T = Decimal256;
-};
-
+template <>
+struct GetOutputType<Decimal256Type> {
+ using T = Decimal256;
+};
+
// ----------------------------------------------------------------------
// Iteration / value access utilities
@@ -247,18 +247,18 @@ struct ArrayIterator<Type, enable_if_base_binary<Type>> {
}
};
-template <typename Type>
-struct ArrayIterator<Type, enable_if_decimal<Type>> {
- using T = typename TypeTraits<Type>::ScalarType::ValueType;
- using endian_agnostic = std::array<uint8_t, sizeof(T)>;
- const endian_agnostic* values;
-
- explicit ArrayIterator(const ArrayData& data)
- : values(data.GetValues<endian_agnostic>(1)) {}
-
- T operator()() { return T{values++->data()}; }
-};
-
+template <typename Type>
+struct ArrayIterator<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+ const endian_agnostic* values;
+
+ explicit ArrayIterator(const ArrayData& data)
+ : values(data.GetValues<endian_agnostic>(1)) {}
+
+ T operator()() { return T{values++->data()}; }
+};
+
// Iterator over various output array types, taking a GetOutputType<Type>
template <typename Type, typename Enable = void>
@@ -276,26 +276,26 @@ struct OutputArrayWriter<Type, enable_if_has_c_type_not_boolean<Type>> {
// Note that this doesn't write the null bitmap, which should be consistent
// with Write / WriteNull calls
void WriteNull() { *values++ = T{}; }
-
- void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
+
+ void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
+};
+
+template <typename Type>
+struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+ endian_agnostic* values;
+
+ explicit OutputArrayWriter(ArrayData* data)
+ : values(data->GetMutableValues<endian_agnostic>(1)) {}
+
+ void Write(T value) { value.ToBytes(values++->data()); }
+
+ void WriteNull() { T{}.ToBytes(values++->data()); }
+
+ void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
};
-template <typename Type>
-struct OutputArrayWriter<Type, enable_if_decimal<Type>> {
- using T = typename TypeTraits<Type>::ScalarType::ValueType;
- using endian_agnostic = std::array<uint8_t, sizeof(T)>;
- endian_agnostic* values;
-
- explicit OutputArrayWriter(ArrayData* data)
- : values(data->GetMutableValues<endian_agnostic>(1)) {}
-
- void Write(T value) { value.ToBytes(values++->data()); }
-
- void WriteNull() { T{}.ToBytes(values++->data()); }
-
- void WriteAllNull(int64_t length) { std::memset(values, 0, sizeof(T) * length); }
-};
-
// (Un)box Scalar to / from C++ value
template <typename Type, typename Enable = void>
@@ -311,9 +311,9 @@ struct UnboxScalar<Type, enable_if_has_c_type<Type>> {
};
template <typename Type>
-struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
+struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
static util::string_view Unbox(const Scalar& val) {
- if (!val.is_valid) return util::string_view();
+ if (!val.is_valid) return util::string_view();
return util::string_view(*checked_cast<const BaseBinaryScalar&>(val).value);
}
};
@@ -325,25 +325,25 @@ struct UnboxScalar<Decimal128Type> {
}
};
-template <>
-struct UnboxScalar<Decimal256Type> {
- static Decimal256 Unbox(const Scalar& val) {
- return checked_cast<const Decimal256Scalar&>(val).value;
- }
-};
-
+template <>
+struct UnboxScalar<Decimal256Type> {
+ static Decimal256 Unbox(const Scalar& val) {
+ return checked_cast<const Decimal256Scalar&>(val).value;
+ }
+};
+
template <typename Type, typename Enable = void>
struct BoxScalar;
template <typename Type>
struct BoxScalar<Type, enable_if_has_c_type<Type>> {
using T = typename GetOutputType<Type>::T;
- static void Box(T val, Scalar* out) {
- // Enables BoxScalar<Int64Type> to work on a (for example) Time64Scalar
- T* mutable_data = reinterpret_cast<T*>(
- checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data());
- *mutable_data = val;
- }
+ static void Box(T val, Scalar* out) {
+ // Enables BoxScalar<Int64Type> to work on a (for example) Time64Scalar
+ T* mutable_data = reinterpret_cast<T*>(
+ checked_cast<::arrow::internal::PrimitiveScalarBase*>(out)->mutable_data());
+ *mutable_data = val;
+ }
};
template <typename Type>
@@ -362,20 +362,20 @@ struct BoxScalar<Decimal128Type> {
static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
};
-template <>
-struct BoxScalar<Decimal256Type> {
- using T = Decimal256;
- using ScalarType = Decimal256Scalar;
- static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
-};
-
+template <>
+struct BoxScalar<Decimal256Type> {
+ using T = Decimal256;
+ using ScalarType = Decimal256Scalar;
+ static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
+};
+
// A VisitArrayDataInline variant that calls its visitor function with logical
// values, such as Decimal128 rather than util::string_view.
template <typename T, typename VisitFunc, typename NullFunc>
-static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
-VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
- NullFunc&& null_func) {
+static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
+VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+ NullFunc&& null_func) {
VisitArrayDataInline<T>(
arr,
[&](typename GetViewType<T>::PhysicalType v) {
@@ -384,18 +384,18 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
std::forward<NullFunc>(null_func));
}
-template <typename T, typename VisitFunc, typename NullFunc>
-static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
-VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
- NullFunc&& null_func) {
- return VisitArrayDataInline<T>(
- arr,
- [&](typename GetViewType<T>::PhysicalType v) {
- return valid_func(GetViewType<T>::LogicalValue(std::move(v)));
- },
- std::forward<NullFunc>(null_func));
-}
-
+template <typename T, typename VisitFunc, typename NullFunc>
+static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
+VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+ NullFunc&& null_func) {
+ return VisitArrayDataInline<T>(
+ arr,
+ [&](typename GetViewType<T>::PhysicalType v) {
+ return valid_func(GetViewType<T>::LogicalValue(std::move(v)));
+ },
+ std::forward<NullFunc>(null_func));
+}
+
// Like VisitArrayValuesInline, but for binary functions.
template <typename Arg0Type, typename Arg1Type, typename VisitFunc, typename NullFunc>
@@ -425,7 +425,7 @@ Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& desc
// ----------------------------------------------------------------------
// Generate an array kernel given template classes
-Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec);
@@ -439,7 +439,7 @@ const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
const std::vector<std::shared_ptr<DataType>>& IntTypes();
const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
-const std::vector<Type::type>& DecimalTypeIds();
+const std::vector<Type::type>& DecimalTypeIds();
ARROW_EXPORT
const std::vector<TimeUnit::type>& AllTimeUnits();
@@ -483,16 +483,16 @@ namespace applicator {
//
// Operator must implement
//
-// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& in, Scalar* out)
+// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& in, Scalar* out)
template <typename Operator>
-static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::SCALAR) {
- return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get());
+ return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get());
} else if (batch.length > 0) {
- return Operator::Call(ctx, *batch[0].array(), out->mutable_array());
+ return Operator::Call(ctx, *batch[0].array(), out->mutable_array());
}
- return Status::OK();
+ return Status::OK();
}
// Generate an ArrayKernelExec given a functor that handles all of its own
@@ -500,34 +500,34 @@ static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out
//
// Operator must implement
//
-// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
-// ArrayData* out)
-// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1,
-// ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1,
-// ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1,
-// Scalar* out)
+// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1,
+// ArrayData* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1,
+// Scalar* out)
template <typename Operator>
-static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (batch.length == 0) return Status::OK();
-
- if (batch[0].kind() == Datum::ARRAY) {
- if (batch[1].kind() == Datum::ARRAY) {
- return Operator::Call(ctx, *batch[0].array(), *batch[1].array(),
- out->mutable_array());
- } else {
- return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(),
- out->mutable_array());
- }
- } else {
- if (batch[1].kind() == Datum::ARRAY) {
- return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(),
- out->mutable_array());
- } else {
- return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(),
- out->scalar().get());
- }
+static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch.length == 0) return Status::OK();
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return Operator::Call(ctx, *batch[0].array(), *batch[1].array(),
+ out->mutable_array());
+ } else {
+ return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ out->mutable_array());
+ }
+ } else {
+ if (batch[1].kind() == Datum::ARRAY) {
+ return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(),
+ out->mutable_array());
+ } else {
+ return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(),
+ out->scalar().get());
+ }
}
}
@@ -541,53 +541,53 @@ struct OutputAdapter;
template <typename Type>
struct OutputAdapter<Type, enable_if_boolean<Type>> {
template <typename Generator>
- static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
ArrayData* out_arr = out->mutable_array();
auto out_bitmap = out_arr->buffers[1]->mutable_data();
GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
std::forward<Generator>(generator));
- return Status::OK();
+ return Status::OK();
}
};
template <typename Type>
struct OutputAdapter<Type, enable_if_has_c_type_not_boolean<Type>> {
template <typename Generator>
- static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
ArrayData* out_arr = out->mutable_array();
auto out_data = out_arr->GetMutableValues<typename Type::c_type>(1);
// TODO: Is this as fast as a more explicitly inlined function?
for (int64_t i = 0; i < out_arr->length; ++i) {
*out_data++ = generator();
}
- return Status::OK();
+ return Status::OK();
}
};
template <typename Type>
struct OutputAdapter<Type, enable_if_base_binary<Type>> {
template <typename Generator>
- static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) {
- return Status::NotImplemented("NYI");
+ static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) {
+ return Status::NotImplemented("NYI");
+ }
+};
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_decimal<Type>> {
+ using T = typename TypeTraits<Type>::ScalarType::ValueType;
+ using endian_agnostic = std::array<uint8_t, sizeof(T)>;
+
+ template <typename Generator>
+ static Status Write(KernelContext*, Datum* out, Generator&& generator) {
+ ArrayData* out_arr = out->mutable_array();
+ auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+ for (int64_t i = 0; i < out_arr->length; ++i) {
+ generator().ToBytes(out_data++->data());
+ }
+ return Status::OK();
}
};
-template <typename Type>
-struct OutputAdapter<Type, enable_if_decimal<Type>> {
- using T = typename TypeTraits<Type>::ScalarType::ValueType;
- using endian_agnostic = std::array<uint8_t, sizeof(T)>;
-
- template <typename Generator>
- static Status Write(KernelContext*, Datum* out, Generator&& generator) {
- ArrayData* out_arr = out->mutable_array();
- auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
- for (int64_t i = 0; i < out_arr->length; ++i) {
- generator().ToBytes(out_data++->data());
- }
- return Status::OK();
- }
-};
-
// A kernel exec generator for unary functions that addresses both array and
// scalar inputs and dispatches input iteration and output writing to other
// templates
@@ -600,10 +600,10 @@ struct OutputAdapter<Type, enable_if_decimal<Type>> {
//
// struct Op {
// template <typename OutValue, typename Arg0Value>
-// static OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) {
+// static OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) {
// // implementation
-// // NOTE: "status" should only populated with errors,
-// // leave it unmodified to indicate Status::OK()
+// // NOTE: "status" should only populated with errors,
+// // leave it unmodified to indicate Status::OK()
// }
// };
template <typename OutType, typename Arg0Type, typename Op>
@@ -611,34 +611,34 @@ struct ScalarUnary {
using OutValue = typename GetOutputType<OutType>::T;
using Arg0Value = typename GetViewType<Arg0Type>::T;
- static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) {
- Status st = Status::OK();
+ static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) {
+ Status st = Status::OK();
ArrayIterator<Arg0Type> arg0_it(arg0);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
+ }));
+ return st;
}
- static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
- Status st = Status::OK();
- Scalar* out_scalar = out->scalar().get();
+ static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+ Status st = Status::OK();
+ Scalar* out_scalar = out->scalar().get();
if (arg0.is_valid) {
Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
- out_scalar->is_valid = true;
- BoxScalar<OutType>::Box(Op::template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
- out_scalar);
+ out_scalar->is_valid = true;
+ BoxScalar<OutType>::Box(Op::template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
+ out_scalar);
} else {
- out_scalar->is_valid = false;
+ out_scalar->is_valid = false;
}
- return st;
+ return st;
}
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
- return ExecArray(ctx, *batch[0].array(), out);
+ return ExecArray(ctx, *batch[0].array(), out);
} else {
- return ExecScalar(ctx, *batch[0].scalar(), out);
+ return ExecScalar(ctx, *batch[0].scalar(), out);
}
}
};
@@ -658,69 +658,69 @@ struct ScalarUnaryNotNullStateful {
template <typename Type, typename Enable = void>
struct ArrayExec {
- static Status Exec(const ThisType& functor, KernelContext* ctx,
- const ExecBatch& batch, Datum* out) {
+ static Status Exec(const ThisType& functor, KernelContext* ctx,
+ const ExecBatch& batch, Datum* out) {
ARROW_LOG(FATAL) << "Missing ArrayExec specialization for output type "
<< out->type();
- return Status::NotImplemented("NYI");
+ return Status::NotImplemented("NYI");
}
};
template <typename Type>
struct ArrayExec<
Type, enable_if_t<has_c_type<Type>::value && !is_boolean_type<Type>::value>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
- Status st = Status::OK();
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
ArrayData* out_arr = out->mutable_array();
auto out_data = out_arr->GetMutableValues<OutValue>(1);
VisitArrayValuesInline<Arg0Type>(
arg0,
[&](Arg0Value v) {
- *out_data++ = functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st);
+ *out_data++ = functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st);
},
[&]() {
// null
- *out_data++ = OutValue{};
+ *out_data++ = OutValue{};
});
- return st;
+ return st;
}
};
template <typename Type>
struct ArrayExec<Type, enable_if_base_binary<Type>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
// NOTE: This code is not currently used by any kernels and has
// suboptimal performance because it's recomputing the validity bitmap
// that is already computed by the kernel execution layer. Consider
// writing a lower-level "output adapter" for base binary types.
typename TypeTraits<Type>::BuilderType builder;
- Status st = Status::OK();
- RETURN_NOT_OK(VisitArrayValuesInline<Arg0Type>(
- arg0, [&](Arg0Value v) { return builder.Append(functor.op.Call(ctx, v, &st)); },
- [&]() { return builder.AppendNull(); }));
- if (st.ok()) {
+ Status st = Status::OK();
+ RETURN_NOT_OK(VisitArrayValuesInline<Arg0Type>(
+ arg0, [&](Arg0Value v) { return builder.Append(functor.op.Call(ctx, v, &st)); },
+ [&]() { return builder.AppendNull(); }));
+ if (st.ok()) {
std::shared_ptr<ArrayData> result;
- RETURN_NOT_OK(builder.FinishInternal(&result));
+ RETURN_NOT_OK(builder.FinishInternal(&result));
out->value = std::move(result);
}
- return st;
+ return st;
}
};
template <typename Type>
struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
- Status st = Status::OK();
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
ArrayData* out_arr = out->mutable_array();
FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(),
out_arr->offset, out_arr->length);
VisitArrayValuesInline<Arg0Type>(
arg0,
[&](Arg0Value v) {
- if (functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)) {
+ if (functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)) {
out_writer.Set();
}
out_writer.Next();
@@ -731,49 +731,49 @@ struct ScalarUnaryNotNullStateful {
out_writer.Next();
});
out_writer.Finish();
- return st;
+ return st;
}
};
template <typename Type>
- struct ArrayExec<Type, enable_if_decimal<Type>> {
- static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
- Datum* out) {
- Status st = Status::OK();
+ struct ArrayExec<Type, enable_if_decimal<Type>> {
+ static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
+ Datum* out) {
+ Status st = Status::OK();
ArrayData* out_arr = out->mutable_array();
- // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian
- using endian_agnostic =
- std::array<uint8_t, sizeof(typename TypeTraits<Type>::ScalarType::ValueType)>;
- auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
+ // Decimal128 data buffers are not safely reinterpret_cast-able on big-endian
+ using endian_agnostic =
+ std::array<uint8_t, sizeof(typename TypeTraits<Type>::ScalarType::ValueType)>;
+ auto out_data = out_arr->GetMutableValues<endian_agnostic>(1);
VisitArrayValuesInline<Arg0Type>(
arg0,
[&](Arg0Value v) {
- functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)
- .ToBytes(out_data++->data());
+ functor.op.template Call<OutValue, Arg0Value>(ctx, v, &st)
+ .ToBytes(out_data++->data());
},
- [&]() {
- // null
- std::memset(out_data, 0, sizeof(*out_data));
- ++out_data;
- });
- return st;
+ [&]() {
+ // null
+ std::memset(out_data, 0, sizeof(*out_data));
+ ++out_data;
+ });
+ return st;
}
};
- Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
- Status st = Status::OK();
+ Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+ Status st = Status::OK();
if (arg0.is_valid) {
Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
- BoxScalar<OutType>::Box(
- this->op.template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
- out->scalar().get());
+ BoxScalar<OutType>::Box(
+ this->op.template Call<OutValue, Arg0Value>(ctx, arg0_val, &st),
+ out->scalar().get());
}
- return st;
+ return st;
}
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
- return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
+ return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
} else {
return Scalar(ctx, *batch[0].scalar(), out);
}
@@ -788,7 +788,7 @@ struct ScalarUnaryNotNull {
using OutValue = typename GetOutputType<OutType>::T;
using Arg0Value = typename GetViewType<Arg0Type>::T;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Seed kernel with dummy state
ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
return kernel.Exec(ctx, batch, out);
@@ -807,11 +807,11 @@ struct ScalarUnaryNotNull {
//
// struct Op {
// template <typename OutValue, typename Arg0Value, typename Arg1Value>
-// static OutValue Call(KernelContext* ctx, Arg0Value arg0, Arg1Value arg1, Status* st)
-// {
+// static OutValue Call(KernelContext* ctx, Arg0Value arg0, Arg1Value arg1, Status* st)
+// {
// // implementation
-// // NOTE: "status" should only populated with errors,
-// // leave it unmodified to indicate Status::OK()
+// // NOTE: "status" should only populated with errors,
+// // leave it unmodified to indicate Status::OK()
// }
// };
template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op>
@@ -820,56 +820,56 @@ struct ScalarBinary {
using Arg0Value = typename GetViewType<Arg0Type>::T;
using Arg1Value = typename GetViewType<Arg1Type>::T;
- static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0,
- const ArrayData& arg1, Datum* out) {
- Status st = Status::OK();
+ static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0,
+ const ArrayData& arg1, Datum* out) {
+ Status st = Status::OK();
ArrayIterator<Arg0Type> arg0_it(arg0);
ArrayIterator<Arg1Type> arg1_it(arg1);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_it(),
- &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_it(),
+ &st);
+ }));
+ return st;
}
- static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
ArrayIterator<Arg0Type> arg0_it(arg0);
auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_val,
- &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_val,
+ &st);
+ }));
+ return st;
}
- static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
- Datum* out) {
- Status st = Status::OK();
+ static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
ArrayIterator<Arg1Type> arg1_it(arg1);
- RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
- return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_it(),
- &st);
- }));
- return st;
+ RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
+ return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_it(),
+ &st);
+ }));
+ return st;
}
- static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
if (out->scalar()->is_valid) {
auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
- BoxScalar<OutType>::Box(
- Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
- out->scalar().get());
+ BoxScalar<OutType>::Box(
+ Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
+ out->scalar().get());
}
- return st;
+ return st;
}
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
if (batch[1].kind() == Datum::ARRAY) {
return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
@@ -900,22 +900,22 @@ struct ScalarBinaryNotNullStateful {
// NOTE: In ArrayExec<Type>, Type is really OutputType
- Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
OutputArrayWriter<OutType> writer(out->mutable_array());
VisitTwoArrayValuesInline<Arg0Type, Arg1Type>(
arg0, arg1,
[&](Arg0Value u, Arg1Value v) {
- writer.Write(op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, v, &st));
+ writer.Write(op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, v, &st));
},
[&]() { writer.WriteNull(); });
- return st;
+ return st;
}
- Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
OutputArrayWriter<OutType> writer(out->mutable_array());
if (arg1.is_valid) {
const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
@@ -923,18 +923,18 @@ struct ScalarBinaryNotNullStateful {
arg0,
[&](Arg0Value u) {
writer.Write(
- op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, arg1_val, &st));
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, u, arg1_val, &st));
},
[&]() { writer.WriteNull(); });
- } else {
- writer.WriteAllNull(out->mutable_array()->length);
+ } else {
+ writer.WriteAllNull(out->mutable_array()->length);
}
- return st;
+ return st;
}
- Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
+ Datum* out) {
+ Status st = Status::OK();
OutputArrayWriter<OutType> writer(out->mutable_array());
if (arg0.is_valid) {
const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
@@ -942,29 +942,29 @@ struct ScalarBinaryNotNullStateful {
arg1,
[&](Arg1Value v) {
writer.Write(
- op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, v, &st));
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, v, &st));
},
[&]() { writer.WriteNull(); });
- } else {
- writer.WriteAllNull(out->mutable_array()->length);
+ } else {
+ writer.WriteAllNull(out->mutable_array()->length);
}
- return st;
+ return st;
}
- Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
- Datum* out) {
- Status st = Status::OK();
+ Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
+ Datum* out) {
+ Status st = Status::OK();
if (arg0.is_valid && arg1.is_valid) {
const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
BoxScalar<OutType>::Box(
- op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
+ op.template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_val, &st),
out->scalar().get());
}
- return st;
+ return st;
}
- Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (batch[0].kind() == Datum::ARRAY) {
if (batch[1].kind() == Datum::ARRAY) {
return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
@@ -991,7 +991,7 @@ struct ScalarBinaryNotNull {
using Arg0Value = typename GetViewType<Arg0Type>::T;
using Arg1Value = typename GetViewType<Arg1Type>::T;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Seed kernel with dummy state
ScalarBinaryNotNullStateful<OutType, Arg0Type, Arg1Type, Op> kernel({});
return kernel.Exec(ctx, batch, out);
@@ -1160,41 +1160,41 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
}
}
-template <template <typename... Args> class Generator, typename... Args>
-ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- return Generator<Int8Type, Args...>::Exec;
- case Type::INT16:
- return Generator<Int16Type, Args...>::Exec;
- case Type::INT32:
- case Type::DATE32:
- case Type::TIME32:
- return Generator<Int32Type, Args...>::Exec;
- case Type::INT64:
- case Type::DATE64:
- case Type::TIMESTAMP:
- case Type::TIME64:
- case Type::DURATION:
- return Generator<Int64Type, Args...>::Exec;
- case Type::UINT8:
- return Generator<UInt8Type, Args...>::Exec;
- case Type::UINT16:
- return Generator<UInt16Type, Args...>::Exec;
- case Type::UINT32:
- return Generator<UInt32Type, Args...>::Exec;
- case Type::UINT64:
- return Generator<UInt64Type, Args...>::Exec;
- case Type::FLOAT:
- return Generator<FloatType, Args...>::Exec;
- case Type::DOUBLE:
- return Generator<DoubleType, Args...>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
+template <template <typename... Args> class Generator, typename... Args>
+ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return Generator<Int8Type, Args...>::Exec;
+ case Type::INT16:
+ return Generator<Int16Type, Args...>::Exec;
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return Generator<Int32Type, Args...>::Exec;
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIMESTAMP:
+ case Type::TIME64:
+ case Type::DURATION:
+ return Generator<Int64Type, Args...>::Exec;
+ case Type::UINT8:
+ return Generator<UInt8Type, Args...>::Exec;
+ case Type::UINT16:
+ return Generator<UInt16Type, Args...>::Exec;
+ case Type::UINT32:
+ return Generator<UInt32Type, Args...>::Exec;
+ case Type::UINT64:
+ return Generator<UInt64Type, Args...>::Exec;
+ case Type::FLOAT:
+ return Generator<FloatType, Args...>::Exec;
+ case Type::DOUBLE:
+ return Generator<DoubleType, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
// Generate a kernel given a templated functor for integer types
//
// See "Numeric" above for description of the generator functor
@@ -1222,26 +1222,26 @@ ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
// bits).
//
// See "Numeric" above for description of the generator functor
-template <template <typename...> class Generator, typename... Args>
+template <template <typename...> class Generator, typename... Args>
ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::NA:
- return Generator<NullType, Args...>::Exec;
+ return Generator<NullType, Args...>::Exec;
case Type::BOOL:
- return Generator<BooleanType, Args...>::Exec;
+ return Generator<BooleanType, Args...>::Exec;
case Type::UINT8:
case Type::INT8:
- return Generator<UInt8Type, Args...>::Exec;
+ return Generator<UInt8Type, Args...>::Exec;
case Type::UINT16:
case Type::INT16:
- return Generator<UInt16Type, Args...>::Exec;
+ return Generator<UInt16Type, Args...>::Exec;
case Type::UINT32:
case Type::INT32:
case Type::FLOAT:
case Type::DATE32:
case Type::TIME32:
- case Type::INTERVAL_MONTHS:
- return Generator<UInt32Type, Args...>::Exec;
+ case Type::INTERVAL_MONTHS:
+ return Generator<UInt32Type, Args...>::Exec;
case Type::UINT64:
case Type::INT64:
case Type::DOUBLE:
@@ -1249,30 +1249,30 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
case Type::TIMESTAMP:
case Type::TIME64:
case Type::DURATION:
- case Type::INTERVAL_DAY_TIME:
- return Generator<UInt64Type, Args...>::Exec;
+ case Type::INTERVAL_DAY_TIME:
+ return Generator<UInt64Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+// similar to GenerateTypeAgnosticPrimitive, but for variable types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::BINARY:
+ case Type::STRING:
+ return Generator<BinaryType, Args...>::Exec;
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ return Generator<LargeBinaryType, Args...>::Exec;
default:
DCHECK(false);
return ExecFail;
}
}
-// similar to GenerateTypeAgnosticPrimitive, but for variable types
-template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::BINARY:
- case Type::STRING:
- return Generator<BinaryType, Args...>::Exec;
- case Type::LARGE_BINARY:
- case Type::LARGE_STRING:
- return Generator<LargeBinaryType, Args...>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
// Generate a kernel given a templated functor for base binary types. Generates
// a single kernel for binary/string and large binary / large string. If your
// kernel implementation needs access to the specific type at compile time,
@@ -1336,46 +1336,46 @@ ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
}
}
-// Generate a kernel given a templated functor for decimal types
-//
-// See "Numeric" above for description of the generator functor
-template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::DECIMAL128:
- return Generator<Type0, Decimal128Type, Args...>::Exec;
- case Type::DECIMAL256:
- return Generator<Type0, Decimal256Type, Args...>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
+// Generate a kernel given a templated functor for decimal types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator, typename Type0, typename... Args>
+ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::DECIMAL128:
+ return Generator<Type0, Decimal128Type, Args...>::Exec;
+ case Type::DECIMAL256:
+ return Generator<Type0, Decimal256Type, Args...>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
// END of kernel generator-dispatchers
// ----------------------------------------------------------------------
-ARROW_EXPORT
-void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs);
-
-ARROW_EXPORT
-void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs);
-
-ARROW_EXPORT
-void ReplaceTypes(const std::shared_ptr<DataType>&, std::vector<ValueDescr>* descrs);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs);
-
-ARROW_EXPORT
-std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs);
-
+ARROW_EXPORT
+void EnsureDictionaryDecoded(std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+void ReplaceNullWithOtherType(std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+void ReplaceTypes(const std::shared_ptr<DataType>&, std::vector<ValueDescr>* descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonNumeric(const std::vector<ValueDescr>& descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonNumeric(const ValueDescr* begin, size_t count);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonTimestamp(const std::vector<ValueDescr>& descrs);
+
+ARROW_EXPORT
+std::shared_ptr<DataType> CommonBinary(const std::vector<ValueDescr>& descrs);
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 63d41392203..ed40a6b1b8c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -1,1379 +1,1379 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "arrow/buffer_builder.h"
-#include "arrow/compute/api_aggregate.h"
-#include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec/key_compare.h"
-#include "arrow/compute/exec/key_encode.h"
-#include "arrow/compute/exec/key_hash.h"
-#include "arrow/compute/exec/key_map.h"
-#include "arrow/compute/exec/util.h"
-#include "arrow/compute/exec_internal.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/aggregate_internal.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_writer.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/cpu_info.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::FirstTimeBitmapWriter;
-
-namespace compute {
-namespace internal {
-namespace {
-
-struct KeyEncoder {
- // the first byte of an encoded key is used to indicate nullity
- static constexpr bool kExtraByteForNull = true;
-
- static constexpr uint8_t kNullByte = 1;
- static constexpr uint8_t kValidByte = 0;
-
- virtual ~KeyEncoder() = default;
-
- virtual void AddLength(const ArrayData&, int32_t* lengths) = 0;
-
- virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0;
-
- virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
- int32_t length, MemoryPool*) = 0;
-
- // extract the null bitmap from the leading nullity bytes of encoded keys
- static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
- std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count) {
- // first count nulls to determine if a null bitmap is necessary
- *null_count = 0;
- for (int32_t i = 0; i < length; ++i) {
- *null_count += (encoded_bytes[i][0] == kNullByte);
- }
-
- if (*null_count > 0) {
- ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool));
- uint8_t* validity = (*null_bitmap)->mutable_data();
-
- FirstTimeBitmapWriter writer(validity, 0, length);
- for (int32_t i = 0; i < length; ++i) {
- if (encoded_bytes[i][0] == kValidByte) {
- writer.Set();
- } else {
- writer.Clear();
- }
- writer.Next();
- encoded_bytes[i] += 1;
- }
- writer.Finish();
- } else {
- for (int32_t i = 0; i < length; ++i) {
- encoded_bytes[i] += 1;
- }
- }
- return Status ::OK();
- }
-};
-
-struct BooleanKeyEncoder : KeyEncoder {
- static constexpr int kByteWidth = 1;
-
- void AddLength(const ArrayData& data, int32_t* lengths) override {
- for (int64_t i = 0; i < data.length; ++i) {
- lengths[i] += kByteWidth + kExtraByteForNull;
- }
- }
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- VisitArrayDataInline<BooleanType>(
- data,
- [&](bool value) {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kValidByte;
- *encoded_ptr++ = value;
- },
- [&] {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kNullByte;
- *encoded_ptr++ = 0;
- });
- return Status::OK();
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- std::shared_ptr<Buffer> null_buf;
- int32_t null_count;
- RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
-
- ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool));
-
- uint8_t* raw_output = key_buf->mutable_data();
- for (int32_t i = 0; i < length; ++i) {
- auto& encoded_ptr = encoded_bytes[i];
- BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0);
- encoded_ptr += 1;
- }
-
- return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)},
- null_count);
- }
-};
-
-struct FixedWidthKeyEncoder : KeyEncoder {
- explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
- : type_(std::move(type)),
- byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
-
- void AddLength(const ArrayData& data, int32_t* lengths) override {
- for (int64_t i = 0; i < data.length; ++i) {
- lengths[i] += byte_width_ + kExtraByteForNull;
- }
- }
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers,
- data.null_count, data.offset);
-
- VisitArrayDataInline<FixedSizeBinaryType>(
- viewed,
- [&](util::string_view bytes) {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kValidByte;
- memcpy(encoded_ptr, bytes.data(), byte_width_);
- encoded_ptr += byte_width_;
- },
- [&] {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kNullByte;
- memset(encoded_ptr, 0, byte_width_);
- encoded_ptr += byte_width_;
- });
- return Status::OK();
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- std::shared_ptr<Buffer> null_buf;
- int32_t null_count;
- RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
-
- ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool));
-
- uint8_t* raw_output = key_buf->mutable_data();
- for (int32_t i = 0; i < length; ++i) {
- auto& encoded_ptr = encoded_bytes[i];
- std::memcpy(raw_output, encoded_ptr, byte_width_);
- encoded_ptr += byte_width_;
- raw_output += byte_width_;
- }
-
- return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)},
- null_count);
- }
-
- std::shared_ptr<DataType> type_;
- int byte_width_;
-};
-
-struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
- DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
- : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- auto dict = MakeArray(data.dictionary);
- if (dictionary_) {
- if (!dictionary_->Equals(dict)) {
- // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
- // differs from the first we saw for this key
- return Status::NotImplemented("Unifying differing dictionaries");
- }
- } else {
- dictionary_ = std::move(dict);
- }
- return FixedWidthKeyEncoder::Encode(data, encoded_bytes);
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- ARROW_ASSIGN_OR_RAISE(auto data,
- FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool));
-
- if (dictionary_) {
- data->dictionary = dictionary_->data();
- } else {
- ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
- data->dictionary = dict->data();
- }
-
- data->type = type_;
- return data;
- }
-
- MemoryPool* pool_;
- std::shared_ptr<Array> dictionary_;
-};
-
-template <typename T>
-struct VarLengthKeyEncoder : KeyEncoder {
- using Offset = typename T::offset_type;
-
- void AddLength(const ArrayData& data, int32_t* lengths) override {
- int64_t i = 0;
- VisitArrayDataInline<T>(
- data,
- [&](util::string_view bytes) {
- lengths[i++] +=
- kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
- },
- [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
- }
-
- Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
- VisitArrayDataInline<T>(
- data,
- [&](util::string_view bytes) {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kValidByte;
- util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
- encoded_ptr += sizeof(Offset);
- memcpy(encoded_ptr, bytes.data(), bytes.size());
- encoded_ptr += bytes.size();
- },
- [&] {
- auto& encoded_ptr = *encoded_bytes++;
- *encoded_ptr++ = kNullByte;
- util::SafeStore(encoded_ptr, static_cast<Offset>(0));
- encoded_ptr += sizeof(Offset);
- });
- return Status::OK();
- }
-
- Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
- MemoryPool* pool) override {
- std::shared_ptr<Buffer> null_buf;
- int32_t null_count;
- RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
-
- Offset length_sum = 0;
- for (int32_t i = 0; i < length; ++i) {
- length_sum += util::SafeLoadAs<Offset>(encoded_bytes[i]);
- }
-
- ARROW_ASSIGN_OR_RAISE(auto offset_buf,
- AllocateBuffer(sizeof(Offset) * (1 + length), pool));
- ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum));
-
- auto raw_offsets = reinterpret_cast<Offset*>(offset_buf->mutable_data());
- auto raw_keys = key_buf->mutable_data();
-
- Offset current_offset = 0;
- for (int32_t i = 0; i < length; ++i) {
- raw_offsets[i] = current_offset;
-
- auto key_length = util::SafeLoadAs<Offset>(encoded_bytes[i]);
- encoded_bytes[i] += sizeof(Offset);
-
- memcpy(raw_keys + current_offset, encoded_bytes[i], key_length);
- encoded_bytes[i] += key_length;
-
- current_offset += key_length;
- }
- raw_offsets[length] = current_offset;
-
- return ArrayData::Make(
- type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)},
- null_count);
- }
-
- explicit VarLengthKeyEncoder(std::shared_ptr<DataType> type) : type_(std::move(type)) {}
-
- std::shared_ptr<DataType> type_;
-};
-
-struct GrouperImpl : Grouper {
- static Result<std::unique_ptr<GrouperImpl>> Make(const std::vector<ValueDescr>& keys,
- ExecContext* ctx) {
- auto impl = ::arrow::internal::make_unique<GrouperImpl>();
-
- impl->encoders_.resize(keys.size());
- impl->ctx_ = ctx;
-
- for (size_t i = 0; i < keys.size(); ++i) {
- const auto& key = keys[i].type;
-
- if (key->id() == Type::BOOL) {
- impl->encoders_[i] = ::arrow::internal::make_unique<BooleanKeyEncoder>();
- continue;
- }
-
- if (key->id() == Type::DICTIONARY) {
- impl->encoders_[i] =
- ::arrow::internal::make_unique<DictionaryKeyEncoder>(key, ctx->memory_pool());
- continue;
- }
-
- if (is_fixed_width(key->id())) {
- impl->encoders_[i] = ::arrow::internal::make_unique<FixedWidthKeyEncoder>(key);
- continue;
- }
-
- if (is_binary_like(key->id())) {
- impl->encoders_[i] =
- ::arrow::internal::make_unique<VarLengthKeyEncoder<BinaryType>>(key);
- continue;
- }
-
- if (is_large_binary_like(key->id())) {
- impl->encoders_[i] =
- ::arrow::internal::make_unique<VarLengthKeyEncoder<LargeBinaryType>>(key);
- continue;
- }
-
- return Status::NotImplemented("Keys of type ", *key);
- }
-
- return std::move(impl);
- }
-
- Result<Datum> Consume(const ExecBatch& batch) override {
- std::vector<int32_t> offsets_batch(batch.length + 1);
- for (int i = 0; i < batch.num_values(); ++i) {
- encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data());
- }
-
- int32_t total_length = 0;
- for (int64_t i = 0; i < batch.length; ++i) {
- auto total_length_before = total_length;
- total_length += offsets_batch[i];
- offsets_batch[i] = total_length_before;
- }
- offsets_batch[batch.length] = total_length;
-
- std::vector<uint8_t> key_bytes_batch(total_length);
- std::vector<uint8_t*> key_buf_ptrs(batch.length);
- for (int64_t i = 0; i < batch.length; ++i) {
- key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i];
- }
-
- for (int i = 0; i < batch.num_values(); ++i) {
- RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data()));
- }
-
- TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
- RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
-
- for (int64_t i = 0; i < batch.length; ++i) {
- int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
- std::string key(
- reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
- key_length);
-
- auto it_success = map_.emplace(key, num_groups_);
- auto group_id = it_success.first->second;
-
- if (it_success.second) {
- // new key; update offsets and key_bytes
- ++num_groups_;
- auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
- key_bytes_.resize(next_key_offset + key_length);
- offsets_.push_back(next_key_offset + key_length);
- memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
- }
-
- group_ids_batch.UnsafeAppend(group_id);
- }
-
- ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
- return Datum(UInt32Array(batch.length, std::move(group_ids)));
- }
-
- uint32_t num_groups() const override { return num_groups_; }
-
- Result<ExecBatch> GetUniques() override {
- ExecBatch out({}, num_groups_);
-
- std::vector<uint8_t*> key_buf_ptrs(num_groups_);
- for (int64_t i = 0; i < num_groups_; ++i) {
- key_buf_ptrs[i] = key_bytes_.data() + offsets_[i];
- }
-
- out.values.resize(encoders_.size());
- for (size_t i = 0; i < encoders_.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- out.values[i],
- encoders_[i]->Decode(key_buf_ptrs.data(), static_cast<int32_t>(num_groups_),
- ctx_->memory_pool()));
- }
-
- return out;
- }
-
- ExecContext* ctx_;
- std::unordered_map<std::string, uint32_t> map_;
- std::vector<int32_t> offsets_ = {0};
- std::vector<uint8_t> key_bytes_;
- uint32_t num_groups_ = 0;
- std::vector<std::unique_ptr<KeyEncoder>> encoders_;
-};
-
-struct GrouperFastImpl : Grouper {
- static constexpr int kBitmapPaddingForSIMD = 64; // bits
- static constexpr int kPaddingForSIMD = 32; // bytes
-
- static bool CanUse(const std::vector<ValueDescr>& keys) {
-#if ARROW_LITTLE_ENDIAN
- for (size_t i = 0; i < keys.size(); ++i) {
- const auto& key = keys[i].type;
- if (is_large_binary_like(key->id())) {
- return false;
- }
- }
- return true;
-#else
- return false;
-#endif
- }
-
- static Result<std::unique_ptr<GrouperFastImpl>> Make(
- const std::vector<ValueDescr>& keys, ExecContext* ctx) {
- auto impl = ::arrow::internal::make_unique<GrouperFastImpl>();
- impl->ctx_ = ctx;
-
- RETURN_NOT_OK(impl->temp_stack_.Init(ctx->memory_pool(), 64 * minibatch_size_max_));
- impl->encode_ctx_.hardware_flags =
- arrow::internal::CpuInfo::GetInstance()->hardware_flags();
- impl->encode_ctx_.stack = &impl->temp_stack_;
-
- auto num_columns = keys.size();
- impl->col_metadata_.resize(num_columns);
- impl->key_types_.resize(num_columns);
- impl->dictionaries_.resize(num_columns);
- for (size_t icol = 0; icol < num_columns; ++icol) {
- const auto& key = keys[icol].type;
- if (key->id() == Type::DICTIONARY) {
- auto bit_width = checked_cast<const FixedWidthType&>(*key).bit_width();
- ARROW_DCHECK(bit_width % 8 == 0);
- impl->col_metadata_[icol] =
- arrow::compute::KeyEncoder::KeyColumnMetadata(true, bit_width / 8);
- } else if (key->id() == Type::BOOL) {
- impl->col_metadata_[icol] =
- arrow::compute::KeyEncoder::KeyColumnMetadata(true, 0);
- } else if (is_fixed_width(key->id())) {
- impl->col_metadata_[icol] = arrow::compute::KeyEncoder::KeyColumnMetadata(
- true, checked_cast<const FixedWidthType&>(*key).bit_width() / 8);
- } else if (is_binary_like(key->id())) {
- impl->col_metadata_[icol] =
- arrow::compute::KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
- } else {
- return Status::NotImplemented("Keys of type ", *key);
- }
- impl->key_types_[icol] = key;
- }
-
- impl->encoder_.Init(impl->col_metadata_, &impl->encode_ctx_,
- /* row_alignment = */ sizeof(uint64_t),
- /* string_alignment = */ sizeof(uint64_t));
- RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
- RETURN_NOT_OK(
- impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
- impl->minibatch_size_ = impl->minibatch_size_min_;
- GrouperFastImpl* impl_ptr = impl.get();
- auto equal_func = [impl_ptr](
- int num_keys_to_compare, const uint16_t* selection_may_be_null,
- const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
- uint16_t* out_selection_mismatch) {
- arrow::compute::KeyCompare::CompareRows(
- num_keys_to_compare, selection_may_be_null, group_ids, &impl_ptr->encode_ctx_,
- out_num_keys_mismatch, out_selection_mismatch, impl_ptr->rows_minibatch_,
- impl_ptr->rows_);
- };
- auto append_func = [impl_ptr](int num_keys, const uint16_t* selection) {
- return impl_ptr->rows_.AppendSelectionFrom(impl_ptr->rows_minibatch_, num_keys,
- selection);
- };
- RETURN_NOT_OK(impl->map_.init(impl->encode_ctx_.hardware_flags, ctx->memory_pool(),
- impl->encode_ctx_.stack, impl->log_minibatch_max_,
- equal_func, append_func));
- impl->cols_.resize(num_columns);
- impl->minibatch_hashes_.resize(impl->minibatch_size_max_ +
- kPaddingForSIMD / sizeof(uint32_t));
-
- return std::move(impl);
- }
-
- ~GrouperFastImpl() { map_.cleanup(); }
-
- Result<Datum> Consume(const ExecBatch& batch) override {
- int64_t num_rows = batch.length;
- int num_columns = batch.num_values();
-
- // Process dictionaries
- for (int icol = 0; icol < num_columns; ++icol) {
- if (key_types_[icol]->id() == Type::DICTIONARY) {
- auto data = batch[icol].array();
- auto dict = MakeArray(data->dictionary);
- if (dictionaries_[icol]) {
- if (!dictionaries_[icol]->Equals(dict)) {
- // TODO(bkietz) unify if necessary. For now, just error if any batch's
- // dictionary differs from the first we saw for this key
- return Status::NotImplemented("Unifying differing dictionaries");
- }
- } else {
- dictionaries_[icol] = std::move(dict);
- }
- }
- }
-
- std::shared_ptr<arrow::Buffer> group_ids;
- ARROW_ASSIGN_OR_RAISE(
- group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
-
- for (int icol = 0; icol < num_columns; ++icol) {
- const uint8_t* non_nulls = nullptr;
- if (batch[icol].array()->buffers[0] != NULLPTR) {
- non_nulls = batch[icol].array()->buffers[0]->data();
- }
- const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
- const uint8_t* varlen = nullptr;
- if (!col_metadata_[icol].is_fixed_length) {
- varlen = batch[icol].array()->buffers[2]->data();
- }
-
- int64_t offset = batch[icol].array()->offset;
-
- auto col_base = arrow::compute::KeyEncoder::KeyColumnArray(
- col_metadata_[icol], offset + num_rows, non_nulls, fixedlen, varlen);
-
- cols_[icol] =
- arrow::compute::KeyEncoder::KeyColumnArray(col_base, offset, num_rows);
- }
-
- // Split into smaller mini-batches
- //
- for (uint32_t start_row = 0; start_row < num_rows;) {
- uint32_t batch_size_next = std::min(static_cast<uint32_t>(minibatch_size_),
- static_cast<uint32_t>(num_rows) - start_row);
-
- // Encode
- rows_minibatch_.Clean();
- RETURN_NOT_OK(encoder_.PrepareOutputForEncode(start_row, batch_size_next,
- &rows_minibatch_, cols_));
- encoder_.Encode(start_row, batch_size_next, &rows_minibatch_, cols_);
-
- // Compute hash
- if (encoder_.row_metadata().is_fixed_length) {
- Hashing::hash_fixed(encode_ctx_.hardware_flags, batch_size_next,
- encoder_.row_metadata().fixed_length, rows_minibatch_.data(1),
- minibatch_hashes_.data());
- } else {
- auto hash_temp_buf =
- util::TempVectorHolder<uint32_t>(&temp_stack_, 4 * batch_size_next);
- Hashing::hash_varlen(encode_ctx_.hardware_flags, batch_size_next,
- rows_minibatch_.offsets(), rows_minibatch_.data(2),
- hash_temp_buf.mutable_data(), minibatch_hashes_.data());
- }
-
- // Map
- RETURN_NOT_OK(
- map_.map(batch_size_next, minibatch_hashes_.data(),
- reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row));
-
- start_row += batch_size_next;
-
- if (minibatch_size_ * 2 <= minibatch_size_max_) {
- minibatch_size_ *= 2;
- }
- }
-
- return Datum(UInt32Array(batch.length, std::move(group_ids)));
- }
-
- uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }
-
- // Make sure padded buffers end up with the right logical size
-
- Result<std::shared_ptr<Buffer>> AllocatePaddedBitmap(int64_t length) {
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<Buffer> buf,
- AllocateBitmap(length + kBitmapPaddingForSIMD, ctx_->memory_pool()));
- return SliceMutableBuffer(buf, 0, BitUtil::BytesForBits(length));
- }
-
- Result<std::shared_ptr<Buffer>> AllocatePaddedBuffer(int64_t size) {
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<Buffer> buf,
- AllocateBuffer(size + kBitmapPaddingForSIMD, ctx_->memory_pool()));
- return SliceMutableBuffer(buf, 0, size);
- }
-
- Result<ExecBatch> GetUniques() override {
- auto num_columns = static_cast<uint32_t>(col_metadata_.size());
- int64_t num_groups = rows_.length();
-
- std::vector<std::shared_ptr<Buffer>> non_null_bufs(num_columns);
- std::vector<std::shared_ptr<Buffer>> fixedlen_bufs(num_columns);
- std::vector<std::shared_ptr<Buffer>> varlen_bufs(num_columns);
-
- for (size_t i = 0; i < num_columns; ++i) {
- ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocatePaddedBitmap(num_groups));
- if (col_metadata_[i].is_fixed_length) {
- if (col_metadata_[i].fixed_length == 0) {
- ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBitmap(num_groups));
- } else {
- ARROW_ASSIGN_OR_RAISE(
- fixedlen_bufs[i],
- AllocatePaddedBuffer(num_groups * col_metadata_[i].fixed_length));
- }
- } else {
- ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
- AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t)));
- }
- cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
- col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
- fixedlen_bufs[i]->mutable_data(), nullptr);
- }
-
- for (int64_t start_row = 0; start_row < num_groups;) {
- int64_t batch_size_next =
- std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
- encoder_.DecodeFixedLengthBuffers(start_row, start_row, batch_size_next, rows_,
- &cols_);
- start_row += batch_size_next;
- }
-
- if (!rows_.metadata().is_fixed_length) {
- for (size_t i = 0; i < num_columns; ++i) {
- if (!col_metadata_[i].is_fixed_length) {
- auto varlen_size =
- reinterpret_cast<const uint32_t*>(fixedlen_bufs[i]->data())[num_groups];
- ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocatePaddedBuffer(varlen_size));
- cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
- col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
- fixedlen_bufs[i]->mutable_data(), varlen_bufs[i]->mutable_data());
- }
- }
-
- for (int64_t start_row = 0; start_row < num_groups;) {
- int64_t batch_size_next =
- std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
- encoder_.DecodeVaryingLengthBuffers(start_row, start_row, batch_size_next, rows_,
- &cols_);
- start_row += batch_size_next;
- }
- }
-
- ExecBatch out({}, num_groups);
- out.values.resize(num_columns);
- for (size_t i = 0; i < num_columns; ++i) {
- auto valid_count = arrow::internal::CountSetBits(
- non_null_bufs[i]->data(), /*offset=*/0, static_cast<int64_t>(num_groups));
- int null_count = static_cast<int>(num_groups) - static_cast<int>(valid_count);
-
- if (col_metadata_[i].is_fixed_length) {
- out.values[i] = ArrayData::Make(
- key_types_[i], num_groups,
- {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])}, null_count);
- } else {
- out.values[i] =
- ArrayData::Make(key_types_[i], num_groups,
- {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]),
- std::move(varlen_bufs[i])},
- null_count);
- }
- }
-
- // Process dictionaries
- for (size_t icol = 0; icol < num_columns; ++icol) {
- if (key_types_[icol]->id() == Type::DICTIONARY) {
- if (dictionaries_[icol]) {
- out.values[icol].array()->dictionary = dictionaries_[icol]->data();
- } else {
- ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(key_types_[icol], 0));
- out.values[icol].array()->dictionary = dict->data();
- }
- }
- }
-
- return out;
- }
-
- static constexpr int log_minibatch_max_ = 10;
- static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_;
- static constexpr int minibatch_size_min_ = 128;
- int minibatch_size_;
-
- ExecContext* ctx_;
- arrow::util::TempVectorStack temp_stack_;
- arrow::compute::KeyEncoder::KeyEncoderContext encode_ctx_;
-
- std::vector<std::shared_ptr<arrow::DataType>> key_types_;
- std::vector<arrow::compute::KeyEncoder::KeyColumnMetadata> col_metadata_;
- std::vector<arrow::compute::KeyEncoder::KeyColumnArray> cols_;
- std::vector<uint32_t> minibatch_hashes_;
-
- std::vector<std::shared_ptr<Array>> dictionaries_;
-
- arrow::compute::KeyEncoder::KeyRowArray rows_;
- arrow::compute::KeyEncoder::KeyRowArray rows_minibatch_;
- arrow::compute::KeyEncoder encoder_;
- arrow::compute::SwissTable map_;
-};
-
-/// C++ abstract base class for the HashAggregateKernel interface.
-/// Implementations should be default constructible and perform initialization in
-/// Init().
-struct GroupedAggregator : KernelState {
- virtual Status Init(ExecContext*, const FunctionOptions*,
- const std::shared_ptr<DataType>&) = 0;
-
- virtual Status Consume(const ExecBatch& batch) = 0;
-
- virtual Result<Datum> Finalize() = 0;
-
- template <typename Reserve>
- Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch,
- const Reserve& reserve) {
- int64_t new_num_groups = batch[2].scalar_as<UInt32Scalar>().value;
- if (new_num_groups <= old_num_groups) {
- return Status::OK();
- }
- return reserve(new_num_groups - old_num_groups);
- }
-
- virtual std::shared_ptr<DataType> out_type() const = 0;
-};
-
-// ----------------------------------------------------------------------
-// Count implementation
-
-struct GroupedCountImpl : public GroupedAggregator {
- Status Init(ExecContext* ctx, const FunctionOptions* options,
- const std::shared_ptr<DataType>&) override {
- options_ = checked_cast<const ScalarAggregateOptions&>(*options);
- counts_ = BufferBuilder(ctx->memory_pool());
- return Status::OK();
- }
-
- Status Consume(const ExecBatch& batch) override {
- RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
- num_groups_ += added_groups;
- return counts_.Append(added_groups * sizeof(int64_t), 0);
- }));
-
- auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
- auto raw_counts = reinterpret_cast<int64_t*>(counts_.mutable_data());
-
- const auto& input = batch[0].array();
-
- if (!options_.skip_nulls) {
- if (input->GetNullCount() != 0) {
- for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) {
- auto g = group_ids[i];
- raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i);
- }
- }
- return Status::OK();
- }
-
- arrow::internal::VisitSetBitRunsVoid(
- input->buffers[0], input->offset, input->length,
- [&](int64_t begin, int64_t length) {
- for (int64_t input_i = begin, i = begin - input->offset;
- input_i < begin + length; ++input_i, ++i) {
- auto g = group_ids[i];
- raw_counts[g] += 1;
- }
- });
- return Status::OK();
- }
-
- Result<Datum> Finalize() override {
- ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish());
- return std::make_shared<Int64Array>(num_groups_, std::move(counts));
- }
-
- std::shared_ptr<DataType> out_type() const override { return int64(); }
-
- int64_t num_groups_ = 0;
- ScalarAggregateOptions options_;
- BufferBuilder counts_;
-};
-
-// ----------------------------------------------------------------------
-// Sum implementation
-
-struct GroupedSumImpl : public GroupedAggregator {
- // NB: whether we are accumulating into double, int64_t, or uint64_t
- // we always have 64 bits per group in the sums buffer.
- static constexpr size_t kSumSize = sizeof(int64_t);
-
- using ConsumeImpl = std::function<void(const std::shared_ptr<ArrayData>&,
- const uint32_t*, void*, int64_t*)>;
-
- struct GetConsumeImpl {
- template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
- Status Visit(const T&) {
- consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
- void* boxed_sums, int64_t* counts) {
- auto sums = reinterpret_cast<typename TypeTraits<AccType>::CType*>(boxed_sums);
-
- VisitArrayDataInline<T>(
- *input,
- [&](typename TypeTraits<T>::CType value) {
- sums[*group] += value;
- counts[*group] += 1;
- ++group;
- },
- [&] { ++group; });
- };
- out_type = TypeTraits<AccType>::type_singleton();
- return Status::OK();
- }
-
- Status Visit(const HalfFloatType& type) {
- return Status::NotImplemented("Summing data of type ", type);
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("Summing data of type ", type);
- }
-
- ConsumeImpl consume_impl;
- std::shared_ptr<DataType> out_type;
- };
-
- Status Init(ExecContext* ctx, const FunctionOptions*,
- const std::shared_ptr<DataType>& input_type) override {
- pool_ = ctx->memory_pool();
- sums_ = BufferBuilder(pool_);
- counts_ = BufferBuilder(pool_);
-
- GetConsumeImpl get_consume_impl;
- RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl));
-
- consume_impl_ = std::move(get_consume_impl.consume_impl);
- out_type_ = std::move(get_consume_impl.out_type);
-
- return Status::OK();
- }
-
- Status Consume(const ExecBatch& batch) override {
- RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
- num_groups_ += added_groups;
- RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0));
- RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0));
- return Status::OK();
- }));
-
- auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
- consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(),
- reinterpret_cast<int64_t*>(counts_.mutable_data()));
- return Status::OK();
- }
-
- Result<Datum> Finalize() override {
- std::shared_ptr<Buffer> null_bitmap;
- int64_t null_count = 0;
-
- for (int64_t i = 0; i < num_groups_; ++i) {
- if (reinterpret_cast<const int64_t*>(counts_.data())[i] > 0) continue;
-
- if (null_bitmap == nullptr) {
- ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_));
- BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true);
- }
-
- null_count += 1;
- BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false);
- }
-
- ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish());
-
- return ArrayData::Make(std::move(out_type_), num_groups_,
- {std::move(null_bitmap), std::move(sums)}, null_count);
- }
-
- std::shared_ptr<DataType> out_type() const override { return out_type_; }
-
- // NB: counts are used here instead of a simple "has_values_" bitmap since
- // we expect to reuse this kernel to handle Mean
- int64_t num_groups_ = 0;
- BufferBuilder sums_, counts_;
- std::shared_ptr<DataType> out_type_;
- ConsumeImpl consume_impl_;
- MemoryPool* pool_;
-};
-
-// ----------------------------------------------------------------------
-// MinMax implementation
-
-template <typename CType>
-struct Extrema : std::numeric_limits<CType> {};
-
-template <>
-struct Extrema<float> {
- static constexpr float min() { return -std::numeric_limits<float>::infinity(); }
- static constexpr float max() { return std::numeric_limits<float>::infinity(); }
-};
-
-template <>
-struct Extrema<double> {
- static constexpr double min() { return -std::numeric_limits<double>::infinity(); }
- static constexpr double max() { return std::numeric_limits<double>::infinity(); }
-};
-
-struct GroupedMinMaxImpl : public GroupedAggregator {
- using ConsumeImpl =
- std::function<void(const std::shared_ptr<ArrayData>&, const uint32_t*, void*, void*,
- uint8_t*, uint8_t*)>;
-
- using ResizeImpl = std::function<Status(BufferBuilder*, int64_t)>;
-
- template <typename CType>
- static ResizeImpl MakeResizeImpl(CType anti_extreme) {
- // resize a min or max buffer, storing the correct anti extreme
- return [anti_extreme](BufferBuilder* builder, int64_t added_groups) {
- TypedBufferBuilder<CType> typed_builder(std::move(*builder));
- RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme));
- *builder = std::move(*typed_builder.bytes_builder());
- return Status::OK();
- };
- }
-
- struct GetImpl {
- template <typename T, typename CType = typename TypeTraits<T>::CType>
- enable_if_number<T, Status> Visit(const T&) {
- consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
- void* mins, void* maxes, uint8_t* has_values,
- uint8_t* has_nulls) {
- auto raw_mins = reinterpret_cast<CType*>(mins);
- auto raw_maxes = reinterpret_cast<CType*>(maxes);
-
- VisitArrayDataInline<T>(
- *input,
- [&](CType val) {
- raw_maxes[*group] = std::max(raw_maxes[*group], val);
- raw_mins[*group] = std::min(raw_mins[*group], val);
- BitUtil::SetBit(has_values, *group++);
- },
- [&] { BitUtil::SetBit(has_nulls, *group++); });
- };
-
- resize_min_impl = MakeResizeImpl(Extrema<CType>::max());
- resize_max_impl = MakeResizeImpl(Extrema<CType>::min());
- return Status::OK();
- }
-
- Status Visit(const BooleanType& type) {
- return Status::NotImplemented("Grouped MinMax data of type ", type);
- }
-
- Status Visit(const HalfFloatType& type) {
- return Status::NotImplemented("Grouped MinMax data of type ", type);
- }
-
- Status Visit(const DataType& type) {
- return Status::NotImplemented("Grouped MinMax data of type ", type);
- }
-
- ConsumeImpl consume_impl;
- ResizeImpl resize_min_impl, resize_max_impl;
- };
-
- Status Init(ExecContext* ctx, const FunctionOptions* options,
- const std::shared_ptr<DataType>& input_type) override {
- options_ = *checked_cast<const ScalarAggregateOptions*>(options);
- type_ = input_type;
-
- mins_ = BufferBuilder(ctx->memory_pool());
- maxes_ = BufferBuilder(ctx->memory_pool());
- has_values_ = TypedBufferBuilder<bool>(ctx->memory_pool());
- has_nulls_ = TypedBufferBuilder<bool>(ctx->memory_pool());
-
- GetImpl get_impl;
- RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl));
-
- consume_impl_ = std::move(get_impl.consume_impl);
- resize_min_impl_ = std::move(get_impl.resize_min_impl);
- resize_max_impl_ = std::move(get_impl.resize_max_impl);
-
- return Status::OK();
- }
-
- Status Consume(const ExecBatch& batch) override {
- RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
- num_groups_ += added_groups;
- RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups));
- RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups));
- RETURN_NOT_OK(has_values_.Append(added_groups, false));
- RETURN_NOT_OK(has_nulls_.Append(added_groups, false));
- return Status::OK();
- }));
-
- auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
- consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(),
- maxes_.mutable_data(), has_values_.mutable_data(),
- has_nulls_.mutable_data());
- return Status::OK();
- }
-
- Result<Datum> Finalize() override {
- // aggregation for group is valid if there was at least one value in that group
- ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
-
- if (!options_.skip_nulls) {
- // ... and there were no nulls in that group
- ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
- arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
- num_groups_, 0, null_bitmap->mutable_data());
- }
-
- auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
- auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
- ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish());
- ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish());
-
- return ArrayData::Make(out_type(), num_groups_, {nullptr},
- {std::move(mins), std::move(maxes)});
- }
-
- std::shared_ptr<DataType> out_type() const override {
- return struct_({field("min", type_), field("max", type_)});
- }
-
- int64_t num_groups_;
- BufferBuilder mins_, maxes_;
- TypedBufferBuilder<bool> has_values_, has_nulls_;
- std::shared_ptr<DataType> type_;
- ConsumeImpl consume_impl_;
- ResizeImpl resize_min_impl_, resize_max_impl_;
- ScalarAggregateOptions options_;
-};
-
-template <typename Impl>
-HashAggregateKernel MakeKernel(InputType argument_type) {
- HashAggregateKernel kernel;
-
- kernel.init = [](KernelContext* ctx,
- const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
- auto impl = ::arrow::internal::make_unique<Impl>();
- // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg
- // for the Impl. Otherwise we're not exposing dispatch as well as we should.
- RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type));
- return std::move(impl);
- };
-
- kernel.signature = KernelSignature::Make(
- {std::move(argument_type), InputType::Array(Type::UINT32),
- InputType::Scalar(Type::UINT32)},
- OutputType(
- [](KernelContext* ctx, const std::vector<ValueDescr>&) -> Result<ValueDescr> {
- return checked_cast<GroupedAggregator*>(ctx->state())->out_type();
- }));
-
- kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) {
- return checked_cast<GroupedAggregator*>(ctx->state())->Consume(batch);
- };
-
- kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) {
- // TODO(ARROW-11840) merge two hash tables
- return Status::NotImplemented("Merge hashed aggregations");
- };
-
- kernel.finalize = [](KernelContext* ctx, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(*out,
- checked_cast<GroupedAggregator*>(ctx->state())->Finalize());
- return Status::OK();
- };
-
- return kernel;
-}
-
-Result<std::vector<const HashAggregateKernel*>> GetKernels(
- ExecContext* ctx, const std::vector<Aggregate>& aggregates,
- const std::vector<ValueDescr>& in_descrs) {
- if (aggregates.size() != in_descrs.size()) {
- return Status::Invalid(aggregates.size(), " aggregate functions were specified but ",
- in_descrs.size(), " arguments were provided.");
- }
-
- std::vector<const HashAggregateKernel*> kernels(in_descrs.size());
-
- for (size_t i = 0; i < aggregates.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(auto function,
- ctx->func_registry()->GetFunction(aggregates[i].function));
- ARROW_ASSIGN_OR_RAISE(
- const Kernel* kernel,
- function->DispatchExact(
- {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())}));
- kernels[i] = static_cast<const HashAggregateKernel*>(kernel);
- }
- return kernels;
-}
-
-Result<std::vector<std::unique_ptr<KernelState>>> InitKernels(
- const std::vector<const HashAggregateKernel*>& kernels, ExecContext* ctx,
- const std::vector<Aggregate>& aggregates, const std::vector<ValueDescr>& in_descrs) {
- std::vector<std::unique_ptr<KernelState>> states(kernels.size());
-
- for (size_t i = 0; i < aggregates.size(); ++i) {
- auto options = aggregates[i].options;
-
- if (options == nullptr) {
- // use known default options for the named function if possible
- auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function);
- if (maybe_function.ok()) {
- options = maybe_function.ValueOrDie()->default_options();
- }
- }
-
- KernelContext kernel_ctx{ctx};
- ARROW_ASSIGN_OR_RAISE(
- states[i], kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i],
- {
- in_descrs[i].type,
- uint32(),
- uint32(),
- },
- options}));
- }
-
- return std::move(states);
-}
-
-Result<FieldVector> ResolveKernels(
- const std::vector<Aggregate>& aggregates,
- const std::vector<const HashAggregateKernel*>& kernels,
- const std::vector<std::unique_ptr<KernelState>>& states, ExecContext* ctx,
- const std::vector<ValueDescr>& descrs) {
- FieldVector fields(descrs.size());
-
- for (size_t i = 0; i < kernels.size(); ++i) {
- KernelContext kernel_ctx{ctx};
- kernel_ctx.SetState(states[i].get());
-
- ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve(
- &kernel_ctx, {
- descrs[i].type,
- uint32(),
- uint32(),
- }));
- fields[i] = field(aggregates[i].function, std::move(descr.type));
- }
- return fields;
-}
-
-} // namespace
-
-Result<std::unique_ptr<Grouper>> Grouper::Make(const std::vector<ValueDescr>& descrs,
- ExecContext* ctx) {
- if (GrouperFastImpl::CanUse(descrs)) {
- return GrouperFastImpl::Make(descrs, ctx);
- }
- return GrouperImpl::Make(descrs, ctx);
-}
-
-Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
- const std::vector<Aggregate>& aggregates, ExecContext* ctx) {
- // Construct and initialize HashAggregateKernels
- ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
- ExecBatch::Make(arguments).Map(
- [](ExecBatch batch) { return batch.GetDescriptors(); }));
-
- ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs));
-
- ARROW_ASSIGN_OR_RAISE(auto states,
- InitKernels(kernels, ctx, aggregates, argument_descrs));
-
- ARROW_ASSIGN_OR_RAISE(
- FieldVector out_fields,
- ResolveKernels(aggregates, kernels, states, ctx, argument_descrs));
-
- using arrow::compute::detail::ExecBatchIterator;
-
- ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator,
- ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
-
- // Construct Grouper
- ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
- return batch.GetDescriptors();
- }));
-
- ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx));
-
- int i = 0;
- for (ValueDescr& key_descr : key_descrs) {
- out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
- }
-
- ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
- ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
-
- // start "streaming" execution
- ExecBatch key_batch, argument_batch;
- while (argument_batch_iterator->Next(&argument_batch) &&
- key_batch_iterator->Next(&key_batch)) {
- if (key_batch.length == 0) continue;
-
- // compute a batch of group ids
- ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
-
- // consume group ids with HashAggregateKernels
- for (size_t i = 0; i < kernels.size(); ++i) {
- KernelContext batch_ctx{ctx};
- batch_ctx.SetState(states[i].get());
- ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch,
- Datum(grouper->num_groups())}));
- RETURN_NOT_OK(kernels[i]->consume(&batch_ctx, batch));
- }
- }
-
- // Finalize output
- ArrayDataVector out_data(arguments.size() + keys.size());
- auto it = out_data.begin();
-
- for (size_t i = 0; i < kernels.size(); ++i) {
- KernelContext batch_ctx{ctx};
- batch_ctx.SetState(states[i].get());
- Datum out;
- RETURN_NOT_OK(kernels[i]->finalize(&batch_ctx, &out));
- *it++ = out.array();
- }
-
- ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques());
- for (const auto& key : out_keys.values) {
- *it++ = key.array();
- }
-
- int64_t length = out_data[0]->length;
- return ArrayData::Make(struct_(std::move(out_fields)), length,
- {/*null_bitmap=*/nullptr}, std::move(out_data),
- /*null_count=*/0);
-}
-
-Result<std::shared_ptr<ListArray>> Grouper::ApplyGroupings(const ListArray& groupings,
- const Array& array,
- ExecContext* ctx) {
- ARROW_ASSIGN_OR_RAISE(Datum sorted,
- compute::Take(array, groupings.data()->child_data[0],
- TakeOptions::NoBoundsCheck(), ctx));
-
- return std::make_shared<ListArray>(list(array.type()), groupings.length(),
- groupings.value_offsets(), sorted.make_array());
-}
-
-Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids,
- uint32_t num_groups,
- ExecContext* ctx) {
- if (ids.null_count() != 0) {
- return Status::Invalid("MakeGroupings with null ids");
- }
-
- ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1),
- ctx->memory_pool()));
- auto raw_offsets = reinterpret_cast<int32_t*>(offsets->mutable_data());
-
- std::memset(raw_offsets, 0, offsets->size());
- for (int i = 0; i < ids.length(); ++i) {
- DCHECK_LT(ids.Value(i), num_groups);
- raw_offsets[ids.Value(i)] += 1;
- }
- int32_t length = 0;
- for (uint32_t id = 0; id < num_groups; ++id) {
- auto offset = raw_offsets[id];
- raw_offsets[id] = length;
- length += offset;
- }
- raw_offsets[num_groups] = length;
- DCHECK_EQ(ids.length(), length);
-
- ARROW_ASSIGN_OR_RAISE(auto offsets_copy,
- offsets->CopySlice(0, offsets->size(), ctx->memory_pool()));
- raw_offsets = reinterpret_cast<int32_t*>(offsets_copy->mutable_data());
-
- ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(),
- ctx->memory_pool()));
- auto raw_sort_indices = reinterpret_cast<int32_t*>(sort_indices->mutable_data());
- for (int i = 0; i < ids.length(); ++i) {
- raw_sort_indices[raw_offsets[ids.Value(i)]++] = i;
- }
-
- return std::make_shared<ListArray>(
- list(int32()), num_groups, std::move(offsets),
- std::make_shared<Int32Array>(ids.length(), std::move(sort_indices)));
-}
-
-namespace {
-const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
- ("By default, non-null values are counted.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array", "group_id_array", "group_count"},
- "ScalarAggregateOptions"};
-
-const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
- ("Null values are ignored."),
- {"array", "group_id_array", "group_count"}};
-
-const FunctionDoc hash_min_max_doc{
- "Compute the minimum and maximum values of a numeric array",
- ("Null values are ignored by default.\n"
- "This can be changed through ScalarAggregateOptions."),
- {"array", "group_id_array", "group_count"},
- "ScalarAggregateOptions"};
-} // namespace
-
-void RegisterHashAggregateBasic(FunctionRegistry* registry) {
- {
- static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
- auto func = std::make_shared<HashAggregateFunction>(
- "hash_count", Arity::Ternary(), &hash_count_doc,
- &default_scalar_aggregate_options);
- DCHECK_OK(func->AddKernel(MakeKernel<GroupedCountImpl>(ValueDescr::ARRAY)));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-
- {
- auto func = std::make_shared<HashAggregateFunction>("hash_sum", Arity::Ternary(),
- &hash_sum_doc);
- DCHECK_OK(func->AddKernel(MakeKernel<GroupedSumImpl>(ValueDescr::ARRAY)));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-
- {
- static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
- auto func = std::make_shared<HashAggregateFunction>(
- "hash_min_max", Arity::Ternary(), &hash_min_max_doc,
- &default_scalar_aggregate_options);
- DCHECK_OK(func->AddKernel(MakeKernel<GroupedMinMaxImpl>(ValueDescr::ARRAY)));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec/key_compare.h"
+#include "arrow/compute/exec/key_encode.h"
+#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/exec/key_map.h"
+#include "arrow/compute/exec/util.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::FirstTimeBitmapWriter;
+
+namespace compute {
+namespace internal {
+namespace {
+
+struct KeyEncoder {
+ // the first byte of an encoded key is used to indicate nullity
+ static constexpr bool kExtraByteForNull = true;
+
+ static constexpr uint8_t kNullByte = 1;
+ static constexpr uint8_t kValidByte = 0;
+
+ virtual ~KeyEncoder() = default;
+
+ virtual void AddLength(const ArrayData&, int32_t* lengths) = 0;
+
+ virtual Status Encode(const ArrayData&, uint8_t** encoded_bytes) = 0;
+
+ virtual Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes,
+ int32_t length, MemoryPool*) = 0;
+
+ // extract the null bitmap from the leading nullity bytes of encoded keys
+ static Status DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encoded_bytes,
+ std::shared_ptr<Buffer>* null_bitmap, int32_t* null_count) {
+ // first count nulls to determine if a null bitmap is necessary
+ *null_count = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ *null_count += (encoded_bytes[i][0] == kNullByte);
+ }
+
+ if (*null_count > 0) {
+ ARROW_ASSIGN_OR_RAISE(*null_bitmap, AllocateBitmap(length, pool));
+ uint8_t* validity = (*null_bitmap)->mutable_data();
+
+ FirstTimeBitmapWriter writer(validity, 0, length);
+ for (int32_t i = 0; i < length; ++i) {
+ if (encoded_bytes[i][0] == kValidByte) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ encoded_bytes[i] += 1;
+ }
+ writer.Finish();
+ } else {
+ for (int32_t i = 0; i < length; ++i) {
+ encoded_bytes[i] += 1;
+ }
+ }
+ return Status ::OK();
+ }
+};
+
+struct BooleanKeyEncoder : KeyEncoder {
+ static constexpr int kByteWidth = 1;
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ for (int64_t i = 0; i < data.length; ++i) {
+ lengths[i] += kByteWidth + kExtraByteForNull;
+ }
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ VisitArrayDataInline<BooleanType>(
+ data,
+ [&](bool value) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ *encoded_ptr++ = value;
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ *encoded_ptr++ = 0;
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBitmap(length, pool));
+
+ uint8_t* raw_output = key_buf->mutable_data();
+ for (int32_t i = 0; i < length; ++i) {
+ auto& encoded_ptr = encoded_bytes[i];
+ BitUtil::SetBitTo(raw_output, i, encoded_ptr[0] != 0);
+ encoded_ptr += 1;
+ }
+
+ return ArrayData::Make(boolean(), length, {std::move(null_buf), std::move(key_buf)},
+ null_count);
+ }
+};
+
+struct FixedWidthKeyEncoder : KeyEncoder {
+ explicit FixedWidthKeyEncoder(std::shared_ptr<DataType> type)
+ : type_(std::move(type)),
+ byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ for (int64_t i = 0; i < data.length; ++i) {
+ lengths[i] += byte_width_ + kExtraByteForNull;
+ }
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ ArrayData viewed(fixed_size_binary(byte_width_), data.length, data.buffers,
+ data.null_count, data.offset);
+
+ VisitArrayDataInline<FixedSizeBinaryType>(
+ viewed,
+ [&](util::string_view bytes) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ memcpy(encoded_ptr, bytes.data(), byte_width_);
+ encoded_ptr += byte_width_;
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ memset(encoded_ptr, 0, byte_width_);
+ encoded_ptr += byte_width_;
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length * byte_width_, pool));
+
+ uint8_t* raw_output = key_buf->mutable_data();
+ for (int32_t i = 0; i < length; ++i) {
+ auto& encoded_ptr = encoded_bytes[i];
+ std::memcpy(raw_output, encoded_ptr, byte_width_);
+ encoded_ptr += byte_width_;
+ raw_output += byte_width_;
+ }
+
+ return ArrayData::Make(type_, length, {std::move(null_buf), std::move(key_buf)},
+ null_count);
+ }
+
+ std::shared_ptr<DataType> type_;
+ int byte_width_;
+};
+
+struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
+ DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
+ : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ auto dict = MakeArray(data.dictionary);
+ if (dictionary_) {
+ if (!dictionary_->Equals(dict)) {
+ // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
+ // differs from the first we saw for this key
+ return Status::NotImplemented("Unifying differing dictionaries");
+ }
+ } else {
+ dictionary_ = std::move(dict);
+ }
+ return FixedWidthKeyEncoder::Encode(data, encoded_bytes);
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ FixedWidthKeyEncoder::Decode(encoded_bytes, length, pool));
+
+ if (dictionary_) {
+ data->dictionary = dictionary_->data();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(type_, 0));
+ data->dictionary = dict->data();
+ }
+
+ data->type = type_;
+ return data;
+ }
+
+ MemoryPool* pool_;
+ std::shared_ptr<Array> dictionary_;
+};
+
+template <typename T>
+struct VarLengthKeyEncoder : KeyEncoder {
+ using Offset = typename T::offset_type;
+
+ void AddLength(const ArrayData& data, int32_t* lengths) override {
+ int64_t i = 0;
+ VisitArrayDataInline<T>(
+ data,
+ [&](util::string_view bytes) {
+ lengths[i++] +=
+ kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
+ },
+ [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
+ }
+
+ Status Encode(const ArrayData& data, uint8_t** encoded_bytes) override {
+ VisitArrayDataInline<T>(
+ data,
+ [&](util::string_view bytes) {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kValidByte;
+ util::SafeStore(encoded_ptr, static_cast<Offset>(bytes.size()));
+ encoded_ptr += sizeof(Offset);
+ memcpy(encoded_ptr, bytes.data(), bytes.size());
+ encoded_ptr += bytes.size();
+ },
+ [&] {
+ auto& encoded_ptr = *encoded_bytes++;
+ *encoded_ptr++ = kNullByte;
+ util::SafeStore(encoded_ptr, static_cast<Offset>(0));
+ encoded_ptr += sizeof(Offset);
+ });
+ return Status::OK();
+ }
+
+ Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
+ MemoryPool* pool) override {
+ std::shared_ptr<Buffer> null_buf;
+ int32_t null_count;
+ RETURN_NOT_OK(DecodeNulls(pool, length, encoded_bytes, &null_buf, &null_count));
+
+ Offset length_sum = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ length_sum += util::SafeLoadAs<Offset>(encoded_bytes[i]);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto offset_buf,
+ AllocateBuffer(sizeof(Offset) * (1 + length), pool));
+ ARROW_ASSIGN_OR_RAISE(auto key_buf, AllocateBuffer(length_sum));
+
+ auto raw_offsets = reinterpret_cast<Offset*>(offset_buf->mutable_data());
+ auto raw_keys = key_buf->mutable_data();
+
+ Offset current_offset = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ raw_offsets[i] = current_offset;
+
+ auto key_length = util::SafeLoadAs<Offset>(encoded_bytes[i]);
+ encoded_bytes[i] += sizeof(Offset);
+
+ memcpy(raw_keys + current_offset, encoded_bytes[i], key_length);
+ encoded_bytes[i] += key_length;
+
+ current_offset += key_length;
+ }
+ raw_offsets[length] = current_offset;
+
+ return ArrayData::Make(
+ type_, length, {std::move(null_buf), std::move(offset_buf), std::move(key_buf)},
+ null_count);
+ }
+
+ explicit VarLengthKeyEncoder(std::shared_ptr<DataType> type) : type_(std::move(type)) {}
+
+ std::shared_ptr<DataType> type_;
+};
+
+struct GrouperImpl : Grouper {
+ static Result<std::unique_ptr<GrouperImpl>> Make(const std::vector<ValueDescr>& keys,
+ ExecContext* ctx) {
+ auto impl = ::arrow::internal::make_unique<GrouperImpl>();
+
+ impl->encoders_.resize(keys.size());
+ impl->ctx_ = ctx;
+
+ for (size_t i = 0; i < keys.size(); ++i) {
+ const auto& key = keys[i].type;
+
+ if (key->id() == Type::BOOL) {
+ impl->encoders_[i] = ::arrow::internal::make_unique<BooleanKeyEncoder>();
+ continue;
+ }
+
+ if (key->id() == Type::DICTIONARY) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<DictionaryKeyEncoder>(key, ctx->memory_pool());
+ continue;
+ }
+
+ if (is_fixed_width(key->id())) {
+ impl->encoders_[i] = ::arrow::internal::make_unique<FixedWidthKeyEncoder>(key);
+ continue;
+ }
+
+ if (is_binary_like(key->id())) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<VarLengthKeyEncoder<BinaryType>>(key);
+ continue;
+ }
+
+ if (is_large_binary_like(key->id())) {
+ impl->encoders_[i] =
+ ::arrow::internal::make_unique<VarLengthKeyEncoder<LargeBinaryType>>(key);
+ continue;
+ }
+
+ return Status::NotImplemented("Keys of type ", *key);
+ }
+
+ return std::move(impl);
+ }
+
+ Result<Datum> Consume(const ExecBatch& batch) override {
+ std::vector<int32_t> offsets_batch(batch.length + 1);
+ for (int i = 0; i < batch.num_values(); ++i) {
+ encoders_[i]->AddLength(*batch[i].array(), offsets_batch.data());
+ }
+
+ int32_t total_length = 0;
+ for (int64_t i = 0; i < batch.length; ++i) {
+ auto total_length_before = total_length;
+ total_length += offsets_batch[i];
+ offsets_batch[i] = total_length_before;
+ }
+ offsets_batch[batch.length] = total_length;
+
+ std::vector<uint8_t> key_bytes_batch(total_length);
+ std::vector<uint8_t*> key_buf_ptrs(batch.length);
+ for (int64_t i = 0; i < batch.length; ++i) {
+ key_buf_ptrs[i] = key_bytes_batch.data() + offsets_batch[i];
+ }
+
+ for (int i = 0; i < batch.num_values(); ++i) {
+ RETURN_NOT_OK(encoders_[i]->Encode(*batch[i].array(), key_buf_ptrs.data()));
+ }
+
+ TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
+ RETURN_NOT_OK(group_ids_batch.Resize(batch.length));
+
+ for (int64_t i = 0; i < batch.length; ++i) {
+ int32_t key_length = offsets_batch[i + 1] - offsets_batch[i];
+ std::string key(
+ reinterpret_cast<const char*>(key_bytes_batch.data() + offsets_batch[i]),
+ key_length);
+
+ auto it_success = map_.emplace(key, num_groups_);
+ auto group_id = it_success.first->second;
+
+ if (it_success.second) {
+ // new key; update offsets and key_bytes
+ ++num_groups_;
+ auto next_key_offset = static_cast<int32_t>(key_bytes_.size());
+ key_bytes_.resize(next_key_offset + key_length);
+ offsets_.push_back(next_key_offset + key_length);
+ memcpy(key_bytes_.data() + next_key_offset, key.c_str(), key_length);
+ }
+
+ group_ids_batch.UnsafeAppend(group_id);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto group_ids, group_ids_batch.Finish());
+ return Datum(UInt32Array(batch.length, std::move(group_ids)));
+ }
+
+ uint32_t num_groups() const override { return num_groups_; }
+
+ Result<ExecBatch> GetUniques() override {
+ ExecBatch out({}, num_groups_);
+
+ std::vector<uint8_t*> key_buf_ptrs(num_groups_);
+ for (int64_t i = 0; i < num_groups_; ++i) {
+ key_buf_ptrs[i] = key_bytes_.data() + offsets_[i];
+ }
+
+ out.values.resize(encoders_.size());
+ for (size_t i = 0; i < encoders_.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ out.values[i],
+ encoders_[i]->Decode(key_buf_ptrs.data(), static_cast<int32_t>(num_groups_),
+ ctx_->memory_pool()));
+ }
+
+ return out;
+ }
+
+ ExecContext* ctx_;
+ std::unordered_map<std::string, uint32_t> map_;
+ std::vector<int32_t> offsets_ = {0};
+ std::vector<uint8_t> key_bytes_;
+ uint32_t num_groups_ = 0;
+ std::vector<std::unique_ptr<KeyEncoder>> encoders_;
+};
+
+struct GrouperFastImpl : Grouper {
+ static constexpr int kBitmapPaddingForSIMD = 64; // bits
+ static constexpr int kPaddingForSIMD = 32; // bytes
+
+ static bool CanUse(const std::vector<ValueDescr>& keys) {
+#if ARROW_LITTLE_ENDIAN
+ for (size_t i = 0; i < keys.size(); ++i) {
+ const auto& key = keys[i].type;
+ if (is_large_binary_like(key->id())) {
+ return false;
+ }
+ }
+ return true;
+#else
+ return false;
+#endif
+ }
+
+ static Result<std::unique_ptr<GrouperFastImpl>> Make(
+ const std::vector<ValueDescr>& keys, ExecContext* ctx) {
+ auto impl = ::arrow::internal::make_unique<GrouperFastImpl>();
+ impl->ctx_ = ctx;
+
+ RETURN_NOT_OK(impl->temp_stack_.Init(ctx->memory_pool(), 64 * minibatch_size_max_));
+ impl->encode_ctx_.hardware_flags =
+ arrow::internal::CpuInfo::GetInstance()->hardware_flags();
+ impl->encode_ctx_.stack = &impl->temp_stack_;
+
+ auto num_columns = keys.size();
+ impl->col_metadata_.resize(num_columns);
+ impl->key_types_.resize(num_columns);
+ impl->dictionaries_.resize(num_columns);
+ for (size_t icol = 0; icol < num_columns; ++icol) {
+ const auto& key = keys[icol].type;
+ if (key->id() == Type::DICTIONARY) {
+ auto bit_width = checked_cast<const FixedWidthType&>(*key).bit_width();
+ ARROW_DCHECK(bit_width % 8 == 0);
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(true, bit_width / 8);
+ } else if (key->id() == Type::BOOL) {
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(true, 0);
+ } else if (is_fixed_width(key->id())) {
+ impl->col_metadata_[icol] = arrow::compute::KeyEncoder::KeyColumnMetadata(
+ true, checked_cast<const FixedWidthType&>(*key).bit_width() / 8);
+ } else if (is_binary_like(key->id())) {
+ impl->col_metadata_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnMetadata(false, sizeof(uint32_t));
+ } else {
+ return Status::NotImplemented("Keys of type ", *key);
+ }
+ impl->key_types_[icol] = key;
+ }
+
+ impl->encoder_.Init(impl->col_metadata_, &impl->encode_ctx_,
+ /* row_alignment = */ sizeof(uint64_t),
+ /* string_alignment = */ sizeof(uint64_t));
+ RETURN_NOT_OK(impl->rows_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
+ RETURN_NOT_OK(
+ impl->rows_minibatch_.Init(ctx->memory_pool(), impl->encoder_.row_metadata()));
+ impl->minibatch_size_ = impl->minibatch_size_min_;
+ GrouperFastImpl* impl_ptr = impl.get();
+ auto equal_func = [impl_ptr](
+ int num_keys_to_compare, const uint16_t* selection_may_be_null,
+ const uint32_t* group_ids, uint32_t* out_num_keys_mismatch,
+ uint16_t* out_selection_mismatch) {
+ arrow::compute::KeyCompare::CompareRows(
+ num_keys_to_compare, selection_may_be_null, group_ids, &impl_ptr->encode_ctx_,
+ out_num_keys_mismatch, out_selection_mismatch, impl_ptr->rows_minibatch_,
+ impl_ptr->rows_);
+ };
+ auto append_func = [impl_ptr](int num_keys, const uint16_t* selection) {
+ return impl_ptr->rows_.AppendSelectionFrom(impl_ptr->rows_minibatch_, num_keys,
+ selection);
+ };
+ RETURN_NOT_OK(impl->map_.init(impl->encode_ctx_.hardware_flags, ctx->memory_pool(),
+ impl->encode_ctx_.stack, impl->log_minibatch_max_,
+ equal_func, append_func));
+ impl->cols_.resize(num_columns);
+ impl->minibatch_hashes_.resize(impl->minibatch_size_max_ +
+ kPaddingForSIMD / sizeof(uint32_t));
+
+ return std::move(impl);
+ }
+
+ ~GrouperFastImpl() { map_.cleanup(); }
+
+ Result<Datum> Consume(const ExecBatch& batch) override {
+ int64_t num_rows = batch.length;
+ int num_columns = batch.num_values();
+
+ // Process dictionaries
+ for (int icol = 0; icol < num_columns; ++icol) {
+ if (key_types_[icol]->id() == Type::DICTIONARY) {
+ auto data = batch[icol].array();
+ auto dict = MakeArray(data->dictionary);
+ if (dictionaries_[icol]) {
+ if (!dictionaries_[icol]->Equals(dict)) {
+ // TODO(bkietz) unify if necessary. For now, just error if any batch's
+ // dictionary differs from the first we saw for this key
+ return Status::NotImplemented("Unifying differing dictionaries");
+ }
+ } else {
+ dictionaries_[icol] = std::move(dict);
+ }
+ }
+ }
+
+ std::shared_ptr<arrow::Buffer> group_ids;
+ ARROW_ASSIGN_OR_RAISE(
+ group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
+
+ for (int icol = 0; icol < num_columns; ++icol) {
+ const uint8_t* non_nulls = nullptr;
+ if (batch[icol].array()->buffers[0] != NULLPTR) {
+ non_nulls = batch[icol].array()->buffers[0]->data();
+ }
+ const uint8_t* fixedlen = batch[icol].array()->buffers[1]->data();
+ const uint8_t* varlen = nullptr;
+ if (!col_metadata_[icol].is_fixed_length) {
+ varlen = batch[icol].array()->buffers[2]->data();
+ }
+
+ int64_t offset = batch[icol].array()->offset;
+
+ auto col_base = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[icol], offset + num_rows, non_nulls, fixedlen, varlen);
+
+ cols_[icol] =
+ arrow::compute::KeyEncoder::KeyColumnArray(col_base, offset, num_rows);
+ }
+
+ // Split into smaller mini-batches
+ //
+ for (uint32_t start_row = 0; start_row < num_rows;) {
+ uint32_t batch_size_next = std::min(static_cast<uint32_t>(minibatch_size_),
+ static_cast<uint32_t>(num_rows) - start_row);
+
+ // Encode
+ rows_minibatch_.Clean();
+ RETURN_NOT_OK(encoder_.PrepareOutputForEncode(start_row, batch_size_next,
+ &rows_minibatch_, cols_));
+ encoder_.Encode(start_row, batch_size_next, &rows_minibatch_, cols_);
+
+ // Compute hash
+ if (encoder_.row_metadata().is_fixed_length) {
+ Hashing::hash_fixed(encode_ctx_.hardware_flags, batch_size_next,
+ encoder_.row_metadata().fixed_length, rows_minibatch_.data(1),
+ minibatch_hashes_.data());
+ } else {
+ auto hash_temp_buf =
+ util::TempVectorHolder<uint32_t>(&temp_stack_, 4 * batch_size_next);
+ Hashing::hash_varlen(encode_ctx_.hardware_flags, batch_size_next,
+ rows_minibatch_.offsets(), rows_minibatch_.data(2),
+ hash_temp_buf.mutable_data(), minibatch_hashes_.data());
+ }
+
+ // Map
+ RETURN_NOT_OK(
+ map_.map(batch_size_next, minibatch_hashes_.data(),
+ reinterpret_cast<uint32_t*>(group_ids->mutable_data()) + start_row));
+
+ start_row += batch_size_next;
+
+ if (minibatch_size_ * 2 <= minibatch_size_max_) {
+ minibatch_size_ *= 2;
+ }
+ }
+
+ return Datum(UInt32Array(batch.length, std::move(group_ids)));
+ }
+
+ uint32_t num_groups() const override { return static_cast<uint32_t>(rows_.length()); }
+
+ // Make sure padded buffers end up with the right logical size
+
+ Result<std::shared_ptr<Buffer>> AllocatePaddedBitmap(int64_t length) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> buf,
+ AllocateBitmap(length + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+ return SliceMutableBuffer(buf, 0, BitUtil::BytesForBits(length));
+ }
+
+ Result<std::shared_ptr<Buffer>> AllocatePaddedBuffer(int64_t size) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<Buffer> buf,
+ AllocateBuffer(size + kBitmapPaddingForSIMD, ctx_->memory_pool()));
+ return SliceMutableBuffer(buf, 0, size);
+ }
+
+ Result<ExecBatch> GetUniques() override {
+ auto num_columns = static_cast<uint32_t>(col_metadata_.size());
+ int64_t num_groups = rows_.length();
+
+ std::vector<std::shared_ptr<Buffer>> non_null_bufs(num_columns);
+ std::vector<std::shared_ptr<Buffer>> fixedlen_bufs(num_columns);
+ std::vector<std::shared_ptr<Buffer>> varlen_bufs(num_columns);
+
+ for (size_t i = 0; i < num_columns; ++i) {
+ ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocatePaddedBitmap(num_groups));
+ if (col_metadata_[i].is_fixed_length) {
+ if (col_metadata_[i].fixed_length == 0) {
+ ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocatePaddedBitmap(num_groups));
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ fixedlen_bufs[i],
+ AllocatePaddedBuffer(num_groups * col_metadata_[i].fixed_length));
+ }
+ } else {
+ ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
+ AllocatePaddedBuffer((num_groups + 1) * sizeof(uint32_t)));
+ }
+ cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
+ fixedlen_bufs[i]->mutable_data(), nullptr);
+ }
+
+ for (int64_t start_row = 0; start_row < num_groups;) {
+ int64_t batch_size_next =
+ std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
+ encoder_.DecodeFixedLengthBuffers(start_row, start_row, batch_size_next, rows_,
+ &cols_);
+ start_row += batch_size_next;
+ }
+
+ if (!rows_.metadata().is_fixed_length) {
+ for (size_t i = 0; i < num_columns; ++i) {
+ if (!col_metadata_[i].is_fixed_length) {
+ auto varlen_size =
+ reinterpret_cast<const uint32_t*>(fixedlen_bufs[i]->data())[num_groups];
+ ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocatePaddedBuffer(varlen_size));
+ cols_[i] = arrow::compute::KeyEncoder::KeyColumnArray(
+ col_metadata_[i], num_groups, non_null_bufs[i]->mutable_data(),
+ fixedlen_bufs[i]->mutable_data(), varlen_bufs[i]->mutable_data());
+ }
+ }
+
+ for (int64_t start_row = 0; start_row < num_groups;) {
+ int64_t batch_size_next =
+ std::min(num_groups - start_row, static_cast<int64_t>(minibatch_size_max_));
+ encoder_.DecodeVaryingLengthBuffers(start_row, start_row, batch_size_next, rows_,
+ &cols_);
+ start_row += batch_size_next;
+ }
+ }
+
+ ExecBatch out({}, num_groups);
+ out.values.resize(num_columns);
+ for (size_t i = 0; i < num_columns; ++i) {
+ auto valid_count = arrow::internal::CountSetBits(
+ non_null_bufs[i]->data(), /*offset=*/0, static_cast<int64_t>(num_groups));
+ int null_count = static_cast<int>(num_groups) - static_cast<int>(valid_count);
+
+ if (col_metadata_[i].is_fixed_length) {
+ out.values[i] = ArrayData::Make(
+ key_types_[i], num_groups,
+ {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])}, null_count);
+ } else {
+ out.values[i] =
+ ArrayData::Make(key_types_[i], num_groups,
+ {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]),
+ std::move(varlen_bufs[i])},
+ null_count);
+ }
+ }
+
+ // Process dictionaries
+ for (size_t icol = 0; icol < num_columns; ++icol) {
+ if (key_types_[icol]->id() == Type::DICTIONARY) {
+ if (dictionaries_[icol]) {
+ out.values[icol].array()->dictionary = dictionaries_[icol]->data();
+ } else {
+ ARROW_ASSIGN_OR_RAISE(auto dict, MakeArrayOfNull(key_types_[icol], 0));
+ out.values[icol].array()->dictionary = dict->data();
+ }
+ }
+ }
+
+ return out;
+ }
+
+ static constexpr int log_minibatch_max_ = 10;
+ static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_;
+ static constexpr int minibatch_size_min_ = 128;
+ int minibatch_size_;
+
+ ExecContext* ctx_;
+ arrow::util::TempVectorStack temp_stack_;
+ arrow::compute::KeyEncoder::KeyEncoderContext encode_ctx_;
+
+ std::vector<std::shared_ptr<arrow::DataType>> key_types_;
+ std::vector<arrow::compute::KeyEncoder::KeyColumnMetadata> col_metadata_;
+ std::vector<arrow::compute::KeyEncoder::KeyColumnArray> cols_;
+ std::vector<uint32_t> minibatch_hashes_;
+
+ std::vector<std::shared_ptr<Array>> dictionaries_;
+
+ arrow::compute::KeyEncoder::KeyRowArray rows_;
+ arrow::compute::KeyEncoder::KeyRowArray rows_minibatch_;
+ arrow::compute::KeyEncoder encoder_;
+ arrow::compute::SwissTable map_;
+};
+
+/// C++ abstract base class for the HashAggregateKernel interface.
+/// Implementations should be default constructible and perform initialization in
+/// Init().
+struct GroupedAggregator : KernelState {
+ virtual Status Init(ExecContext*, const FunctionOptions*,
+ const std::shared_ptr<DataType>&) = 0;
+
+ virtual Status Consume(const ExecBatch& batch) = 0;
+
+ virtual Result<Datum> Finalize() = 0;
+
+ template <typename Reserve>
+ Status MaybeReserve(int64_t old_num_groups, const ExecBatch& batch,
+ const Reserve& reserve) {
+ int64_t new_num_groups = batch[2].scalar_as<UInt32Scalar>().value;
+ if (new_num_groups <= old_num_groups) {
+ return Status::OK();
+ }
+ return reserve(new_num_groups - old_num_groups);
+ }
+
+ virtual std::shared_ptr<DataType> out_type() const = 0;
+};
+
+// ----------------------------------------------------------------------
+// Count implementation
+
+struct GroupedCountImpl : public GroupedAggregator {
+ Status Init(ExecContext* ctx, const FunctionOptions* options,
+ const std::shared_ptr<DataType>&) override {
+ options_ = checked_cast<const ScalarAggregateOptions&>(*options);
+ counts_ = BufferBuilder(ctx->memory_pool());
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ return counts_.Append(added_groups * sizeof(int64_t), 0);
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ auto raw_counts = reinterpret_cast<int64_t*>(counts_.mutable_data());
+
+ const auto& input = batch[0].array();
+
+ if (!options_.skip_nulls) {
+ if (input->GetNullCount() != 0) {
+ for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) {
+ auto g = group_ids[i];
+ raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i);
+ }
+ }
+ return Status::OK();
+ }
+
+ arrow::internal::VisitSetBitRunsVoid(
+ input->buffers[0], input->offset, input->length,
+ [&](int64_t begin, int64_t length) {
+ for (int64_t input_i = begin, i = begin - input->offset;
+ input_i < begin + length; ++input_i, ++i) {
+ auto g = group_ids[i];
+ raw_counts[g] += 1;
+ }
+ });
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ ARROW_ASSIGN_OR_RAISE(auto counts, counts_.Finish());
+ return std::make_shared<Int64Array>(num_groups_, std::move(counts));
+ }
+
+ std::shared_ptr<DataType> out_type() const override { return int64(); }
+
+ int64_t num_groups_ = 0;
+ ScalarAggregateOptions options_;
+ BufferBuilder counts_;
+};
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+struct GroupedSumImpl : public GroupedAggregator {
+ // NB: whether we are accumulating into double, int64_t, or uint64_t
+ // we always have 64 bits per group in the sums buffer.
+ static constexpr size_t kSumSize = sizeof(int64_t);
+
+ using ConsumeImpl = std::function<void(const std::shared_ptr<ArrayData>&,
+ const uint32_t*, void*, int64_t*)>;
+
+ struct GetConsumeImpl {
+ template <typename T, typename AccType = typename FindAccumulatorType<T>::Type>
+ Status Visit(const T&) {
+ consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+ void* boxed_sums, int64_t* counts) {
+ auto sums = reinterpret_cast<typename TypeTraits<AccType>::CType*>(boxed_sums);
+
+ VisitArrayDataInline<T>(
+ *input,
+ [&](typename TypeTraits<T>::CType value) {
+ sums[*group] += value;
+ counts[*group] += 1;
+ ++group;
+ },
+ [&] { ++group; });
+ };
+ out_type = TypeTraits<AccType>::type_singleton();
+ return Status::OK();
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ return Status::NotImplemented("Summing data of type ", type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Summing data of type ", type);
+ }
+
+ ConsumeImpl consume_impl;
+ std::shared_ptr<DataType> out_type;
+ };
+
+ Status Init(ExecContext* ctx, const FunctionOptions*,
+ const std::shared_ptr<DataType>& input_type) override {
+ pool_ = ctx->memory_pool();
+ sums_ = BufferBuilder(pool_);
+ counts_ = BufferBuilder(pool_);
+
+ GetConsumeImpl get_consume_impl;
+ RETURN_NOT_OK(VisitTypeInline(*input_type, &get_consume_impl));
+
+ consume_impl_ = std::move(get_consume_impl.consume_impl);
+ out_type_ = std::move(get_consume_impl.out_type);
+
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ RETURN_NOT_OK(sums_.Append(added_groups * kSumSize, 0));
+ RETURN_NOT_OK(counts_.Append(added_groups * sizeof(int64_t), 0));
+ return Status::OK();
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ consume_impl_(batch[0].array(), group_ids, sums_.mutable_data(),
+ reinterpret_cast<int64_t*>(counts_.mutable_data()));
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ std::shared_ptr<Buffer> null_bitmap;
+ int64_t null_count = 0;
+
+ for (int64_t i = 0; i < num_groups_; ++i) {
+ if (reinterpret_cast<const int64_t*>(counts_.data())[i] > 0) continue;
+
+ if (null_bitmap == nullptr) {
+ ARROW_ASSIGN_OR_RAISE(null_bitmap, AllocateBitmap(num_groups_, pool_));
+ BitUtil::SetBitsTo(null_bitmap->mutable_data(), 0, num_groups_, true);
+ }
+
+ null_count += 1;
+ BitUtil::SetBitTo(null_bitmap->mutable_data(), i, false);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto sums, sums_.Finish());
+
+ return ArrayData::Make(std::move(out_type_), num_groups_,
+ {std::move(null_bitmap), std::move(sums)}, null_count);
+ }
+
+ std::shared_ptr<DataType> out_type() const override { return out_type_; }
+
+ // NB: counts are used here instead of a simple "has_values_" bitmap since
+ // we expect to reuse this kernel to handle Mean
+ int64_t num_groups_ = 0;
+ BufferBuilder sums_, counts_;
+ std::shared_ptr<DataType> out_type_;
+ ConsumeImpl consume_impl_;
+ MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename CType>
+struct Extrema : std::numeric_limits<CType> {};
+
+template <>
+struct Extrema<float> {
+ static constexpr float min() { return -std::numeric_limits<float>::infinity(); }
+ static constexpr float max() { return std::numeric_limits<float>::infinity(); }
+};
+
+template <>
+struct Extrema<double> {
+ static constexpr double min() { return -std::numeric_limits<double>::infinity(); }
+ static constexpr double max() { return std::numeric_limits<double>::infinity(); }
+};
+
+struct GroupedMinMaxImpl : public GroupedAggregator {
+ using ConsumeImpl =
+ std::function<void(const std::shared_ptr<ArrayData>&, const uint32_t*, void*, void*,
+ uint8_t*, uint8_t*)>;
+
+ using ResizeImpl = std::function<Status(BufferBuilder*, int64_t)>;
+
+ template <typename CType>
+ static ResizeImpl MakeResizeImpl(CType anti_extreme) {
+ // resize a min or max buffer, storing the correct anti extreme
+ return [anti_extreme](BufferBuilder* builder, int64_t added_groups) {
+ TypedBufferBuilder<CType> typed_builder(std::move(*builder));
+ RETURN_NOT_OK(typed_builder.Append(added_groups, anti_extreme));
+ *builder = std::move(*typed_builder.bytes_builder());
+ return Status::OK();
+ };
+ }
+
+ struct GetImpl {
+ template <typename T, typename CType = typename TypeTraits<T>::CType>
+ enable_if_number<T, Status> Visit(const T&) {
+ consume_impl = [](const std::shared_ptr<ArrayData>& input, const uint32_t* group,
+ void* mins, void* maxes, uint8_t* has_values,
+ uint8_t* has_nulls) {
+ auto raw_mins = reinterpret_cast<CType*>(mins);
+ auto raw_maxes = reinterpret_cast<CType*>(maxes);
+
+ VisitArrayDataInline<T>(
+ *input,
+ [&](CType val) {
+ raw_maxes[*group] = std::max(raw_maxes[*group], val);
+ raw_mins[*group] = std::min(raw_mins[*group], val);
+ BitUtil::SetBit(has_values, *group++);
+ },
+ [&] { BitUtil::SetBit(has_nulls, *group++); });
+ };
+
+ resize_min_impl = MakeResizeImpl(Extrema<CType>::max());
+ resize_max_impl = MakeResizeImpl(Extrema<CType>::min());
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ Status Visit(const HalfFloatType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::NotImplemented("Grouped MinMax data of type ", type);
+ }
+
+ ConsumeImpl consume_impl;
+ ResizeImpl resize_min_impl, resize_max_impl;
+ };
+
+ Status Init(ExecContext* ctx, const FunctionOptions* options,
+ const std::shared_ptr<DataType>& input_type) override {
+ options_ = *checked_cast<const ScalarAggregateOptions*>(options);
+ type_ = input_type;
+
+ mins_ = BufferBuilder(ctx->memory_pool());
+ maxes_ = BufferBuilder(ctx->memory_pool());
+ has_values_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+ has_nulls_ = TypedBufferBuilder<bool>(ctx->memory_pool());
+
+ GetImpl get_impl;
+ RETURN_NOT_OK(VisitTypeInline(*input_type, &get_impl));
+
+ consume_impl_ = std::move(get_impl.consume_impl);
+ resize_min_impl_ = std::move(get_impl.resize_min_impl);
+ resize_max_impl_ = std::move(get_impl.resize_max_impl);
+
+ return Status::OK();
+ }
+
+ Status Consume(const ExecBatch& batch) override {
+ RETURN_NOT_OK(MaybeReserve(num_groups_, batch, [&](int64_t added_groups) {
+ num_groups_ += added_groups;
+ RETURN_NOT_OK(resize_min_impl_(&mins_, added_groups));
+ RETURN_NOT_OK(resize_max_impl_(&maxes_, added_groups));
+ RETURN_NOT_OK(has_values_.Append(added_groups, false));
+ RETURN_NOT_OK(has_nulls_.Append(added_groups, false));
+ return Status::OK();
+ }));
+
+ auto group_ids = batch[1].array()->GetValues<uint32_t>(1);
+ consume_impl_(batch[0].array(), group_ids, mins_.mutable_data(),
+ maxes_.mutable_data(), has_values_.mutable_data(),
+ has_nulls_.mutable_data());
+ return Status::OK();
+ }
+
+ Result<Datum> Finalize() override {
+ // aggregation for group is valid if there was at least one value in that group
+ ARROW_ASSIGN_OR_RAISE(auto null_bitmap, has_values_.Finish());
+
+ if (!options_.skip_nulls) {
+ // ... and there were no nulls in that group
+ ARROW_ASSIGN_OR_RAISE(auto has_nulls, has_nulls_.Finish());
+ arrow::internal::BitmapAndNot(null_bitmap->data(), 0, has_nulls->data(), 0,
+ num_groups_, 0, null_bitmap->mutable_data());
+ }
+
+ auto mins = ArrayData::Make(type_, num_groups_, {null_bitmap, nullptr});
+ auto maxes = ArrayData::Make(type_, num_groups_, {std::move(null_bitmap), nullptr});
+ ARROW_ASSIGN_OR_RAISE(mins->buffers[1], mins_.Finish());
+ ARROW_ASSIGN_OR_RAISE(maxes->buffers[1], maxes_.Finish());
+
+ return ArrayData::Make(out_type(), num_groups_, {nullptr},
+ {std::move(mins), std::move(maxes)});
+ }
+
+ std::shared_ptr<DataType> out_type() const override {
+ return struct_({field("min", type_), field("max", type_)});
+ }
+
+ int64_t num_groups_;
+ BufferBuilder mins_, maxes_;
+ TypedBufferBuilder<bool> has_values_, has_nulls_;
+ std::shared_ptr<DataType> type_;
+ ConsumeImpl consume_impl_;
+ ResizeImpl resize_min_impl_, resize_max_impl_;
+ ScalarAggregateOptions options_;
+};
+
+template <typename Impl>
+HashAggregateKernel MakeKernel(InputType argument_type) {
+ HashAggregateKernel kernel;
+
+ kernel.init = [](KernelContext* ctx,
+ const KernelInitArgs& args) -> Result<std::unique_ptr<KernelState>> {
+ auto impl = ::arrow::internal::make_unique<Impl>();
+ // FIXME(bkietz) Init should not take a type. That should be an unboxed template arg
+ // for the Impl. Otherwise we're not exposing dispatch as well as we should.
+ RETURN_NOT_OK(impl->Init(ctx->exec_context(), args.options, args.inputs[0].type));
+ return std::move(impl);
+ };
+
+ kernel.signature = KernelSignature::Make(
+ {std::move(argument_type), InputType::Array(Type::UINT32),
+ InputType::Scalar(Type::UINT32)},
+ OutputType(
+ [](KernelContext* ctx, const std::vector<ValueDescr>&) -> Result<ValueDescr> {
+ return checked_cast<GroupedAggregator*>(ctx->state())->out_type();
+ }));
+
+ kernel.consume = [](KernelContext* ctx, const ExecBatch& batch) {
+ return checked_cast<GroupedAggregator*>(ctx->state())->Consume(batch);
+ };
+
+ kernel.merge = [](KernelContext* ctx, KernelState&&, KernelState*) {
+ // TODO(ARROW-11840) merge two hash tables
+ return Status::NotImplemented("Merge hashed aggregations");
+ };
+
+ kernel.finalize = [](KernelContext* ctx, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(*out,
+ checked_cast<GroupedAggregator*>(ctx->state())->Finalize());
+ return Status::OK();
+ };
+
+ return kernel;
+}
+
+Result<std::vector<const HashAggregateKernel*>> GetKernels(
+ ExecContext* ctx, const std::vector<Aggregate>& aggregates,
+ const std::vector<ValueDescr>& in_descrs) {
+ if (aggregates.size() != in_descrs.size()) {
+ return Status::Invalid(aggregates.size(), " aggregate functions were specified but ",
+ in_descrs.size(), " arguments were provided.");
+ }
+
+ std::vector<const HashAggregateKernel*> kernels(in_descrs.size());
+
+ for (size_t i = 0; i < aggregates.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(auto function,
+ ctx->func_registry()->GetFunction(aggregates[i].function));
+ ARROW_ASSIGN_OR_RAISE(
+ const Kernel* kernel,
+ function->DispatchExact(
+ {in_descrs[i], ValueDescr::Array(uint32()), ValueDescr::Scalar(uint32())}));
+ kernels[i] = static_cast<const HashAggregateKernel*>(kernel);
+ }
+ return kernels;
+}
+
+Result<std::vector<std::unique_ptr<KernelState>>> InitKernels(
+ const std::vector<const HashAggregateKernel*>& kernels, ExecContext* ctx,
+ const std::vector<Aggregate>& aggregates, const std::vector<ValueDescr>& in_descrs) {
+ std::vector<std::unique_ptr<KernelState>> states(kernels.size());
+
+ for (size_t i = 0; i < aggregates.size(); ++i) {
+ auto options = aggregates[i].options;
+
+ if (options == nullptr) {
+ // use known default options for the named function if possible
+ auto maybe_function = ctx->func_registry()->GetFunction(aggregates[i].function);
+ if (maybe_function.ok()) {
+ options = maybe_function.ValueOrDie()->default_options();
+ }
+ }
+
+ KernelContext kernel_ctx{ctx};
+ ARROW_ASSIGN_OR_RAISE(
+ states[i], kernels[i]->init(&kernel_ctx, KernelInitArgs{kernels[i],
+ {
+ in_descrs[i].type,
+ uint32(),
+ uint32(),
+ },
+ options}));
+ }
+
+ return std::move(states);
+}
+
+Result<FieldVector> ResolveKernels(
+ const std::vector<Aggregate>& aggregates,
+ const std::vector<const HashAggregateKernel*>& kernels,
+ const std::vector<std::unique_ptr<KernelState>>& states, ExecContext* ctx,
+ const std::vector<ValueDescr>& descrs) {
+ FieldVector fields(descrs.size());
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext kernel_ctx{ctx};
+ kernel_ctx.SetState(states[i].get());
+
+ ARROW_ASSIGN_OR_RAISE(auto descr, kernels[i]->signature->out_type().Resolve(
+ &kernel_ctx, {
+ descrs[i].type,
+ uint32(),
+ uint32(),
+ }));
+ fields[i] = field(aggregates[i].function, std::move(descr.type));
+ }
+ return fields;
+}
+
+} // namespace
+
+Result<std::unique_ptr<Grouper>> Grouper::Make(const std::vector<ValueDescr>& descrs,
+ ExecContext* ctx) {
+ if (GrouperFastImpl::CanUse(descrs)) {
+ return GrouperFastImpl::Make(descrs, ctx);
+ }
+ return GrouperImpl::Make(descrs, ctx);
+}
+
+Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Datum>& keys,
+ const std::vector<Aggregate>& aggregates, ExecContext* ctx) {
+ // Construct and initialize HashAggregateKernels
+ ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
+ ExecBatch::Make(arguments).Map(
+ [](ExecBatch batch) { return batch.GetDescriptors(); }));
+
+ ARROW_ASSIGN_OR_RAISE(auto kernels, GetKernels(ctx, aggregates, argument_descrs));
+
+ ARROW_ASSIGN_OR_RAISE(auto states,
+ InitKernels(kernels, ctx, aggregates, argument_descrs));
+
+ ARROW_ASSIGN_OR_RAISE(
+ FieldVector out_fields,
+ ResolveKernels(aggregates, kernels, states, ctx, argument_descrs));
+
+ using arrow::compute::detail::ExecBatchIterator;
+
+ ARROW_ASSIGN_OR_RAISE(auto argument_batch_iterator,
+ ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
+
+ // Construct Grouper
+ ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
+ return batch.GetDescriptors();
+ }));
+
+ ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_descrs, ctx));
+
+ int i = 0;
+ for (ValueDescr& key_descr : key_descrs) {
+ out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
+ ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
+
+ // start "streaming" execution
+ ExecBatch key_batch, argument_batch;
+ while (argument_batch_iterator->Next(&argument_batch) &&
+ key_batch_iterator->Next(&key_batch)) {
+ if (key_batch.length == 0) continue;
+
+ // compute a batch of group ids
+ ARROW_ASSIGN_OR_RAISE(Datum id_batch, grouper->Consume(key_batch));
+
+ // consume group ids with HashAggregateKernels
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext batch_ctx{ctx};
+ batch_ctx.SetState(states[i].get());
+ ARROW_ASSIGN_OR_RAISE(auto batch, ExecBatch::Make({argument_batch[i], id_batch,
+ Datum(grouper->num_groups())}));
+ RETURN_NOT_OK(kernels[i]->consume(&batch_ctx, batch));
+ }
+ }
+
+ // Finalize output
+ ArrayDataVector out_data(arguments.size() + keys.size());
+ auto it = out_data.begin();
+
+ for (size_t i = 0; i < kernels.size(); ++i) {
+ KernelContext batch_ctx{ctx};
+ batch_ctx.SetState(states[i].get());
+ Datum out;
+ RETURN_NOT_OK(kernels[i]->finalize(&batch_ctx, &out));
+ *it++ = out.array();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(ExecBatch out_keys, grouper->GetUniques());
+ for (const auto& key : out_keys.values) {
+ *it++ = key.array();
+ }
+
+ int64_t length = out_data[0]->length;
+ return ArrayData::Make(struct_(std::move(out_fields)), length,
+ {/*null_bitmap=*/nullptr}, std::move(out_data),
+ /*null_count=*/0);
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::ApplyGroupings(const ListArray& groupings,
+ const Array& array,
+ ExecContext* ctx) {
+ ARROW_ASSIGN_OR_RAISE(Datum sorted,
+ compute::Take(array, groupings.data()->child_data[0],
+ TakeOptions::NoBoundsCheck(), ctx));
+
+ return std::make_shared<ListArray>(list(array.type()), groupings.length(),
+ groupings.value_offsets(), sorted.make_array());
+}
+
+Result<std::shared_ptr<ListArray>> Grouper::MakeGroupings(const UInt32Array& ids,
+ uint32_t num_groups,
+ ExecContext* ctx) {
+ if (ids.null_count() != 0) {
+ return Status::Invalid("MakeGroupings with null ids");
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets, AllocateBuffer(sizeof(int32_t) * (num_groups + 1),
+ ctx->memory_pool()));
+ auto raw_offsets = reinterpret_cast<int32_t*>(offsets->mutable_data());
+
+ std::memset(raw_offsets, 0, offsets->size());
+ for (int i = 0; i < ids.length(); ++i) {
+ DCHECK_LT(ids.Value(i), num_groups);
+ raw_offsets[ids.Value(i)] += 1;
+ }
+ int32_t length = 0;
+ for (uint32_t id = 0; id < num_groups; ++id) {
+ auto offset = raw_offsets[id];
+ raw_offsets[id] = length;
+ length += offset;
+ }
+ raw_offsets[num_groups] = length;
+ DCHECK_EQ(ids.length(), length);
+
+ ARROW_ASSIGN_OR_RAISE(auto offsets_copy,
+ offsets->CopySlice(0, offsets->size(), ctx->memory_pool()));
+ raw_offsets = reinterpret_cast<int32_t*>(offsets_copy->mutable_data());
+
+ ARROW_ASSIGN_OR_RAISE(auto sort_indices, AllocateBuffer(sizeof(int32_t) * ids.length(),
+ ctx->memory_pool()));
+ auto raw_sort_indices = reinterpret_cast<int32_t*>(sort_indices->mutable_data());
+ for (int i = 0; i < ids.length(); ++i) {
+ raw_sort_indices[raw_offsets[ids.Value(i)]++] = i;
+ }
+
+ return std::make_shared<ListArray>(
+ list(int32()), num_groups, std::move(offsets),
+ std::make_shared<Int32Array>(ids.length(), std::move(sort_indices)));
+}
+
+namespace {
+const FunctionDoc hash_count_doc{"Count the number of null / non-null values",
+ ("By default, non-null values are counted.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array", "group_id_array", "group_count"},
+ "ScalarAggregateOptions"};
+
+const FunctionDoc hash_sum_doc{"Sum values of a numeric array",
+ ("Null values are ignored."),
+ {"array", "group_id_array", "group_count"}};
+
+const FunctionDoc hash_min_max_doc{
+ "Compute the minimum and maximum values of a numeric array",
+ ("Null values are ignored by default.\n"
+ "This can be changed through ScalarAggregateOptions."),
+ {"array", "group_id_array", "group_count"},
+ "ScalarAggregateOptions"};
+} // namespace
+
+void RegisterHashAggregateBasic(FunctionRegistry* registry) {
+ {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<HashAggregateFunction>(
+ "hash_count", Arity::Ternary(), &hash_count_doc,
+ &default_scalar_aggregate_options);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedCountImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ auto func = std::make_shared<HashAggregateFunction>("hash_sum", Arity::Ternary(),
+ &hash_sum_doc);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedSumImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ static auto default_scalar_aggregate_options = ScalarAggregateOptions::Defaults();
+ auto func = std::make_shared<HashAggregateFunction>(
+ "hash_min_max", Arity::Ternary(), &hash_min_max_doc,
+ &default_scalar_aggregate_options);
+ DCHECK_OK(func->AddKernel(MakeKernel<GroupedMinMaxImpl>(ValueDescr::ARRAY)));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index a8f1f82771b..a5d4a557740 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -15,17 +15,17 @@
// specific language governing permissions and limitations
// under the License.
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <utility>
-
-#include "arrow/compute/kernels/codegen_internal.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <utility>
+
+#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/decimal.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/macros.h"
@@ -34,7 +34,7 @@ namespace arrow {
using internal::AddWithOverflow;
using internal::DivideWithOverflow;
using internal::MultiplyWithOverflow;
-using internal::NegateWithOverflow;
+using internal::NegateWithOverflow;
using internal::SubtractWithOverflow;
namespace compute {
@@ -42,8 +42,8 @@ namespace internal {
using applicator::ScalarBinaryEqualTypes;
using applicator::ScalarBinaryNotNullEqualTypes;
-using applicator::ScalarUnary;
-using applicator::ScalarUnaryNotNull;
+using applicator::ScalarUnary;
+using applicator::ScalarUnaryNotNull;
namespace {
@@ -55,169 +55,169 @@ template <typename T>
using is_signed_integer =
std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
-template <typename T, typename R = T>
-using enable_if_signed_integer = enable_if_t<is_signed_integer<T>::value, R>;
+template <typename T, typename R = T>
+using enable_if_signed_integer = enable_if_t<is_signed_integer<T>::value, R>;
-template <typename T, typename R = T>
-using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer<T>::value, R>;
+template <typename T, typename R = T>
+using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer<T>::value, R>;
-template <typename T, typename R = T>
+template <typename T, typename R = T>
using enable_if_integer =
- enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, R>;
+ enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, R>;
+
+template <typename T, typename R = T>
+using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, R>;
-template <typename T, typename R = T>
-using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, R>;
-
template <typename T>
-using enable_if_decimal =
- enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
- T>;
+using enable_if_decimal =
+ enable_if_t<std::is_same<Decimal128, T>::value || std::is_same<Decimal256, T>::value,
+ T>;
template <typename T, typename Unsigned = typename std::make_unsigned<T>::type>
constexpr Unsigned to_unsigned(T signed_) {
return static_cast<Unsigned>(signed_);
}
-struct AbsoluteValue {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, T arg, Status*) {
- return std::fabs(arg);
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, T arg, Status*) {
- return arg;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, T arg, Status* st) {
- return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
- }
-};
-
-struct AbsoluteValueChecked {
- template <typename T, typename Arg>
- static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == std::numeric_limits<Arg>::min()) {
- *st = Status::Invalid("overflow");
- return arg;
- }
- return std::abs(arg);
- }
-
- template <typename T, typename Arg>
- static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- return arg;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- return std::fabs(arg);
- }
-};
-
+struct AbsoluteValue {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, T arg, Status*) {
+ return std::fabs(arg);
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, T arg, Status*) {
+ return arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, T arg, Status* st) {
+ return (arg < 0) ? arrow::internal::SafeSignedNegate(arg) : arg;
+ }
+};
+
+struct AbsoluteValueChecked {
+ template <typename T, typename Arg>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == std::numeric_limits<Arg>::min()) {
+ *st = Status::Invalid("overflow");
+ return arg;
+ }
+ return std::abs(arg);
+ }
+
+ template <typename T, typename Arg>
+ static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return std::fabs(arg);
+ }
+};
+
struct Add {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
return left + right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
- Arg1 right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
+ Arg1 right, Status*) {
return left + right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
return arrow::internal::SafeSignedAdd(left, right);
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + right;
+ }
};
struct AddChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result = 0;
if (ARROW_PREDICT_FALSE(AddWithOverflow(left, right, &result))) {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left + right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + right;
+ }
};
struct Subtract {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left - right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
- Arg1 right, Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 left,
+ Arg1 right, Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left - right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return arrow::internal::SafeSignedSubtract(left, right);
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + (-right);
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + (-right);
+ }
};
struct SubtractChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result = 0;
if (ARROW_PREDICT_FALSE(SubtractWithOverflow(left, right, &result))) {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left - right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left + (-right);
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left + (-right);
+ }
};
struct Multiply {
@@ -230,23 +230,23 @@ struct Multiply {
static_assert(std::is_same<decltype(int64_t() * int64_t()), int64_t>::value, "");
static_assert(std::is_same<decltype(uint64_t() * uint64_t()), uint64_t>::value, "");
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, T left, T right,
- Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, T left, T right,
+ Status*) {
return left * right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_t<
- is_unsigned_integer<T>::value && !std::is_same<T, uint16_t>::value, T>
- Call(KernelContext*, T left, T right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_t<
+ is_unsigned_integer<T>::value && !std::is_same<T, uint16_t>::value, T>
+ Call(KernelContext*, T left, T right, Status*) {
return left * right;
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_t<
- is_signed_integer<T>::value && !std::is_same<T, int16_t>::value, T>
- Call(KernelContext*, T left, T right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_t<
+ is_signed_integer<T>::value && !std::is_same<T, int16_t>::value, T>
+ Call(KernelContext*, T left, T right, Status*) {
return to_unsigned(left) * to_unsigned(right);
}
@@ -254,593 +254,593 @@ struct Multiply {
// integer. However, some inputs may nevertheless overflow (which triggers undefined
// behaviour). Therefore we first cast to 32 bit unsigned integers where overflow is
// well defined.
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_same<T, int16_t, T> Call(KernelContext*, int16_t left,
- int16_t right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_same<T, int16_t, T> Call(KernelContext*, int16_t left,
+ int16_t right, Status*) {
return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
}
- template <typename T, typename Arg0, typename Arg1>
- static constexpr enable_if_same<T, uint16_t, T> Call(KernelContext*, uint16_t left,
- uint16_t right, Status*) {
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr enable_if_same<T, uint16_t, T> Call(KernelContext*, uint16_t left,
+ uint16_t right, Status*) {
return static_cast<uint32_t>(left) * static_cast<uint32_t>(right);
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left * right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left * right;
+ }
};
struct MultiplyChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result = 0;
if (ARROW_PREDICT_FALSE(MultiplyWithOverflow(left, right, &result))) {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
return left * right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
- return left * right;
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status*) {
+ return left * right;
+ }
};
struct Divide {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status*) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status*) {
return left / right;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
T result;
if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
if (right == 0) {
- *st = Status::Invalid("divide by zero");
+ *st = Status::Invalid("divide by zero");
} else {
result = 0;
}
}
return result;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
- if (right == Arg1()) {
- *st = Status::Invalid("Divide by zero");
- return T();
- } else {
- return left / right;
- }
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ if (right == Arg1()) {
+ *st = Status::Invalid("Divide by zero");
+ return T();
+ } else {
+ return left / right;
+ }
+ }
};
struct DivideChecked {
template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
+ static enable_if_integer<T> Call(KernelContext*, Arg0 left, Arg1 right, Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
T result;
if (ARROW_PREDICT_FALSE(DivideWithOverflow(left, right, &result))) {
if (right == 0) {
- *st = Status::Invalid("divide by zero");
+ *st = Status::Invalid("divide by zero");
} else {
- *st = Status::Invalid("overflow");
+ *st = Status::Invalid("overflow");
}
}
return result;
}
template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
- Status* st) {
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 left, Arg1 right,
+ Status* st) {
static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
if (ARROW_PREDICT_FALSE(right == 0)) {
- *st = Status::Invalid("divide by zero");
+ *st = Status::Invalid("divide by zero");
return 0;
}
return left / right;
}
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_decimal<T> Call(KernelContext* ctx, Arg0 left, Arg1 right,
- Status* st) {
- return Divide::Call<T>(ctx, left, right, st);
- }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_decimal<T> Call(KernelContext* ctx, Arg0 left, Arg1 right,
+ Status* st) {
+ return Divide::Call<T>(ctx, left, right, st);
+ }
+};
+
+struct Negate {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return -arg;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return ~arg + 1;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return arrow::internal::SafeSignedNegate(arg);
+ }
+};
+
+struct NegateChecked {
+ template <typename T, typename Arg>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ T result = 0;
+ if (ARROW_PREDICT_FALSE(NegateWithOverflow(arg, &result))) {
+ *st = Status::Invalid("overflow");
+ }
+ return result;
+ }
+
+ template <typename T, typename Arg>
+ static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ DCHECK(false) << "This is included only for the purposes of instantiability from the "
+ "arithmetic kernel generator";
+ return 0;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ return -arg;
+ }
+};
+
+struct Power {
+ ARROW_NOINLINE
+ static uint64_t IntegerPower(uint64_t base, uint64_t exp) {
+ // right to left O(logn) power
+ uint64_t pow = 1;
+ while (exp) {
+ pow *= (exp & 1) ? base : 1;
+ base *= base;
+ exp >>= 1;
+ }
+ return pow;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, T base, T exp, Status* st) {
+ if (exp < 0) {
+ *st = Status::Invalid("integers to negative integer powers are not allowed");
+ return 0;
+ }
+ return static_cast<T>(IntegerPower(base, exp));
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, T base, T exp, Status*) {
+ return std::pow(base, exp);
+ }
+};
+
+struct PowerChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status* st) {
+ if (exp < 0) {
+ *st = Status::Invalid("integers to negative integer powers are not allowed");
+ return 0;
+ } else if (exp == 0) {
+ return 1;
+ }
+ // left to right O(logn) power with overflow checks
+ bool overflow = false;
+ uint64_t bitmask =
+ 1ULL << (63 - BitUtil::CountLeadingZeros(static_cast<uint64_t>(exp)));
+ T pow = 1;
+ while (bitmask) {
+ overflow |= MultiplyWithOverflow(pow, pow, &pow);
+ if (exp & bitmask) {
+ overflow |= MultiplyWithOverflow(pow, base, &pow);
+ }
+ bitmask >>= 1;
+ }
+ if (overflow) {
+ *st = Status::Invalid("overflow");
+ }
+ return pow;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
+ return std::pow(base, exp);
+ }
+};
+
+struct Sign {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1));
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return arg > 0;
+ }
+
+ template <typename T, typename Arg>
+ static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
+ return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
+ }
+};
+
+// Bitwise operations
+
+struct BitWiseNot {
+ template <typename T, typename Arg>
+ static T Call(KernelContext*, Arg arg, Status*) {
+ return ~arg;
+ }
+};
+
+struct BitWiseAnd {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs & rhs;
+ }
+};
+
+struct BitWiseOr {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs | rhs;
+ }
+};
+
+struct BitWiseXor {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ return lhs ^ rhs;
+ }
+};
+
+struct ShiftLeft {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ using Unsigned = typename std::make_unsigned<Arg0>::type;
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ return lhs;
+ }
+ return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
+ }
+};
+
+// See SEI CERT C Coding Standard rule INT34-C
+struct ShiftLeftChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+ Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ return lhs << rhs;
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_signed_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
+ Status* st) {
+ using Unsigned = typename std::make_unsigned<Arg0>::type;
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ // In C/C++ left shift of a negative number is undefined (C++11 standard 5.8.2)
+ // Mimic Java/etc. and treat left shift as based on two's complement representation
+ // Assumes two's complement machine
+ return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
+ }
+};
+
+struct ShiftRight {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ // Logical right shift when Arg0 is unsigned
+ // Arithmetic otherwise (this is implementation-defined but GCC and MSVC document this
+ // as arithmetic right shift)
+ // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
+ // https://docs.microsoft.com/en-us/cpp/cpp/left-shift-and-right-shift-operators-input-and-output?view=msvc-160
+ // Clang doesn't document their behavior.
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ return lhs;
+ }
+ return lhs >> rhs;
+ }
+};
+
+struct ShiftRightChecked {
+ template <typename T, typename Arg0, typename Arg1>
+ static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
+ *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
+ return lhs;
+ }
+ return lhs >> rhs;
+ }
+};
+
+struct Sin {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::sin(val);
+ }
+};
+
+struct SinChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::sin(val);
+ }
+};
+
+struct Cos {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::cos(val);
+ }
+};
+
+struct CosChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::cos(val);
+ }
+};
+
+struct Tan {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::tan(val);
+ }
+};
+
+struct TanChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(std::isinf(val))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ // Cannot raise range errors (overflow) since PI/2 is not exactly representable
+ return std::tan(val);
+ }
+};
+
+struct Asin {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::asin(val);
+ }
+};
+
+struct AsinChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::asin(val);
+ }
+};
+
+struct Acos {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::acos(val);
+ }
+};
+
+struct AcosChecked {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
+ *st = Status::Invalid("domain error");
+ return val;
+ }
+ return std::acos(val);
+ }
+};
+
+struct Atan {
+ template <typename T, typename Arg0>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ return std::atan(val);
+ }
+};
+
+struct Atan2 {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
+ static_assert(std::is_same<T, Arg0>::value, "");
+ static_assert(std::is_same<Arg0, Arg1>::value, "");
+ return std::atan2(y, x);
+ }
+};
+
+struct LogNatural {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log(arg);
+ }
+};
+
+struct LogNaturalChecked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0.0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log(arg);
+ }
+};
+
+struct Log10 {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log10(arg);
+ }
+};
+
+struct Log10Checked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log10(arg);
+ }
+};
+
+struct Log2 {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < 0.0) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log2(arg);
+ }
+};
+
+struct Log2Checked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == 0.0) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < 0.0) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log2(arg);
+ }
+};
+
+struct Log1p {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == -1) {
+ return -std::numeric_limits<T>::infinity();
+ } else if (arg < -1) {
+ return std::numeric_limits<T>::quiet_NaN();
+ }
+ return std::log1p(arg);
+ }
+};
+
+struct Log1pChecked {
+ template <typename T, typename Arg>
+ static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
+ static_assert(std::is_same<T, Arg>::value, "");
+ if (arg == -1) {
+ *st = Status::Invalid("logarithm of zero");
+ return arg;
+ } else if (arg < -1) {
+ *st = Status::Invalid("logarithm of negative number");
+ return arg;
+ }
+ return std::log1p(arg);
+ }
+};
+
+struct Floor {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::floor(arg);
+ }
+};
+
+struct Ceil {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::ceil(arg);
+ }
+};
+
+struct Trunc {
+ template <typename T, typename Arg>
+ static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
+ return std::trunc(arg);
+ }
};
-struct Negate {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return -arg;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return ~arg + 1;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return arrow::internal::SafeSignedNegate(arg);
- }
-};
-
-struct NegateChecked {
- template <typename T, typename Arg>
- static enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- T result = 0;
- if (ARROW_PREDICT_FALSE(NegateWithOverflow(arg, &result))) {
- *st = Status::Invalid("overflow");
- }
- return result;
- }
-
- template <typename T, typename Arg>
- static enable_if_unsigned_integer<T> Call(KernelContext* ctx, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- DCHECK(false) << "This is included only for the purposes of instantiability from the "
- "arithmetic kernel generator";
- return 0;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- return -arg;
- }
-};
-
-struct Power {
- ARROW_NOINLINE
- static uint64_t IntegerPower(uint64_t base, uint64_t exp) {
- // right to left O(logn) power
- uint64_t pow = 1;
- while (exp) {
- pow *= (exp & 1) ? base : 1;
- base *= base;
- exp >>= 1;
- }
- return pow;
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, T base, T exp, Status* st) {
- if (exp < 0) {
- *st = Status::Invalid("integers to negative integer powers are not allowed");
- return 0;
- }
- return static_cast<T>(IntegerPower(base, exp));
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, T base, T exp, Status*) {
- return std::pow(base, exp);
- }
-};
-
-struct PowerChecked {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status* st) {
- if (exp < 0) {
- *st = Status::Invalid("integers to negative integer powers are not allowed");
- return 0;
- } else if (exp == 0) {
- return 1;
- }
- // left to right O(logn) power with overflow checks
- bool overflow = false;
- uint64_t bitmask =
- 1ULL << (63 - BitUtil::CountLeadingZeros(static_cast<uint64_t>(exp)));
- T pow = 1;
- while (bitmask) {
- overflow |= MultiplyWithOverflow(pow, pow, &pow);
- if (exp & bitmask) {
- overflow |= MultiplyWithOverflow(pow, base, &pow);
- }
- bitmask >>= 1;
- }
- if (overflow) {
- *st = Status::Invalid("overflow");
- }
- return pow;
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(KernelContext*, Arg0 base, Arg1 exp, Status*) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<T, Arg1>::value, "");
- return std::pow(base, exp);
- }
-};
-
-struct Sign {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::isnan(arg) ? arg : ((arg == 0) ? 0 : (std::signbit(arg) ? -1 : 1));
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_unsigned_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return arg > 0;
- }
-
- template <typename T, typename Arg>
- static constexpr enable_if_signed_integer<T> Call(KernelContext*, Arg arg, Status*) {
- return (arg > 0) ? 1 : ((arg == 0) ? 0 : -1);
- }
-};
-
-// Bitwise operations
-
-struct BitWiseNot {
- template <typename T, typename Arg>
- static T Call(KernelContext*, Arg arg, Status*) {
- return ~arg;
- }
-};
-
-struct BitWiseAnd {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- return lhs & rhs;
- }
-};
-
-struct BitWiseOr {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- return lhs | rhs;
- }
-};
-
-struct BitWiseXor {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- return lhs ^ rhs;
- }
-};
-
-struct ShiftLeft {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- using Unsigned = typename std::make_unsigned<Arg0>::type;
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- return lhs;
- }
- return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
- }
-};
-
-// See SEI CERT C Coding Standard rule INT34-C
-struct ShiftLeftChecked {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_unsigned_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
- Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
- return lhs;
- }
- return lhs << rhs;
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_signed_integer<T> Call(KernelContext*, Arg0 lhs, Arg1 rhs,
- Status* st) {
- using Unsigned = typename std::make_unsigned<Arg0>::type;
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
- return lhs;
- }
- // In C/C++ left shift of a negative number is undefined (C++11 standard 5.8.2)
- // Mimic Java/etc. and treat left shift as based on two's complement representation
- // Assumes two's complement machine
- return static_cast<T>(static_cast<Unsigned>(lhs) << static_cast<Unsigned>(rhs));
- }
-};
-
-struct ShiftRight {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- // Logical right shift when Arg0 is unsigned
- // Arithmetic otherwise (this is implementation-defined but GCC and MSVC document this
- // as arithmetic right shift)
- // https://gcc.gnu.org/onlinedocs/gcc/Integers-implementation.html#Integers-implementation
- // https://docs.microsoft.com/en-us/cpp/cpp/left-shift-and-right-shift-operators-input-and-output?view=msvc-160
- // Clang doesn't document their behavior.
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- return lhs;
- }
- return lhs >> rhs;
- }
-};
-
-struct ShiftRightChecked {
- template <typename T, typename Arg0, typename Arg1>
- static T Call(KernelContext*, Arg0 lhs, Arg1 rhs, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(rhs < 0 || rhs >= std::numeric_limits<Arg0>::digits)) {
- *st = Status::Invalid("shift amount must be >= 0 and less than precision of type");
- return lhs;
- }
- return lhs >> rhs;
- }
-};
-
-struct Sin {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::sin(val);
- }
-};
-
-struct SinChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(std::isinf(val))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::sin(val);
- }
-};
-
-struct Cos {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::cos(val);
- }
-};
-
-struct CosChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(std::isinf(val))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::cos(val);
- }
-};
-
-struct Tan {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::tan(val);
- }
-};
-
-struct TanChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(std::isinf(val))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- // Cannot raise range errors (overflow) since PI/2 is not exactly representable
- return std::tan(val);
- }
-};
-
-struct Asin {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::asin(val);
- }
-};
-
-struct AsinChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE(val < -1.0 || val > 1.0)) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::asin(val);
- }
-};
-
-struct Acos {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::acos(val);
- }
-};
-
-struct AcosChecked {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status* st) {
- static_assert(std::is_same<T, Arg0>::value, "");
- if (ARROW_PREDICT_FALSE((val < -1.0 || val > 1.0))) {
- *st = Status::Invalid("domain error");
- return val;
- }
- return std::acos(val);
- }
-};
-
-struct Atan {
- template <typename T, typename Arg0>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 val, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- return std::atan(val);
- }
-};
-
-struct Atan2 {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<Arg0, T> Call(KernelContext*, Arg0 y, Arg1 x, Status*) {
- static_assert(std::is_same<T, Arg0>::value, "");
- static_assert(std::is_same<Arg0, Arg1>::value, "");
- return std::atan2(y, x);
- }
-};
-
-struct LogNatural {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < 0.0) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log(arg);
- }
-};
-
-struct LogNaturalChecked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < 0.0) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log(arg);
- }
-};
-
-struct Log10 {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < 0.0) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log10(arg);
- }
-};
-
-struct Log10Checked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < 0) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log10(arg);
- }
-};
-
-struct Log2 {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < 0.0) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log2(arg);
- }
-};
-
-struct Log2Checked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == 0.0) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < 0.0) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log2(arg);
- }
-};
-
-struct Log1p {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status*) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == -1) {
- return -std::numeric_limits<T>::infinity();
- } else if (arg < -1) {
- return std::numeric_limits<T>::quiet_NaN();
- }
- return std::log1p(arg);
- }
-};
-
-struct Log1pChecked {
- template <typename T, typename Arg>
- static enable_if_floating_point<Arg, T> Call(KernelContext*, Arg arg, Status* st) {
- static_assert(std::is_same<T, Arg>::value, "");
- if (arg == -1) {
- *st = Status::Invalid("logarithm of zero");
- return arg;
- } else if (arg < -1) {
- *st = Status::Invalid("logarithm of negative number");
- return arg;
- }
- return std::log1p(arg);
- }
-};
-
-struct Floor {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::floor(arg);
- }
-};
-
-struct Ceil {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::ceil(arg);
- }
-};
-
-struct Trunc {
- template <typename T, typename Arg>
- static constexpr enable_if_floating_point<T> Call(KernelContext*, Arg arg, Status*) {
- return std::trunc(arg);
- }
-};
-
// Generate a kernel given an arithmetic functor
template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
+ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::INT8:
return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
@@ -869,321 +869,321 @@ ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
}
}
-// Generate a kernel given a bitwise arithmetic functor. Assumes the
-// functor treats all integer types of equal width identically
-template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- case Type::UINT8:
- return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
- case Type::INT16:
- case Type::UINT16:
- return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
- case Type::INT32:
- case Type::UINT32:
- return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
- case Type::INT64:
- case Type::UINT64:
- return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
- case Type::UINT8:
- return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
- case Type::INT16:
- return KernelGenerator<Int16Type, Int16Type, Op>::Exec;
- case Type::UINT16:
- return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
- case Type::INT32:
- return KernelGenerator<Int32Type, Int32Type, Op>::Exec;
- case Type::UINT32:
- return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
- case Type::INT64:
- return KernelGenerator<Int64Type, Int64Type, Op>::Exec;
- case Type::UINT64:
- return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::FLOAT:
- return KernelGenerator<FloatType, FloatType, Op>::Exec;
- case Type::DOUBLE:
- return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-Status CastBinaryDecimalArgs(const std::string& func_name,
- std::vector<ValueDescr>* values) {
- auto& left_type = (*values)[0].type;
- auto& right_type = (*values)[1].type;
- DCHECK(is_decimal(left_type->id()) || is_decimal(right_type->id()));
-
- // decimal + float = float
- if (is_floating(left_type->id())) {
- right_type = left_type;
- return Status::OK();
- } else if (is_floating(right_type->id())) {
- left_type = right_type;
- return Status::OK();
- }
-
- // precision, scale of left and right args
- int32_t p1, s1, p2, s2;
-
- // decimal + integer = decimal
- if (is_decimal(left_type->id())) {
- auto decimal = checked_cast<const DecimalType*>(left_type.get());
- p1 = decimal->precision();
- s1 = decimal->scale();
- } else {
- DCHECK(is_integer(left_type->id()));
- p1 = static_cast<int32_t>(std::ceil(std::log10(bit_width(left_type->id()))));
- s1 = 0;
- }
- if (is_decimal(right_type->id())) {
- auto decimal = checked_cast<const DecimalType*>(right_type.get());
- p2 = decimal->precision();
- s2 = decimal->scale();
- } else {
- DCHECK(is_integer(right_type->id()));
- p2 = static_cast<int32_t>(std::ceil(std::log10(bit_width(right_type->id()))));
- s2 = 0;
- }
- if (s1 < 0 || s2 < 0) {
- return Status::NotImplemented("Decimals with negative scales not supported");
- }
-
- // decimal128 + decimal256 = decimal256
- Type::type casted_type_id = Type::DECIMAL128;
- if (left_type->id() == Type::DECIMAL256 || right_type->id() == Type::DECIMAL256) {
- casted_type_id = Type::DECIMAL256;
- }
-
- // decimal promotion rules compatible with amazon redshift
- // https://docs.aws.amazon.com/redshift/latest/dg/r_numeric_computations201.html
- int32_t left_scaleup, right_scaleup;
-
- // "add_checked" -> "add"
- const std::string op = func_name.substr(0, func_name.find("_"));
- if (op == "add" || op == "subtract") {
- left_scaleup = std::max(s1, s2) - s1;
- right_scaleup = std::max(s1, s2) - s2;
- } else if (op == "multiply") {
- left_scaleup = right_scaleup = 0;
- } else if (op == "divide") {
- left_scaleup = std::max(4, s1 + p2 - s2 + 1) + s2 - s1;
- right_scaleup = 0;
- } else {
- return Status::Invalid("Invalid decimal function: ", func_name);
- }
-
- ARROW_ASSIGN_OR_RAISE(
- left_type, DecimalType::Make(casted_type_id, p1 + left_scaleup, s1 + left_scaleup));
- ARROW_ASSIGN_OR_RAISE(right_type, DecimalType::Make(casted_type_id, p2 + right_scaleup,
- s2 + right_scaleup));
- return Status::OK();
-}
-
-// resolve decimal binary operation output type per *casted* args
-template <typename OutputGetter>
-Result<ValueDescr> ResolveDecimalBinaryOperationOutput(
- const std::vector<ValueDescr>& args, OutputGetter&& getter) {
- // casted args should be same size decimals
- auto left_type = checked_cast<const DecimalType*>(args[0].type.get());
- auto right_type = checked_cast<const DecimalType*>(args[1].type.get());
- DCHECK_EQ(left_type->id(), right_type->id());
-
- int32_t precision, scale;
- std::tie(precision, scale) = getter(left_type->precision(), left_type->scale(),
- right_type->precision(), right_type->scale());
- ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type->id(), precision, scale));
- return ValueDescr(std::move(type), GetBroadcastShape(args));
-}
-
-Result<ValueDescr> ResolveDecimalAdditionOrSubtractionOutput(
- KernelContext*, const std::vector<ValueDescr>& args) {
- return ResolveDecimalBinaryOperationOutput(
- args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
- DCHECK_EQ(s1, s2);
- const int32_t scale = s1;
- const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1;
- return std::make_pair(precision, scale);
- });
-}
-
-Result<ValueDescr> ResolveDecimalMultiplicationOutput(
- KernelContext*, const std::vector<ValueDescr>& args) {
- return ResolveDecimalBinaryOperationOutput(
- args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
- const int32_t scale = s1 + s2;
- const int32_t precision = p1 + p2 + 1;
- return std::make_pair(precision, scale);
- });
-}
-
-Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
- const std::vector<ValueDescr>& args) {
- return ResolveDecimalBinaryOperationOutput(
- args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
- DCHECK_GE(s1, s2);
- const int32_t scale = s1 - s2;
- const int32_t precision = p1;
- return std::make_pair(precision, scale);
- });
-}
-
+// Generate a kernel given a bitwise arithmetic functor. Assumes the
+// functor treats all integer types of equal width identically
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<UInt8Type, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<Int16Type, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<UInt16Type, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<Int32Type, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<UInt32Type, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ return KernelGenerator<Int64Type, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+template <template <typename... Args> class KernelGenerator, typename Op>
+ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+Status CastBinaryDecimalArgs(const std::string& func_name,
+ std::vector<ValueDescr>* values) {
+ auto& left_type = (*values)[0].type;
+ auto& right_type = (*values)[1].type;
+ DCHECK(is_decimal(left_type->id()) || is_decimal(right_type->id()));
+
+ // decimal + float = float
+ if (is_floating(left_type->id())) {
+ right_type = left_type;
+ return Status::OK();
+ } else if (is_floating(right_type->id())) {
+ left_type = right_type;
+ return Status::OK();
+ }
+
+ // precision, scale of left and right args
+ int32_t p1, s1, p2, s2;
+
+ // decimal + integer = decimal
+ if (is_decimal(left_type->id())) {
+ auto decimal = checked_cast<const DecimalType*>(left_type.get());
+ p1 = decimal->precision();
+ s1 = decimal->scale();
+ } else {
+ DCHECK(is_integer(left_type->id()));
+ p1 = static_cast<int32_t>(std::ceil(std::log10(bit_width(left_type->id()))));
+ s1 = 0;
+ }
+ if (is_decimal(right_type->id())) {
+ auto decimal = checked_cast<const DecimalType*>(right_type.get());
+ p2 = decimal->precision();
+ s2 = decimal->scale();
+ } else {
+ DCHECK(is_integer(right_type->id()));
+ p2 = static_cast<int32_t>(std::ceil(std::log10(bit_width(right_type->id()))));
+ s2 = 0;
+ }
+ if (s1 < 0 || s2 < 0) {
+ return Status::NotImplemented("Decimals with negative scales not supported");
+ }
+
+ // decimal128 + decimal256 = decimal256
+ Type::type casted_type_id = Type::DECIMAL128;
+ if (left_type->id() == Type::DECIMAL256 || right_type->id() == Type::DECIMAL256) {
+ casted_type_id = Type::DECIMAL256;
+ }
+
+ // decimal promotion rules compatible with amazon redshift
+ // https://docs.aws.amazon.com/redshift/latest/dg/r_numeric_computations201.html
+ int32_t left_scaleup, right_scaleup;
+
+ // "add_checked" -> "add"
+ const std::string op = func_name.substr(0, func_name.find("_"));
+ if (op == "add" || op == "subtract") {
+ left_scaleup = std::max(s1, s2) - s1;
+ right_scaleup = std::max(s1, s2) - s2;
+ } else if (op == "multiply") {
+ left_scaleup = right_scaleup = 0;
+ } else if (op == "divide") {
+ left_scaleup = std::max(4, s1 + p2 - s2 + 1) + s2 - s1;
+ right_scaleup = 0;
+ } else {
+ return Status::Invalid("Invalid decimal function: ", func_name);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(
+ left_type, DecimalType::Make(casted_type_id, p1 + left_scaleup, s1 + left_scaleup));
+ ARROW_ASSIGN_OR_RAISE(right_type, DecimalType::Make(casted_type_id, p2 + right_scaleup,
+ s2 + right_scaleup));
+ return Status::OK();
+}
+
+// resolve decimal binary operation output type per *casted* args
+template <typename OutputGetter>
+Result<ValueDescr> ResolveDecimalBinaryOperationOutput(
+ const std::vector<ValueDescr>& args, OutputGetter&& getter) {
+ // casted args should be same size decimals
+ auto left_type = checked_cast<const DecimalType*>(args[0].type.get());
+ auto right_type = checked_cast<const DecimalType*>(args[1].type.get());
+ DCHECK_EQ(left_type->id(), right_type->id());
+
+ int32_t precision, scale;
+ std::tie(precision, scale) = getter(left_type->precision(), left_type->scale(),
+ right_type->precision(), right_type->scale());
+ ARROW_ASSIGN_OR_RAISE(auto type, DecimalType::Make(left_type->id(), precision, scale));
+ return ValueDescr(std::move(type), GetBroadcastShape(args));
+}
+
+Result<ValueDescr> ResolveDecimalAdditionOrSubtractionOutput(
+ KernelContext*, const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ DCHECK_EQ(s1, s2);
+ const int32_t scale = s1;
+ const int32_t precision = std::max(p1 - s1, p2 - s2) + scale + 1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+Result<ValueDescr> ResolveDecimalMultiplicationOutput(
+ KernelContext*, const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ const int32_t scale = s1 + s2;
+ const int32_t precision = p1 + p2 + 1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+Result<ValueDescr> ResolveDecimalDivisionOutput(KernelContext*,
+ const std::vector<ValueDescr>& args) {
+ return ResolveDecimalBinaryOperationOutput(
+ args, [](int32_t p1, int32_t s1, int32_t p2, int32_t s2) {
+ DCHECK_GE(s1, s2);
+ const int32_t scale = s1 - s2;
+ const int32_t precision = p1;
+ return std::make_pair(precision, scale);
+ });
+}
+
+template <typename Op>
+void AddDecimalBinaryKernels(const std::string& name,
+ std::shared_ptr<ScalarFunction>* func) {
+ OutputType out_type(null());
+ const std::string op = name.substr(0, name.find("_"));
+ if (op == "add" || op == "subtract") {
+ out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput);
+ } else if (op == "multiply") {
+ out_type = OutputType(ResolveDecimalMultiplicationOutput);
+ } else if (op == "divide") {
+ out_type = OutputType(ResolveDecimalDivisionOutput);
+ } else {
+ DCHECK(false);
+ }
+
+ auto in_type128 = InputType(Type::DECIMAL128);
+ auto in_type256 = InputType(Type::DECIMAL256);
+ auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
+ auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
+ DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
+ DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
+}
+
+// Generate a kernel given an arithmetic functor
+template <template <typename...> class KernelGenerator, typename OutType, typename Op>
+ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
+ switch (get_id.id) {
+ case Type::INT8:
+ return KernelGenerator<OutType, Int8Type, Op>::Exec;
+ case Type::UINT8:
+ return KernelGenerator<OutType, UInt8Type, Op>::Exec;
+ case Type::INT16:
+ return KernelGenerator<OutType, Int16Type, Op>::Exec;
+ case Type::UINT16:
+ return KernelGenerator<OutType, UInt16Type, Op>::Exec;
+ case Type::INT32:
+ return KernelGenerator<OutType, Int32Type, Op>::Exec;
+ case Type::UINT32:
+ return KernelGenerator<OutType, UInt32Type, Op>::Exec;
+ case Type::INT64:
+ case Type::TIMESTAMP:
+ return KernelGenerator<OutType, Int64Type, Op>::Exec;
+ case Type::UINT64:
+ return KernelGenerator<OutType, UInt64Type, Op>::Exec;
+ case Type::FLOAT:
+ return KernelGenerator<FloatType, FloatType, Op>::Exec;
+ case Type::DOUBLE:
+ return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
+ default:
+ DCHECK(false);
+ return ExecFail;
+ }
+}
+
+struct ArithmeticFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ RETURN_NOT_OK(CheckDecimals(values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ // Only promote types for binary functions
+ if (values->size() == 2) {
+ ReplaceNullWithOtherType(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+
+ Status CheckDecimals(std::vector<ValueDescr>* values) const {
+ bool has_decimal = false;
+ for (const auto& value : *values) {
+ if (is_decimal(value.type->id())) {
+ has_decimal = true;
+ break;
+ }
+ }
+ if (!has_decimal) return Status::OK();
+
+ if (values->size() == 2) {
+ return CastBinaryDecimalArgs(name(), values);
+ }
+ return Status::OK();
+ }
+};
+
+/// An ArithmeticFunction that promotes integer arguments to double.
+struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
+ using ArithmeticFunction::ArithmeticFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ RETURN_NOT_OK(CheckDecimals(values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ if (values->size() == 2) {
+ ReplaceNullWithOtherType(values);
+ }
+
+ for (auto& descr : *values) {
+ if (is_integer(descr.type->id())) {
+ descr.type = float64();
+ }
+ }
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
template <typename Op>
-void AddDecimalBinaryKernels(const std::string& name,
- std::shared_ptr<ScalarFunction>* func) {
- OutputType out_type(null());
- const std::string op = name.substr(0, name.find("_"));
- if (op == "add" || op == "subtract") {
- out_type = OutputType(ResolveDecimalAdditionOrSubtractionOutput);
- } else if (op == "multiply") {
- out_type = OutputType(ResolveDecimalMultiplicationOutput);
- } else if (op == "divide") {
- out_type = OutputType(ResolveDecimalDivisionOutput);
- } else {
- DCHECK(false);
- }
-
- auto in_type128 = InputType(Type::DECIMAL128);
- auto in_type256 = InputType(Type::DECIMAL256);
- auto exec128 = ScalarBinaryNotNullEqualTypes<Decimal128Type, Decimal128Type, Op>::Exec;
- auto exec256 = ScalarBinaryNotNullEqualTypes<Decimal256Type, Decimal256Type, Op>::Exec;
- DCHECK_OK((*func)->AddKernel({in_type128, in_type128}, out_type, exec128));
- DCHECK_OK((*func)->AddKernel({in_type256, in_type256}, out_type, exec256));
-}
-
-// Generate a kernel given an arithmetic functor
-template <template <typename...> class KernelGenerator, typename OutType, typename Op>
-ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
- switch (get_id.id) {
- case Type::INT8:
- return KernelGenerator<OutType, Int8Type, Op>::Exec;
- case Type::UINT8:
- return KernelGenerator<OutType, UInt8Type, Op>::Exec;
- case Type::INT16:
- return KernelGenerator<OutType, Int16Type, Op>::Exec;
- case Type::UINT16:
- return KernelGenerator<OutType, UInt16Type, Op>::Exec;
- case Type::INT32:
- return KernelGenerator<OutType, Int32Type, Op>::Exec;
- case Type::UINT32:
- return KernelGenerator<OutType, UInt32Type, Op>::Exec;
- case Type::INT64:
- case Type::TIMESTAMP:
- return KernelGenerator<OutType, Int64Type, Op>::Exec;
- case Type::UINT64:
- return KernelGenerator<OutType, UInt64Type, Op>::Exec;
- case Type::FLOAT:
- return KernelGenerator<FloatType, FloatType, Op>::Exec;
- case Type::DOUBLE:
- return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
- default:
- DCHECK(false);
- return ExecFail;
- }
-}
-
-struct ArithmeticFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- RETURN_NOT_OK(CheckDecimals(values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
-
- // Only promote types for binary functions
- if (values->size() == 2) {
- ReplaceNullWithOtherType(values);
-
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- }
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-
- Status CheckDecimals(std::vector<ValueDescr>* values) const {
- bool has_decimal = false;
- for (const auto& value : *values) {
- if (is_decimal(value.type->id())) {
- has_decimal = true;
- break;
- }
- }
- if (!has_decimal) return Status::OK();
-
- if (values->size() == 2) {
- return CastBinaryDecimalArgs(name(), values);
- }
- return Status::OK();
- }
-};
-
-/// An ArithmeticFunction that promotes integer arguments to double.
-struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
- using ArithmeticFunction::ArithmeticFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
- RETURN_NOT_OK(CheckDecimals(values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
-
- if (values->size() == 2) {
- ReplaceNullWithOtherType(values);
- }
-
- for (auto& descr : *values) {
- if (is_integer(descr.type->id())) {
- descr.type = float64();
- }
- }
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Op>(ty);
+ auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Op>(ty);
DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
}
return func;
@@ -1192,630 +1192,630 @@ std::shared_ptr<ScalarFunction> MakeArithmeticFunction(std::string name,
// Like MakeArithmeticFunction, but for arithmetic ops that need to run
// only on non-null output.
template <typename Op>
-std::shared_ptr<ScalarFunction> MakeArithmeticFunctionNotNull(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+std::shared_ptr<ScalarFunction> MakeArithmeticFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = ArithmeticExecFromOp<ScalarUnary, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for unary arithmetic ops with a fixed
+// output type for integral inputs.
+template <typename Op, typename IntOutType>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionWithFixedIntOutType(
+ std::string name, const FunctionDoc* doc) {
+ auto int_out_ty = TypeTraits<IntOutType>::type_singleton();
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ auto out_ty = arrow::is_floating(ty->id()) ? ty : int_out_ty;
+ auto exec = GenerateArithmeticWithFixedIntOutType<ScalarUnary, IntOutType, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, out_ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ return func;
+}
+
+// Like MakeUnaryArithmeticFunction, but for signed arithmetic ops that need to run
+// only on non-null output.
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnarySignedArithmeticFunctionNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : NumericTypes()) {
+ if (!arrow::is_unsigned_integer(ty->id())) {
+ auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, ty, exec));
+ }
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBitWiseFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = TypeAgnosticBitWiseExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
}
return func;
}
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarUnary, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, ty, exec));
- }
- return func;
-}
-
-// Like MakeUnaryArithmeticFunction, but for unary arithmetic ops with a fixed
-// output type for integral inputs.
-template <typename Op, typename IntOutType>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionWithFixedIntOutType(
- std::string name, const FunctionDoc* doc) {
- auto int_out_ty = TypeTraits<IntOutType>::type_singleton();
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- auto out_ty = arrow::is_floating(ty->id()) ? ty : int_out_ty;
- auto exec = GenerateArithmeticWithFixedIntOutType<ScalarUnary, IntOutType, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, out_ty, exec));
- }
- return func;
-}
-
-// Like MakeUnaryArithmeticFunction, but for arithmetic ops that need to run
-// only on non-null output.
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionNotNull(
- std::string name, const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, ty, exec));
- }
- return func;
-}
-
-// Like MakeUnaryArithmeticFunction, but for signed arithmetic ops that need to run
-// only on non-null output.
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnarySignedArithmeticFunctionNotNull(
- std::string name, const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : NumericTypes()) {
- if (!arrow::is_unsigned_integer(ty->id())) {
- auto exec = ArithmeticExecFromOp<ScalarUnaryNotNull, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, ty, exec));
- }
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeBitWiseFunctionNotNull(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : IntTypes()) {
- auto exec = TypeAgnosticBitWiseExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
- DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeShiftFunctionNotNull(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : IntTypes()) {
- auto exec = ShiftExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
- DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPoint(
- std::string name, const FunctionDoc* doc) {
- auto func =
- std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : FloatingPointTypes()) {
- auto output = is_integer(ty->id()) ? float64() : ty;
- auto exec = GenerateArithmeticFloatingPoint<ScalarUnary, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, output, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPointNotNull(
- std::string name, const FunctionDoc* doc) {
- auto func =
- std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
- for (const auto& ty : FloatingPointTypes()) {
- auto output = is_integer(ty->id()) ? float64() : ty;
- auto exec = GenerateArithmeticFloatingPoint<ScalarUnaryNotNull, Op>(ty);
- DCHECK_OK(func->AddKernel({ty}, output, exec));
- }
- return func;
-}
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeArithmeticFunctionFloatingPoint(
- std::string name, const FunctionDoc* doc) {
- auto func =
- std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Binary(), doc);
- for (const auto& ty : FloatingPointTypes()) {
- auto output = is_integer(ty->id()) ? float64() : ty;
- auto exec = GenerateArithmeticFloatingPoint<ScalarBinaryEqualTypes, Op>(ty);
- DCHECK_OK(func->AddKernel({ty, ty}, output, exec));
- }
- return func;
-}
-
-const FunctionDoc absolute_value_doc{
- "Calculate the absolute value of the argument element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"abs_checked\" if you want overflow\n"
- "to return an error."),
- {"x"}};
-
-const FunctionDoc absolute_value_checked_doc{
- "Calculate the absolute value of the argument element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"abs\"."),
- {"x"}};
-
-const FunctionDoc add_doc{"Add the arguments element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"add_checked\" if you want overflow\n"
- "to return an error."),
- {"x", "y"}};
-
-const FunctionDoc add_checked_doc{
- "Add the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"add\"."),
- {"x", "y"}};
-
-const FunctionDoc sub_doc{"Subtract the arguments element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"subtract_checked\" if you want overflow\n"
- "to return an error."),
- {"x", "y"}};
-
-const FunctionDoc sub_checked_doc{
- "Subtract the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"subtract\"."),
- {"x", "y"}};
-
-const FunctionDoc mul_doc{"Multiply the arguments element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"multiply_checked\" if you want overflow\n"
- "to return an error."),
- {"x", "y"}};
-
-const FunctionDoc mul_checked_doc{
- "Multiply the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"multiply\"."),
- {"x", "y"}};
-
-const FunctionDoc div_doc{
- "Divide the arguments element-wise",
- ("Integer division by zero returns an error. However, integer overflow\n"
- "wraps around, and floating-point division by zero returns an infinite.\n"
- "Use function \"divide_checked\" if you want to get an error\n"
- "in all the aforementioned cases."),
- {"dividend", "divisor"}};
-
-const FunctionDoc div_checked_doc{
- "Divide the arguments element-wise",
- ("An error is returned when trying to divide by zero, or when\n"
- "integer overflow is encountered."),
- {"dividend", "divisor"}};
-
-const FunctionDoc negate_doc{"Negate the argument element-wise",
- ("Results will wrap around on integer overflow.\n"
- "Use function \"negate_checked\" if you want overflow\n"
- "to return an error."),
- {"x"}};
-
-const FunctionDoc negate_checked_doc{
- "Negate the arguments element-wise",
- ("This function returns an error on overflow. For a variant that\n"
- "doesn't fail on overflow, use function \"negate\"."),
- {"x"}};
-
-const FunctionDoc pow_doc{
- "Raise arguments to power element-wise",
- ("Integer to negative integer power returns an error. However, integer overflow\n"
- "wraps around. If either base or exponent is null the result will be null."),
- {"base", "exponent"}};
-
-const FunctionDoc pow_checked_doc{
- "Raise arguments to power element-wise",
- ("An error is returned when integer to negative integer power is encountered,\n"
- "or integer overflow is encountered."),
- {"base", "exponent"}};
-
-const FunctionDoc sign_doc{
- "Get the signedness of the arguments element-wise",
- ("Output is any of (-1,1) for nonzero inputs and 0 for zero input.\n"
- "NaN values return NaN. Integral values return signedness as Int8 and\n"
- "floating-point values return it with the same type as the input values."),
- {"x"}};
-
-const FunctionDoc bit_wise_not_doc{
- "Bit-wise negate the arguments element-wise", "Null values return null.", {"x"}};
-
-const FunctionDoc bit_wise_and_doc{
- "Bit-wise AND the arguments element-wise", "Null values return null.", {"x", "y"}};
-
-const FunctionDoc bit_wise_or_doc{
- "Bit-wise OR the arguments element-wise", "Null values return null.", {"x", "y"}};
-
-const FunctionDoc bit_wise_xor_doc{
- "Bit-wise XOR the arguments element-wise", "Null values return null.", {"x", "y"}};
-
-const FunctionDoc shift_left_doc{
- "Left shift `x` by `y`",
- ("This function will return `x` if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`.\n"
- "The shift operates as if on the two's complement representation of the number. "
- "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
- "even if overflow occurs.\n"
- "Use function \"shift_left_checked\" if you want an invalid shift amount to "
- "return an error."),
- {"x", "y"}};
-
-const FunctionDoc shift_left_checked_doc{
- "Left shift `x` by `y` with invalid shift check",
- ("This function will raise an error if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`. "
- "The shift operates as if on the two's complement representation of the number. "
- "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
- "even if overflow occurs.\n"
- "See \"shift_left\" for a variant that doesn't fail for an invalid shift amount."),
- {"x", "y"}};
-
-const FunctionDoc shift_right_doc{
- "Right shift `x` by `y`",
- ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
- "This function will return `x` if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`.\n"
- "Use function \"shift_right_checked\" if you want an invalid shift amount to return "
- "an error."),
- {"x", "y"}};
-
-const FunctionDoc shift_right_checked_doc{
- "Right shift `x` by `y` with invalid shift check",
- ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
- "This function will raise an error if `y` (the amount to shift by) is: "
- "(1) negative or (2) greater than or equal to the precision of `x`.\n"
- "See \"shift_right\" for a variant that doesn't fail for an invalid shift amount"),
- {"x", "y"}};
-
-const FunctionDoc sin_doc{"Compute the sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"sin_checked\"."),
- {"x"}};
-
-const FunctionDoc sin_checked_doc{
- "Compute the sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"sin\"."),
- {"x"}};
-
-const FunctionDoc cos_doc{"Compute the cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"cos_checked\"."),
- {"x"}};
-
-const FunctionDoc cos_checked_doc{
- "Compute the cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"cos\"."),
- {"x"}};
-
-const FunctionDoc tan_doc{"Compute the tangent of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"tan_checked\"."),
- {"x"}};
-
-const FunctionDoc tan_checked_doc{
- "Compute the tangent of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"tan\"."),
- {"x"}};
-
-const FunctionDoc asin_doc{"Compute the inverse sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"asin_checked\"."),
- {"x"}};
-
-const FunctionDoc asin_checked_doc{
- "Compute the inverse sine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"asin\"."),
- {"x"}};
-
-const FunctionDoc acos_doc{"Compute the inverse cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function returns NaN on values outside its domain. "
- "To raise an error instead, see \"acos_checked\"."),
- {"x"}};
-
-const FunctionDoc acos_checked_doc{
- "Compute the inverse cosine of the elements argument-wise",
- ("Integer arguments return double values. "
- "This function raises an error on values outside its domain. "
- "To return NaN instead, see \"acos\"."),
- {"x"}};
-
-const FunctionDoc atan_doc{"Compute the principal value of the inverse tangent",
- "Integer arguments return double values.",
- {"x"}};
-
-const FunctionDoc atan2_doc{
- "Compute the inverse tangent using argument signs to determine the quadrant",
- "Integer arguments return double values.",
- {"y", "x"}};
-
-const FunctionDoc ln_doc{
- "Compute natural log of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"ln_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc ln_checked_doc{
- "Compute natural log of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"ln\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc log10_doc{
- "Compute log base 10 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log10_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc log10_checked_doc{
- "Compute log base 10 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log10\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc log2_doc{
- "Compute log base 2 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log2_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc log2_checked_doc{
- "Compute log base 2 of arguments element-wise",
- ("Non-positive values return -inf or NaN. Null values return null.\n"
- "Use function \"log2\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc log1p_doc{
- "Compute natural log of (1+x) element-wise",
- ("Values <= -1 return -inf or NaN. Null values return null.\n"
- "This function may be more precise than log(1 + x) for x close to zero."
- "Use function \"log1p_checked\" if you want non-positive values to raise an error."),
- {"x"}};
-
-const FunctionDoc log1p_checked_doc{
- "Compute natural log of (1+x) element-wise",
- ("Values <= -1 return -inf or NaN. Null values return null.\n"
- "This function may be more precise than log(1 + x) for x close to zero."
- "Use function \"log1p\" if you want non-positive values to return "
- "-inf or NaN."),
- {"x"}};
-
-const FunctionDoc floor_doc{
- "Round down to the nearest integer",
- ("Calculate the nearest integer less than or equal in magnitude to the "
- "argument element-wise"),
- {"x"}};
-
-const FunctionDoc ceil_doc{
- "Round up to the nearest integer",
- ("Calculate the nearest integer greater than or equal in magnitude to the "
- "argument element-wise"),
- {"x"}};
-
-const FunctionDoc trunc_doc{
- "Get the integral part without fractional digits",
- ("Calculate the nearest integer not greater in magnitude than to the "
- "argument element-wise."),
- {"x"}};
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeShiftFunctionNotNull(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ArithmeticFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = ShiftExecFromOp<ScalarBinaryNotNullEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, ty, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPoint(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarUnary, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, output, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionFloatingPointNotNull(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Unary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarUnaryNotNull, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty}, output, exec));
+ }
+ return func;
+}
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeArithmeticFunctionFloatingPoint(
+ std::string name, const FunctionDoc* doc) {
+ auto func =
+ std::make_shared<ArithmeticFloatingPointFunction>(name, Arity::Binary(), doc);
+ for (const auto& ty : FloatingPointTypes()) {
+ auto output = is_integer(ty->id()) ? float64() : ty;
+ auto exec = GenerateArithmeticFloatingPoint<ScalarBinaryEqualTypes, Op>(ty);
+ DCHECK_OK(func->AddKernel({ty, ty}, output, exec));
+ }
+ return func;
+}
+
+const FunctionDoc absolute_value_doc{
+ "Calculate the absolute value of the argument element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"abs_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x"}};
+
+const FunctionDoc absolute_value_checked_doc{
+ "Calculate the absolute value of the argument element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"abs\"."),
+ {"x"}};
+
+const FunctionDoc add_doc{"Add the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"add_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc add_checked_doc{
+ "Add the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"add\"."),
+ {"x", "y"}};
+
+const FunctionDoc sub_doc{"Subtract the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"subtract_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc sub_checked_doc{
+ "Subtract the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"subtract\"."),
+ {"x", "y"}};
+
+const FunctionDoc mul_doc{"Multiply the arguments element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"multiply_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x", "y"}};
+
+const FunctionDoc mul_checked_doc{
+ "Multiply the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"multiply\"."),
+ {"x", "y"}};
+
+const FunctionDoc div_doc{
+ "Divide the arguments element-wise",
+ ("Integer division by zero returns an error. However, integer overflow\n"
+ "wraps around, and floating-point division by zero returns an infinite.\n"
+ "Use function \"divide_checked\" if you want to get an error\n"
+ "in all the aforementioned cases."),
+ {"dividend", "divisor"}};
+
+const FunctionDoc div_checked_doc{
+ "Divide the arguments element-wise",
+ ("An error is returned when trying to divide by zero, or when\n"
+ "integer overflow is encountered."),
+ {"dividend", "divisor"}};
+
+const FunctionDoc negate_doc{"Negate the argument element-wise",
+ ("Results will wrap around on integer overflow.\n"
+ "Use function \"negate_checked\" if you want overflow\n"
+ "to return an error."),
+ {"x"}};
+
+const FunctionDoc negate_checked_doc{
+ "Negate the arguments element-wise",
+ ("This function returns an error on overflow. For a variant that\n"
+ "doesn't fail on overflow, use function \"negate\"."),
+ {"x"}};
+
+const FunctionDoc pow_doc{
+ "Raise arguments to power element-wise",
+ ("Integer to negative integer power returns an error. However, integer overflow\n"
+ "wraps around. If either base or exponent is null the result will be null."),
+ {"base", "exponent"}};
+
+const FunctionDoc pow_checked_doc{
+ "Raise arguments to power element-wise",
+ ("An error is returned when integer to negative integer power is encountered,\n"
+ "or integer overflow is encountered."),
+ {"base", "exponent"}};
+
+const FunctionDoc sign_doc{
+ "Get the signedness of the arguments element-wise",
+ ("Output is any of (-1,1) for nonzero inputs and 0 for zero input.\n"
+ "NaN values return NaN. Integral values return signedness as Int8 and\n"
+ "floating-point values return it with the same type as the input values."),
+ {"x"}};
+
+const FunctionDoc bit_wise_not_doc{
+ "Bit-wise negate the arguments element-wise", "Null values return null.", {"x"}};
+
+const FunctionDoc bit_wise_and_doc{
+ "Bit-wise AND the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc bit_wise_or_doc{
+ "Bit-wise OR the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc bit_wise_xor_doc{
+ "Bit-wise XOR the arguments element-wise", "Null values return null.", {"x", "y"}};
+
+const FunctionDoc shift_left_doc{
+ "Left shift `x` by `y`",
+ ("This function will return `x` if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "The shift operates as if on the two's complement representation of the number. "
+ "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+ "even if overflow occurs.\n"
+ "Use function \"shift_left_checked\" if you want an invalid shift amount to "
+ "return an error."),
+ {"x", "y"}};
+
+const FunctionDoc shift_left_checked_doc{
+ "Left shift `x` by `y` with invalid shift check",
+ ("This function will raise an error if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`. "
+ "The shift operates as if on the two's complement representation of the number. "
+ "In other words, this is equivalent to multiplying `x` by 2 to the power `y`, "
+ "even if overflow occurs.\n"
+ "See \"shift_left\" for a variant that doesn't fail for an invalid shift amount."),
+ {"x", "y"}};
+
+const FunctionDoc shift_right_doc{
+ "Right shift `x` by `y`",
+ ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
+ "This function will return `x` if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "Use function \"shift_right_checked\" if you want an invalid shift amount to return "
+ "an error."),
+ {"x", "y"}};
+
+const FunctionDoc shift_right_checked_doc{
+ "Right shift `x` by `y` with invalid shift check",
+ ("Perform a logical shift for unsigned `x` and an arithmetic shift for signed `x`.\n"
+ "This function will raise an error if `y` (the amount to shift by) is: "
+ "(1) negative or (2) greater than or equal to the precision of `x`.\n"
+ "See \"shift_right\" for a variant that doesn't fail for an invalid shift amount"),
+ {"x", "y"}};
+
+const FunctionDoc sin_doc{"Compute the sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"sin_checked\"."),
+ {"x"}};
+
+const FunctionDoc sin_checked_doc{
+ "Compute the sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"sin\"."),
+ {"x"}};
+
+const FunctionDoc cos_doc{"Compute the cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"cos_checked\"."),
+ {"x"}};
+
+const FunctionDoc cos_checked_doc{
+ "Compute the cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"cos\"."),
+ {"x"}};
+
+const FunctionDoc tan_doc{"Compute the tangent of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"tan_checked\"."),
+ {"x"}};
+
+const FunctionDoc tan_checked_doc{
+ "Compute the tangent of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"tan\"."),
+ {"x"}};
+
+const FunctionDoc asin_doc{"Compute the inverse sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"asin_checked\"."),
+ {"x"}};
+
+const FunctionDoc asin_checked_doc{
+ "Compute the inverse sine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"asin\"."),
+ {"x"}};
+
+const FunctionDoc acos_doc{"Compute the inverse cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function returns NaN on values outside its domain. "
+ "To raise an error instead, see \"acos_checked\"."),
+ {"x"}};
+
+const FunctionDoc acos_checked_doc{
+ "Compute the inverse cosine of the elements argument-wise",
+ ("Integer arguments return double values. "
+ "This function raises an error on values outside its domain. "
+ "To return NaN instead, see \"acos\"."),
+ {"x"}};
+
+const FunctionDoc atan_doc{"Compute the principal value of the inverse tangent",
+ "Integer arguments return double values.",
+ {"x"}};
+
+const FunctionDoc atan2_doc{
+ "Compute the inverse tangent using argument signs to determine the quadrant",
+ "Integer arguments return double values.",
+ {"y", "x"}};
+
+const FunctionDoc ln_doc{
+ "Compute natural log of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"ln_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc ln_checked_doc{
+ "Compute natural log of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"ln\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log10_doc{
+ "Compute log base 10 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log10_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log10_checked_doc{
+ "Compute log base 10 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log10\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log2_doc{
+ "Compute log base 2 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log2_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log2_checked_doc{
+ "Compute log base 2 of arguments element-wise",
+ ("Non-positive values return -inf or NaN. Null values return null.\n"
+ "Use function \"log2\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc log1p_doc{
+ "Compute natural log of (1+x) element-wise",
+ ("Values <= -1 return -inf or NaN. Null values return null.\n"
+ "This function may be more precise than log(1 + x) for x close to zero."
+ "Use function \"log1p_checked\" if you want non-positive values to raise an error."),
+ {"x"}};
+
+const FunctionDoc log1p_checked_doc{
+ "Compute natural log of (1+x) element-wise",
+ ("Values <= -1 return -inf or NaN. Null values return null.\n"
+ "This function may be more precise than log(1 + x) for x close to zero."
+ "Use function \"log1p\" if you want non-positive values to return "
+ "-inf or NaN."),
+ {"x"}};
+
+const FunctionDoc floor_doc{
+ "Round down to the nearest integer",
+ ("Calculate the nearest integer less than or equal in magnitude to the "
+ "argument element-wise"),
+ {"x"}};
+
+const FunctionDoc ceil_doc{
+ "Round up to the nearest integer",
+ ("Calculate the nearest integer greater than or equal in magnitude to the "
+ "argument element-wise"),
+ {"x"}};
+
+const FunctionDoc trunc_doc{
+ "Get the integral part without fractional digits",
+ ("Calculate the nearest integer not greater in magnitude than to the "
+ "argument element-wise."),
+ {"x"}};
} // namespace
void RegisterScalarArithmetic(FunctionRegistry* registry) {
// ----------------------------------------------------------------------
- auto absolute_value =
- MakeUnaryArithmeticFunction<AbsoluteValue>("abs", &absolute_value_doc);
- DCHECK_OK(registry->AddFunction(std::move(absolute_value)));
-
- // ----------------------------------------------------------------------
- auto absolute_value_checked = MakeUnaryArithmeticFunctionNotNull<AbsoluteValueChecked>(
- "abs_checked", &absolute_value_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(absolute_value_checked)));
-
- // ----------------------------------------------------------------------
- auto add = MakeArithmeticFunction<Add>("add", &add_doc);
- AddDecimalBinaryKernels<Add>("add", &add);
+ auto absolute_value =
+ MakeUnaryArithmeticFunction<AbsoluteValue>("abs", &absolute_value_doc);
+ DCHECK_OK(registry->AddFunction(std::move(absolute_value)));
+
+ // ----------------------------------------------------------------------
+ auto absolute_value_checked = MakeUnaryArithmeticFunctionNotNull<AbsoluteValueChecked>(
+ "abs_checked", &absolute_value_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(absolute_value_checked)));
+
+ // ----------------------------------------------------------------------
+ auto add = MakeArithmeticFunction<Add>("add", &add_doc);
+ AddDecimalBinaryKernels<Add>("add", &add);
DCHECK_OK(registry->AddFunction(std::move(add)));
// ----------------------------------------------------------------------
- auto add_checked =
- MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
- AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
+ auto add_checked =
+ MakeArithmeticFunctionNotNull<AddChecked>("add_checked", &add_checked_doc);
+ AddDecimalBinaryKernels<AddChecked>("add_checked", &add_checked);
DCHECK_OK(registry->AddFunction(std::move(add_checked)));
// ----------------------------------------------------------------------
- auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
- AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
+ auto subtract = MakeArithmeticFunction<Subtract>("subtract", &sub_doc);
+ AddDecimalBinaryKernels<Subtract>("subtract", &subtract);
// Add subtract(timestamp, timestamp) -> duration
for (auto unit : AllTimeUnits()) {
InputType in_type(match::TimestampTypeUnit(unit));
- auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Subtract>(Type::TIMESTAMP);
+ auto exec = ArithmeticExecFromOp<ScalarBinaryEqualTypes, Subtract>(Type::TIMESTAMP);
DCHECK_OK(subtract->AddKernel({in_type, in_type}, duration(unit), std::move(exec)));
}
DCHECK_OK(registry->AddFunction(std::move(subtract)));
// ----------------------------------------------------------------------
- auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
- "subtract_checked", &sub_checked_doc);
- AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
+ auto subtract_checked = MakeArithmeticFunctionNotNull<SubtractChecked>(
+ "subtract_checked", &sub_checked_doc);
+ AddDecimalBinaryKernels<SubtractChecked>("subtract_checked", &subtract_checked);
DCHECK_OK(registry->AddFunction(std::move(subtract_checked)));
// ----------------------------------------------------------------------
- auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
- AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
+ auto multiply = MakeArithmeticFunction<Multiply>("multiply", &mul_doc);
+ AddDecimalBinaryKernels<Multiply>("multiply", &multiply);
DCHECK_OK(registry->AddFunction(std::move(multiply)));
// ----------------------------------------------------------------------
- auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
- "multiply_checked", &mul_checked_doc);
- AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
+ auto multiply_checked = MakeArithmeticFunctionNotNull<MultiplyChecked>(
+ "multiply_checked", &mul_checked_doc);
+ AddDecimalBinaryKernels<MultiplyChecked>("multiply_checked", &multiply_checked);
DCHECK_OK(registry->AddFunction(std::move(multiply_checked)));
// ----------------------------------------------------------------------
- auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
- AddDecimalBinaryKernels<Divide>("divide", &divide);
+ auto divide = MakeArithmeticFunctionNotNull<Divide>("divide", &div_doc);
+ AddDecimalBinaryKernels<Divide>("divide", &divide);
DCHECK_OK(registry->AddFunction(std::move(divide)));
// ----------------------------------------------------------------------
- auto divide_checked =
- MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
- AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
+ auto divide_checked =
+ MakeArithmeticFunctionNotNull<DivideChecked>("divide_checked", &div_checked_doc);
+ AddDecimalBinaryKernels<DivideChecked>("divide_checked", &divide_checked);
DCHECK_OK(registry->AddFunction(std::move(divide_checked)));
-
- // ----------------------------------------------------------------------
- auto negate = MakeUnaryArithmeticFunction<Negate>("negate", &negate_doc);
- DCHECK_OK(registry->AddFunction(std::move(negate)));
-
- // ----------------------------------------------------------------------
- auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull<NegateChecked>(
- "negate_checked", &negate_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(negate_checked)));
-
- // ----------------------------------------------------------------------
- auto power = MakeArithmeticFunction<Power>("power", &pow_doc);
- DCHECK_OK(registry->AddFunction(std::move(power)));
-
- // ----------------------------------------------------------------------
- auto power_checked =
- MakeArithmeticFunctionNotNull<PowerChecked>("power_checked", &pow_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(power_checked)));
-
- // ----------------------------------------------------------------------
- auto sign =
- MakeUnaryArithmeticFunctionWithFixedIntOutType<Sign, Int8Type>("sign", &sign_doc);
- DCHECK_OK(registry->AddFunction(std::move(sign)));
-
- // ----------------------------------------------------------------------
- // Bitwise functions
- {
- auto bit_wise_not = std::make_shared<ArithmeticFunction>(
- "bit_wise_not", Arity::Unary(), &bit_wise_not_doc);
- for (const auto& ty : IntTypes()) {
- auto exec = TypeAgnosticBitWiseExecFromOp<ScalarUnaryNotNull, BitWiseNot>(ty);
- DCHECK_OK(bit_wise_not->AddKernel({ty}, ty, exec));
- }
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_not)));
- }
-
- auto bit_wise_and =
- MakeBitWiseFunctionNotNull<BitWiseAnd>("bit_wise_and", &bit_wise_and_doc);
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_and)));
-
- auto bit_wise_or =
- MakeBitWiseFunctionNotNull<BitWiseOr>("bit_wise_or", &bit_wise_or_doc);
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_or)));
-
- auto bit_wise_xor =
- MakeBitWiseFunctionNotNull<BitWiseXor>("bit_wise_xor", &bit_wise_xor_doc);
- DCHECK_OK(registry->AddFunction(std::move(bit_wise_xor)));
-
- auto shift_left = MakeShiftFunctionNotNull<ShiftLeft>("shift_left", &shift_left_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_left)));
-
- auto shift_left_checked = MakeShiftFunctionNotNull<ShiftLeftChecked>(
- "shift_left_checked", &shift_left_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_left_checked)));
-
- auto shift_right =
- MakeShiftFunctionNotNull<ShiftRight>("shift_right", &shift_right_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_right)));
-
- auto shift_right_checked = MakeShiftFunctionNotNull<ShiftRightChecked>(
- "shift_right_checked", &shift_right_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(shift_right_checked)));
-
- // ----------------------------------------------------------------------
- // Trig functions
- auto sin = MakeUnaryArithmeticFunctionFloatingPoint<Sin>("sin", &sin_doc);
- DCHECK_OK(registry->AddFunction(std::move(sin)));
-
- auto sin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<SinChecked>(
- "sin_checked", &sin_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(sin_checked)));
-
- auto cos = MakeUnaryArithmeticFunctionFloatingPoint<Cos>("cos", &cos_doc);
- DCHECK_OK(registry->AddFunction(std::move(cos)));
-
- auto cos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<CosChecked>(
- "cos_checked", &cos_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(cos_checked)));
-
- auto tan = MakeUnaryArithmeticFunctionFloatingPoint<Tan>("tan", &tan_doc);
- DCHECK_OK(registry->AddFunction(std::move(tan)));
-
- auto tan_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<TanChecked>(
- "tan_checked", &tan_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(tan_checked)));
-
- auto asin = MakeUnaryArithmeticFunctionFloatingPoint<Asin>("asin", &asin_doc);
- DCHECK_OK(registry->AddFunction(std::move(asin)));
-
- auto asin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AsinChecked>(
- "asin_checked", &asin_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(asin_checked)));
-
- auto acos = MakeUnaryArithmeticFunctionFloatingPoint<Acos>("acos", &acos_doc);
- DCHECK_OK(registry->AddFunction(std::move(acos)));
-
- auto acos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AcosChecked>(
- "acos_checked", &acos_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(acos_checked)));
-
- auto atan = MakeUnaryArithmeticFunctionFloatingPoint<Atan>("atan", &atan_doc);
- DCHECK_OK(registry->AddFunction(std::move(atan)));
-
- auto atan2 = MakeArithmeticFunctionFloatingPoint<Atan2>("atan2", &atan2_doc);
- DCHECK_OK(registry->AddFunction(std::move(atan2)));
-
- // ----------------------------------------------------------------------
- // Logarithms
- auto ln = MakeUnaryArithmeticFunctionFloatingPoint<LogNatural>("ln", &ln_doc);
- DCHECK_OK(registry->AddFunction(std::move(ln)));
-
- auto ln_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<LogNaturalChecked>(
- "ln_checked", &ln_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(ln_checked)));
-
- auto log10 = MakeUnaryArithmeticFunctionFloatingPoint<Log10>("log10", &log10_doc);
- DCHECK_OK(registry->AddFunction(std::move(log10)));
-
- auto log10_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log10Checked>(
- "log10_checked", &log10_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(log10_checked)));
-
- auto log2 = MakeUnaryArithmeticFunctionFloatingPoint<Log2>("log2", &log2_doc);
- DCHECK_OK(registry->AddFunction(std::move(log2)));
-
- auto log2_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log2Checked>(
- "log2_checked", &log2_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(log2_checked)));
-
- auto log1p = MakeUnaryArithmeticFunctionFloatingPoint<Log1p>("log1p", &log1p_doc);
- DCHECK_OK(registry->AddFunction(std::move(log1p)));
-
- auto log1p_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log1pChecked>(
- "log1p_checked", &log1p_checked_doc);
- DCHECK_OK(registry->AddFunction(std::move(log1p_checked)));
-
- // ----------------------------------------------------------------------
- // Rounding functions
- auto floor = MakeUnaryArithmeticFunctionFloatingPoint<Floor>("floor", &floor_doc);
- DCHECK_OK(registry->AddFunction(std::move(floor)));
-
- auto ceil = MakeUnaryArithmeticFunctionFloatingPoint<Ceil>("ceil", &ceil_doc);
- DCHECK_OK(registry->AddFunction(std::move(ceil)));
-
- auto trunc = MakeUnaryArithmeticFunctionFloatingPoint<Trunc>("trunc", &trunc_doc);
- DCHECK_OK(registry->AddFunction(std::move(trunc)));
+
+ // ----------------------------------------------------------------------
+ auto negate = MakeUnaryArithmeticFunction<Negate>("negate", &negate_doc);
+ DCHECK_OK(registry->AddFunction(std::move(negate)));
+
+ // ----------------------------------------------------------------------
+ auto negate_checked = MakeUnarySignedArithmeticFunctionNotNull<NegateChecked>(
+ "negate_checked", &negate_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(negate_checked)));
+
+ // ----------------------------------------------------------------------
+ auto power = MakeArithmeticFunction<Power>("power", &pow_doc);
+ DCHECK_OK(registry->AddFunction(std::move(power)));
+
+ // ----------------------------------------------------------------------
+ auto power_checked =
+ MakeArithmeticFunctionNotNull<PowerChecked>("power_checked", &pow_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(power_checked)));
+
+ // ----------------------------------------------------------------------
+ auto sign =
+ MakeUnaryArithmeticFunctionWithFixedIntOutType<Sign, Int8Type>("sign", &sign_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sign)));
+
+ // ----------------------------------------------------------------------
+ // Bitwise functions
+ {
+ auto bit_wise_not = std::make_shared<ArithmeticFunction>(
+ "bit_wise_not", Arity::Unary(), &bit_wise_not_doc);
+ for (const auto& ty : IntTypes()) {
+ auto exec = TypeAgnosticBitWiseExecFromOp<ScalarUnaryNotNull, BitWiseNot>(ty);
+ DCHECK_OK(bit_wise_not->AddKernel({ty}, ty, exec));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_not)));
+ }
+
+ auto bit_wise_and =
+ MakeBitWiseFunctionNotNull<BitWiseAnd>("bit_wise_and", &bit_wise_and_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_and)));
+
+ auto bit_wise_or =
+ MakeBitWiseFunctionNotNull<BitWiseOr>("bit_wise_or", &bit_wise_or_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_or)));
+
+ auto bit_wise_xor =
+ MakeBitWiseFunctionNotNull<BitWiseXor>("bit_wise_xor", &bit_wise_xor_doc);
+ DCHECK_OK(registry->AddFunction(std::move(bit_wise_xor)));
+
+ auto shift_left = MakeShiftFunctionNotNull<ShiftLeft>("shift_left", &shift_left_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_left)));
+
+ auto shift_left_checked = MakeShiftFunctionNotNull<ShiftLeftChecked>(
+ "shift_left_checked", &shift_left_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_left_checked)));
+
+ auto shift_right =
+ MakeShiftFunctionNotNull<ShiftRight>("shift_right", &shift_right_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_right)));
+
+ auto shift_right_checked = MakeShiftFunctionNotNull<ShiftRightChecked>(
+ "shift_right_checked", &shift_right_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(shift_right_checked)));
+
+ // ----------------------------------------------------------------------
+ // Trig functions
+ auto sin = MakeUnaryArithmeticFunctionFloatingPoint<Sin>("sin", &sin_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sin)));
+
+ auto sin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<SinChecked>(
+ "sin_checked", &sin_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(sin_checked)));
+
+ auto cos = MakeUnaryArithmeticFunctionFloatingPoint<Cos>("cos", &cos_doc);
+ DCHECK_OK(registry->AddFunction(std::move(cos)));
+
+ auto cos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<CosChecked>(
+ "cos_checked", &cos_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(cos_checked)));
+
+ auto tan = MakeUnaryArithmeticFunctionFloatingPoint<Tan>("tan", &tan_doc);
+ DCHECK_OK(registry->AddFunction(std::move(tan)));
+
+ auto tan_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<TanChecked>(
+ "tan_checked", &tan_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(tan_checked)));
+
+ auto asin = MakeUnaryArithmeticFunctionFloatingPoint<Asin>("asin", &asin_doc);
+ DCHECK_OK(registry->AddFunction(std::move(asin)));
+
+ auto asin_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AsinChecked>(
+ "asin_checked", &asin_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(asin_checked)));
+
+ auto acos = MakeUnaryArithmeticFunctionFloatingPoint<Acos>("acos", &acos_doc);
+ DCHECK_OK(registry->AddFunction(std::move(acos)));
+
+ auto acos_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<AcosChecked>(
+ "acos_checked", &acos_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(acos_checked)));
+
+ auto atan = MakeUnaryArithmeticFunctionFloatingPoint<Atan>("atan", &atan_doc);
+ DCHECK_OK(registry->AddFunction(std::move(atan)));
+
+ auto atan2 = MakeArithmeticFunctionFloatingPoint<Atan2>("atan2", &atan2_doc);
+ DCHECK_OK(registry->AddFunction(std::move(atan2)));
+
+ // ----------------------------------------------------------------------
+ // Logarithms
+ auto ln = MakeUnaryArithmeticFunctionFloatingPoint<LogNatural>("ln", &ln_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ln)));
+
+ auto ln_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<LogNaturalChecked>(
+ "ln_checked", &ln_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ln_checked)));
+
+ auto log10 = MakeUnaryArithmeticFunctionFloatingPoint<Log10>("log10", &log10_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log10)));
+
+ auto log10_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log10Checked>(
+ "log10_checked", &log10_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log10_checked)));
+
+ auto log2 = MakeUnaryArithmeticFunctionFloatingPoint<Log2>("log2", &log2_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log2)));
+
+ auto log2_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log2Checked>(
+ "log2_checked", &log2_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log2_checked)));
+
+ auto log1p = MakeUnaryArithmeticFunctionFloatingPoint<Log1p>("log1p", &log1p_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log1p)));
+
+ auto log1p_checked = MakeUnaryArithmeticFunctionFloatingPointNotNull<Log1pChecked>(
+ "log1p_checked", &log1p_checked_doc);
+ DCHECK_OK(registry->AddFunction(std::move(log1p_checked)));
+
+ // ----------------------------------------------------------------------
+ // Rounding functions
+ auto floor = MakeUnaryArithmeticFunctionFloatingPoint<Floor>("floor", &floor_doc);
+ DCHECK_OK(registry->AddFunction(std::move(floor)));
+
+ auto ceil = MakeUnaryArithmeticFunctionFloatingPoint<Ceil>("ceil", &ceil_doc);
+ DCHECK_OK(registry->AddFunction(std::move(ceil)));
+
+ auto trunc = MakeUnaryArithmeticFunctionFloatingPoint<Trunc>("trunc", &trunc_doc);
+ DCHECK_OK(registry->AddFunction(std::move(trunc)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 3a99c87dd99..7a0e3654edb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -33,180 +33,180 @@ namespace {
template <typename ComputeWord>
void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
const ArrayData& right, ArrayData* out) {
- DCHECK(left.null_count != 0 || right.null_count != 0)
- << "ComputeKleene is unnecessarily expensive for the non-null case";
+ DCHECK(left.null_count != 0 || right.null_count != 0)
+ << "ComputeKleene is unnecessarily expensive for the non-null case";
- Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
- Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
+ Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
+ Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
- Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
- Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
+ Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
+ Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
- std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
- Bitmap(out->buffers[1], out->offset, out->length)};
+ std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
+ Bitmap(out->buffers[1], out->offset, out->length)};
auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
- uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
+ uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
auto left_true = left_valid & left_data;
auto left_false = left_valid & ~left_data;
auto right_true = right_valid & right_data;
auto right_false = right_valid & ~right_data;
- compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
+ compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
};
- if (right.null_count == 0) {
- std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
- Bitmap::VisitWordsAndWrite(
- in_bms, &out_bms,
- [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
- apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1)));
- });
- return;
- }
-
- if (left.null_count == 0) {
- std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
- Bitmap::VisitWordsAndWrite(
- in_bms, &out_bms,
- [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
- apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1)));
- });
- return;
+ if (right.null_count == 0) {
+ std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+ apply(in[0], in[1], ~uint64_t(0), in[2], &(out->at(0)), &(out->at(1)));
+ });
+ return;
}
-
- DCHECK(left.null_count != 0 && right.null_count != 0);
- std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
- right_data_bm};
- Bitmap::VisitWordsAndWrite(
- in_bms, &out_bms,
- [&](const std::array<uint64_t, 4>& in, std::array<uint64_t, 2>* out) {
- apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1)));
- });
+
+ if (left.null_count == 0) {
+ std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 3>& in, std::array<uint64_t, 2>* out) {
+ apply(~uint64_t(0), in[0], in[1], in[2], &(out->at(0)), &(out->at(1)));
+ });
+ return;
+ }
+
+ DCHECK(left.null_count != 0 && right.null_count != 0);
+ std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
+ right_data_bm};
+ Bitmap::VisitWordsAndWrite(
+ in_bms, &out_bms,
+ [&](const std::array<uint64_t, 4>& in, std::array<uint64_t, 2>* out) {
+ apply(in[0], in[1], in[2], in[3], &(out->at(0)), &(out->at(1)));
+ });
+}
+
+inline BooleanScalar InvertScalar(const Scalar& in) {
+ return in.is_valid ? BooleanScalar(!checked_cast<const BooleanScalar&>(in).value)
+ : BooleanScalar();
+}
+
+inline Bitmap GetBitmap(const ArrayData& arr, int index) {
+ return Bitmap{arr.buffers[index], arr.offset, arr.length};
}
-inline BooleanScalar InvertScalar(const Scalar& in) {
- return in.is_valid ? BooleanScalar(!checked_cast<const BooleanScalar&>(in).value)
- : BooleanScalar();
-}
-
-inline Bitmap GetBitmap(const ArrayData& arr, int index) {
- return Bitmap{arr.buffers[index], arr.offset, arr.length};
-}
-
-struct InvertOp {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
- *checked_cast<BooleanScalar*>(out) = InvertScalar(in);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
- GetBitmap(*out, 1).CopyFromInverted(GetBitmap(in, 1));
- return Status::OK();
- }
-};
-
-template <typename Op>
-struct Commutative {
- static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
- ArrayData* out) {
- return Op::Call(ctx, right, left, out);
- }
-};
-
-struct AndOp : Commutative<AndOp> {
- using Commutative<AndOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- if (left.is_valid && right.is_valid) {
+struct InvertOp {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ *checked_cast<BooleanScalar*>(out) = InvertScalar(in);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+ GetBitmap(*out, 1).CopyFromInverted(GetBitmap(in, 1));
+ return Status::OK();
+ }
+};
+
+template <typename Op>
+struct Commutative {
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ return Op::Call(ctx, right, left, out);
+ }
+};
+
+struct AndOp : Commutative<AndOp> {
+ using Commutative<AndOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
checked_cast<BooleanScalar*>(out)->value =
- checked_cast<const BooleanScalar&>(left).value &&
- checked_cast<const BooleanScalar&>(right).value;
+ checked_cast<const BooleanScalar&>(left).value &&
+ checked_cast<const BooleanScalar&>(right).value;
}
- return Status::OK();
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- if (right.is_valid) {
- checked_cast<const BooleanScalar&>(right).value
- ? GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1))
- : GetBitmap(*out, 1).SetBitsTo(false);
- }
- return Status::OK();
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1))
+ : GetBitmap(*out, 1).SetBitsTo(false);
+ }
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
right.buffers[1]->data(), right.offset, right.length,
out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
+ return Status::OK();
}
};
-struct KleeneAndOp : Commutative<KleeneAndOp> {
- using Commutative<KleeneAndOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
- bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
-
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- checked_cast<BooleanScalar*>(out)->value = left_true && right_true;
- out->is_valid = left_false || right_false || (left_true && right_true);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- if (right_false) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- GetBitmap(*out, 1).SetBitsTo(false); // all false case
- return Status::OK();
- }
-
- if (right_true) {
- if (left.GetNullCount() == 0) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- } else {
- GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
- }
- GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- return Status::OK();
- }
-
- // scalar was null: out[i] is valid iff left[i] was false
- if (left.GetNullCount() == 0) {
- ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[0]->mutable_data(), out->offset);
- } else {
- ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset,
- left.buffers[1]->data(), left.offset, left.length,
- out->offset, out->buffers[0]->mutable_data());
- }
- ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[1]->mutable_data(), out->offset);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct KleeneAndOp : Commutative<KleeneAndOp> {
+ using Commutative<KleeneAndOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ checked_cast<BooleanScalar*>(out)->value = left_true && right_true;
+ out->is_valid = left_false || right_false || (left_true && right_true);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ if (right_false) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(false); // all false case
+ return Status::OK();
+ }
+
+ if (right_true) {
+ if (left.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+ }
+ GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff left[i] was false
+ if (left.GetNullCount() == 0) {
+ ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset,
+ left.buffers[1]->data(), left.offset, left.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
- out->null_count = 0;
- // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
- BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
- return AndOp::Call(ctx, left, right, out);
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return AndOp::Call(ctx, left, right, out);
}
auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
uint64_t right_false, uint64_t* out_valid,
@@ -215,104 +215,104 @@ struct KleeneAndOp : Commutative<KleeneAndOp> {
*out_valid = left_false | right_false | (left_true & right_true);
};
ComputeKleene(compute_word, ctx, left, right, out);
- return Status::OK();
+ return Status::OK();
}
};
-struct OrOp : Commutative<OrOp> {
- using Commutative<OrOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- if (left.is_valid && right.is_valid) {
- checked_cast<BooleanScalar*>(out)->value =
- checked_cast<const BooleanScalar&>(left).value ||
- checked_cast<const BooleanScalar&>(right).value;
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- if (right.is_valid) {
- checked_cast<const BooleanScalar&>(right).value
- ? GetBitmap(*out, 1).SetBitsTo(true)
- : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct OrOp : Commutative<OrOp> {
+ using Commutative<OrOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value ||
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).SetBitsTo(true)
+ : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset,
right.buffers[1]->data(), right.offset, right.length,
out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
+ return Status::OK();
}
};
-struct KleeneOrOp : Commutative<KleeneOrOp> {
- using Commutative<KleeneOrOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
- bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
-
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- checked_cast<BooleanScalar*>(out)->value = left_true || right_true;
- out->is_valid = left_true || right_true || (left_false && right_false);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
- bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
-
- if (right_true) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- GetBitmap(*out, 1).SetBitsTo(true); // all true case
- return Status::OK();
- }
-
- if (right_false) {
- if (left.GetNullCount() == 0) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- } else {
- GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
- }
- GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- return Status::OK();
- }
-
- // scalar was null: out[i] is valid iff left[i] was true
- if (left.GetNullCount() == 0) {
- ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[0]->mutable_data(), out->offset);
- } else {
- ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset,
- left.buffers[1]->data(), left.offset, left.length,
- out->offset, out->buffers[0]->mutable_data());
- }
- ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
- out->buffers[1]->mutable_data(), out->offset);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct KleeneOrOp : Commutative<KleeneOrOp> {
+ using Commutative<KleeneOrOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ checked_cast<BooleanScalar*>(out)->value = left_true || right_true;
+ out->is_valid = left_true || right_true || (left_false && right_false);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
+ bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
+
+ if (right_true) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(true); // all true case
+ return Status::OK();
+ }
+
+ if (right_false) {
+ if (left.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+ }
+ GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff left[i] was true
+ if (left.GetNullCount() == 0) {
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset,
+ left.buffers[1]->data(), left.offset, left.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
- out->null_count = 0;
- // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
- BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
- return OrOp::Call(ctx, left, right, out);
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return OrOp::Call(ctx, left, right, out);
}
-
+
static auto compute_word = [](uint64_t left_true, uint64_t left_false,
uint64_t right_true, uint64_t right_false,
uint64_t* out_valid, uint64_t* out_data) {
@@ -320,149 +320,149 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
*out_valid = left_true | right_true | (left_false & right_false);
};
- ComputeKleene(compute_word, ctx, left, right, out);
- return Status::OK();
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
}
};
-struct XorOp : Commutative<XorOp> {
- using Commutative<XorOp>::Call;
-
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- if (left.is_valid && right.is_valid) {
- checked_cast<BooleanScalar*>(out)->value =
- checked_cast<const BooleanScalar&>(left).value ^
- checked_cast<const BooleanScalar&>(right).value;
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- if (right.is_valid) {
- checked_cast<const BooleanScalar&>(right).value
- ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(left, 1))
- : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
+struct XorOp : Commutative<XorOp> {
+ using Commutative<XorOp>::Call;
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ if (left.is_valid && right.is_valid) {
+ checked_cast<BooleanScalar*>(out)->value =
+ checked_cast<const BooleanScalar&>(left).value ^
+ checked_cast<const BooleanScalar&>(right).value;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ if (right.is_valid) {
+ checked_cast<const BooleanScalar&>(right).value
+ ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(left, 1))
+ : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset,
right.buffers[1]->data(), right.offset, right.length,
out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
+ return Status::OK();
+ }
+};
+
+struct AndNotOp {
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ return AndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.is_valid) {
+ checked_cast<const BooleanScalar&>(left).value
+ ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1))
+ : GetBitmap(*out, 1).SetBitsTo(false);
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ return AndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ ::arrow::internal::BitmapAndNot(left.buffers[1]->data(), left.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[1]->mutable_data());
+ return Status::OK();
+ }
+};
+
+struct KleeneAndNotOp {
+ static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
+ Scalar* out) {
+ return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
+ ArrayData* out) {
+ bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
+ bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
+
+ if (left_false) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ GetBitmap(*out, 1).SetBitsTo(false); // all false case
+ return Status::OK();
+ }
+
+ if (left_true) {
+ if (right.GetNullCount() == 0) {
+ out->null_count = 0;
+ out->buffers[0] = nullptr;
+ } else {
+ GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0));
+ }
+ GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1));
+ return Status::OK();
+ }
+
+ // scalar was null: out[i] is valid iff right[i] was true
+ if (right.GetNullCount() == 0) {
+ ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length,
+ out->buffers[0]->mutable_data(), out->offset);
+ } else {
+ ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset,
+ right.buffers[1]->data(), right.offset, right.length,
+ out->offset, out->buffers[0]->mutable_data());
+ }
+ ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length,
+ out->buffers[1]->mutable_data(), out->offset);
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
+ ArrayData* out) {
+ return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+ ArrayData* out) {
+ if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+ out->null_count = 0;
+ // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
+ BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
+ return AndNotOp::Call(ctx, left, right, out);
+ }
+
+ static auto compute_word = [](uint64_t left_true, uint64_t left_false,
+ uint64_t right_true, uint64_t right_false,
+ uint64_t* out_valid, uint64_t* out_data) {
+ *out_data = left_true & right_false;
+ *out_valid = left_false | right_true | (left_true & right_false);
+ };
+
+ ComputeKleene(compute_word, ctx, left, right, out);
+ return Status::OK();
}
};
-struct AndNotOp {
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- return AndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
- ArrayData* out) {
- if (left.is_valid) {
- checked_cast<const BooleanScalar&>(left).value
- ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1))
- : GetBitmap(*out, 1).SetBitsTo(false);
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- return AndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
- ::arrow::internal::BitmapAndNot(left.buffers[1]->data(), left.offset,
- right.buffers[1]->data(), right.offset, right.length,
- out->offset, out->buffers[1]->mutable_data());
- return Status::OK();
- }
-};
-
-struct KleeneAndNotOp {
- static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
- Scalar* out) {
- return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
- ArrayData* out) {
- bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
- bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
-
- if (left_false) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- GetBitmap(*out, 1).SetBitsTo(false); // all false case
- return Status::OK();
- }
-
- if (left_true) {
- if (right.GetNullCount() == 0) {
- out->null_count = 0;
- out->buffers[0] = nullptr;
- } else {
- GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0));
- }
- GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1));
- return Status::OK();
- }
-
- // scalar was null: out[i] is valid iff right[i] was true
- if (right.GetNullCount() == 0) {
- ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length,
- out->buffers[0]->mutable_data(), out->offset);
- } else {
- ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset,
- right.buffers[1]->data(), right.offset, right.length,
- out->offset, out->buffers[0]->mutable_data());
- }
- ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length,
- out->buffers[1]->mutable_data(), out->offset);
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
- ArrayData* out) {
- return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
- ArrayData* out) {
- if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
- out->null_count = 0;
- // Kleene kernels have validity bitmap pre-allocated. Therefore, set it to 1
- BitUtil::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
- return AndNotOp::Call(ctx, left, right, out);
- }
-
- static auto compute_word = [](uint64_t left_true, uint64_t left_false,
- uint64_t right_true, uint64_t right_false,
- uint64_t* out_valid, uint64_t* out_data) {
- *out_data = left_true & right_false;
- *out_valid = left_false | right_true | (left_true & right_false);
- };
-
- ComputeKleene(compute_word, ctx, left, right, out);
- return Status::OK();
- }
-};
-
-void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
- const FunctionDoc* doc, FunctionRegistry* registry,
+void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
+ const FunctionDoc* doc, FunctionRegistry* registry,
NullHandling::type null_handling = NullHandling::INTERSECTION) {
- auto func = std::make_shared<ScalarFunction>(name, Arity(arity), doc);
+ auto func = std::make_shared<ScalarFunction>(name, Arity(arity), doc);
// Scalar arguments not yet supported
- std::vector<InputType> in_types(arity, InputType(boolean()));
+ std::vector<InputType> in_types(arity, InputType(boolean()));
ScalarKernel kernel(std::move(in_types), boolean(), exec);
kernel.null_handling = null_handling;
@@ -470,92 +470,92 @@ void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-const FunctionDoc invert_doc{"Invert boolean values", "", {"values"}};
-
-const FunctionDoc and_doc{
- "Logical 'and' boolean values",
- ("When a null is encountered in either input, a null is output.\n"
- "For a different null behavior, see function \"and_kleene\"."),
- {"x", "y"}};
-
-const FunctionDoc and_not_doc{
- "Logical 'and not' boolean values",
- ("When a null is encountered in either input, a null is output.\n"
- "For a different null behavior, see function \"and_not_kleene\"."),
- {"x", "y"}};
-
-const FunctionDoc or_doc{
- "Logical 'or' boolean values",
- ("When a null is encountered in either input, a null is output.\n"
- "For a different null behavior, see function \"or_kleene\"."),
- {"x", "y"}};
-
-const FunctionDoc xor_doc{
- "Logical 'xor' boolean values",
- ("When a null is encountered in either input, a null is output."),
- {"x", "y"}};
-
-const FunctionDoc and_kleene_doc{
- "Logical 'and' boolean values (Kleene logic)",
- ("This function behaves as follows with nulls:\n\n"
- "- true and null = null\n"
- "- null and true = null\n"
- "- false and null = false\n"
- "- null and false = false\n"
- "- null and null = null\n"
- "\n"
- "In other words, in this context a null value really means \"unknown\",\n"
- "and an unknown value 'and' false is always false.\n"
- "For a different null behavior, see function \"and\"."),
- {"x", "y"}};
-
-const FunctionDoc and_not_kleene_doc{
- "Logical 'and not' boolean values (Kleene logic)",
- ("This function behaves as follows with nulls:\n\n"
- "- true and null = null\n"
- "- null and false = null\n"
- "- false and null = false\n"
- "- null and true = false\n"
- "- null and null = null\n"
- "\n"
- "In other words, in this context a null value really means \"unknown\",\n"
- "and an unknown value 'and not' true is always false, as is false\n"
- "'and not' an unknown value.\n"
- "For a different null behavior, see function \"and_not\"."),
- {"x", "y"}};
-
-const FunctionDoc or_kleene_doc{
- "Logical 'or' boolean values (Kleene logic)",
- ("This function behaves as follows with nulls:\n\n"
- "- true or null = true\n"
- "- null and true = true\n"
- "- false and null = null\n"
- "- null and false = null\n"
- "- null and null = null\n"
- "\n"
- "In other words, in this context a null value really means \"unknown\",\n"
- "and an unknown value 'or' true is always true.\n"
- "For a different null behavior, see function \"and\"."),
- {"x", "y"}};
-
+const FunctionDoc invert_doc{"Invert boolean values", "", {"values"}};
+
+const FunctionDoc and_doc{
+ "Logical 'and' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"and_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc and_not_doc{
+ "Logical 'and not' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"and_not_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc or_doc{
+ "Logical 'or' boolean values",
+ ("When a null is encountered in either input, a null is output.\n"
+ "For a different null behavior, see function \"or_kleene\"."),
+ {"x", "y"}};
+
+const FunctionDoc xor_doc{
+ "Logical 'xor' boolean values",
+ ("When a null is encountered in either input, a null is output."),
+ {"x", "y"}};
+
+const FunctionDoc and_kleene_doc{
+ "Logical 'and' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true and null = null\n"
+ "- null and true = null\n"
+ "- false and null = false\n"
+ "- null and false = false\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'and' false is always false.\n"
+ "For a different null behavior, see function \"and\"."),
+ {"x", "y"}};
+
+const FunctionDoc and_not_kleene_doc{
+ "Logical 'and not' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true and null = null\n"
+ "- null and false = null\n"
+ "- false and null = false\n"
+ "- null and true = false\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'and not' true is always false, as is false\n"
+ "'and not' an unknown value.\n"
+ "For a different null behavior, see function \"and_not\"."),
+ {"x", "y"}};
+
+const FunctionDoc or_kleene_doc{
+ "Logical 'or' boolean values (Kleene logic)",
+ ("This function behaves as follows with nulls:\n\n"
+ "- true or null = true\n"
+ "- null and true = true\n"
+ "- false and null = null\n"
+ "- null and false = null\n"
+ "- null and null = null\n"
+ "\n"
+ "In other words, in this context a null value really means \"unknown\",\n"
+ "and an unknown value 'or' true is always true.\n"
+ "For a different null behavior, see function \"and\"."),
+ {"x", "y"}};
+
} // namespace
namespace internal {
void RegisterScalarBoolean(FunctionRegistry* registry) {
// These functions can write into sliced output bitmaps
- MakeFunction("invert", 1, applicator::SimpleUnary<InvertOp>, &invert_doc, registry);
- MakeFunction("and", 2, applicator::SimpleBinary<AndOp>, &and_doc, registry);
- MakeFunction("and_not", 2, applicator::SimpleBinary<AndNotOp>, &and_not_doc, registry);
- MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, &or_doc, registry);
- MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, &xor_doc, registry);
-
- MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
- registry, NullHandling::COMPUTED_PREALLOCATE);
- MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
- &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
- MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, &or_kleene_doc,
- registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("invert", 1, applicator::SimpleUnary<InvertOp>, &invert_doc, registry);
+ MakeFunction("and", 2, applicator::SimpleBinary<AndOp>, &and_doc, registry);
+ MakeFunction("and_not", 2, applicator::SimpleBinary<AndNotOp>, &and_not_doc, registry);
+ MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, &or_doc, registry);
+ MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, &xor_doc, registry);
+
+ MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, &and_kleene_doc,
+ registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
+ &and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
+ MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, &or_kleene_doc,
+ registry, NullHandling::COMPUTED_PREALLOCATE);
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index daee9cff79a..dad94c1ace7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -17,7 +17,7 @@
// Cast types to boolean
-#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_primitive.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/util/value_parsing.h"
@@ -31,17 +31,17 @@ namespace internal {
struct IsNonZero {
template <typename OutValue, typename Arg0Value>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
return val != 0;
}
};
struct ParseBooleanString {
template <typename OutValue, typename Arg0Value>
- static OutValue Call(KernelContext*, Arg0Value val, Status* st) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status* st) {
bool result = false;
if (ARROW_PREDICT_FALSE(!ParseValue<BooleanType>(val.data(), val.size(), &result))) {
- *st = Status::Invalid("Failed to parse value: ", val);
+ *st = Status::Invalid("Failed to parse value: ", val);
}
return result;
}
@@ -50,7 +50,7 @@ struct ParseBooleanString {
std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
auto func = std::make_shared<CastFunction>("cast_boolean", Type::BOOL);
AddCommonCasts(Type::BOOL, boolean(), func.get());
- AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
+ AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
for (const auto& ty : NumericTypes()) {
ArrayKernelExec exec =
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
index b8be4d78696..b1e1164fd34 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -1,126 +1,126 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Implementation of casting to dictionary type
-
-#include <arrow/util/bitmap_ops.h>
-#include <arrow/util/checked_cast.h>
-
-#include "arrow/array/builder_primitive.h"
-#include "arrow/compute/cast_internal.h"
-#include "arrow/compute/kernels/scalar_cast_internal.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/util/int_util.h"
-
-namespace arrow {
-using internal::CopyBitmap;
-
-namespace compute {
-namespace internal {
-
-Status CastDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const CastOptions& options = CastState::Get(ctx);
- auto out_type = std::static_pointer_cast<DictionaryType>(out->type());
-
- // if out type is same as in type, return input
- if (out_type->Equals(batch[0].type())) {
- *out = batch[0];
- return Status::OK();
- }
-
- if (batch[0].is_scalar()) { // if input is scalar
- auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar());
-
- // if invalid scalar, return null scalar
- if (!in_scalar.is_valid) {
- *out = MakeNullScalar(out_type);
- return Status::OK();
- }
-
- Datum casted_index, casted_dict;
- if (in_scalar.value.index->type->Equals(out_type->index_type())) {
- casted_index = in_scalar.value.index;
- } else {
- ARROW_ASSIGN_OR_RAISE(casted_index,
- Cast(in_scalar.value.index, out_type->index_type(), options,
- ctx->exec_context()));
- }
-
- if (in_scalar.value.dictionary->type()->Equals(out_type->value_type())) {
- casted_dict = in_scalar.value.dictionary;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- casted_dict, Cast(in_scalar.value.dictionary, out_type->value_type(), options,
- ctx->exec_context()));
- }
-
- *out = std::static_pointer_cast<Scalar>(
- DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array()));
-
- return Status::OK();
- }
-
- // if input is array
- const std::shared_ptr<ArrayData>& in_array = batch[0].array();
- const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
-
- ArrayData* out_array = out->mutable_array();
-
- if (in_type.index_type()->Equals(out_type->index_type())) {
- out_array->buffers[0] = in_array->buffers[0];
- out_array->buffers[1] = in_array->buffers[1];
- out_array->null_count = in_array->GetNullCount();
- out_array->offset = in_array->offset;
- } else {
- // for indices, create a dummy ArrayData with index_type()
- const std::shared_ptr<ArrayData>& indices_arr =
- ArrayData::Make(in_type.index_type(), in_array->length, in_array->buffers,
- in_array->GetNullCount(), in_array->offset);
- ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type->index_type(),
- options, ctx->exec_context()));
- out_array->buffers[0] = std::move(casted_indices.array()->buffers[0]);
- out_array->buffers[1] = std::move(casted_indices.array()->buffers[1]);
- }
-
- // data (dict)
- if (in_type.value_type()->Equals(out_type->value_type())) {
- out_array->dictionary = in_array->dictionary;
- } else {
- const std::shared_ptr<Array>& dict_arr = MakeArray(in_array->dictionary);
- ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type->value_type(),
- options, ctx->exec_context()));
- out_array->dictionary = casted_data.array();
- }
- return Status::OK();
-}
-
-std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
- auto func = std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
-
- AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get());
- ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastDictionary);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-
- DCHECK_OK(func->AddKernel(Type::DICTIONARY, std::move(kernel)));
-
- return {func};
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to dictionary type
+
+#include <arrow/util/bitmap_ops.h>
+#include <arrow/util/checked_cast.h>
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/compute/cast_internal.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/util/int_util.h"
+
+namespace arrow {
+using internal::CopyBitmap;
+
+namespace compute {
+namespace internal {
+
+Status CastDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const CastOptions& options = CastState::Get(ctx);
+ auto out_type = std::static_pointer_cast<DictionaryType>(out->type());
+
+ // if out type is same as in type, return input
+ if (out_type->Equals(batch[0].type())) {
+ *out = batch[0];
+ return Status::OK();
+ }
+
+ if (batch[0].is_scalar()) { // if input is scalar
+ auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar());
+
+ // if invalid scalar, return null scalar
+ if (!in_scalar.is_valid) {
+ *out = MakeNullScalar(out_type);
+ return Status::OK();
+ }
+
+ Datum casted_index, casted_dict;
+ if (in_scalar.value.index->type->Equals(out_type->index_type())) {
+ casted_index = in_scalar.value.index;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(casted_index,
+ Cast(in_scalar.value.index, out_type->index_type(), options,
+ ctx->exec_context()));
+ }
+
+ if (in_scalar.value.dictionary->type()->Equals(out_type->value_type())) {
+ casted_dict = in_scalar.value.dictionary;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(
+ casted_dict, Cast(in_scalar.value.dictionary, out_type->value_type(), options,
+ ctx->exec_context()));
+ }
+
+ *out = std::static_pointer_cast<Scalar>(
+ DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array()));
+
+ return Status::OK();
+ }
+
+ // if input is array
+ const std::shared_ptr<ArrayData>& in_array = batch[0].array();
+ const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
+
+ ArrayData* out_array = out->mutable_array();
+
+ if (in_type.index_type()->Equals(out_type->index_type())) {
+ out_array->buffers[0] = in_array->buffers[0];
+ out_array->buffers[1] = in_array->buffers[1];
+ out_array->null_count = in_array->GetNullCount();
+ out_array->offset = in_array->offset;
+ } else {
+ // for indices, create a dummy ArrayData with index_type()
+ const std::shared_ptr<ArrayData>& indices_arr =
+ ArrayData::Make(in_type.index_type(), in_array->length, in_array->buffers,
+ in_array->GetNullCount(), in_array->offset);
+ ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type->index_type(),
+ options, ctx->exec_context()));
+ out_array->buffers[0] = std::move(casted_indices.array()->buffers[0]);
+ out_array->buffers[1] = std::move(casted_indices.array()->buffers[1]);
+ }
+
+ // data (dict)
+ if (in_type.value_type()->Equals(out_type->value_type())) {
+ out_array->dictionary = in_array->dictionary;
+ } else {
+ const std::shared_ptr<Array>& dict_arr = MakeArray(in_array->dictionary);
+ ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type->value_type(),
+ options, ctx->exec_context()));
+ out_array->dictionary = casted_data.array();
+ }
+ return Status::OK();
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
+ auto func = std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
+
+ AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get());
+ ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastDictionary);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+ DCHECK_OK(func->AddKernel(Type::DICTIONARY, std::move(kernel)));
+
+ return {func};
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 660250359c4..198c82bd97e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -148,40 +148,40 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
// ----------------------------------------------------------------------
-Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK(out->is_array());
-
+Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+
DictionaryArray dict_arr(batch[0].array());
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
const auto& dict_type = *dict_arr.dictionary()->type();
- if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {
- return Status::Invalid("Cast type ", options.to_type->ToString(),
- " incompatible with dictionary type ", dict_type.ToString());
+ if (!dict_type.Equals(options.to_type) && !CanCast(dict_type, *options.to_type)) {
+ return Status::Invalid("Cast type ", options.to_type->ToString(),
+ " incompatible with dictionary type ", dict_type.ToString());
}
- ARROW_ASSIGN_OR_RAISE(*out,
- Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
- TakeOptions::Defaults(), ctx->exec_context()));
-
- if (!dict_type.Equals(options.to_type)) {
- ARROW_ASSIGN_OR_RAISE(*out, Cast(*out, options));
+ ARROW_ASSIGN_OR_RAISE(*out,
+ Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
+ TakeOptions::Defaults(), ctx->exec_context()));
+
+ if (!dict_type.Equals(options.to_type)) {
+ ARROW_ASSIGN_OR_RAISE(*out, Cast(*out, options));
}
- return Status::OK();
+ return Status::OK();
}
-Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (out->is_scalar()) {
- out->scalar()->is_valid = false;
- } else {
- ArrayData* output = out->mutable_array();
- output->buffers = {nullptr};
- output->null_count = batch.length;
- }
- return Status::OK();
+Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (out->is_scalar()) {
+ out->scalar()->is_valid = false;
+ } else {
+ ArrayData* output = out->mutable_array();
+ output->buffers = {nullptr};
+ output->null_count = batch.length;
+ }
+ return Status::OK();
}
-Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
const DataType& in_type = *batch[0].type();
@@ -190,20 +190,20 @@ Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out)
ExtensionArray extension(batch[0].array());
Datum casted_storage;
- RETURN_NOT_OK(Cast(*extension.storage(), out->type(), options, ctx->exec_context())
- .Value(&casted_storage));
+ RETURN_NOT_OK(Cast(*extension.storage(), out->type(), options, ctx->exec_context())
+ .Value(&casted_storage));
out->value = casted_storage.array();
- return Status::OK();
+ return Status::OK();
}
-Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (!batch[0].is_scalar()) {
- ArrayData* output = out->mutable_array();
- std::shared_ptr<Array> nulls;
- RETURN_NOT_OK(MakeArrayOfNull(output->type, batch.length).Value(&nulls));
- out->value = nulls->data();
- }
- return Status::OK();
+Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (!batch[0].is_scalar()) {
+ ArrayData* output = out->mutable_array();
+ std::shared_ptr<Array> nulls;
+ RETURN_NOT_OK(MakeArrayOfNull(output->type, batch.length).Value(&nulls));
+ out->value = nulls->data();
+ }
+ return Status::OK();
}
Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
@@ -223,25 +223,25 @@ Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
OutputType kOutputTargetType(ResolveOutputFromOptions);
-Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- // Make a copy of the buffers into a destination array without carrying
- // the type
- const ArrayData& input = *batch[0].array();
- ArrayData* output = out->mutable_array();
- output->length = input.length;
- output->SetNullCount(input.null_count);
- output->buffers = input.buffers;
- output->offset = input.offset;
- output->child_data = input.child_data;
- return Status::OK();
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+ // Make a copy of the buffers into a destination array without carrying
+ // the type
+ const ArrayData& input = *batch[0].array();
+ ArrayData* output = out->mutable_array();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+ output->buffers = input.buffers;
+ output->offset = input.offset;
+ output->child_data = input.child_data;
+ return Status::OK();
}
void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
CastFunction* func) {
auto sig = KernelSignature::Make({in_type}, out_type);
ScalarKernel kernel;
- kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec);
+ kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec);
kernel.signature = sig;
kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
@@ -255,12 +255,12 @@ static bool CanCastFromDictionary(Type::type type_id) {
void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* func) {
// From null to this type
- ScalarKernel kernel;
- kernel.exec = CastFromNull;
- kernel.signature = KernelSignature::Make({null()}, out_ty);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- DCHECK_OK(func->AddKernel(Type::NA, std::move(kernel)));
+ ScalarKernel kernel;
+ kernel.exec = CastFromNull;
+ kernel.signature = KernelSignature::Make({null()}, out_ty);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(Type::NA, std::move(kernel)));
// From dictionary to this type
if (CanCastFromDictionary(out_type_id)) {
@@ -268,10 +268,10 @@ void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* fun
//
// XXX: Uses Take and does its own memory allocation for the moment. We can
// fix this later.
- DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
- TrivialScalarUnaryAsArraysExec(UnpackDictionary),
- NullHandling::COMPUTED_NO_PREALLOCATE,
- MemAllocation::NO_PREALLOCATE));
+ DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
+ TrivialScalarUnaryAsArraysExec(UnpackDictionary),
+ NullHandling::COMPUTED_NO_PREALLOCATE,
+ MemAllocation::NO_PREALLOCATE));
}
// From extension type to this type
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index bffa64988a6..2419d898a68 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -21,7 +21,7 @@
#include "arrow/compute/cast.h" // IWYU pragma: export
#include "arrow/compute/cast_internal.h" // IWYU pragma: export
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
namespace arrow {
@@ -37,10 +37,10 @@ struct CastFunctor {};
template <typename O, typename I>
struct CastFunctor<
O, I, enable_if_t<std::is_same<O, I>::value && is_parameter_free_type<I>::value>> {
- static Status Exec(KernelContext*, const ExecBatch&, Datum*) { return Status::OK(); }
+ static Status Exec(KernelContext*, const ExecBatch&, Datum*) { return Status::OK(); }
};
-Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
// Utility for numeric casts
void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input,
@@ -49,23 +49,23 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
// ----------------------------------------------------------------------
// Dictionary to other things
-Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-// Adds a cast function where CastFunctor is specialized and the input and output
-// types are parameter free (have a type_singleton). Scalar inputs are handled by
-// wrapping with TrivialScalarUnaryAsArraysExec.
+// Adds a cast function where CastFunctor is specialized and the input and output
+// types are parameter free (have a type_singleton). Scalar inputs are handled by
+// wrapping with TrivialScalarUnaryAsArraysExec.
template <typename InType, typename OutType>
void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
- DCHECK_OK(func->AddKernel(
- InType::type_id, {in_ty}, out_ty,
- TrivialScalarUnaryAsArraysExec(CastFunctor<OutType, InType>::Exec)));
+ DCHECK_OK(func->AddKernel(
+ InType::type_id, {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(CastFunctor<OutType, InType>::Exec)));
}
-Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
CastFunction* func);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index c5fccf30311..ec92dbb5d60 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -20,74 +20,74 @@
#include <utility>
#include <vector>
-#include "arrow/array/builder_nested.h"
-#include "arrow/compute/api_scalar.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/compute/api_scalar.h"
#include "arrow/compute/cast.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
-#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_ops.h"
namespace arrow {
-
-using internal::CopyBitmap;
-
+
+using internal::CopyBitmap;
+
namespace compute {
namespace internal {
template <typename Type>
-Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- using offset_type = typename Type::offset_type;
- using ScalarType = typename TypeTraits<Type>::ScalarType;
-
- const CastOptions& options = CastState::Get(ctx);
-
- auto child_type = checked_cast<const Type&>(*out->type()).value_type();
-
- if (out->kind() == Datum::SCALAR) {
- const auto& in_scalar = checked_cast<const ScalarType&>(*batch[0].scalar());
- auto out_scalar = checked_cast<ScalarType*>(out->scalar().get());
-
- DCHECK(!out_scalar->is_valid);
- if (in_scalar.is_valid) {
- ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, options,
- ctx->exec_context()));
-
- out_scalar->is_valid = true;
- }
- return Status::OK();
+Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using offset_type = typename Type::offset_type;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+
+ const CastOptions& options = CastState::Get(ctx);
+
+ auto child_type = checked_cast<const Type&>(*out->type()).value_type();
+
+ if (out->kind() == Datum::SCALAR) {
+ const auto& in_scalar = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto out_scalar = checked_cast<ScalarType*>(out->scalar().get());
+
+ DCHECK(!out_scalar->is_valid);
+ if (in_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, options,
+ ctx->exec_context()));
+
+ out_scalar->is_valid = true;
+ }
+ return Status::OK();
+ }
+
+ const ArrayData& in_array = *batch[0].array();
+ ArrayData* out_array = out->mutable_array();
+
+ // Copy from parent
+ out_array->buffers = in_array.buffers;
+ Datum values = in_array.child_data[0];
+
+ if (in_array.offset != 0) {
+ if (in_array.buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+ CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
+ in_array.offset, in_array.length));
+ }
+ ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
+ ctx->Allocate(sizeof(offset_type) * (in_array.length + 1)));
+
+ auto offsets = in_array.GetValues<offset_type>(1);
+ auto shifted_offsets = out_array->GetMutableValues<offset_type>(1);
+
+ for (int64_t i = 0; i < in_array.length + 1; ++i) {
+ shifted_offsets[i] = offsets[i] - offsets[0];
+ }
+ values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]);
}
- const ArrayData& in_array = *batch[0].array();
- ArrayData* out_array = out->mutable_array();
-
- // Copy from parent
- out_array->buffers = in_array.buffers;
- Datum values = in_array.child_data[0];
-
- if (in_array.offset != 0) {
- if (in_array.buffers[0]) {
- ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
- CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
- in_array.offset, in_array.length));
- }
- ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
- ctx->Allocate(sizeof(offset_type) * (in_array.length + 1)));
-
- auto offsets = in_array.GetValues<offset_type>(1);
- auto shifted_offsets = out_array->GetMutableValues<offset_type>(1);
-
- for (int64_t i = 0; i < in_array.length + 1; ++i) {
- shifted_offsets[i] = offsets[i] - offsets[0];
- }
- values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]);
- }
-
- ARROW_ASSIGN_OR_RAISE(Datum cast_values,
- Cast(values, child_type, options, ctx->exec_context()));
-
- DCHECK_EQ(Datum::ARRAY, cast_values.kind());
- out_array->child_data.push_back(cast_values.array());
- return Status::OK();
+ ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+ Cast(values, child_type, options, ctx->exec_context()));
+
+ DCHECK_EQ(Datum::ARRAY, cast_values.kind());
+ out_array->child_data.push_back(cast_values.array());
+ return Status::OK();
}
template <typename Type>
@@ -120,12 +120,12 @@ std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
auto cast_struct = std::make_shared<CastFunction>("cast_struct", Type::STRUCT);
AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get());
- // So is dictionary
- auto cast_dictionary =
- std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
- AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get());
-
- return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary};
+ // So is dictionary
+ auto cast_dictionary =
+ std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
+ AddCommonCasts(Type::DICTIONARY, kOutputTargetType, cast_dictionary.get());
+
+ return {cast_list, cast_large_list, cast_fsl, cast_struct, cast_dictionary};
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 4ada0b08afe..cc7b533f262 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -17,10 +17,10 @@
// Implementation of casting to integer, floating point, or decimal types
-#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_primitive.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
#include "arrow/util/bit_block_counter.h"
#include "arrow/util/int_util.h"
#include "arrow/util/value_parsing.h"
@@ -36,18 +36,18 @@ using internal::ParseValue;
namespace compute {
namespace internal {
-Status CastIntegerToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastIntegerToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
if (!options.allow_int_overflow) {
- RETURN_NOT_OK(IntegersCanFit(batch[0], *out->type()));
+ RETURN_NOT_OK(IntegersCanFit(batch[0], *out->type()));
}
CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
- return Status::OK();
+ return Status::OK();
}
-Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out) {
+Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out) {
CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -65,7 +65,7 @@ Status CheckFloatTruncation(const Datum& input, const Datum& output) {
return is_valid && static_cast<InT>(out_val) != in_val;
};
auto GetErrorMessage = [&](InT val) {
- return Status::Invalid("Float value ", val, " was truncated converting to ",
+ return Status::Invalid("Float value ", val, " was truncated converting to ",
*output.type());
};
@@ -170,13 +170,13 @@ Status CheckFloatToIntTruncation(const Datum& input, const Datum& output) {
return Status::OK();
}
-Status CastFloatingToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFloatingToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
if (!options.allow_float_truncate) {
- RETURN_NOT_OK(CheckFloatToIntTruncation(batch[0], *out));
+ RETURN_NOT_OK(CheckFloatToIntTruncation(batch[0], *out));
}
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -249,14 +249,14 @@ Status CheckForIntegerToFloatingTruncation(const Datum& input, Type::type out_ty
return Status::OK();
}
-Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
Type::type out_type = out->type()->id();
if (!options.allow_float_truncate) {
- RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
+ RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
}
CastNumberToNumberUnsafe(batch[0].type()->id(), out_type, batch[0], out);
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -264,7 +264,7 @@ Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum*
struct BooleanToNumber {
template <typename OutValue, typename Arg0Value>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
constexpr auto kOne = static_cast<OutValue>(1);
constexpr auto kZero = static_cast<OutValue>(0);
return val ? kOne : kZero;
@@ -273,9 +273,9 @@ struct BooleanToNumber {
template <typename O>
struct CastFunctor<O, BooleanType, enable_if_number<O>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return applicator::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch,
- out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return applicator::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch,
+ out);
}
};
@@ -285,11 +285,11 @@ struct CastFunctor<O, BooleanType, enable_if_number<O>> {
template <typename OutType>
struct ParseString {
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
OutValue result = OutValue(0);
if (ARROW_PREDICT_FALSE(!ParseValue<OutType>(val.data(), val.size(), &result))) {
- *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
- TypeTraits<OutType>::type_singleton()->ToString());
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ TypeTraits<OutType>::type_singleton()->ToString());
}
return result;
}
@@ -297,8 +297,8 @@ struct ParseString {
template <typename O, typename I>
struct CastFunctor<O, I, enable_if_base_binary<I>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
}
};
@@ -306,13 +306,13 @@ struct CastFunctor<O, I, enable_if_base_binary<I>> {
// Decimal to integer
struct DecimalToIntegerMixin {
- template <typename OutValue, typename Arg0Value>
- OutValue ToInteger(KernelContext* ctx, const Arg0Value& val, Status* st) const {
+ template <typename OutValue, typename Arg0Value>
+ OutValue ToInteger(KernelContext* ctx, const Arg0Value& val, Status* st) const {
constexpr auto min_value = std::numeric_limits<OutValue>::min();
constexpr auto max_value = std::numeric_limits<OutValue>::max();
if (!allow_int_overflow_ && ARROW_PREDICT_FALSE(val < min_value || val > max_value)) {
- *st = Status::Invalid("Integer value out of bounds");
+ *st = Status::Invalid("Integer value out of bounds");
return OutValue{}; // Zero
} else {
return static_cast<OutValue>(val.low_bits());
@@ -330,8 +330,8 @@ struct UnsafeUpscaleDecimalToInteger : public DecimalToIntegerMixin {
using DecimalToIntegerMixin::DecimalToIntegerMixin;
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
- return ToInteger<OutValue>(ctx, val.IncreaseScaleBy(-in_scale_), st);
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ return ToInteger<OutValue>(ctx, val.IncreaseScaleBy(-in_scale_), st);
}
};
@@ -339,8 +339,8 @@ struct UnsafeDownscaleDecimalToInteger : public DecimalToIntegerMixin {
using DecimalToIntegerMixin::DecimalToIntegerMixin;
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
- return ToInteger<OutValue>(ctx, val.ReduceScaleBy(in_scale_, false), st);
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ return ToInteger<OutValue>(ctx, val.ReduceScaleBy(in_scale_, false), st);
}
};
@@ -348,44 +348,44 @@ struct SafeRescaleDecimalToInteger : public DecimalToIntegerMixin {
using DecimalToIntegerMixin::DecimalToIntegerMixin;
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
+ OutValue Call(KernelContext* ctx, Arg0Value val, Status* st) const {
auto result = val.Rescale(in_scale_, 0);
if (ARROW_PREDICT_FALSE(!result.ok())) {
- *st = result.status();
+ *st = result.status();
return OutValue{}; // Zero
} else {
- return ToInteger<OutValue>(ctx, *result, st);
+ return ToInteger<OutValue>(ctx, *result, st);
}
}
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
using out_type = typename O::c_type;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
- const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
+ const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
const auto in_scale = in_type_inst.scale();
if (options.allow_decimal_truncate) {
if (in_scale < 0) {
// Unsafe upscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimalToInteger>
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimalToInteger>
kernel(UnsafeUpscaleDecimalToInteger{in_scale, options.allow_int_overflow});
return kernel.Exec(ctx, batch, out);
} else {
// Unsafe downscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimalToInteger>
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimalToInteger>
kernel(UnsafeDownscaleDecimalToInteger{in_scale, options.allow_int_overflow});
return kernel.Exec(ctx, batch, out);
}
} else {
// Safe rescale
- applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimalToInteger> kernel(
- SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
+ applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimalToInteger> kernel(
+ SafeRescaleDecimalToInteger{in_scale, options.allow_int_overflow});
return kernel.Exec(ctx, batch, out);
}
}
@@ -394,104 +394,104 @@ struct CastFunctor<O, I,
// ----------------------------------------------------------------------
// Decimal to decimal
-// Helper that converts the input and output decimals
-// For instance, Decimal128 -> Decimal256 requires converting, then scaling
-// Decimal256 -> Decimal128 requires scaling, then truncating
-template <typename OutDecimal, typename InDecimal>
-struct DecimalConversions {};
-
-template <typename InDecimal>
-struct DecimalConversions<Decimal256, InDecimal> {
- // Convert then scale
- static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); }
- static Decimal256 ConvertOutput(Decimal256&& val) { return val; }
-};
-
-template <>
-struct DecimalConversions<Decimal128, Decimal256> {
- // Scale then truncate
- static Decimal256 ConvertInput(Decimal256&& val) { return val; }
- static Decimal128 ConvertOutput(Decimal256&& val) {
- return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]);
- }
-};
-
-template <>
-struct DecimalConversions<Decimal128, Decimal128> {
- static Decimal128 ConvertInput(Decimal128&& val) { return val; }
- static Decimal128 ConvertOutput(Decimal128&& val) { return val; }
+// Helper that converts the input and output decimals
+// For instance, Decimal128 -> Decimal256 requires converting, then scaling
+// Decimal256 -> Decimal128 requires scaling, then truncating
+template <typename OutDecimal, typename InDecimal>
+struct DecimalConversions {};
+
+template <typename InDecimal>
+struct DecimalConversions<Decimal256, InDecimal> {
+ // Convert then scale
+ static Decimal256 ConvertInput(InDecimal&& val) { return Decimal256(val); }
+ static Decimal256 ConvertOutput(Decimal256&& val) { return val; }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal256> {
+ // Scale then truncate
+ static Decimal256 ConvertInput(Decimal256&& val) { return val; }
+ static Decimal128 ConvertOutput(Decimal256&& val) {
+ return Decimal128(val.little_endian_array()[1], val.little_endian_array()[0]);
+ }
+};
+
+template <>
+struct DecimalConversions<Decimal128, Decimal128> {
+ static Decimal128 ConvertInput(Decimal128&& val) { return val; }
+ static Decimal128 ConvertOutput(Decimal128&& val) { return val; }
+};
+
+struct UnsafeUpscaleDecimal {
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status*) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_));
+ }
+ int32_t by_;
};
-struct UnsafeUpscaleDecimal {
- template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status*) const {
- using Conv = DecimalConversions<OutValue, Arg0Value>;
- return Conv::ConvertOutput(Conv::ConvertInput(std::move(val)).IncreaseScaleBy(by_));
- }
- int32_t by_;
-};
-
struct UnsafeDownscaleDecimal {
- template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status*) const {
- using Conv = DecimalConversions<OutValue, Arg0Value>;
- return Conv::ConvertOutput(
- Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false));
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status*) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ return Conv::ConvertOutput(
+ Conv::ConvertInput(std::move(val)).ReduceScaleBy(by_, false));
}
- int32_t by_;
+ int32_t by_;
};
struct SafeRescaleDecimal {
- template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
- using Conv = DecimalConversions<OutValue, Arg0Value>;
- auto maybe_rescaled =
- Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_);
- if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) {
- *st = maybe_rescaled.status();
- return {}; // Zero
+ template <typename OutValue, typename Arg0Value>
+ OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
+ using Conv = DecimalConversions<OutValue, Arg0Value>;
+ auto maybe_rescaled =
+ Conv::ConvertInput(std::move(val)).Rescale(in_scale_, out_scale_);
+ if (ARROW_PREDICT_FALSE(!maybe_rescaled.ok())) {
+ *st = maybe_rescaled.status();
+ return {}; // Zero
+ }
+
+ if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) {
+ return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe());
}
-
- if (ARROW_PREDICT_TRUE(maybe_rescaled->FitsInPrecision(out_precision_))) {
- return Conv::ConvertOutput(maybe_rescaled.MoveValueUnsafe());
- }
-
- *st = Status::Invalid("Decimal value does not fit in precision ", out_precision_);
- return {}; // Zero
+
+ *st = Status::Invalid("Decimal value does not fit in precision ", out_precision_);
+ return {}; // Zero
}
int32_t out_scale_, out_precision_, in_scale_;
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
- const auto& in_type = checked_cast<const I&>(*batch[0].type());
- const auto& out_type = checked_cast<const O&>(*out->type());
- const auto in_scale = in_type.scale();
- const auto out_scale = out_type.scale();
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto& out_type = checked_cast<const O&>(*out->type());
+ const auto in_scale = in_type.scale();
+ const auto out_scale = out_type.scale();
if (options.allow_decimal_truncate) {
if (in_scale < out_scale) {
// Unsafe upscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimal> kernel(
- UnsafeUpscaleDecimal{out_scale - in_scale});
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeUpscaleDecimal> kernel(
+ UnsafeUpscaleDecimal{out_scale - in_scale});
return kernel.Exec(ctx, batch, out);
} else {
// Unsafe downscale
- applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimal> kernel(
- UnsafeDownscaleDecimal{in_scale - out_scale});
+ applicator::ScalarUnaryNotNullStateful<O, I, UnsafeDownscaleDecimal> kernel(
+ UnsafeDownscaleDecimal{in_scale - out_scale});
return kernel.Exec(ctx, batch, out);
}
}
-
- // Safe rescale
- applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimal> kernel(
- SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
- return kernel.Exec(ctx, batch, out);
+
+ // Safe rescale
+ applicator::ScalarUnaryNotNullStateful<O, I, SafeRescaleDecimal> kernel(
+ SafeRescaleDecimal{out_scale, out_type.precision(), in_scale});
+ return kernel.Exec(ctx, batch, out);
}
};
@@ -500,33 +500,33 @@ struct CastFunctor<O, I,
struct RealToDecimal {
template <typename OutValue, typename RealType>
- OutValue Call(KernelContext*, RealType val, Status* st) const {
- auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_);
-
- if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) {
- return maybe_decimal.MoveValueUnsafe();
+ OutValue Call(KernelContext*, RealType val, Status* st) const {
+ auto maybe_decimal = OutValue::FromReal(val, out_precision_, out_scale_);
+
+ if (ARROW_PREDICT_TRUE(maybe_decimal.ok())) {
+ return maybe_decimal.MoveValueUnsafe();
}
-
- if (!allow_truncate_) {
- *st = maybe_decimal.status();
- }
- return {}; // Zero
+
+ if (!allow_truncate_) {
+ *st = maybe_decimal.status();
+ }
+ return {}; // Zero
}
int32_t out_scale_, out_precision_;
bool allow_truncate_;
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& options = checked_cast<const CastState*>(ctx->state())->options;
- const auto& out_type = checked_cast<const O&>(*out->type());
- const auto out_scale = out_type.scale();
- const auto out_precision = out_type.precision();
+ const auto& out_type = checked_cast<const O&>(*out->type());
+ const auto out_scale = out_type.scale();
+ const auto out_precision = out_type.precision();
- applicator::ScalarUnaryNotNullStateful<O, I, RealToDecimal> kernel(
+ applicator::ScalarUnaryNotNullStateful<O, I, RealToDecimal> kernel(
RealToDecimal{out_scale, out_precision, options.allow_decimal_truncate});
return kernel.Exec(ctx, batch, out);
}
@@ -537,21 +537,21 @@ struct CastFunctor<O, I,
struct DecimalToReal {
template <typename RealType, typename Arg0Value>
- RealType Call(KernelContext*, const Arg0Value& val, Status*) const {
- return val.template ToReal<RealType>(in_scale_);
+ RealType Call(KernelContext*, const Arg0Value& val, Status*) const {
+ return val.template ToReal<RealType>(in_scale_);
}
int32_t in_scale_;
};
-template <typename O, typename I>
-struct CastFunctor<O, I,
- enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& in_type = checked_cast<const I&>(*batch[0].type());
- const auto in_scale = in_type.scale();
+template <typename O, typename I>
+struct CastFunctor<O, I,
+ enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& in_type = checked_cast<const I&>(*batch[0].type());
+ const auto in_scale = in_type.scale();
- applicator::ScalarUnaryNotNullStateful<O, I, DecimalToReal> kernel(
+ applicator::ScalarUnaryNotNullStateful<O, I, DecimalToReal> kernel(
DecimalToReal{in_scale});
return kernel.Exec(ctx, batch, out);
}
@@ -595,10 +595,10 @@ std::shared_ptr<CastFunction> GetCastToInteger(std::string name) {
AddCommonNumberCasts<OutType>(out_ty, func.get());
// From decimal to integer
- DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
+ DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
CastFunctor<OutType, Decimal128Type>::Exec));
- DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
- CastFunctor<OutType, Decimal256Type>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+ CastFunctor<OutType, Decimal256Type>::Exec));
return func;
}
@@ -621,18 +621,18 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
AddCommonNumberCasts<OutType>(out_ty, func.get());
// From decimal to floating point
- DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
+ DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType(Type::DECIMAL)}, out_ty,
CastFunctor<OutType, Decimal128Type>::Exec));
- DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
- CastFunctor<OutType, Decimal256Type>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, out_ty,
+ CastFunctor<OutType, Decimal256Type>::Exec));
return func;
}
-std::shared_ptr<CastFunction> GetCastToDecimal128() {
+std::shared_ptr<CastFunction> GetCastToDecimal128() {
OutputType sig_out_ty(ResolveOutputFromOptions);
- auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
- AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
+ auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL128);
+ AddCommonCasts(Type::DECIMAL128, sig_out_ty, func.get());
// Cast from floating point
DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
@@ -643,36 +643,36 @@ std::shared_ptr<CastFunction> GetCastToDecimal128() {
// Cast from other decimal
auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
// We resolve the output type of this kernel from the CastOptions
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
- exec = CastFunctor<Decimal128Type, Decimal256Type>::Exec;
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+ exec = CastFunctor<Decimal128Type, Decimal256Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
+ return func;
+}
+
+std::shared_ptr<CastFunction> GetCastToDecimal256() {
+ OutputType sig_out_ty(ResolveOutputFromOptions);
+
+ auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
+ AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
+
+ // Cast from floating point
+ DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
+ CastFunctor<Decimal256Type, FloatType>::Exec));
+ DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
+ CastFunctor<Decimal256Type, DoubleType>::Exec));
+
+ // Cast from other decimal
+ auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
+ exec = CastFunctor<Decimal256Type, Decimal256Type>::Exec;
+ DCHECK_OK(
+ func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
return func;
}
-std::shared_ptr<CastFunction> GetCastToDecimal256() {
- OutputType sig_out_ty(ResolveOutputFromOptions);
-
- auto func = std::make_shared<CastFunction>("cast_decimal256", Type::DECIMAL256);
- AddCommonCasts(Type::DECIMAL256, sig_out_ty, func.get());
-
- // Cast from floating point
- DCHECK_OK(func->AddKernel(Type::FLOAT, {float32()}, sig_out_ty,
- CastFunctor<Decimal256Type, FloatType>::Exec));
- DCHECK_OK(func->AddKernel(Type::DOUBLE, {float64()}, sig_out_ty,
- CastFunctor<Decimal256Type, DoubleType>::Exec));
-
- // Cast from other decimal
- auto exec = CastFunctor<Decimal256Type, Decimal128Type>::Exec;
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL128, {InputType(Type::DECIMAL128)}, sig_out_ty, exec));
- exec = CastFunctor<Decimal256Type, Decimal256Type>::Exec;
- DCHECK_OK(
- func->AddKernel(Type::DECIMAL256, {InputType(Type::DECIMAL256)}, sig_out_ty, exec));
- return func;
-}
-
} // namespace
std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
@@ -681,8 +681,8 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
// Make a cast to null that does not do much. Not sure why we need to be able
// to cast from dict<null> -> null but there are unit tests for it
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
- DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
- OutputAllNull));
+ DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
+ OutputAllNull));
functions.push_back(cast_null);
functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
@@ -716,8 +716,8 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
functions.push_back(GetCastToFloating<FloatType>("cast_float"));
functions.push_back(GetCastToFloating<DoubleType>("cast_double"));
- functions.push_back(GetCastToDecimal128());
- functions.push_back(GetCastToDecimal256());
+ functions.push_back(GetCastToDecimal128());
+ functions.push_back(GetCastToDecimal256());
return functions;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index e24d7fabf37..3ce537b7223 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -15,15 +15,15 @@
// specific language governing permissions and limitations
// under the License.
-#include <limits>
+#include <limits>
#include "arrow/array/array_base.h"
-#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_binary.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/result.h"
#include "arrow/util/formatting.h"
-#include "arrow/util/int_util.h"
+#include "arrow/util/int_util.h"
#include "arrow/util/optional.h"
#include "arrow/util/utf8.h"
#include "arrow/visitor_inline.h"
@@ -37,22 +37,22 @@ using util::ValidateUTF8;
namespace compute {
namespace internal {
-namespace {
-
+namespace {
+
// ----------------------------------------------------------------------
// Number / Boolean to String
-template <typename O, typename I>
-struct NumericToStringCastFunctor {
+template <typename O, typename I>
+struct NumericToStringCastFunctor {
using value_type = typename TypeTraits<I>::CType;
using BuilderType = typename TypeTraits<O>::BuilderType;
using FormatterType = StringFormatter<I>;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK(out->is_array());
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
const ArrayData& input = *batch[0].array();
ArrayData* output = out->mutable_array();
- return Convert(ctx, input, output);
+ return Convert(ctx, input, output);
}
static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
@@ -73,7 +73,7 @@ struct NumericToStringCastFunctor {
};
// ----------------------------------------------------------------------
-// Binary-like to binary-like
+// Binary-like to binary-like
//
#if defined(_MSC_VER)
@@ -94,152 +94,152 @@ struct Utf8Validator {
};
template <typename I, typename O>
-Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
- ArrayData* output) {
- static_assert(std::is_same<I, O>::value, "Cast same-width offsets (no-op)");
- return Status::OK();
-}
+Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
+ ArrayData* output) {
+ static_assert(std::is_same<I, O>::value, "Cast same-width offsets (no-op)");
+ return Status::OK();
+}
-// Upcast offsets
+// Upcast offsets
template <>
-Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
- const ArrayData& input,
- ArrayData* output) {
- using input_offset_type = int32_t;
- using output_offset_type = int64_t;
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[1],
- ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type)));
- memset(output->buffers[1]->mutable_data(), 0,
- output->offset * sizeof(output_offset_type));
- ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
- output->GetMutableValues<output_offset_type>(1),
- output->length + 1);
- return Status::OK();
-}
-
-// Downcast offsets
+Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
+ const ArrayData& input,
+ ArrayData* output) {
+ using input_offset_type = int32_t;
+ using output_offset_type = int64_t;
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[1],
+ ctx->Allocate((output->length + output->offset + 1) * sizeof(output_offset_type)));
+ memset(output->buffers[1]->mutable_data(), 0,
+ output->offset * sizeof(output_offset_type));
+ ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+ output->GetMutableValues<output_offset_type>(1),
+ output->length + 1);
+ return Status::OK();
+}
+
+// Downcast offsets
template <>
-Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
- const ArrayData& input,
- ArrayData* output) {
- using input_offset_type = int64_t;
- using output_offset_type = int32_t;
-
- constexpr input_offset_type kMaxOffset = std::numeric_limits<output_offset_type>::max();
-
- auto input_offsets = input.GetValues<input_offset_type>(1);
-
- // Binary offsets are ascending, so it's enough to check the last one for overflow.
- if (input_offsets[input.length] > kMaxOffset) {
- return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
- output->type->ToString(), ": input array too large");
- } else {
- ARROW_ASSIGN_OR_RAISE(output->buffers[1],
- ctx->Allocate((output->length + output->offset + 1) *
- sizeof(output_offset_type)));
- memset(output->buffers[1]->mutable_data(), 0,
- output->offset * sizeof(output_offset_type));
- ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
- output->GetMutableValues<output_offset_type>(1),
- output->length + 1);
- return Status::OK();
- }
-}
-
-template <typename O, typename I>
-Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK(out->is_array());
- const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- const ArrayData& input = *batch[0].array();
-
- if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
- InitializeUTF8();
-
- ArrayDataVisitor<I> visitor;
- Utf8Validator validator;
- RETURN_NOT_OK(visitor.Visit(input, &validator));
- }
-
- // Start with a zero-copy cast, but change indices to expected size
- RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
- return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
- ctx, input, out->mutable_array());
-}
-
+Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
+ const ArrayData& input,
+ ArrayData* output) {
+ using input_offset_type = int64_t;
+ using output_offset_type = int32_t;
+
+ constexpr input_offset_type kMaxOffset = std::numeric_limits<output_offset_type>::max();
+
+ auto input_offsets = input.GetValues<input_offset_type>(1);
+
+ // Binary offsets are ascending, so it's enough to check the last one for overflow.
+ if (input_offsets[input.length] > kMaxOffset) {
+ return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+ output->type->ToString(), ": input array too large");
+ } else {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate((output->length + output->offset + 1) *
+ sizeof(output_offset_type)));
+ memset(output->buffers[1]->mutable_data(), 0,
+ output->offset * sizeof(output_offset_type));
+ ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+ output->GetMutableValues<output_offset_type>(1),
+ output->length + 1);
+ return Status::OK();
+ }
+}
+
+template <typename O, typename I>
+Status BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK(out->is_array());
+ const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
+ const ArrayData& input = *batch[0].array();
+
+ if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
+ InitializeUTF8();
+
+ ArrayDataVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+
+ // Start with a zero-copy cast, but change indices to expected size
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
+ ctx, input, out->mutable_array());
+}
+
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
-// ----------------------------------------------------------------------
-// Cast functions registration
+// ----------------------------------------------------------------------
+// Cast functions registration
template <typename OutType>
-void AddNumberToStringCasts(CastFunction* func) {
- auto out_ty = TypeTraits<OutType>::type_singleton();
-
+void AddNumberToStringCasts(CastFunction* func) {
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
- TrivialScalarUnaryAsArraysExec(
- NumericToStringCastFunctor<OutType, BooleanType>::Exec),
+ TrivialScalarUnaryAsArraysExec(
+ NumericToStringCastFunctor<OutType, BooleanType>::Exec),
NullHandling::COMPUTED_NO_PREALLOCATE));
for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
- DCHECK_OK(
- func->AddKernel(in_ty->id(), {in_ty}, out_ty,
- TrivialScalarUnaryAsArraysExec(
- GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty)),
- NullHandling::COMPUTED_NO_PREALLOCATE));
+ DCHECK_OK(
+ func->AddKernel(in_ty->id(), {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(
+ GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty)),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
}
}
-template <typename OutType, typename InType>
-void AddBinaryToBinaryCast(CastFunction* func) {
- auto in_ty = TypeTraits<InType>::type_singleton();
- auto out_ty = TypeTraits<OutType>::type_singleton();
-
- DCHECK_OK(func->AddKernel(
- InType::type_id, {in_ty}, out_ty,
- TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>),
- NullHandling::COMPUTED_NO_PREALLOCATE));
-}
-
-template <typename OutType>
-void AddBinaryToBinaryCast(CastFunction* func) {
- AddBinaryToBinaryCast<OutType, StringType>(func);
- AddBinaryToBinaryCast<OutType, BinaryType>(func);
- AddBinaryToBinaryCast<OutType, LargeStringType>(func);
- AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
-}
-
-} // namespace
-
+template <typename OutType, typename InType>
+void AddBinaryToBinaryCast(CastFunction* func) {
+ auto in_ty = TypeTraits<InType>::type_singleton();
+ auto out_ty = TypeTraits<OutType>::type_singleton();
+
+ DCHECK_OK(func->AddKernel(
+ InType::type_id, {in_ty}, out_ty,
+ TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>),
+ NullHandling::COMPUTED_NO_PREALLOCATE));
+}
+
+template <typename OutType>
+void AddBinaryToBinaryCast(CastFunction* func) {
+ AddBinaryToBinaryCast<OutType, StringType>(func);
+ AddBinaryToBinaryCast<OutType, BinaryType>(func);
+ AddBinaryToBinaryCast<OutType, LargeStringType>(func);
+ AddBinaryToBinaryCast<OutType, LargeBinaryType>(func);
+}
+
+} // namespace
+
std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
auto cast_binary = std::make_shared<CastFunction>("cast_binary", Type::BINARY);
AddCommonCasts(Type::BINARY, binary(), cast_binary.get());
- AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
+ AddBinaryToBinaryCast<BinaryType>(cast_binary.get());
auto cast_large_binary =
std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
AddCommonCasts(Type::LARGE_BINARY, large_binary(), cast_large_binary.get());
- AddBinaryToBinaryCast<LargeBinaryType>(cast_large_binary.get());
+ AddBinaryToBinaryCast<LargeBinaryType>(cast_large_binary.get());
auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
AddCommonCasts(Type::STRING, utf8(), cast_string.get());
- AddNumberToStringCasts<StringType>(cast_string.get());
- AddBinaryToBinaryCast<StringType>(cast_string.get());
+ AddNumberToStringCasts<StringType>(cast_string.get());
+ AddBinaryToBinaryCast<StringType>(cast_string.get());
auto cast_large_string =
std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
AddCommonCasts(Type::LARGE_STRING, large_utf8(), cast_large_string.get());
- AddNumberToStringCasts<LargeStringType>(cast_large_string.get());
- AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
-
- auto cast_fsb =
- std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
- AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
- cast_fsb.get());
-
- return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
+ AddNumberToStringCasts<LargeStringType>(cast_large_string.get());
+ AddBinaryToBinaryCast<LargeStringType>(cast_large_string.get());
+
+ auto cast_fsb =
+ std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
+ AddCommonCasts(Type::FIXED_SIZE_BINARY, OutputType(ResolveOutputFromOptions),
+ cast_fsb.get());
+
+ return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index b5271e02413..1a58fce7c74 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -19,7 +19,7 @@
#include <limits>
-#include "arrow/array/builder_time.h"
+#include "arrow/array/builder_time.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/util/bitmap_reader.h"
@@ -39,10 +39,10 @@ constexpr int64_t kMillisecondsInDay = 86400000;
// From one timestamp to another
template <typename in_type, typename out_type>
-Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
- const int64_t factor, const ArrayData& input, ArrayData* output) {
+Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
+ const int64_t factor, const ArrayData& input, ArrayData* output) {
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
- auto in_data = input.GetValues<in_type>(1);
+ auto in_data = input.GetValues<in_type>(1);
auto out_data = output->GetMutableValues<out_type>(1);
if (factor == 1) {
@@ -55,10 +55,10 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
out_data[i] = static_cast<out_type>(in_data[i] * factor);
}
} else {
-#define RAISE_OVERFLOW_CAST(VAL) \
- return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
- output->type->ToString(), " would result in ", \
- "out of bounds timestamp: ", VAL);
+#define RAISE_OVERFLOW_CAST(VAL) \
+ return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+ output->type->ToString(), " would result in ", \
+ "out of bounds timestamp: ", VAL);
int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
@@ -88,9 +88,9 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
out_data[i] = static_cast<out_type>(in_data[i] / factor);
}
} else {
-#define RAISE_INVALID_CAST(VAL) \
- return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
- output->type->ToString(), " would lose data: ", VAL);
+#define RAISE_INVALID_CAST(VAL) \
+ return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+ output->type->ToString(), " would lose data: ", VAL);
if (input.null_count != 0) {
BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
@@ -113,8 +113,8 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
#undef RAISE_INVALID_CAST
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
// <TimestampType, TimestampType> and <DurationType, DurationType>
@@ -123,7 +123,7 @@ struct CastFunctor<
O, I,
enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
(is_duration_type<O>::value && is_duration_type<I>::value)>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const ArrayData& input = *batch[0].array();
@@ -137,14 +137,14 @@ struct CastFunctor<
// lengths to make this zero copy in the future but we leave it for now
auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
- return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input,
- output);
+ return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input,
+ output);
}
};
template <>
struct CastFunctor<Date32Type, TimestampType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const ArrayData& input = *batch[0].array();
@@ -160,13 +160,13 @@ struct CastFunctor<Date32Type, TimestampType> {
};
const int64_t factor = kTimestampToDateFactors[static_cast<int>(in_type.unit())];
- return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, factor, input, output);
+ return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, factor, input, output);
}
};
template <>
struct CastFunctor<Date64Type, TimestampType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
@@ -175,8 +175,8 @@ struct CastFunctor<Date64Type, TimestampType> {
const auto& in_type = checked_cast<const TimestampType&>(*input.type);
auto conversion = util::GetTimestampConversion(in_type.unit(), TimeUnit::MILLI);
- RETURN_NOT_OK((ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
- input, output)));
+ RETURN_NOT_OK((ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
+ input, output)));
// Ensure that intraday milliseconds have been zeroed out
auto out_data = output->GetMutableValues<int64_t>(1);
@@ -188,7 +188,7 @@ struct CastFunctor<Date64Type, TimestampType> {
const int64_t remainder = out_data[i] % kMillisecondsInDay;
if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && bit_reader.IsSet() &&
remainder > 0)) {
- return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
+ return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
}
out_data[i] -= remainder;
bit_reader.Next();
@@ -197,13 +197,13 @@ struct CastFunctor<Date64Type, TimestampType> {
for (int64_t i = 0; i < input.length; ++i) {
const int64_t remainder = out_data[i] % kMillisecondsInDay;
if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && remainder > 0)) {
- return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
+ return Status::Invalid("Timestamp value had non-zero intraday milliseconds");
}
out_data[i] -= remainder;
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
@@ -215,7 +215,7 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
using in_t = typename I::c_type;
using out_t = typename O::c_type;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
const ArrayData& input = *batch[0].array();
@@ -226,8 +226,8 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
const auto& out_type = checked_cast<const O&>(*output->type);
DCHECK_NE(in_type.unit(), out_type.unit()) << "Do not cast equal types";
auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
- return ShiftTime<in_t, out_t>(ctx, conversion.first, conversion.second, input,
- output);
+ return ShiftTime<in_t, out_t>(ctx, conversion.first, conversion.second, input,
+ output);
}
};
@@ -236,68 +236,68 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
template <>
struct CastFunctor<Date64Type, Date32Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
- *batch[0].array(), out->mutable_array());
+ return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
+ *batch[0].array(), out->mutable_array());
}
};
template <>
struct CastFunctor<Date32Type, Date64Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay,
- *batch[0].array(), out->mutable_array());
+ return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+// ----------------------------------------------------------------------
+// date32, date64 to timestamp
+
+template <>
+struct CastFunctor<TimestampType, Date32Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+ // get conversion SECOND -> unit
+ auto conversion = util::GetTimestampConversion(TimeUnit::SECOND, out_type.unit());
+ DCHECK_EQ(conversion.first, util::MULTIPLY);
+
+ // multiply to achieve days -> unit
+ conversion.second *= kMillisecondsInDay / 1000;
+ return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, conversion.second,
+ *batch[0].array(), out->mutable_array());
+ }
+};
+
+template <>
+struct CastFunctor<TimestampType, Date64Type> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+
+ const auto& out_type = checked_cast<const TimestampType&>(*out->type());
+
+ // date64 is ms since epoch
+ auto conversion = util::GetTimestampConversion(TimeUnit::MILLI, out_type.unit());
+ return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
+ *batch[0].array(), out->mutable_array());
}
};
// ----------------------------------------------------------------------
-// date32, date64 to timestamp
-
-template <>
-struct CastFunctor<TimestampType, Date32Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
- const auto& out_type = checked_cast<const TimestampType&>(*out->type());
- // get conversion SECOND -> unit
- auto conversion = util::GetTimestampConversion(TimeUnit::SECOND, out_type.unit());
- DCHECK_EQ(conversion.first, util::MULTIPLY);
-
- // multiply to achieve days -> unit
- conversion.second *= kMillisecondsInDay / 1000;
- return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, conversion.second,
- *batch[0].array(), out->mutable_array());
- }
-};
-
-template <>
-struct CastFunctor<TimestampType, Date64Type> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
- const auto& out_type = checked_cast<const TimestampType&>(*out->type());
-
- // date64 is ms since epoch
- auto conversion = util::GetTimestampConversion(TimeUnit::MILLI, out_type.unit());
- return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
- *batch[0].array(), out->mutable_array());
- }
-};
-
-// ----------------------------------------------------------------------
// String to Timestamp
struct ParseTimestamp {
template <typename OutValue, typename Arg0Value>
- OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
+ OutValue Call(KernelContext*, Arg0Value val, Status* st) const {
OutValue result = 0;
if (ARROW_PREDICT_FALSE(!ParseValue(type, val.data(), val.size(), &result))) {
- *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
- type.ToString());
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ type.ToString());
}
return result;
}
@@ -307,7 +307,7 @@ struct ParseTimestamp {
template <typename I>
struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const auto& out_type = checked_cast<const TimestampType&>(*out->type());
applicator::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
ParseTimestamp{out_type});
@@ -318,7 +318,7 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
template <typename Type>
void AddCrossUnitCast(CastFunction* func) {
ScalarKernel kernel;
- kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec);
+ kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec);
kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
}
@@ -417,11 +417,11 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
// From date types
- // TODO: ARROW-8876, these casts are not directly tested
- AddSimpleCast<Date32Type, TimestampType>(InputType(Type::DATE32), kOutputTargetType,
- func.get());
- AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64), kOutputTargetType,
- func.get());
+ // TODO: ARROW-8876, these casts are not directly tested
+ AddSimpleCast<Date32Type, TimestampType>(InputType(Type::DATE32), kOutputTargetType,
+ func.get());
+ AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64), kOutputTargetType,
+ func.get());
// string -> timestamp
AddSimpleCast<StringType, TimestampType>(utf8(), kOutputTargetType, func.get());
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 777a7c9d5ee..4342d776c38 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -15,12 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-#include <cmath>
-#include <limits>
-
-#include "arrow/compute/api_scalar.h"
+#include <cmath>
+#include <limits>
+
+#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_ops.h"
namespace arrow {
@@ -34,110 +34,110 @@ namespace internal {
namespace {
struct Equal {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left == right;
}
};
struct NotEqual {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left != right;
}
};
struct Greater {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left > right;
}
};
struct GreaterEqual {
- template <typename T, typename Arg0, typename Arg1>
- static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
- static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
+ template <typename T, typename Arg0, typename Arg1>
+ static constexpr T Call(KernelContext*, const Arg0& left, const Arg1& right, Status*) {
+ static_assert(std::is_same<T, bool>::value && std::is_same<Arg0, Arg1>::value, "");
return left >= right;
}
};
-template <typename T>
-using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
- std::is_unsigned<T>::value>;
-
-template <typename T>
-using is_signed_integer =
- std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
-
-template <typename T>
-using enable_if_integer =
- enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, T>;
-
-template <typename T>
-using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
-
-struct Minimum {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::fmin(left, right);
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::min(left, right);
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
- return std::nanf("");
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
- return std::nan("");
- }
-
- template <typename T>
- static constexpr enable_if_integer<T> antiextreme() {
- return std::numeric_limits<T>::max();
- }
-};
-
-struct Maximum {
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::fmax(left, right);
- }
-
- template <typename T, typename Arg0, typename Arg1>
- static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
- static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
- return std::max(left, right);
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
- return std::nanf("");
- }
-
- template <typename T>
- static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
- return std::nan("");
- }
-
- template <typename T>
- static constexpr enable_if_integer<T> antiextreme() {
- return std::numeric_limits<T>::min();
- }
-};
-
+template <typename T>
+using is_unsigned_integer = std::integral_constant<bool, std::is_integral<T>::value &&
+ std::is_unsigned<T>::value>;
+
+template <typename T>
+using is_signed_integer =
+ std::integral_constant<bool, std::is_integral<T>::value && std::is_signed<T>::value>;
+
+template <typename T>
+using enable_if_integer =
+ enable_if_t<is_signed_integer<T>::value || is_unsigned_integer<T>::value, T>;
+
+template <typename T>
+using enable_if_floating_point = enable_if_t<std::is_floating_point<T>::value, T>;
+
+struct Minimum {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::fmin(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::min(left, right);
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
+ return std::nanf("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
+ return std::nan("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_integer<T> antiextreme() {
+ return std::numeric_limits<T>::max();
+ }
+};
+
+struct Maximum {
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_floating_point<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::fmax(left, right);
+ }
+
+ template <typename T, typename Arg0, typename Arg1>
+ static enable_if_integer<T> Call(Arg0 left, Arg1 right) {
+ static_assert(std::is_same<T, Arg0>::value && std::is_same<Arg0, Arg1>::value, "");
+ return std::max(left, right);
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<float, T>::value, T> antiextreme() {
+ return std::nanf("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_t<std::is_same<double, T>::value, T> antiextreme() {
+ return std::nan("");
+ }
+
+ template <typename T>
+ static constexpr enable_if_integer<T> antiextreme() {
+ return std::numeric_limits<T>::min();
+ }
+};
+
// Implement Less, LessEqual by flipping arguments to Greater, GreaterEqual
template <typename Op>
@@ -154,57 +154,57 @@ void AddGenericCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* func
applicator::ScalarBinaryEqualTypes<BooleanType, InType, Op>::Exec));
}
-struct CompareFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
- ReplaceNullWithOtherType(values);
-
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- } else if (auto type = CommonTimestamp(*values)) {
- ReplaceTypes(type, values);
- } else if (auto type = CommonBinary(*values)) {
- ReplaceTypes(type, values);
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-struct VarArgsCompareFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- EnsureDictionaryDecoded(values);
-
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- } else if (auto type = CommonTimestamp(*values)) {
- ReplaceTypes(type, values);
- }
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
+struct CompareFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+ ReplaceNullWithOtherType(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonTimestamp(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonBinary(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+struct VarArgsCompareFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ EnsureDictionaryDecoded(values);
+
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ } else if (auto type = CommonTimestamp(*values)) {
+ ReplaceTypes(type, values);
+ }
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
template <typename Op>
-std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
+std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
DCHECK_OK(func->AddKernel(
{boolean(), boolean()}, boolean(),
@@ -263,9 +263,9 @@ std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name,
}
std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
- const ScalarFunction& func,
- const FunctionDoc* doc) {
- auto flipped_func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
+ const ScalarFunction& func,
+ const FunctionDoc* doc) {
+ auto flipped_func = std::make_shared<CompareFunction>(name, Arity::Binary(), doc);
for (const ScalarKernel* kernel : func.kernels()) {
ScalarKernel flipped_kernel = *kernel;
flipped_kernel.exec = MakeFlippedBinaryExec(kernel->exec);
@@ -274,249 +274,249 @@ std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
return flipped_func;
}
-using MinMaxState = OptionsWrapper<ElementWiseAggregateOptions>;
-
-// Implement a variadic scalar min/max kernel.
-template <typename OutType, typename Op>
-struct ScalarMinMax {
- using OutValue = typename GetOutputType<OutType>::T;
-
- static void ExecScalar(const ExecBatch& batch,
- const ElementWiseAggregateOptions& options, Scalar* out) {
- // All arguments are scalar
- OutValue value{};
- bool valid = false;
- for (const auto& arg : batch.values) {
- // Ignore non-scalar arguments so we can use it in the mixed-scalar-and-array case
- if (!arg.is_scalar()) continue;
- const auto& scalar = *arg.scalar();
- if (!scalar.is_valid) {
- if (options.skip_nulls) continue;
- out->is_valid = false;
- return;
- }
- if (!valid) {
- value = UnboxScalar<OutType>::Unbox(scalar);
- valid = true;
- } else {
- value = Op::template Call<OutValue, OutValue, OutValue>(
- value, UnboxScalar<OutType>::Unbox(scalar));
- }
- }
- out->is_valid = valid;
- if (valid) {
- BoxScalar<OutType>::Box(value, out);
- }
- }
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
- const auto descrs = batch.GetDescriptors();
- const size_t scalar_count =
- static_cast<size_t>(std::count_if(batch.values.begin(), batch.values.end(),
- [](const Datum& d) { return d.is_scalar(); }));
- if (scalar_count == batch.values.size()) {
- ExecScalar(batch, options, out->scalar().get());
- return Status::OK();
- }
-
- ArrayData* output = out->mutable_array();
-
- // At least one array, two or more arguments
- ArrayDataVector arrays;
- for (const auto& arg : batch.values) {
- if (!arg.is_array()) continue;
- arrays.push_back(arg.array());
- }
-
- bool initialize_output = true;
- if (scalar_count > 0) {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
- MakeScalar(out->type(), 0));
- ExecScalar(batch, options, temp_scalar.get());
- if (temp_scalar->is_valid) {
- const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
- initialize_output = false;
- OutValue* out = output->GetMutableValues<OutValue>(1);
- std::fill(out, out + batch.length, value);
- } else if (!options.skip_nulls) {
- // Abort early
- ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*temp_scalar, batch.length,
- ctx->memory_pool()));
- *output = *array->data();
- return Status::OK();
- }
- }
-
- if (initialize_output) {
- OutValue* out = output->GetMutableValues<OutValue>(1);
- std::fill(out, out + batch.length, Op::template antiextreme<OutValue>());
- }
-
- // Precompute the validity buffer
- if (options.skip_nulls && initialize_output) {
- // OR together the validity buffers of all arrays
- if (std::all_of(arrays.begin(), arrays.end(),
- [](const std::shared_ptr<ArrayData>& arr) {
- return arr->MayHaveNulls();
- })) {
- for (const auto& arr : arrays) {
- if (!arr->MayHaveNulls()) continue;
- if (!output->buffers[0]) {
- ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
- ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
-
- batch.length,
- output->buffers[0]->mutable_data(),
- /*dest_offset=*/0);
- } else {
- ::arrow::internal::BitmapOr(
- output->buffers[0]->data(), /*left_offset=*/0, arr->buffers[0]->data(),
- arr->offset, batch.length,
- /*out_offset=*/0, output->buffers[0]->mutable_data());
- }
- }
- }
- } else if (!options.skip_nulls) {
- // AND together the validity buffers of all arrays
- for (const auto& arr : arrays) {
- if (!arr->MayHaveNulls()) continue;
- if (!output->buffers[0]) {
- ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
- ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
- batch.length, output->buffers[0]->mutable_data(),
- /*dest_offset=*/0);
- } else {
- ::arrow::internal::BitmapAnd(output->buffers[0]->data(), /*left_offset=*/0,
- arr->buffers[0]->data(), arr->offset, batch.length,
- /*out_offset=*/0,
- output->buffers[0]->mutable_data());
- }
- }
- }
-
- for (const auto& array : arrays) {
- OutputArrayWriter<OutType> writer(out->mutable_array());
- ArrayIterator<OutType> out_it(*output);
- int64_t index = 0;
- VisitArrayValuesInline<OutType>(
- *array,
- [&](OutValue value) {
- auto u = out_it();
- if (!output->buffers[0] ||
- BitUtil::GetBit(output->buffers[0]->data(), index)) {
- writer.Write(Op::template Call<OutValue, OutValue, OutValue>(u, value));
- } else {
- writer.Write(value);
- }
- index++;
- },
- [&]() {
- // RHS is null, preserve the LHS
- writer.values++;
- index++;
- out_it();
- });
- }
- output->null_count = output->buffers[0] ? -1 : 0;
- return Status::OK();
- }
-};
-
-template <typename Op>
-std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name,
- const FunctionDoc* doc) {
- static auto default_element_wise_aggregate_options =
- ElementWiseAggregateOptions::Defaults();
-
- auto func = std::make_shared<VarArgsCompareFunction>(
- name, Arity::VarArgs(), doc, &default_element_wise_aggregate_options);
- for (const auto& ty : NumericTypes()) {
- auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
- ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
- MinMaxState::Init};
- kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- for (const auto& ty : TemporalTypes()) {
- auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
- ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
- MinMaxState::Init};
- kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- return func;
-}
-
-const FunctionDoc equal_doc{"Compare values for equality (x == y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc not_equal_doc{"Compare values for inequality (x != y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc greater_doc{"Compare values for ordered inequality (x > y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc greater_equal_doc{
- "Compare values for ordered inequality (x >= y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc less_doc{"Compare values for ordered inequality (x < y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc less_equal_doc{
- "Compare values for ordered inequality (x <= y)",
- ("A null on either side emits a null comparison result."),
- {"x", "y"}};
-
-const FunctionDoc min_element_wise_doc{
- "Find the element-wise minimum value",
- ("Nulls will be ignored (default) or propagated. "
- "NaN will be taken over null, but not over any valid float."),
- {"*args"},
- "ElementWiseAggregateOptions"};
-
-const FunctionDoc max_element_wise_doc{
- "Find the element-wise maximum value",
- ("Nulls will be ignored (default) or propagated. "
- "NaN will be taken over null, but not over any valid float."),
- {"*args"},
- "ElementWiseAggregateOptions"};
+using MinMaxState = OptionsWrapper<ElementWiseAggregateOptions>;
+
+// Implement a variadic scalar min/max kernel.
+template <typename OutType, typename Op>
+struct ScalarMinMax {
+ using OutValue = typename GetOutputType<OutType>::T;
+
+ static void ExecScalar(const ExecBatch& batch,
+ const ElementWiseAggregateOptions& options, Scalar* out) {
+ // All arguments are scalar
+ OutValue value{};
+ bool valid = false;
+ for (const auto& arg : batch.values) {
+ // Ignore non-scalar arguments so we can use it in the mixed-scalar-and-array case
+ if (!arg.is_scalar()) continue;
+ const auto& scalar = *arg.scalar();
+ if (!scalar.is_valid) {
+ if (options.skip_nulls) continue;
+ out->is_valid = false;
+ return;
+ }
+ if (!valid) {
+ value = UnboxScalar<OutType>::Unbox(scalar);
+ valid = true;
+ } else {
+ value = Op::template Call<OutValue, OutValue, OutValue>(
+ value, UnboxScalar<OutType>::Unbox(scalar));
+ }
+ }
+ out->is_valid = valid;
+ if (valid) {
+ BoxScalar<OutType>::Box(value, out);
+ }
+ }
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ const auto descrs = batch.GetDescriptors();
+ const size_t scalar_count =
+ static_cast<size_t>(std::count_if(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); }));
+ if (scalar_count == batch.values.size()) {
+ ExecScalar(batch, options, out->scalar().get());
+ return Status::OK();
+ }
+
+ ArrayData* output = out->mutable_array();
+
+ // At least one array, two or more arguments
+ ArrayDataVector arrays;
+ for (const auto& arg : batch.values) {
+ if (!arg.is_array()) continue;
+ arrays.push_back(arg.array());
+ }
+
+ bool initialize_output = true;
+ if (scalar_count > 0) {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
+ MakeScalar(out->type(), 0));
+ ExecScalar(batch, options, temp_scalar.get());
+ if (temp_scalar->is_valid) {
+ const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
+ initialize_output = false;
+ OutValue* out = output->GetMutableValues<OutValue>(1);
+ std::fill(out, out + batch.length, value);
+ } else if (!options.skip_nulls) {
+ // Abort early
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*temp_scalar, batch.length,
+ ctx->memory_pool()));
+ *output = *array->data();
+ return Status::OK();
+ }
+ }
+
+ if (initialize_output) {
+ OutValue* out = output->GetMutableValues<OutValue>(1);
+ std::fill(out, out + batch.length, Op::template antiextreme<OutValue>());
+ }
+
+ // Precompute the validity buffer
+ if (options.skip_nulls && initialize_output) {
+ // OR together the validity buffers of all arrays
+ if (std::all_of(arrays.begin(), arrays.end(),
+ [](const std::shared_ptr<ArrayData>& arr) {
+ return arr->MayHaveNulls();
+ })) {
+ for (const auto& arr : arrays) {
+ if (!arr->MayHaveNulls()) continue;
+ if (!output->buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
+ ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
+
+ batch.length,
+ output->buffers[0]->mutable_data(),
+ /*dest_offset=*/0);
+ } else {
+ ::arrow::internal::BitmapOr(
+ output->buffers[0]->data(), /*left_offset=*/0, arr->buffers[0]->data(),
+ arr->offset, batch.length,
+ /*out_offset=*/0, output->buffers[0]->mutable_data());
+ }
+ }
+ }
+ } else if (!options.skip_nulls) {
+ // AND together the validity buffers of all arrays
+ for (const auto& arr : arrays) {
+ if (!arr->MayHaveNulls()) continue;
+ if (!output->buffers[0]) {
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
+ ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
+ batch.length, output->buffers[0]->mutable_data(),
+ /*dest_offset=*/0);
+ } else {
+ ::arrow::internal::BitmapAnd(output->buffers[0]->data(), /*left_offset=*/0,
+ arr->buffers[0]->data(), arr->offset, batch.length,
+ /*out_offset=*/0,
+ output->buffers[0]->mutable_data());
+ }
+ }
+ }
+
+ for (const auto& array : arrays) {
+ OutputArrayWriter<OutType> writer(out->mutable_array());
+ ArrayIterator<OutType> out_it(*output);
+ int64_t index = 0;
+ VisitArrayValuesInline<OutType>(
+ *array,
+ [&](OutValue value) {
+ auto u = out_it();
+ if (!output->buffers[0] ||
+ BitUtil::GetBit(output->buffers[0]->data(), index)) {
+ writer.Write(Op::template Call<OutValue, OutValue, OutValue>(u, value));
+ } else {
+ writer.Write(value);
+ }
+ index++;
+ },
+ [&]() {
+ // RHS is null, preserve the LHS
+ writer.values++;
+ index++;
+ out_it();
+ });
+ }
+ output->null_count = output->buffers[0] ? -1 : 0;
+ return Status::OK();
+ }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name,
+ const FunctionDoc* doc) {
+ static auto default_element_wise_aggregate_options =
+ ElementWiseAggregateOptions::Defaults();
+
+ auto func = std::make_shared<VarArgsCompareFunction>(
+ name, Arity::VarArgs(), doc, &default_element_wise_aggregate_options);
+ for (const auto& ty : NumericTypes()) {
+ auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
+ ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
+ MinMaxState::Init};
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ for (const auto& ty : TemporalTypes()) {
+ auto exec = GeneratePhysicalNumeric<ScalarMinMax, Op>(ty);
+ ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
+ MinMaxState::Init};
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ return func;
+}
+
+const FunctionDoc equal_doc{"Compare values for equality (x == y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc not_equal_doc{"Compare values for inequality (x != y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc greater_doc{"Compare values for ordered inequality (x > y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc greater_equal_doc{
+ "Compare values for ordered inequality (x >= y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc less_doc{"Compare values for ordered inequality (x < y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc less_equal_doc{
+ "Compare values for ordered inequality (x <= y)",
+ ("A null on either side emits a null comparison result."),
+ {"x", "y"}};
+
+const FunctionDoc min_element_wise_doc{
+ "Find the element-wise minimum value",
+ ("Nulls will be ignored (default) or propagated. "
+ "NaN will be taken over null, but not over any valid float."),
+ {"*args"},
+ "ElementWiseAggregateOptions"};
+
+const FunctionDoc max_element_wise_doc{
+ "Find the element-wise maximum value",
+ ("Nulls will be ignored (default) or propagated. "
+ "NaN will be taken over null, but not over any valid float."),
+ {"*args"},
+ "ElementWiseAggregateOptions"};
} // namespace
void RegisterScalarComparison(FunctionRegistry* registry) {
- DCHECK_OK(registry->AddFunction(MakeCompareFunction<Equal>("equal", &equal_doc)));
- DCHECK_OK(
- registry->AddFunction(MakeCompareFunction<NotEqual>("not_equal", &not_equal_doc)));
+ DCHECK_OK(registry->AddFunction(MakeCompareFunction<Equal>("equal", &equal_doc)));
+ DCHECK_OK(
+ registry->AddFunction(MakeCompareFunction<NotEqual>("not_equal", &not_equal_doc)));
- auto greater = MakeCompareFunction<Greater>("greater", &greater_doc);
- auto greater_equal =
- MakeCompareFunction<GreaterEqual>("greater_equal", &greater_equal_doc);
+ auto greater = MakeCompareFunction<Greater>("greater", &greater_doc);
+ auto greater_equal =
+ MakeCompareFunction<GreaterEqual>("greater_equal", &greater_equal_doc);
- auto less = MakeFlippedFunction("less", *greater, &less_doc);
- auto less_equal = MakeFlippedFunction("less_equal", *greater_equal, &less_equal_doc);
+ auto less = MakeFlippedFunction("less", *greater, &less_doc);
+ auto less_equal = MakeFlippedFunction("less_equal", *greater_equal, &less_equal_doc);
DCHECK_OK(registry->AddFunction(std::move(less)));
DCHECK_OK(registry->AddFunction(std::move(less_equal)));
DCHECK_OK(registry->AddFunction(std::move(greater)));
DCHECK_OK(registry->AddFunction(std::move(greater_equal)));
-
- // ----------------------------------------------------------------------
- // Variadic element-wise functions
-
- auto min_element_wise =
- MakeScalarMinMax<Minimum>("min_element_wise", &min_element_wise_doc);
- DCHECK_OK(registry->AddFunction(std::move(min_element_wise)));
-
- auto max_element_wise =
- MakeScalarMinMax<Maximum>("max_element_wise", &max_element_wise_doc);
- DCHECK_OK(registry->AddFunction(std::move(max_element_wise)));
+
+ // ----------------------------------------------------------------------
+ // Variadic element-wise functions
+
+ auto min_element_wise =
+ MakeScalarMinMax<Minimum>("min_element_wise", &min_element_wise_doc);
+ DCHECK_OK(registry->AddFunction(std::move(min_element_wise)));
+
+ auto max_element_wise =
+ MakeScalarMinMax<Maximum>("max_element_wise", &max_element_wise_doc);
+ DCHECK_OK(registry->AddFunction(std::move(max_element_wise)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
index d29c3984b7a..cf22b0de3dc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_fill_null.cc
@@ -37,13 +37,13 @@ namespace {
template <typename Type, typename Enable = void>
struct FillNullFunctor {};
-// Numeric inputs
-
+// Numeric inputs
+
template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
using T = typename TypeTraits<Type>::CType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const ArrayData& data = *batch[0].array();
const Scalar& fill_value = *batch[1].scalar();
ArrayData* output = out->mutable_array();
@@ -54,8 +54,8 @@ struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
T value = UnboxScalar<Type>::Unbox(fill_value);
if (data.MayHaveNulls() != 0 && fill_value.is_valid) {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
- ctx->Allocate(data.length * sizeof(T)));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+ ctx->Allocate(data.length * sizeof(T)));
const uint8_t* is_valid = data.buffers[0]->data();
const T* in_values = data.GetValues<T>(1);
@@ -80,28 +80,28 @@ struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
in_values += block.length;
}
output->buffers[1] = out_buf;
- output->null_count = 0;
+ output->null_count = 0;
} else {
*output = data;
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
-// Boolean input
-
+// Boolean input
+
template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const ArrayData& data = *batch[0].array();
const Scalar& fill_value = *batch[1].scalar();
ArrayData* output = out->mutable_array();
bool value = UnboxScalar<BooleanType>::Unbox(fill_value);
if (data.MayHaveNulls() != 0 && fill_value.is_valid) {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
- ctx->AllocateBitmap(data.length));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> out_buf,
+ ctx->AllocateBitmap(data.length));
const uint8_t* is_valid = data.buffers[0]->data();
const uint8_t* data_bitmap = data.buffers[1]->data();
@@ -132,68 +132,68 @@ struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
out_offset += block.length;
}
output->buffers[1] = out_buf;
- output->null_count = 0;
+ output->null_count = 0;
} else {
*output = data;
}
-
- return Status::OK();
+
+ return Status::OK();
}
};
-// Null input
-
+// Null input
+
template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_null_type<Type>::value>> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Nothing preallocated, so we assign into the output
*out->mutable_array() = *batch[0].array();
- return Status::OK();
+ return Status::OK();
+ }
+};
+
+// Binary-like input
+
+template <typename Type>
+struct FillNullFunctor<Type, enable_if_t<is_base_binary_type<Type>::value>> {
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& input = *batch[0].array();
+ const auto& fill_value_scalar =
+ checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+ ArrayData* output = out->mutable_array();
+
+ // Ensure the kernel is configured properly to have no validity bitmap /
+ // null count 0 unless we explicitly propagate it below.
+ DCHECK(output->buffers[0] == nullptr);
+
+ const int64_t null_count = input.GetNullCount();
+
+ if (null_count > 0 && fill_value_scalar.is_valid) {
+ util::string_view fill_value(*fill_value_scalar.value);
+ BuilderType builder(input.type, ctx->memory_pool());
+ RETURN_NOT_OK(builder.ReserveData(input.buffers[2]->size() +
+ fill_value.length() * null_count));
+ RETURN_NOT_OK(builder.Resize(input.length));
+
+ VisitArrayDataInline<Type>(
+ input, [&](util::string_view s) { builder.UnsafeAppend(s); },
+ [&]() { builder.UnsafeAppend(fill_value); });
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *output = *string_array->data();
+ // The builder does not match the logical type, due to
+ // GenerateTypeAgnosticVarBinaryBase
+ output->type = input.type;
+ } else {
+ *output = input;
+ }
+
+ return Status::OK();
}
};
-// Binary-like input
-
-template <typename Type>
-struct FillNullFunctor<Type, enable_if_t<is_base_binary_type<Type>::value>> {
- using BuilderType = typename TypeTraits<Type>::BuilderType;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const ArrayData& input = *batch[0].array();
- const auto& fill_value_scalar =
- checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
- ArrayData* output = out->mutable_array();
-
- // Ensure the kernel is configured properly to have no validity bitmap /
- // null count 0 unless we explicitly propagate it below.
- DCHECK(output->buffers[0] == nullptr);
-
- const int64_t null_count = input.GetNullCount();
-
- if (null_count > 0 && fill_value_scalar.is_valid) {
- util::string_view fill_value(*fill_value_scalar.value);
- BuilderType builder(input.type, ctx->memory_pool());
- RETURN_NOT_OK(builder.ReserveData(input.buffers[2]->size() +
- fill_value.length() * null_count));
- RETURN_NOT_OK(builder.Resize(input.length));
-
- VisitArrayDataInline<Type>(
- input, [&](util::string_view s) { builder.UnsafeAppend(s); },
- [&]() { builder.UnsafeAppend(fill_value); });
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder.Finish(&string_array));
- *output = *string_array->data();
- // The builder does not match the logical type, due to
- // GenerateTypeAgnosticVarBinaryBase
- output->type = input.type;
- } else {
- *output = input;
- }
-
- return Status::OK();
- }
-};
-
void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
for (const std::shared_ptr<DataType>& ty : types) {
@@ -208,22 +208,22 @@ void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
AddKernels({boolean(), null()});
}
-void AddBinaryFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
- for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
- kernel.signature =
- KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
- kernel.exec = GenerateTypeAgnosticVarBinaryBase<FillNullFunctor>(*ty);
- DCHECK_OK(func->AddKernel(kernel));
- }
-}
-
-const FunctionDoc fill_null_doc{
- "Replace null elements",
- ("`fill_value` must be a scalar of the same type as `values`.\n"
- "Each non-null value in `values` is emitted as-is.\n"
- "Each null value in `values` is replaced with `fill_value`."),
- {"values", "fill_value"}};
-
+void AddBinaryFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ kernel.signature =
+ KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
+ kernel.exec = GenerateTypeAgnosticVarBinaryBase<FillNullFunctor>(*ty);
+ DCHECK_OK(func->AddKernel(kernel));
+ }
+}
+
+const FunctionDoc fill_null_doc{
+ "Replace null elements",
+ ("`fill_value` must be a scalar of the same type as `values`.\n"
+ "Each non-null value in `values` is emitted as-is.\n"
+ "Each null value in `values` is replaced with `fill_value`."),
+ {"values", "fill_value"}};
+
} // namespace
void RegisterScalarFillNull(FunctionRegistry* registry) {
@@ -231,10 +231,10 @@ void RegisterScalarFillNull(FunctionRegistry* registry) {
ScalarKernel fill_null_base;
fill_null_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
fill_null_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
- auto fill_null =
- std::make_shared<ScalarFunction>("fill_null", Arity::Binary(), &fill_null_doc);
+ auto fill_null =
+ std::make_shared<ScalarFunction>("fill_null", Arity::Binary(), &fill_null_doc);
AddBasicFillNullKernels(fill_null_base, fill_null.get());
- AddBinaryFillNullKernels(fill_null_base, fill_null.get());
+ AddBinaryFillNullKernels(fill_null_base, fill_null.get());
DCHECK_OK(registry->AddFunction(fill_null));
}
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 74fdc062930..ff308a673a3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -1,1730 +1,1730 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <arrow/compute/api.h>
-#include <arrow/compute/kernels/codegen_internal.h>
-#include <arrow/compute/util_internal.h>
-#include <arrow/util/bit_block_counter.h>
-#include <arrow/util/bitmap.h>
-#include <arrow/util/bitmap_ops.h>
-#include <arrow/util/bitmap_reader.h>
-
-namespace arrow {
-using internal::BitBlockCount;
-using internal::BitBlockCounter;
-using internal::Bitmap;
-using internal::BitmapWordReader;
-
-namespace compute {
-namespace internal {
-
-namespace {
-
-constexpr uint64_t kAllNull = 0;
-constexpr uint64_t kAllValid = ~kAllNull;
-
-util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {
- if (data.is_scalar()) {
- return data.scalar()->is_valid ? kAllValid : kAllNull;
- }
-
- if (data.array()->null_count == data.array()->length) return kAllNull;
-
- if (!data.array()->MayHaveNulls()) return kAllValid;
-
- // no constant validity word available
- return {};
-}
-
-inline Bitmap GetBitmap(const Datum& datum, int i) {
- if (datum.is_scalar()) return {};
- const ArrayData& a = *datum.array();
- return Bitmap{a.buffers[i], a.offset, a.length};
-}
-
-// if the condition is null then output is null otherwise we take validity from the
-// selected argument
-// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
-template <typename AllocateNullBitmap>
-Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
- const Datum& right_d, ArrayData* output) {
- auto cond_const = GetConstantValidityWord(cond_d);
- auto left_const = GetConstantValidityWord(left_d);
- auto right_const = GetConstantValidityWord(right_d);
-
- enum { COND_CONST = 1, LEFT_CONST = 2, RIGHT_CONST = 4 };
- auto flag = COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
- RIGHT_CONST * right_const.has_value();
-
- const ArrayData& cond = *cond_d.array();
- // cond.data will always be available
- Bitmap cond_data{cond.buffers[1], cond.offset, cond.length};
- Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
- Bitmap left_valid = GetBitmap(left_d, 0);
- Bitmap right_valid = GetBitmap(right_d, 0);
-
- // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
- // In the following cases, we dont need to allocate out_valid bitmap
-
- // if cond & left & right all ones, then output is all valid.
- // if output validity buffer is already allocated (NullHandling::
- // COMPUTED_PREALLOCATE) -> set all bits
- // else, return nullptr
- if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
- if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
- output->buffers[0] = nullptr;
- } else { // NullHandling::COMPUTED_PREALLOCATE
- BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
- output->length);
- }
- return Status::OK();
- }
-
- if (left_const == kAllValid && right_const == kAllValid) {
- // if both left and right are valid, no need to calculate out_valid bitmap. Copy
- // cond validity buffer
- if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
- // if there's an offset, copy bitmap (cannot slice a bitmap)
- if (cond.offset) {
- ARROW_ASSIGN_OR_RAISE(
- output->buffers[0],
- arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
- cond.offset, cond.length));
- } else { // just copy assign cond validity buffer
- output->buffers[0] = cond.buffers[0];
- }
- } else { // NullHandling::COMPUTED_PREALLOCATE
- arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
- output->buffers[0]->mutable_data(), output->offset);
- }
- return Status::OK();
- }
-
- // lambda function that will be used inside the visitor
- auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
- uint64_t r_valid) {
- return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
- };
-
- if (AllocateNullBitmap::value) {
- // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
- // would not have allocated buffers for it.
- ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
- }
-
- std::array<Bitmap, 1> out_bitmaps{
- Bitmap{output->buffers[0], output->offset, output->length}};
-
- switch (flag) {
- case COND_CONST | LEFT_CONST | RIGHT_CONST: {
- std::array<Bitmap, 1> bitmaps{cond_data};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 1>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- *left_const, *right_const);
- });
- break;
- }
- case LEFT_CONST | RIGHT_CONST: {
- std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 2>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- *left_const, *right_const);
- });
- break;
- }
- case COND_CONST | RIGHT_CONST: {
- // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
- // Visit()
- std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 2>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- words_in[1], *right_const);
- });
- break;
- }
- case RIGHT_CONST: {
- // bitmaps[R_VALID] might be null; override to make it safe for Visit()
- std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 3>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- words_in[2], *right_const);
- });
- break;
- }
- case COND_CONST | LEFT_CONST: {
- // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
- // Visit()
- std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 2>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- *left_const, words_in[1]);
- });
- break;
- }
- case LEFT_CONST: {
- // bitmaps[L_VALID] might be null; override to make it safe for Visit()
- std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 3>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- *left_const, words_in[2]);
- });
- break;
- }
- case COND_CONST: {
- // bitmaps[C_VALID] might be null; override to make it safe for Visit()
- std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 3>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(*cond_const, words_in[0],
- words_in[1], words_in[2]);
- });
- break;
- }
- case 0: {
- std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
- Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
- [&](const std::array<uint64_t, 4>& words_in,
- std::array<uint64_t, 1>* word_out) {
- word_out->at(0) = apply(words_in[0], words_in[1],
- words_in[2], words_in[3]);
- });
- break;
- }
- }
- return Status::OK();
-}
-
-using Word = uint64_t;
-static constexpr int64_t word_len = sizeof(Word) * 8;
-
-/// Runs the main if_else loop. Here, it is expected that the right data has already
-/// been copied to the output.
-/// If `invert` is meant to invert the cond.data. If is set to `true`, then the
-/// buffer will be inverted before calling the handle_block or handle_each functions.
-/// This is useful, when left is an array and right is scalar. Then rather than
-/// copying data from the right to output, we can copy left data to the output and
-/// invert the cond data to fill right values. Filling out with a scalar is presumed to
-/// be more efficient than filling with an array
-///
-/// `HandleBlock` has the signature:
-/// [](int64_t offset, int64_t length){...}
-/// It should copy `length` number of elements from source array to output array with
-/// `offset` offset in both arrays
-template <typename HandleBlock, bool invert = false>
-void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
- int64_t data_offset = 0;
- int64_t bit_offset = cond.offset;
- const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray
-
- BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
-
- constexpr Word pickAll = invert ? 0 : UINT64_MAX;
- constexpr Word pickNone = ~pickAll;
-
- int64_t cnt = cond_reader.words();
- while (cnt--) {
- Word word = cond_reader.NextWord();
-
- if (word == pickAll) {
- handle_block(data_offset, word_len);
- } else if (word != pickNone) {
- for (int64_t i = 0; i < word_len; ++i) {
- if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
- handle_block(data_offset + i, 1);
- }
- }
- }
- data_offset += word_len;
- bit_offset += word_len;
- }
-
- constexpr uint8_t pickAllByte = invert ? 0 : UINT8_MAX;
- // byte bit-wise inversion is int-wide. Hence XOR with 0xff
- constexpr uint8_t pickNoneByte = pickAllByte ^ 0xff;
-
- cnt = cond_reader.trailing_bytes();
- while (cnt--) {
- int valid_bits;
- uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
-
- if (byte == pickAllByte && valid_bits == 8) {
- handle_block(data_offset, 8);
- } else if (byte != pickNoneByte) {
- for (int i = 0; i < valid_bits; ++i) {
- if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
- handle_block(data_offset + i, 1);
- }
- }
- }
- data_offset += 8;
- bit_offset += 8;
- }
-}
-
-template <typename HandleBlock>
-void RunIfElseLoopInverted(const ArrayData& cond, const HandleBlock& handle_block) {
- RunIfElseLoop<HandleBlock, true>(cond, handle_block);
-}
-
-/// Runs if-else when cond is a scalar. Two special functions are required,
-/// 1.CopyArrayData, 2. BroadcastScalar
-template <typename CopyArrayData, typename BroadcastScalar>
-Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum& right,
- Datum* out, const CopyArrayData& copy_array_data,
- const BroadcastScalar& broadcast_scalar) {
- if (left.is_scalar() && right.is_scalar()) { // output will be a scalar
- if (cond.is_valid) {
- *out = cond.value ? left.scalar() : right.scalar();
- } else {
- *out = MakeNullScalar(left.type());
- }
- return Status::OK();
- }
-
- // either left or right is an array. Output is always an array`
- const std::shared_ptr<ArrayData>& out_array = out->array();
- if (!cond.is_valid) {
- // cond is null; output is all null --> clear validity buffer
- BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- return Status::OK();
- }
-
- // cond is a non-null scalar
- const auto& valid_data = cond.value ? left : right;
- if (valid_data.is_array()) {
- // valid_data is an array. Hence copy data to the output buffers
- const auto& valid_array = valid_data.array();
- if (valid_array->MayHaveNulls()) {
- arrow::internal::CopyBitmap(
- valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
- out_array->buffers[0]->mutable_data(), out_array->offset);
- } else { // validity buffer is nullptr --> set all bits
- BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- }
- copy_array_data(*valid_array, out_array.get());
- return Status::OK();
-
- } else { // valid data is scalar
- // valid data is a scalar that needs to be broadcasted
- const auto& valid_scalar = *valid_data.scalar();
- if (valid_scalar.is_valid) { // if the scalar is non-null, broadcast
- BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- broadcast_scalar(*valid_data.scalar(), out_array.get());
- } else { // scalar is null, clear the output validity buffer
- BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
- out_array->length);
- }
- return Status::OK();
- }
-}
-
-template <typename Type, typename Enable = void>
-struct IfElseFunctor {};
-
-// only number types needs to be handled for Fixed sized primitive data types because,
-// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
-// int type
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_number<Type>> {
- using T = typename TypeTraits<Type>::CType;
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- return RunIfElseScalar(
- cond, left, right, out,
- /*CopyArrayData*/
- [&](const ArrayData& valid_array, ArrayData* out_array) {
- std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
- valid_array.length * sizeof(T));
- },
- /*BroadcastScalar*/
- [&](const Scalar& scalar, ArrayData* out_array) {
- T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
- std::fill(out_array->GetMutableValues<T>(1),
- out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
- });
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy right data to out_buff
- const T* right_data = right.GetValues<T>(1);
- std::memcpy(out_values, right_data, right.length * sizeof(T));
-
- // selectively copy values from left data
- const T* left_data = left.GetValues<T>(1);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::memcpy(out_values + data_offset, left_data + data_offset,
- num_elems * sizeof(T));
- });
-
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy right data to out_buff
- const T* right_data = right.GetValues<T>(1);
- std::memcpy(out_values, right_data, right.length * sizeof(T));
-
- // selectively copy values from left data
- T left_data = internal::UnboxScalar<Type>::Unbox(left);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::fill(out_values + data_offset, out_values + data_offset + num_elems,
- left_data);
- });
-
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy left data to out_buff
- const T* left_data = left.GetValues<T>(1);
- std::memcpy(out_values, left_data, left.length * sizeof(T));
-
- T right_data = internal::UnboxScalar<Type>::Unbox(right);
-
- RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::fill(out_values + data_offset, out_values + data_offset + num_elems,
- right_data);
- });
-
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- T* out_values = out->template GetMutableValues<T>(1);
-
- // copy right data to out_buff
- T right_data = internal::UnboxScalar<Type>::Unbox(right);
- std::fill(out_values, out_values + cond.length, right_data);
-
- // selectively copy values from left data
- T left_data = internal::UnboxScalar<Type>::Unbox(left);
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::fill(out_values + data_offset, out_values + data_offset + num_elems,
- left_data);
- });
-
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_boolean<Type>> {
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- return RunIfElseScalar(
- cond, left, right, out,
- /*CopyArrayData*/
- [&](const ArrayData& valid_array, ArrayData* out_array) {
- arrow::internal::CopyBitmap(
- valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
- out_array->buffers[1]->mutable_data(), out_array->offset);
- },
- /*BroadcastScalar*/
- [&](const Scalar& scalar, ArrayData* out_array) {
- bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
- BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
- out_array->length, scalar_data);
- });
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- // out_buff = right & ~cond
- const auto& out_buf = out->buffers[1];
- arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
- cond.buffers[1]->data(), cond.offset, cond.length,
- out->offset, out_buf->mutable_data());
-
- // out_buff = left & cond
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
- arrow::internal::BitmapAnd(
- ctx->memory_pool(), left.buffers[1]->data(), left.offset,
- cond.buffers[1]->data(), cond.offset, cond.length, 0));
-
- arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
- cond.length, out->offset, out_buf->mutable_data());
-
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- // out_buff = right & ~cond
- const auto& out_buf = out->buffers[1];
- arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
- cond.buffers[1]->data(), cond.offset, cond.length,
- out->offset, out_buf->mutable_data());
-
- // out_buff = left & cond
- bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
- if (left_data) {
- arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
- cond.offset, cond.length, out->offset,
- out_buf->mutable_data());
- }
-
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- // out_buff = left & cond
- const auto& out_buf = out->buffers[1];
- arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
- cond.buffers[1]->data(), cond.offset, cond.length,
- out->offset, out_buf->mutable_data());
-
- bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
-
- // out_buff = left & cond | right & ~cond
- if (right_data) {
- arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
- cond.offset, cond.length, out->offset,
- out_buf->mutable_data());
- }
-
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
- bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
-
- const auto& out_buf = out->buffers[1];
-
- // out_buf = left & cond | right & ~cond
- // std::shared_ptr<Buffer> out_buf = nullptr;
- if (left_data) {
- if (right_data) {
- // out_buf = ones
- BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
- } else {
- // out_buf = cond
- arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
- out_buf->mutable_data(), out->offset);
- }
- } else {
- if (right_data) {
- // out_buf = ~cond
- arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
- out_buf->mutable_data(), out->offset);
- } else {
- // out_buf = zeros
- BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
- }
- }
-
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
- using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
-
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- if (left.is_scalar() && right.is_scalar()) {
- if (cond.is_valid) {
- *out = cond.value ? left.scalar() : right.scalar();
- } else {
- *out = MakeNullScalar(left.type());
- }
- return Status::OK();
- }
- // either left or right is an array. Output is always an array
- int64_t out_arr_len = std::max(left.length(), right.length());
- if (!cond.is_valid) {
- // cond is null; just create a null array
- ARROW_ASSIGN_OR_RAISE(*out,
- MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
- return Status::OK();
- }
-
- const auto& valid_data = cond.value ? left : right;
- if (valid_data.is_array()) {
- *out = valid_data;
- } else {
- // valid data is a scalar that needs to be broadcasted
- ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
- ctx->memory_pool()));
- }
- return Status::OK();
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- const auto* left_offsets = left.GetValues<OffsetType>(1);
- const uint8_t* left_data = left.buffers[2]->data();
- const auto* right_offsets = right.GetValues<OffsetType>(1);
- const uint8_t* right_data = right.buffers[2]->data();
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
- right_offsets[right.length] - right_offsets[0];
-
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out,
- [&](int64_t i) {
- builder.UnsafeAppend(left_data + left_offsets[i],
- left_offsets[i + 1] - left_offsets[i]);
- },
- [&](int64_t i) {
- builder.UnsafeAppend(right_data + right_offsets[i],
- right_offsets[i + 1] - right_offsets[i]);
- },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
- auto left_size = static_cast<OffsetType>(left_data.size());
-
- const auto* right_offsets = right.GetValues<OffsetType>(1);
- const uint8_t* right_data = right.buffers[2]->data();
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc =
- left_size * cond.length + right_offsets[right.length] - right_offsets[0];
-
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
- [&](int64_t i) {
- builder.UnsafeAppend(right_data + right_offsets[i],
- right_offsets[i + 1] - right_offsets[i]);
- },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- const auto* left_offsets = left.GetValues<OffsetType>(1);
- const uint8_t* left_data = left.buffers[2]->data();
-
- util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
- auto right_size = static_cast<OffsetType>(right_data.size());
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc =
- right_size * cond.length + left_offsets[left.length] - left_offsets[0];
-
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out,
- [&](int64_t i) {
- builder.UnsafeAppend(left_data + left_offsets[i],
- left_offsets[i + 1] - left_offsets[i]);
- },
- [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
- auto left_size = static_cast<OffsetType>(left_data.size());
-
- util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
- auto right_size = static_cast<OffsetType>(right_data.size());
-
- // allocate data buffer conservatively
- int64_t data_buff_alloc = std::max(right_size, left_size) * cond.length;
- BuilderType builder(ctx->memory_pool());
- ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
- ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
-
- RunLoop(
- cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
- [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
- [&]() { builder.UnsafeAppendNull(); });
- ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
- out->SetNullCount(out_arr->data()->null_count);
- out->buffers[0] = std::move(out_arr->data()->buffers[0]);
- out->buffers[1] = std::move(out_arr->data()->buffers[1]);
- out->buffers[2] = std::move(out_arr->data()->buffers[2]);
- return Status::OK();
- }
-
- template <typename HandleLeft, typename HandleRight, typename HandleNull>
- static void RunLoop(const ArrayData& cond, const ArrayData& output,
- HandleLeft&& handle_left, HandleRight&& handle_right,
- HandleNull&& handle_null) {
- const auto* cond_data = cond.buffers[1]->data();
-
- if (output.buffers[0]) { // output may have nulls
- // output validity buffer is allocated internally from the IfElseFunctor. Therefore
- // it is cond.length'd with 0 offset.
- const auto* out_valid = output.buffers[0]->data();
-
- for (int64_t i = 0; i < cond.length; i++) {
- if (BitUtil::GetBit(out_valid, i)) {
- BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
- } else {
- handle_null();
- }
- }
- } else { // output is all valid (no nulls)
- for (int64_t i = 0; i < cond.length; i++) {
- BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
- }
- }
- }
-};
-
-template <typename Type>
-struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
- // A - Array, S - Scalar, X = Array/Scalar
-
- // SXX
- static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
- const Datum& right, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type(), *right.type()));
- return RunIfElseScalar(
- cond, left, right, out,
- /*CopyArrayData*/
- [&](const ArrayData& valid_array, ArrayData* out_array) {
- std::memcpy(
- out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
- valid_array.buffers[1]->data() + valid_array.offset * byte_width,
- valid_array.length * byte_width);
- },
- /*BroadcastScalar*/
- [&](const Scalar& scalar, ArrayData* out_array) {
- const util::string_view& scalar_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
- uint8_t* start =
- out_array->buffers[1]->mutable_data() + out_array->offset * byte_width;
- for (int64_t i = 0; i < out_array->length; i++) {
- std::memcpy(start + i * byte_width, scalar_data.data(), scalar_data.size());
- }
- });
- }
-
- // AAA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const ArrayData& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy right data to out_buff
- const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
- std::memcpy(out_values, right_data, right.length * byte_width);
-
- // selectively copy values from left data
- const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- std::memcpy(out_values + data_offset * byte_width,
- left_data + data_offset * byte_width, num_elems * byte_width);
- });
-
- return Status::OK();
- }
-
- // ASA
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const ArrayData& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy right data to out_buff
- const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
- std::memcpy(out_values, right_data, right.length * byte_width);
-
- // selectively copy values from left data
- const util::string_view& left_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- if (left_data.data()) {
- for (int64_t i = 0; i < num_elems; i++) {
- std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
- left_data.size());
- }
- }
- });
-
- return Status::OK();
- }
-
- // AAS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
- const Scalar& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy left data to out_buff
- const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
- std::memcpy(out_values, left_data, left.length * byte_width);
-
- const util::string_view& right_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
-
- RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
- if (right_data.data()) {
- for (int64_t i = 0; i < num_elems; i++) {
- std::memcpy(out_values + (data_offset + i) * byte_width, right_data.data(),
- right_data.size());
- }
- }
- });
-
- return Status::OK();
- }
-
- // ASS
- static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
- const Scalar& right, ArrayData* out) {
- ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
- auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
-
- // copy right data to out_buff
- const util::string_view& right_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
- if (right_data.data()) {
- for (int64_t i = 0; i < cond.length; i++) {
- std::memcpy(out_values + i * byte_width, right_data.data(), right_data.size());
- }
- }
-
- // selectively copy values from left data
- const util::string_view& left_data =
- internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
-
- RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
- if (left_data.data()) {
- for (int64_t i = 0; i < num_elems; i++) {
- std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
- left_data.size());
- }
- }
- });
-
- return Status::OK();
- }
-
- static Result<int32_t> GetByteWidth(const DataType& left_type,
- const DataType& right_type) {
- int width = checked_cast<const FixedSizeBinaryType&>(left_type).byte_width();
- if (width == checked_cast<const FixedSizeBinaryType&>(right_type).byte_width()) {
- return width;
- } else {
- return Status::Invalid("FixedSizeBinaryType byte_widths should be equal");
- }
- }
-};
-
-template <typename Type, typename AllocateMem>
-struct ResolveIfElseExec {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // cond is scalar
- if (batch[0].is_scalar()) {
- const auto& cond = batch[0].scalar_as<BooleanScalar>();
- return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
- }
-
- // cond is array. Use functors to sort things out
- ARROW_RETURN_NOT_OK(PromoteNullsVisitor<AllocateMem>(ctx, batch[0], batch[1],
- batch[2], out->mutable_array()));
-
- if (batch[1].kind() == Datum::ARRAY) {
- if (batch[2].kind() == Datum::ARRAY) { // AAA
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
- *batch[2].array(), out->mutable_array());
- } else { // AAS
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
- *batch[2].scalar(), out->mutable_array());
- }
- } else {
- if (batch[2].kind() == Datum::ARRAY) { // ASA
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
- *batch[2].array(), out->mutable_array());
- } else { // ASS
- return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
- *batch[2].scalar(), out->mutable_array());
- }
- }
- }
-};
-
-template <typename AllocateMem>
-struct ResolveIfElseExec<NullType, AllocateMem> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // if all are scalars, return a null scalar
- if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
- *out = MakeNullScalar(null());
- } else {
- ARROW_ASSIGN_OR_RAISE(*out,
- MakeArrayOfNull(null(), batch.length, ctx->memory_pool()));
- }
- return Status::OK();
- }
-};
-
-struct IfElseFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
-
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- // if 0th descriptor is null, replace with bool
- if (values->at(0).type->id() == Type::NA) {
- values->at(0).type = boolean();
- }
-
- // if-else 0'th descriptor is bool, so skip it
- std::vector<ValueDescr> values_copy(values->begin() + 1, values->end());
- internal::EnsureDictionaryDecoded(&values_copy);
- internal::ReplaceNullWithOtherType(&values_copy);
-
- if (auto type = internal::CommonNumeric(values_copy)) {
- internal::ReplaceTypes(type, &values_copy);
- }
-
- std::move(values_copy.begin(), values_copy.end(), values->begin() + 1);
-
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
-
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-void AddNullIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
- ScalarKernel kernel({boolean(), null(), null()}, null(),
- ResolveIfElseExec<NullType,
- /*AllocateMem=*/std::true_type>::Exec);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- kernel.can_write_into_slices = false;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec =
- internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec,
- /*AllocateMem=*/std::false_type>(*type);
- // cond array needs to be boolean always
- ScalarKernel kernel({boolean(), type, type}, type, exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = true;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
- }
-}
-
-void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec =
- internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec,
- /*AllocateMem=*/std::true_type>(
- *type);
- // cond array needs to be boolean always
- ScalarKernel kernel({boolean(), type, type}, type, exec);
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- kernel.can_write_into_slices = false;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
- }
-}
-
-void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
- // cond array needs to be boolean always
- ScalarKernel kernel(
- {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
- OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
- return ValueDescr(descrs[1].type, ValueDescr::ANY);
- }),
- ResolveIfElseExec<FixedSizeBinaryType, /*AllocateMem=*/std::false_type>::Exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = true;
-
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-// Helper to copy or broadcast fixed-width values between buffers.
-template <typename Type, typename Enable = void>
-struct CopyFixedWidth {};
-template <>
-struct CopyFixedWidth<BooleanType> {
- static void CopyScalar(const Scalar& scalar, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const bool value = UnboxScalar<BooleanType>::Unbox(scalar);
- BitUtil::SetBitsTo(raw_out_values, out_offset, length, value);
- }
- static void CopyArray(const DataType&, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- arrow::internal::CopyBitmap(in_values, in_offset, length, raw_out_values, out_offset);
- }
-};
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_number<Type>> {
- using CType = typename TypeTraits<Type>::CType;
- static void CopyScalar(const Scalar& scalar, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- CType* out_values = reinterpret_cast<CType*>(raw_out_values);
- const CType value = UnboxScalar<Type>::Unbox(scalar);
- std::fill(out_values + out_offset, out_values + out_offset + length, value);
- }
- static void CopyArray(const DataType&, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- std::memcpy(raw_out_values + out_offset * sizeof(CType),
- in_values + in_offset * sizeof(CType), length * sizeof(CType));
- }
-};
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
- static void CopyScalar(const Scalar& values, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width =
- checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(values);
- // Scalar may have null value buffer
- if (!scalar.value) {
- std::memset(next, 0x00, width * length);
- } else {
- DCHECK_EQ(scalar.value->size(), width);
- for (int i = 0; i < length; i++) {
- std::memcpy(next, scalar.value->data(), width);
- next += width;
- }
- }
- }
- static void CopyArray(const DataType& type, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- std::memcpy(next, in_values + in_offset * width, length * width);
- }
-};
-template <typename Type>
-struct CopyFixedWidth<Type, enable_if_decimal<Type>> {
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- static void CopyScalar(const Scalar& values, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width =
- checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- const auto& scalar = checked_cast<const ScalarType&>(values);
- const auto value = scalar.value.ToBytes();
- for (int i = 0; i < length; i++) {
- std::memcpy(next, value.data(), width);
- next += width;
- }
- }
- static void CopyArray(const DataType& type, const uint8_t* in_values,
- const int64_t in_offset, const int64_t length,
- uint8_t* raw_out_values, const int64_t out_offset) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
- uint8_t* next = raw_out_values + (width * out_offset);
- std::memcpy(next, in_values + in_offset * width, length * width);
- }
-};
-// Copy fixed-width values from a scalar/array datum into an output values buffer
-template <typename Type>
-void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
- uint8_t* out_valid, uint8_t* out_values, const int64_t out_offset) {
- if (in_values.is_scalar()) {
- const auto& scalar = *in_values.scalar();
- if (out_valid) {
- BitUtil::SetBitsTo(out_valid, out_offset, length, scalar.is_valid);
- }
- CopyFixedWidth<Type>::CopyScalar(scalar, length, out_values, out_offset);
- } else {
- const ArrayData& array = *in_values.array();
- if (out_valid) {
- if (array.MayHaveNulls()) {
- if (length == 1) {
- // CopyBitmap is slow for short runs
- BitUtil::SetBitTo(
- out_valid, out_offset,
- BitUtil::GetBit(array.buffers[0]->data(), array.offset + in_offset));
- } else {
- arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset + in_offset,
- length, out_valid, out_offset);
- }
- } else {
- BitUtil::SetBitsTo(out_valid, out_offset, length, true);
- }
- }
- CopyFixedWidth<Type>::CopyArray(*array.type, array.buffers[1]->data(),
- array.offset + in_offset, length, out_values,
- out_offset);
- }
-}
-
-// Specialized helper to copy a single value from a source array. Allows avoiding
-// repeatedly calling MayHaveNulls and Buffer::data() which have internal checks that
-// add up when called in a loop.
-template <typename Type>
-void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
- const uint8_t* in_values, const int64_t in_offset,
- uint8_t* out_valid, uint8_t* out_values,
- const int64_t out_offset) {
- if (out_valid) {
- BitUtil::SetBitTo(out_valid, out_offset,
- !in_valid || BitUtil::GetBit(in_valid, in_offset));
- }
- CopyFixedWidth<Type>::CopyArray(type, in_values, in_offset, /*length=*/1, out_values,
- out_offset);
-}
-
-struct CaseWhenFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- // The first function is a struct of booleans, where the number of fields in the
- // struct is either equal to the number of other arguments or is one less.
- RETURN_NOT_OK(CheckArity(*values));
- EnsureDictionaryDecoded(values);
- auto first_type = (*values)[0].type;
- if (first_type->id() != Type::STRUCT) {
- return Status::TypeError("case_when: first argument must be STRUCT, not ",
- *first_type);
- }
- auto num_fields = static_cast<size_t>(first_type->num_fields());
- if (num_fields < values->size() - 2 || num_fields >= values->size()) {
- return Status::Invalid(
- "case_when: number of struct fields must be equal to or one less than count of "
- "remaining arguments (",
- values->size() - 1, "), got: ", first_type->num_fields());
- }
- for (const auto& field : first_type->fields()) {
- if (field->type()->id() != Type::BOOL) {
- return Status::TypeError(
- "case_when: all fields of first argument must be BOOL, but ", field->name(),
- " was of type: ", *field->type());
- }
- }
-
- if (auto type = CommonNumeric(values->data() + 1, values->size() - 1)) {
- for (auto it = values->begin() + 1; it != values->end(); it++) {
- it->type = type;
- }
- }
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-// Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions
-template <typename Type>
-Status ExecScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
- if (!conds.is_valid) {
- return Status::Invalid("cond struct must not be null");
- }
- Datum result;
- for (size_t i = 0; i < batch.values.size() - 1; i++) {
- if (i < conds.value.size()) {
- const Scalar& cond = *conds.value[i];
- if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
- result = batch[i + 1];
- break;
- }
- } else {
- // ELSE clause
- result = batch[i + 1];
- break;
- }
- }
- if (out->is_scalar()) {
- *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
- return Status::OK();
- }
- ArrayData* output = out->mutable_array();
- if (!result.is_value()) {
- // All conditions false, no 'else' argument
- result = MakeNullScalar(out->type());
- }
- CopyValues<Type>(result, /*in_offset=*/0, batch.length,
- output->GetMutableValues<uint8_t>(0, 0),
- output->GetMutableValues<uint8_t>(1, 0), output->offset);
- return Status::OK();
-}
-
-// Implement 'case when' for any mix of scalar/array arguments for any fixed-width type,
-// given helper functions to copy data from a source array to a target array
-template <typename Type>
-Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& conds_array = *batch.values[0].array();
- if (conds_array.GetNullCount() > 0) {
- return Status::Invalid("cond struct must not have top-level nulls");
- }
- ArrayData* output = out->mutable_array();
- const int64_t out_offset = output->offset;
- const auto num_value_args = batch.values.size() - 1;
- const bool have_else_arg =
- static_cast<size_t>(conds_array.type->num_fields()) < num_value_args;
- uint8_t* out_valid = output->buffers[0]->mutable_data();
- uint8_t* out_values = output->buffers[1]->mutable_data();
- if (have_else_arg) {
- // Copy 'else' value into output
- CopyValues<Type>(batch.values.back(), /*in_offset=*/0, batch.length, out_valid,
- out_values, out_offset);
- } else {
- // There's no 'else' argument, so we should have an all-null validity bitmap
- BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
- }
-
- // Allocate a temporary bitmap to determine which elements still need setting.
- ARROW_ASSIGN_OR_RAISE(auto mask_buffer, ctx->AllocateBitmap(batch.length));
- uint8_t* mask = mask_buffer->mutable_data();
- std::memset(mask, 0xFF, mask_buffer->size());
-
- // Then iterate through each argument in turn and set elements.
- for (size_t i = 0; i < batch.values.size() - (have_else_arg ? 2 : 1); i++) {
- const ArrayData& cond_array = *conds_array.child_data[i];
- const int64_t cond_offset = conds_array.offset + cond_array.offset;
- const uint8_t* cond_values = cond_array.buffers[1]->data();
- const Datum& values_datum = batch[i + 1];
- int64_t offset = 0;
-
- if (cond_array.GetNullCount() == 0) {
- // If no valid buffer, visit mask & cond bitmap simultaneously
- BinaryBitBlockCounter counter(mask, /*start_offset=*/0, cond_values, cond_offset,
- batch.length);
- while (offset < batch.length) {
- const auto block = counter.NextAndWord();
- if (block.AllSet()) {
- CopyValues<Type>(values_datum, offset, block.length, out_valid, out_values,
- out_offset + offset);
- BitUtil::SetBitsTo(mask, offset, block.length, false);
- } else if (block.popcount) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (BitUtil::GetBit(mask, offset + j) &&
- BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
- CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
- out_values, out_offset + offset + j);
- BitUtil::SetBitTo(mask, offset + j, false);
- }
- }
- }
- offset += block.length;
- }
- } else {
- // Visit mask & cond bitmap & cond validity
- const uint8_t* cond_valid = cond_array.buffers[0]->data();
- Bitmap bitmaps[3] = {{mask, /*offset=*/0, batch.length},
- {cond_values, cond_offset, batch.length},
- {cond_valid, cond_offset, batch.length}};
- Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 3> words) {
- const uint64_t word = words[0] & words[1] & words[2];
- const int64_t block_length = std::min<int64_t>(64, batch.length - offset);
- if (word == std::numeric_limits<uint64_t>::max()) {
- CopyValues<Type>(values_datum, offset, block_length, out_valid, out_values,
- out_offset + offset);
- BitUtil::SetBitsTo(mask, offset, block_length, false);
- } else if (word) {
- for (int64_t j = 0; j < block_length; ++j) {
- if (BitUtil::GetBit(mask, offset + j) &&
- BitUtil::GetBit(cond_valid, cond_offset + offset + j) &&
- BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
- CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
- out_values, out_offset + offset + j);
- BitUtil::SetBitTo(mask, offset + j, false);
- }
- }
- }
- });
- }
- }
- if (!have_else_arg) {
- // Need to initialize any remaining null slots (uninitialized memory)
- BitBlockCounter counter(mask, /*offset=*/0, batch.length);
- int64_t offset = 0;
- auto bit_width = checked_cast<const FixedWidthType&>(*out->type()).bit_width();
- auto byte_width = BitUtil::BytesForBits(bit_width);
- while (offset < batch.length) {
- const auto block = counter.NextWord();
- if (block.AllSet()) {
- if (bit_width == 1) {
- BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
- } else {
- std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
- byte_width * block.length);
- }
- } else if (!block.NoneSet()) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
- if (bit_width == 1) {
- BitUtil::ClearBit(out_values, out_offset + offset + j);
- } else {
- std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
- byte_width);
- }
- }
- }
- offset += block.length;
- }
- }
- return Status::OK();
-}
-
-template <typename Type, typename Enable = void>
-struct CaseWhenFunctor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (batch.values[0].is_array()) {
- return ExecArrayCaseWhen<Type>(ctx, batch, out);
- }
- return ExecScalarCaseWhen<Type>(ctx, batch, out);
- }
-};
-
-template <>
-struct CaseWhenFunctor<NullType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::OK();
- }
-};
-
-struct CoalesceFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- RETURN_NOT_OK(CheckArity(*values));
- using arrow::compute::detail::DispatchExactImpl;
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- EnsureDictionaryDecoded(values);
- if (auto type = CommonNumeric(*values)) {
- ReplaceTypes(type, values);
- }
- if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
- return arrow::compute::detail::NoMatchingKernel(this, *values);
- }
-};
-
-// Implement a 'coalesce' (SQL) operator for any number of scalar inputs
-Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- for (const auto& datum : batch.values) {
- if (datum.scalar()->is_valid) {
- *out = datum;
- break;
- }
- }
- return Status::OK();
-}
-
-// Helper: copy from a source datum into all null slots of the output
-template <typename Type>
-void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
- const int64_t out_offset, const int64_t length) {
- BitBlockCounter counter(out_valid, out_offset, length);
- int64_t offset = 0;
- while (offset < length) {
- const auto block = counter.NextWord();
- if (block.NoneSet()) {
- CopyValues<Type>(source, offset, block.length, out_valid, out_values,
- out_offset + offset);
- } else if (!block.AllSet()) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (!BitUtil::GetBit(out_valid, out_offset + offset + j)) {
- CopyValues<Type>(source, offset + j, 1, out_valid, out_values,
- out_offset + offset + j);
- }
- }
- }
- offset += block.length;
- }
-}
-
-// Helper: zero the values buffer of the output wherever the slot is null
-void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_values,
- const int64_t out_offset, const int64_t length) {
- BitBlockCounter counter(out_valid, out_offset, length);
- int64_t offset = 0;
- auto bit_width = checked_cast<const FixedWidthType&>(type).bit_width();
- auto byte_width = BitUtil::BytesForBits(bit_width);
- while (offset < length) {
- const auto block = counter.NextWord();
- if (block.NoneSet()) {
- if (bit_width == 1) {
- BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
- } else {
- std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
- byte_width * block.length);
- }
- } else if (!block.AllSet()) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
- if (bit_width == 1) {
- BitUtil::ClearBit(out_values, out_offset + offset + j);
- } else {
- std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
- byte_width);
- }
- }
- }
- offset += block.length;
- }
-}
-
-// Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
-template <typename Type>
-Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ArrayData* output = out->mutable_array();
- const int64_t out_offset = output->offset;
- // Use output validity buffer as mask to decide what values to copy
- uint8_t* out_valid = output->buffers[0]->mutable_data();
- // Clear output buffer - no values are set initially
- BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
- uint8_t* out_values = output->buffers[1]->mutable_data();
-
- for (const auto& datum : batch.values) {
- if ((datum.is_scalar() && datum.scalar()->is_valid) ||
- (datum.is_array() && !datum.array()->MayHaveNulls())) {
- // Valid scalar, or all-valid array
- CopyValuesAllValid<Type>(datum, out_valid, out_values, out_offset, batch.length);
- break;
- } else if (datum.is_array()) {
- // Array with nulls
- const ArrayData& arr = *datum.array();
- const DataType& type = *datum.type();
- const uint8_t* in_valid = arr.buffers[0]->data();
- const uint8_t* in_values = arr.buffers[1]->data();
- BinaryBitBlockCounter counter(in_valid, arr.offset, out_valid, out_offset,
- batch.length);
- int64_t offset = 0;
- while (offset < batch.length) {
- const auto block = counter.NextAndNotWord();
- if (block.AllSet()) {
- CopyValues<Type>(datum, offset, block.length, out_valid, out_values,
- out_offset + offset);
- } else if (block.popcount) {
- for (int64_t j = 0; j < block.length; ++j) {
- if (!BitUtil::GetBit(out_valid, out_offset + offset + j) &&
- BitUtil::GetBit(in_valid, arr.offset + offset + j)) {
- // This version lets us avoid calling MayHaveNulls() on every iteration
- // (which does an atomic load and can add up)
- CopyOneArrayValue<Type>(type, in_valid, in_values, arr.offset + offset + j,
- out_valid, out_values, out_offset + offset + j);
- }
- }
- }
- offset += block.length;
- }
- }
- }
-
- // Initialize any remaining null slots (uninitialized memory)
- InitializeNullSlots(*out->type(), out_valid, out_values, out_offset, batch.length);
- return Status::OK();
-}
-
-template <typename Type, typename Enable = void>
-struct CoalesceFunctor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- for (const auto& datum : batch.values) {
- if (datum.is_array()) {
- return ExecArrayCoalesce<Type>(ctx, batch, out);
- }
- }
- return ExecScalarCoalesce(ctx, batch, out);
- }
-};
-
-template <>
-struct CoalesceFunctor<NullType> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
- using offset_type = typename Type::offset_type;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- for (const auto& datum : batch.values) {
- if (datum.is_array()) {
- return ExecArray(ctx, batch, out);
- }
- }
- return ExecScalarCoalesce(ctx, batch, out);
- }
-
- static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // Special case: grab any leading non-null scalar or array arguments
- for (const auto& datum : batch.values) {
- if (datum.is_scalar()) {
- if (!datum.scalar()->is_valid) continue;
- ARROW_ASSIGN_OR_RAISE(
- *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
- return Status::OK();
- } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
- *out = datum;
- return Status::OK();
- }
- break;
- }
- ArrayData* output = out->mutable_array();
- BuilderType builder(batch[0].type(), ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(batch.length));
- for (int64_t i = 0; i < batch.length; i++) {
- bool set = false;
- for (const auto& datum : batch.values) {
- if (datum.is_scalar()) {
- if (datum.scalar()->is_valid) {
- RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(*datum.scalar())));
- set = true;
- break;
- }
- } else {
- const ArrayData& source = *datum.array();
- if (!source.MayHaveNulls() ||
- BitUtil::GetBit(source.buffers[0]->data(), source.offset + i)) {
- const uint8_t* data = source.buffers[2]->data();
- const offset_type* offsets = source.GetValues<offset_type>(1);
- const offset_type offset0 = offsets[i];
- const offset_type offset1 = offsets[i + 1];
- RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
- set = true;
- break;
- }
- }
- }
- if (!set) RETURN_NOT_OK(builder.AppendNull());
- }
- ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
- *output = *temp_output->data();
- // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
- output->type = batch[0].type();
- return Status::OK();
- }
-};
-
-Result<ValueDescr> LastType(KernelContext*, const std::vector<ValueDescr>& descrs) {
- ValueDescr result = descrs.back();
- result.shape = GetBroadcastShape(descrs);
- return result;
-}
-
-void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
- detail::GetTypeId get_id, ArrayKernelExec exec) {
- ScalarKernel kernel(
- KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
- OutputType(LastType),
- /*is_varargs=*/true),
- exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = is_fixed_width(get_id.id);
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-void AddPrimitiveCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec = GenerateTypeAgnosticPrimitive<CaseWhenFunctor>(*type);
- AddCaseWhenKernel(scalar_function, type, std::move(exec));
- }
-}
-
-void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
- detail::GetTypeId get_id, ArrayKernelExec exec) {
- ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
- /*is_varargs=*/true),
- exec);
- kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::PREALLOCATE;
- kernel.can_write_into_slices = is_fixed_width(get_id.id);
- DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
-}
-
-void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
- const std::vector<std::shared_ptr<DataType>>& types) {
- for (auto&& type : types) {
- auto exec = GenerateTypeAgnosticPrimitive<CoalesceFunctor>(*type);
- AddCoalesceKernel(scalar_function, type, std::move(exec));
- }
-}
-
-const FunctionDoc if_else_doc{"Choose values based on a condition",
- ("`cond` must be a Boolean scalar/ array. \n`left` or "
- "`right` must be of the same type scalar/ array.\n"
- "`null` values in `cond` will be promoted to the"
- " output."),
- {"cond", "left", "right"}};
-
-const FunctionDoc case_when_doc{
- "Choose values based on multiple conditions",
- ("`cond` must be a struct of Boolean values. `cases` can be a mix "
- "of scalar and array arguments (of any type, but all must be the "
- "same type or castable to a common type), with either exactly one "
- "datum per child of `cond`, or one more `cases` than children of "
- "`cond` (in which case we have an \"else\" value).\n"
- "Each row of the output will be the corresponding value of the "
- "first datum in `cases` for which the corresponding child of `cond` "
- "is true, or otherwise the \"else\" value (if given), or null. "
- "Essentially, this implements a switch-case or if-else, if-else... "
- "statement."),
- {"cond", "*cases"}};
-
-const FunctionDoc coalesce_doc{
- "Select the first non-null value in each slot",
- ("Each row of the output will be the value from the first corresponding input "
- "for which the value is not null. If all inputs are null in a row, the output "
- "will be null."),
- {"*values"}};
-} // namespace
-
-void RegisterScalarIfElse(FunctionRegistry* registry) {
- {
- auto func =
- std::make_shared<IfElseFunction>("if_else", Arity::Ternary(), &if_else_doc);
-
- AddPrimitiveIfElseKernels(func, NumericTypes());
- AddPrimitiveIfElseKernels(func, TemporalTypes());
- AddPrimitiveIfElseKernels(func, {boolean(), day_time_interval(), month_interval()});
- AddNullIfElseKernel(func);
- AddBinaryIfElseKernels(func, BaseBinaryTypes());
- AddFSBinaryIfElseKernel(func);
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<CaseWhenFunction>(
- "case_when", Arity::VarArgs(/*min_args=*/1), &case_when_doc);
- AddPrimitiveCaseWhenKernels(func, NumericTypes());
- AddPrimitiveCaseWhenKernels(func, TemporalTypes());
- AddPrimitiveCaseWhenKernels(
- func, {boolean(), null(), day_time_interval(), month_interval()});
- AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY,
- CaseWhenFunctor<FixedSizeBinaryType>::Exec);
- AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<Decimal128Type>::Exec);
- AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<Decimal256Type>::Exec);
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<CoalesceFunction>(
- "coalesce", Arity::VarArgs(/*min_args=*/1), &coalesce_doc);
- AddPrimitiveCoalesceKernels(func, NumericTypes());
- AddPrimitiveCoalesceKernels(func, TemporalTypes());
- AddPrimitiveCoalesceKernels(
- func, {boolean(), null(), day_time_interval(), month_interval()});
- AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
- CoalesceFunctor<FixedSizeBinaryType>::Exec);
- AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<Decimal128Type>::Exec);
- AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<Decimal256Type>::Exec);
- for (const auto& ty : BaseBinaryTypes()) {
- AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<CoalesceFunctor>(ty));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/compute/api.h>
+#include <arrow/compute/kernels/codegen_internal.h>
+#include <arrow/compute/util_internal.h>
+#include <arrow/util/bit_block_counter.h>
+#include <arrow/util/bitmap.h>
+#include <arrow/util/bitmap_ops.h>
+#include <arrow/util/bitmap_reader.h>
+
+namespace arrow {
+using internal::BitBlockCount;
+using internal::BitBlockCounter;
+using internal::Bitmap;
+using internal::BitmapWordReader;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+constexpr uint64_t kAllNull = 0;
+constexpr uint64_t kAllValid = ~kAllNull;
+
+util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {
+ if (data.is_scalar()) {
+ return data.scalar()->is_valid ? kAllValid : kAllNull;
+ }
+
+ if (data.array()->null_count == data.array()->length) return kAllNull;
+
+ if (!data.array()->MayHaveNulls()) return kAllValid;
+
+ // no constant validity word available
+ return {};
+}
+
+inline Bitmap GetBitmap(const Datum& datum, int i) {
+ if (datum.is_scalar()) return {};
+ const ArrayData& a = *datum.array();
+ return Bitmap{a.buffers[i], a.offset, a.length};
+}
+
+// if the condition is null then output is null otherwise we take validity from the
+// selected argument
+// ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+template <typename AllocateNullBitmap>
+Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
+ const Datum& right_d, ArrayData* output) {
+ auto cond_const = GetConstantValidityWord(cond_d);
+ auto left_const = GetConstantValidityWord(left_d);
+ auto right_const = GetConstantValidityWord(right_d);
+
+ enum { COND_CONST = 1, LEFT_CONST = 2, RIGHT_CONST = 4 };
+ auto flag = COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
+ RIGHT_CONST * right_const.has_value();
+
+ const ArrayData& cond = *cond_d.array();
+ // cond.data will always be available
+ Bitmap cond_data{cond.buffers[1], cond.offset, cond.length};
+ Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
+ Bitmap left_valid = GetBitmap(left_d, 0);
+ Bitmap right_valid = GetBitmap(right_d, 0);
+
+ // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+ // In the following cases, we dont need to allocate out_valid bitmap
+
+ // if cond & left & right all ones, then output is all valid.
+ // if output validity buffer is already allocated (NullHandling::
+ // COMPUTED_PREALLOCATE) -> set all bits
+ // else, return nullptr
+ if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
+ if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
+ output->buffers[0] = nullptr;
+ } else { // NullHandling::COMPUTED_PREALLOCATE
+ BitUtil::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
+ output->length);
+ }
+ return Status::OK();
+ }
+
+ if (left_const == kAllValid && right_const == kAllValid) {
+ // if both left and right are valid, no need to calculate out_valid bitmap. Copy
+ // cond validity buffer
+ if (AllocateNullBitmap::value) { // NullHandling::COMPUTED_NO_PREALLOCATE
+ // if there's an offset, copy bitmap (cannot slice a bitmap)
+ if (cond.offset) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
+ cond.offset, cond.length));
+ } else { // just copy assign cond validity buffer
+ output->buffers[0] = cond.buffers[0];
+ }
+ } else { // NullHandling::COMPUTED_PREALLOCATE
+ arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
+ output->buffers[0]->mutable_data(), output->offset);
+ }
+ return Status::OK();
+ }
+
+ // lambda function that will be used inside the visitor
+ auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
+ uint64_t r_valid) {
+ return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
+ };
+
+ if (AllocateNullBitmap::value) {
+ // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
+ // would not have allocated buffers for it.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
+ }
+
+ std::array<Bitmap, 1> out_bitmaps{
+ Bitmap{output->buffers[0], output->offset, output->length}};
+
+ switch (flag) {
+ case COND_CONST | LEFT_CONST | RIGHT_CONST: {
+ std::array<Bitmap, 1> bitmaps{cond_data};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 1>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ *left_const, *right_const);
+ });
+ break;
+ }
+ case LEFT_CONST | RIGHT_CONST: {
+ std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ *left_const, *right_const);
+ });
+ break;
+ }
+ case COND_CONST | RIGHT_CONST: {
+ // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
+ // Visit()
+ std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ words_in[1], *right_const);
+ });
+ break;
+ }
+ case RIGHT_CONST: {
+ // bitmaps[R_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ words_in[2], *right_const);
+ });
+ break;
+ }
+ case COND_CONST | LEFT_CONST: {
+ // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
+ // Visit()
+ std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 2>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ *left_const, words_in[1]);
+ });
+ break;
+ }
+ case LEFT_CONST: {
+ // bitmaps[L_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ *left_const, words_in[2]);
+ });
+ break;
+ }
+ case COND_CONST: {
+ // bitmaps[C_VALID] might be null; override to make it safe for Visit()
+ std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 3>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(*cond_const, words_in[0],
+ words_in[1], words_in[2]);
+ });
+ break;
+ }
+ case 0: {
+ std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
+ Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+ [&](const std::array<uint64_t, 4>& words_in,
+ std::array<uint64_t, 1>* word_out) {
+ word_out->at(0) = apply(words_in[0], words_in[1],
+ words_in[2], words_in[3]);
+ });
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+using Word = uint64_t;
+static constexpr int64_t word_len = sizeof(Word) * 8;
+
+/// Runs the main if_else loop. Here, it is expected that the right data has already
+/// been copied to the output.
+/// If `invert` is meant to invert the cond.data. If is set to `true`, then the
+/// buffer will be inverted before calling the handle_block or handle_each functions.
+/// This is useful, when left is an array and right is scalar. Then rather than
+/// copying data from the right to output, we can copy left data to the output and
+/// invert the cond data to fill right values. Filling out with a scalar is presumed to
+/// be more efficient than filling with an array
+///
+/// `HandleBlock` has the signature:
+/// [](int64_t offset, int64_t length){...}
+/// It should copy `length` number of elements from source array to output array with
+/// `offset` offset in both arrays
+template <typename HandleBlock, bool invert = false>
+void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
+ int64_t data_offset = 0;
+ int64_t bit_offset = cond.offset;
+ const auto* cond_data = cond.buffers[1]->data(); // this is a BoolArray
+
+ BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
+
+ constexpr Word pickAll = invert ? 0 : UINT64_MAX;
+ constexpr Word pickNone = ~pickAll;
+
+ int64_t cnt = cond_reader.words();
+ while (cnt--) {
+ Word word = cond_reader.NextWord();
+
+ if (word == pickAll) {
+ handle_block(data_offset, word_len);
+ } else if (word != pickNone) {
+ for (int64_t i = 0; i < word_len; ++i) {
+ if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
+ handle_block(data_offset + i, 1);
+ }
+ }
+ }
+ data_offset += word_len;
+ bit_offset += word_len;
+ }
+
+ constexpr uint8_t pickAllByte = invert ? 0 : UINT8_MAX;
+ // byte bit-wise inversion is int-wide. Hence XOR with 0xff
+ constexpr uint8_t pickNoneByte = pickAllByte ^ 0xff;
+
+ cnt = cond_reader.trailing_bytes();
+ while (cnt--) {
+ int valid_bits;
+ uint8_t byte = cond_reader.NextTrailingByte(valid_bits);
+
+ if (byte == pickAllByte && valid_bits == 8) {
+ handle_block(data_offset, 8);
+ } else if (byte != pickNoneByte) {
+ for (int i = 0; i < valid_bits; ++i) {
+ if (BitUtil::GetBit(cond_data, bit_offset + i) != invert) {
+ handle_block(data_offset + i, 1);
+ }
+ }
+ }
+ data_offset += 8;
+ bit_offset += 8;
+ }
+}
+
+template <typename HandleBlock>
+void RunIfElseLoopInverted(const ArrayData& cond, const HandleBlock& handle_block) {
+ RunIfElseLoop<HandleBlock, true>(cond, handle_block);
+}
+
+/// Runs if-else when cond is a scalar. Two special functions are required,
+/// 1.CopyArrayData, 2. BroadcastScalar
+template <typename CopyArrayData, typename BroadcastScalar>
+Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum& right,
+ Datum* out, const CopyArrayData& copy_array_data,
+ const BroadcastScalar& broadcast_scalar) {
+ if (left.is_scalar() && right.is_scalar()) { // output will be a scalar
+ if (cond.is_valid) {
+ *out = cond.value ? left.scalar() : right.scalar();
+ } else {
+ *out = MakeNullScalar(left.type());
+ }
+ return Status::OK();
+ }
+
+ // either left or right is an array. Output is always an array`
+ const std::shared_ptr<ArrayData>& out_array = out->array();
+ if (!cond.is_valid) {
+ // cond is null; output is all null --> clear validity buffer
+ BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ return Status::OK();
+ }
+
+ // cond is a non-null scalar
+ const auto& valid_data = cond.value ? left : right;
+ if (valid_data.is_array()) {
+ // valid_data is an array. Hence copy data to the output buffers
+ const auto& valid_array = valid_data.array();
+ if (valid_array->MayHaveNulls()) {
+ arrow::internal::CopyBitmap(
+ valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
+ out_array->buffers[0]->mutable_data(), out_array->offset);
+ } else { // validity buffer is nullptr --> set all bits
+ BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ }
+ copy_array_data(*valid_array, out_array.get());
+ return Status::OK();
+
+ } else { // valid data is scalar
+ // valid data is a scalar that needs to be broadcasted
+ const auto& valid_scalar = *valid_data.scalar();
+ if (valid_scalar.is_valid) { // if the scalar is non-null, broadcast
+ BitUtil::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ broadcast_scalar(*valid_data.scalar(), out_array.get());
+ } else { // scalar is null, clear the output validity buffer
+ BitUtil::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+ out_array->length);
+ }
+ return Status::OK();
+ }
+}
+
+template <typename Type, typename Enable = void>
+struct IfElseFunctor {};
+
+// only number types needs to be handled for Fixed sized primitive data types because,
+// internal::GenerateTypeAgnosticPrimitive forwards types to the corresponding unsigned
+// int type
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_number<Type>> {
+ using T = typename TypeTraits<Type>::CType;
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
+ valid_array.length * sizeof(T));
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+ std::fill(out_array->GetMutableValues<T>(1),
+ out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ const T* right_data = right.GetValues<T>(1);
+ std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+ // selectively copy values from left data
+ const T* left_data = left.GetValues<T>(1);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::memcpy(out_values + data_offset, left_data + data_offset,
+ num_elems * sizeof(T));
+ });
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ const T* right_data = right.GetValues<T>(1);
+ std::memcpy(out_values, right_data, right.length * sizeof(T));
+
+ // selectively copy values from left data
+ T left_data = internal::UnboxScalar<Type>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ left_data);
+ });
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy left data to out_buff
+ const T* left_data = left.GetValues<T>(1);
+ std::memcpy(out_values, left_data, left.length * sizeof(T));
+
+ T right_data = internal::UnboxScalar<Type>::Unbox(right);
+
+ RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ right_data);
+ });
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ T* out_values = out->template GetMutableValues<T>(1);
+
+ // copy right data to out_buff
+ T right_data = internal::UnboxScalar<Type>::Unbox(right);
+ std::fill(out_values, out_values + cond.length, right_data);
+
+ // selectively copy values from left data
+ T left_data = internal::UnboxScalar<Type>::Unbox(left);
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::fill(out_values + data_offset, out_values + data_offset + num_elems,
+ left_data);
+ });
+
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_boolean<Type>> {
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ arrow::internal::CopyBitmap(
+ valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
+ out_array->buffers[1]->mutable_data(), out_array->offset);
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
+ BitUtil::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
+ out_array->length, scalar_data);
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ // out_buff = right & ~cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ // out_buff = left & cond
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
+ arrow::internal::BitmapAnd(
+ ctx->memory_pool(), left.buffers[1]->data(), left.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length, 0));
+
+ arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
+ cond.length, out->offset, out_buf->mutable_data());
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ // out_buff = right & ~cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ // out_buff = left & cond
+ bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
+ if (left_data) {
+ arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
+ cond.offset, cond.length, out->offset,
+ out_buf->mutable_data());
+ }
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ // out_buff = left & cond
+ const auto& out_buf = out->buffers[1];
+ arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
+ cond.buffers[1]->data(), cond.offset, cond.length,
+ out->offset, out_buf->mutable_data());
+
+ bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
+
+ // out_buff = left & cond | right & ~cond
+ if (right_data) {
+ arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
+ cond.offset, cond.length, out->offset,
+ out_buf->mutable_data());
+ }
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
+ bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
+
+ const auto& out_buf = out->buffers[1];
+
+ // out_buf = left & cond | right & ~cond
+ // std::shared_ptr<Buffer> out_buf = nullptr;
+ if (left_data) {
+ if (right_data) {
+ // out_buf = ones
+ BitUtil::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
+ } else {
+ // out_buf = cond
+ arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+ out_buf->mutable_data(), out->offset);
+ }
+ } else {
+ if (right_data) {
+ // out_buf = ~cond
+ arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
+ out_buf->mutable_data(), out->offset);
+ } else {
+ // out_buf = zeros
+ BitUtil::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
+ }
+ }
+
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
+ using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ if (left.is_scalar() && right.is_scalar()) {
+ if (cond.is_valid) {
+ *out = cond.value ? left.scalar() : right.scalar();
+ } else {
+ *out = MakeNullScalar(left.type());
+ }
+ return Status::OK();
+ }
+ // either left or right is an array. Output is always an array
+ int64_t out_arr_len = std::max(left.length(), right.length());
+ if (!cond.is_valid) {
+ // cond is null; just create a null array
+ ARROW_ASSIGN_OR_RAISE(*out,
+ MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
+ return Status::OK();
+ }
+
+ const auto& valid_data = cond.value ? left : right;
+ if (valid_data.is_array()) {
+ *out = valid_data;
+ } else {
+ // valid data is a scalar that needs to be broadcasted
+ ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
+ ctx->memory_pool()));
+ }
+ return Status::OK();
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ const auto* left_offsets = left.GetValues<OffsetType>(1);
+ const uint8_t* left_data = left.buffers[2]->data();
+ const auto* right_offsets = right.GetValues<OffsetType>(1);
+ const uint8_t* right_data = right.buffers[2]->data();
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
+ right_offsets[right.length] - right_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out,
+ [&](int64_t i) {
+ builder.UnsafeAppend(left_data + left_offsets[i],
+ left_offsets[i + 1] - left_offsets[i]);
+ },
+ [&](int64_t i) {
+ builder.UnsafeAppend(right_data + right_offsets[i],
+ right_offsets[i + 1] - right_offsets[i]);
+ },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+ auto left_size = static_cast<OffsetType>(left_data.size());
+
+ const auto* right_offsets = right.GetValues<OffsetType>(1);
+ const uint8_t* right_data = right.buffers[2]->data();
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc =
+ left_size * cond.length + right_offsets[right.length] - right_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+ [&](int64_t i) {
+ builder.UnsafeAppend(right_data + right_offsets[i],
+ right_offsets[i + 1] - right_offsets[i]);
+ },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ const auto* left_offsets = left.GetValues<OffsetType>(1);
+ const uint8_t* left_data = left.buffers[2]->data();
+
+ util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
+ auto right_size = static_cast<OffsetType>(right_data.size());
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc =
+ right_size * cond.length + left_offsets[left.length] - left_offsets[0];
+
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out,
+ [&](int64_t i) {
+ builder.UnsafeAppend(left_data + left_offsets[i],
+ left_offsets[i + 1] - left_offsets[i]);
+ },
+ [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
+ auto left_size = static_cast<OffsetType>(left_data.size());
+
+ util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
+ auto right_size = static_cast<OffsetType>(right_data.size());
+
+ // allocate data buffer conservatively
+ int64_t data_buff_alloc = std::max(right_size, left_size) * cond.length;
+ BuilderType builder(ctx->memory_pool());
+ ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
+ ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
+
+ RunLoop(
+ cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+ [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
+ [&]() { builder.UnsafeAppendNull(); });
+ ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
+
+ out->SetNullCount(out_arr->data()->null_count);
+ out->buffers[0] = std::move(out_arr->data()->buffers[0]);
+ out->buffers[1] = std::move(out_arr->data()->buffers[1]);
+ out->buffers[2] = std::move(out_arr->data()->buffers[2]);
+ return Status::OK();
+ }
+
+ template <typename HandleLeft, typename HandleRight, typename HandleNull>
+ static void RunLoop(const ArrayData& cond, const ArrayData& output,
+ HandleLeft&& handle_left, HandleRight&& handle_right,
+ HandleNull&& handle_null) {
+ const auto* cond_data = cond.buffers[1]->data();
+
+ if (output.buffers[0]) { // output may have nulls
+ // output validity buffer is allocated internally from the IfElseFunctor. Therefore
+ // it is cond.length'd with 0 offset.
+ const auto* out_valid = output.buffers[0]->data();
+
+ for (int64_t i = 0; i < cond.length; i++) {
+ if (BitUtil::GetBit(out_valid, i)) {
+ BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
+ } else {
+ handle_null();
+ }
+ }
+ } else { // output is all valid (no nulls)
+ for (int64_t i = 0; i < cond.length; i++) {
+ BitUtil::GetBit(cond_data, cond.offset + i) ? handle_left(i) : handle_right(i);
+ }
+ }
+ }
+};
+
+template <typename Type>
+struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
+ // A - Array, S - Scalar, X = Array/Scalar
+
+ // SXX
+ static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
+ const Datum& right, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type(), *right.type()));
+ return RunIfElseScalar(
+ cond, left, right, out,
+ /*CopyArrayData*/
+ [&](const ArrayData& valid_array, ArrayData* out_array) {
+ std::memcpy(
+ out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
+ valid_array.buffers[1]->data() + valid_array.offset * byte_width,
+ valid_array.length * byte_width);
+ },
+ /*BroadcastScalar*/
+ [&](const Scalar& scalar, ArrayData* out_array) {
+ const util::string_view& scalar_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ uint8_t* start =
+ out_array->buffers[1]->mutable_data() + out_array->offset * byte_width;
+ for (int64_t i = 0; i < out_array->length; i++) {
+ std::memcpy(start + i * byte_width, scalar_data.data(), scalar_data.size());
+ }
+ });
+ }
+
+ // AAA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const ArrayData& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+ std::memcpy(out_values, right_data, right.length * byte_width);
+
+ // selectively copy values from left data
+ const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ std::memcpy(out_values + data_offset * byte_width,
+ left_data + data_offset * byte_width, num_elems * byte_width);
+ });
+
+ return Status::OK();
+ }
+
+ // ASA
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const ArrayData& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+ std::memcpy(out_values, right_data, right.length * byte_width);
+
+ // selectively copy values from left data
+ const util::string_view& left_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (left_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
+ left_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ // AAS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
+ const Scalar& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy left data to out_buff
+ const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+ std::memcpy(out_values, left_data, left.length * byte_width);
+
+ const util::string_view& right_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
+
+ RunIfElseLoopInverted(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (right_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, right_data.data(),
+ right_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ // ASS
+ static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
+ const Scalar& right, ArrayData* out) {
+ ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
+ auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+
+ // copy right data to out_buff
+ const util::string_view& right_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(right);
+ if (right_data.data()) {
+ for (int64_t i = 0; i < cond.length; i++) {
+ std::memcpy(out_values + i * byte_width, right_data.data(), right_data.size());
+ }
+ }
+
+ // selectively copy values from left data
+ const util::string_view& left_data =
+ internal::UnboxScalar<FixedSizeBinaryType>::Unbox(left);
+
+ RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
+ if (left_data.data()) {
+ for (int64_t i = 0; i < num_elems; i++) {
+ std::memcpy(out_values + (data_offset + i) * byte_width, left_data.data(),
+ left_data.size());
+ }
+ }
+ });
+
+ return Status::OK();
+ }
+
+ static Result<int32_t> GetByteWidth(const DataType& left_type,
+ const DataType& right_type) {
+ int width = checked_cast<const FixedSizeBinaryType&>(left_type).byte_width();
+ if (width == checked_cast<const FixedSizeBinaryType&>(right_type).byte_width()) {
+ return width;
+ } else {
+ return Status::Invalid("FixedSizeBinaryType byte_widths should be equal");
+ }
+ }
+};
+
+template <typename Type, typename AllocateMem>
+struct ResolveIfElseExec {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // cond is scalar
+ if (batch[0].is_scalar()) {
+ const auto& cond = batch[0].scalar_as<BooleanScalar>();
+ return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
+ }
+
+ // cond is array. Use functors to sort things out
+ ARROW_RETURN_NOT_OK(PromoteNullsVisitor<AllocateMem>(ctx, batch[0], batch[1],
+ batch[2], out->mutable_array()));
+
+ if (batch[1].kind() == Datum::ARRAY) {
+ if (batch[2].kind() == Datum::ARRAY) { // AAA
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
+ *batch[2].array(), out->mutable_array());
+ } else { // AAS
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
+ *batch[2].scalar(), out->mutable_array());
+ }
+ } else {
+ if (batch[2].kind() == Datum::ARRAY) { // ASA
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ *batch[2].array(), out->mutable_array());
+ } else { // ASS
+ return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
+ *batch[2].scalar(), out->mutable_array());
+ }
+ }
+ }
+};
+
+template <typename AllocateMem>
+struct ResolveIfElseExec<NullType, AllocateMem> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // if all are scalars, return a null scalar
+ if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
+ *out = MakeNullScalar(null());
+ } else {
+ ARROW_ASSIGN_OR_RAISE(*out,
+ MakeArrayOfNull(null(), batch.length, ctx->memory_pool()));
+ }
+ return Status::OK();
+ }
+};
+
+struct IfElseFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ // if 0th descriptor is null, replace with bool
+ if (values->at(0).type->id() == Type::NA) {
+ values->at(0).type = boolean();
+ }
+
+ // if-else 0'th descriptor is bool, so skip it
+ std::vector<ValueDescr> values_copy(values->begin() + 1, values->end());
+ internal::EnsureDictionaryDecoded(&values_copy);
+ internal::ReplaceNullWithOtherType(&values_copy);
+
+ if (auto type = internal::CommonNumeric(values_copy)) {
+ internal::ReplaceTypes(type, &values_copy);
+ }
+
+ std::move(values_copy.begin(), values_copy.end(), values->begin() + 1);
+
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+void AddNullIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+ ScalarKernel kernel({boolean(), null(), null()}, null(),
+ ResolveIfElseExec<NullType,
+ /*AllocateMem=*/std::true_type>::Exec);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.can_write_into_slices = false;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec =
+ internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec,
+ /*AllocateMem=*/std::false_type>(*type);
+ // cond array needs to be boolean always
+ ScalarKernel kernel({boolean(), type, type}, type, exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = true;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+ }
+}
+
+void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec =
+ internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec,
+ /*AllocateMem=*/std::true_type>(
+ *type);
+ // cond array needs to be boolean always
+ ScalarKernel kernel({boolean(), type, type}, type, exec);
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.can_write_into_slices = false;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+ }
+}
+
+void AddFSBinaryIfElseKernel(const std::shared_ptr<IfElseFunction>& scalar_function) {
+ // cond array needs to be boolean always
+ ScalarKernel kernel(
+ {boolean(), InputType(Type::FIXED_SIZE_BINARY), InputType(Type::FIXED_SIZE_BINARY)},
+ OutputType([](KernelContext*, const std::vector<ValueDescr>& descrs) {
+ return ValueDescr(descrs[1].type, ValueDescr::ANY);
+ }),
+ ResolveIfElseExec<FixedSizeBinaryType, /*AllocateMem=*/std::false_type>::Exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = true;
+
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+// Helper to copy or broadcast fixed-width values between buffers.
+template <typename Type, typename Enable = void>
+struct CopyFixedWidth {};
+template <>
+struct CopyFixedWidth<BooleanType> {
+ static void CopyScalar(const Scalar& scalar, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const bool value = UnboxScalar<BooleanType>::Unbox(scalar);
+ BitUtil::SetBitsTo(raw_out_values, out_offset, length, value);
+ }
+ static void CopyArray(const DataType&, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ arrow::internal::CopyBitmap(in_values, in_offset, length, raw_out_values, out_offset);
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_number<Type>> {
+ using CType = typename TypeTraits<Type>::CType;
+ static void CopyScalar(const Scalar& scalar, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ CType* out_values = reinterpret_cast<CType*>(raw_out_values);
+ const CType value = UnboxScalar<Type>::Unbox(scalar);
+ std::fill(out_values + out_offset, out_values + out_offset + length, value);
+ }
+ static void CopyArray(const DataType&, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ std::memcpy(raw_out_values + out_offset * sizeof(CType),
+ in_values + in_offset * sizeof(CType), length * sizeof(CType));
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+ static void CopyScalar(const Scalar& values, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width =
+ checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(values);
+ // Scalar may have null value buffer
+ if (!scalar.value) {
+ std::memset(next, 0x00, width * length);
+ } else {
+ DCHECK_EQ(scalar.value->size(), width);
+ for (int i = 0; i < length; i++) {
+ std::memcpy(next, scalar.value->data(), width);
+ next += width;
+ }
+ }
+ }
+ static void CopyArray(const DataType& type, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ std::memcpy(next, in_values + in_offset * width, length * width);
+ }
+};
+template <typename Type>
+struct CopyFixedWidth<Type, enable_if_decimal<Type>> {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void CopyScalar(const Scalar& values, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width =
+ checked_cast<const FixedSizeBinaryType&>(*values.type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ const auto& scalar = checked_cast<const ScalarType&>(values);
+ const auto value = scalar.value.ToBytes();
+ for (int i = 0; i < length; i++) {
+ std::memcpy(next, value.data(), width);
+ next += width;
+ }
+ }
+ static void CopyArray(const DataType& type, const uint8_t* in_values,
+ const int64_t in_offset, const int64_t length,
+ uint8_t* raw_out_values, const int64_t out_offset) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(type).byte_width();
+ uint8_t* next = raw_out_values + (width * out_offset);
+ std::memcpy(next, in_values + in_offset * width, length * width);
+ }
+};
+// Copy fixed-width values from a scalar/array datum into an output values buffer
+template <typename Type>
+void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
+ uint8_t* out_valid, uint8_t* out_values, const int64_t out_offset) {
+ if (in_values.is_scalar()) {
+ const auto& scalar = *in_values.scalar();
+ if (out_valid) {
+ BitUtil::SetBitsTo(out_valid, out_offset, length, scalar.is_valid);
+ }
+ CopyFixedWidth<Type>::CopyScalar(scalar, length, out_values, out_offset);
+ } else {
+ const ArrayData& array = *in_values.array();
+ if (out_valid) {
+ if (array.MayHaveNulls()) {
+ if (length == 1) {
+ // CopyBitmap is slow for short runs
+ BitUtil::SetBitTo(
+ out_valid, out_offset,
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + in_offset));
+ } else {
+ arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset + in_offset,
+ length, out_valid, out_offset);
+ }
+ } else {
+ BitUtil::SetBitsTo(out_valid, out_offset, length, true);
+ }
+ }
+ CopyFixedWidth<Type>::CopyArray(*array.type, array.buffers[1]->data(),
+ array.offset + in_offset, length, out_values,
+ out_offset);
+ }
+}
+
+// Specialized helper to copy a single value from a source array. Allows avoiding
+// repeatedly calling MayHaveNulls and Buffer::data() which have internal checks that
+// add up when called in a loop.
+template <typename Type>
+void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
+ const uint8_t* in_values, const int64_t in_offset,
+ uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset) {
+ if (out_valid) {
+ BitUtil::SetBitTo(out_valid, out_offset,
+ !in_valid || BitUtil::GetBit(in_valid, in_offset));
+ }
+ CopyFixedWidth<Type>::CopyArray(type, in_values, in_offset, /*length=*/1, out_values,
+ out_offset);
+}
+
+struct CaseWhenFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ // The first function is a struct of booleans, where the number of fields in the
+ // struct is either equal to the number of other arguments or is one less.
+ RETURN_NOT_OK(CheckArity(*values));
+ EnsureDictionaryDecoded(values);
+ auto first_type = (*values)[0].type;
+ if (first_type->id() != Type::STRUCT) {
+ return Status::TypeError("case_when: first argument must be STRUCT, not ",
+ *first_type);
+ }
+ auto num_fields = static_cast<size_t>(first_type->num_fields());
+ if (num_fields < values->size() - 2 || num_fields >= values->size()) {
+ return Status::Invalid(
+ "case_when: number of struct fields must be equal to or one less than count of "
+ "remaining arguments (",
+ values->size() - 1, "), got: ", first_type->num_fields());
+ }
+ for (const auto& field : first_type->fields()) {
+ if (field->type()->id() != Type::BOOL) {
+ return Status::TypeError(
+ "case_when: all fields of first argument must be BOOL, but ", field->name(),
+ " was of type: ", *field->type());
+ }
+ }
+
+ if (auto type = CommonNumeric(values->data() + 1, values->size() - 1)) {
+ for (auto it = values->begin() + 1; it != values->end(); it++) {
+ it->type = type;
+ }
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions
+template <typename Type>
+Status ExecScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
+ if (!conds.is_valid) {
+ return Status::Invalid("cond struct must not be null");
+ }
+ Datum result;
+ for (size_t i = 0; i < batch.values.size() - 1; i++) {
+ if (i < conds.value.size()) {
+ const Scalar& cond = *conds.value[i];
+ if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
+ result = batch[i + 1];
+ break;
+ }
+ } else {
+ // ELSE clause
+ result = batch[i + 1];
+ break;
+ }
+ }
+ if (out->is_scalar()) {
+ *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
+ return Status::OK();
+ }
+ ArrayData* output = out->mutable_array();
+ if (!result.is_value()) {
+ // All conditions false, no 'else' argument
+ result = MakeNullScalar(out->type());
+ }
+ CopyValues<Type>(result, /*in_offset=*/0, batch.length,
+ output->GetMutableValues<uint8_t>(0, 0),
+ output->GetMutableValues<uint8_t>(1, 0), output->offset);
+ return Status::OK();
+}
+
+// Implement 'case when' for any mix of scalar/array arguments for any fixed-width type,
+// given helper functions to copy data from a source array to a target array
+template <typename Type>
+Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& conds_array = *batch.values[0].array();
+ if (conds_array.GetNullCount() > 0) {
+ return Status::Invalid("cond struct must not have top-level nulls");
+ }
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ const auto num_value_args = batch.values.size() - 1;
+ const bool have_else_arg =
+ static_cast<size_t>(conds_array.type->num_fields()) < num_value_args;
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ if (have_else_arg) {
+ // Copy 'else' value into output
+ CopyValues<Type>(batch.values.back(), /*in_offset=*/0, batch.length, out_valid,
+ out_values, out_offset);
+ } else {
+ // There's no 'else' argument, so we should have an all-null validity bitmap
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ }
+
+ // Allocate a temporary bitmap to determine which elements still need setting.
+ ARROW_ASSIGN_OR_RAISE(auto mask_buffer, ctx->AllocateBitmap(batch.length));
+ uint8_t* mask = mask_buffer->mutable_data();
+ std::memset(mask, 0xFF, mask_buffer->size());
+
+ // Then iterate through each argument in turn and set elements.
+ for (size_t i = 0; i < batch.values.size() - (have_else_arg ? 2 : 1); i++) {
+ const ArrayData& cond_array = *conds_array.child_data[i];
+ const int64_t cond_offset = conds_array.offset + cond_array.offset;
+ const uint8_t* cond_values = cond_array.buffers[1]->data();
+ const Datum& values_datum = batch[i + 1];
+ int64_t offset = 0;
+
+ if (cond_array.GetNullCount() == 0) {
+ // If no valid buffer, visit mask & cond bitmap simultaneously
+ BinaryBitBlockCounter counter(mask, /*start_offset=*/0, cond_values, cond_offset,
+ batch.length);
+ while (offset < batch.length) {
+ const auto block = counter.NextAndWord();
+ if (block.AllSet()) {
+ CopyValues<Type>(values_datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ BitUtil::SetBitsTo(mask, offset, block.length, false);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(mask, offset + j) &&
+ BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
+ CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
+ out_values, out_offset + offset + j);
+ BitUtil::SetBitTo(mask, offset + j, false);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ } else {
+ // Visit mask & cond bitmap & cond validity
+ const uint8_t* cond_valid = cond_array.buffers[0]->data();
+ Bitmap bitmaps[3] = {{mask, /*offset=*/0, batch.length},
+ {cond_values, cond_offset, batch.length},
+ {cond_valid, cond_offset, batch.length}};
+ Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 3> words) {
+ const uint64_t word = words[0] & words[1] & words[2];
+ const int64_t block_length = std::min<int64_t>(64, batch.length - offset);
+ if (word == std::numeric_limits<uint64_t>::max()) {
+ CopyValues<Type>(values_datum, offset, block_length, out_valid, out_values,
+ out_offset + offset);
+ BitUtil::SetBitsTo(mask, offset, block_length, false);
+ } else if (word) {
+ for (int64_t j = 0; j < block_length; ++j) {
+ if (BitUtil::GetBit(mask, offset + j) &&
+ BitUtil::GetBit(cond_valid, cond_offset + offset + j) &&
+ BitUtil::GetBit(cond_values, cond_offset + offset + j)) {
+ CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
+ out_values, out_offset + offset + j);
+ BitUtil::SetBitTo(mask, offset + j, false);
+ }
+ }
+ }
+ });
+ }
+ }
+ if (!have_else_arg) {
+ // Need to initialize any remaining null slots (uninitialized memory)
+ BitBlockCounter counter(mask, /*offset=*/0, batch.length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast<const FixedWidthType&>(*out->type()).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < batch.length) {
+ const auto block = counter.NextWord();
+ if (block.AllSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.NoneSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct CaseWhenFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch.values[0].is_array()) {
+ return ExecArrayCaseWhen<Type>(ctx, batch, out);
+ }
+ return ExecScalarCaseWhen<Type>(ctx, batch, out);
+ }
+};
+
+template <>
+struct CaseWhenFunctor<NullType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+struct CoalesceFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ EnsureDictionaryDecoded(values);
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'coalesce' (SQL) operator for any number of scalar inputs
+Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.scalar()->is_valid) {
+ *out = datum;
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+// Helper: copy from a source datum into all null slots of the output
+template <typename Type>
+void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ CopyValues<Type>(source, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j)) {
+ CopyValues<Type>(source, offset + j, 1, out_valid, out_values,
+ out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Helper: zero the values buffer of the output wherever the slot is null
+void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast<const FixedWidthType&>(type).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
+template <typename Type>
+Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ // Use output validity buffer as mask to decide what values to copy
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ // Clear output buffer - no values are set initially
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+
+ for (const auto& datum : batch.values) {
+ if ((datum.is_scalar() && datum.scalar()->is_valid) ||
+ (datum.is_array() && !datum.array()->MayHaveNulls())) {
+ // Valid scalar, or all-valid array
+ CopyValuesAllValid<Type>(datum, out_valid, out_values, out_offset, batch.length);
+ break;
+ } else if (datum.is_array()) {
+ // Array with nulls
+ const ArrayData& arr = *datum.array();
+ const DataType& type = *datum.type();
+ const uint8_t* in_valid = arr.buffers[0]->data();
+ const uint8_t* in_values = arr.buffers[1]->data();
+ BinaryBitBlockCounter counter(in_valid, arr.offset, out_valid, out_offset,
+ batch.length);
+ int64_t offset = 0;
+ while (offset < batch.length) {
+ const auto block = counter.NextAndNotWord();
+ if (block.AllSet()) {
+ CopyValues<Type>(datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j) &&
+ BitUtil::GetBit(in_valid, arr.offset + offset + j)) {
+ // This version lets us avoid calling MayHaveNulls() on every iteration
+ // (which does an atomic load and can add up)
+ CopyOneArrayValue<Type>(type, in_valid, in_values, arr.offset + offset + j,
+ out_valid, out_values, out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ }
+
+ // Initialize any remaining null slots (uninitialized memory)
+ InitializeNullSlots(*out->type(), out_valid, out_values, out_offset, batch.length);
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct CoalesceFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArrayCoalesce<Type>(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+};
+
+template <>
+struct CoalesceFunctor<NullType> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArray(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+
+ static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Special case: grab any leading non-null scalar or array arguments
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) continue;
+ ARROW_ASSIGN_OR_RAISE(
+ *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
+ return Status::OK();
+ } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
+ *out = datum;
+ return Status::OK();
+ }
+ break;
+ }
+ ArrayData* output = out->mutable_array();
+ BuilderType builder(batch[0].type(), ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ for (int64_t i = 0; i < batch.length; i++) {
+ bool set = false;
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (datum.scalar()->is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(*datum.scalar())));
+ set = true;
+ break;
+ }
+ } else {
+ const ArrayData& source = *datum.array();
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + i)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues<offset_type>(1);
+ const offset_type offset0 = offsets[i];
+ const offset_type offset1 = offsets[i + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ set = true;
+ break;
+ }
+ }
+ }
+ if (!set) RETURN_NOT_OK(builder.AppendNull());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = batch[0].type();
+ return Status::OK();
+ }
+};
+
+Result<ValueDescr> LastType(KernelContext*, const std::vector<ValueDescr>& descrs) {
+ ValueDescr result = descrs.back();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
+}
+
+void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(
+ KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
+ OutputType(LastType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive<CaseWhenFunctor>(*type);
+ AddCaseWhenKernel(scalar_function, type, std::move(exec));
+ }
+}
+
+void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_function,
+ const std::vector<std::shared_ptr<DataType>>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive<CoalesceFunctor>(*type);
+ AddCoalesceKernel(scalar_function, type, std::move(exec));
+ }
+}
+
+const FunctionDoc if_else_doc{"Choose values based on a condition",
+ ("`cond` must be a Boolean scalar/ array. \n`left` or "
+ "`right` must be of the same type scalar/ array.\n"
+ "`null` values in `cond` will be promoted to the"
+ " output."),
+ {"cond", "left", "right"}};
+
+const FunctionDoc case_when_doc{
+ "Choose values based on multiple conditions",
+ ("`cond` must be a struct of Boolean values. `cases` can be a mix "
+ "of scalar and array arguments (of any type, but all must be the "
+ "same type or castable to a common type), with either exactly one "
+ "datum per child of `cond`, or one more `cases` than children of "
+ "`cond` (in which case we have an \"else\" value).\n"
+ "Each row of the output will be the corresponding value of the "
+ "first datum in `cases` for which the corresponding child of `cond` "
+ "is true, or otherwise the \"else\" value (if given), or null. "
+ "Essentially, this implements a switch-case or if-else, if-else... "
+ "statement."),
+ {"cond", "*cases"}};
+
+const FunctionDoc coalesce_doc{
+ "Select the first non-null value in each slot",
+ ("Each row of the output will be the value from the first corresponding input "
+ "for which the value is not null. If all inputs are null in a row, the output "
+ "will be null."),
+ {"*values"}};
+} // namespace
+
+void RegisterScalarIfElse(FunctionRegistry* registry) {
+ {
+ auto func =
+ std::make_shared<IfElseFunction>("if_else", Arity::Ternary(), &if_else_doc);
+
+ AddPrimitiveIfElseKernels(func, NumericTypes());
+ AddPrimitiveIfElseKernels(func, TemporalTypes());
+ AddPrimitiveIfElseKernels(func, {boolean(), day_time_interval(), month_interval()});
+ AddNullIfElseKernel(func);
+ AddBinaryIfElseKernels(func, BaseBinaryTypes());
+ AddFSBinaryIfElseKernel(func);
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<CaseWhenFunction>(
+ "case_when", Arity::VarArgs(/*min_args=*/1), &case_when_doc);
+ AddPrimitiveCaseWhenKernels(func, NumericTypes());
+ AddPrimitiveCaseWhenKernels(func, TemporalTypes());
+ AddPrimitiveCaseWhenKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCaseWhenKernel(func, Type::FIXED_SIZE_BINARY,
+ CaseWhenFunctor<FixedSizeBinaryType>::Exec);
+ AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor<Decimal128Type>::Exec);
+ AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor<Decimal256Type>::Exec);
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<CoalesceFunction>(
+ "coalesce", Arity::VarArgs(/*min_args=*/1), &coalesce_doc);
+ AddPrimitiveCoalesceKernels(func, NumericTypes());
+ AddPrimitiveCoalesceKernels(func, TemporalTypes());
+ AddPrimitiveCoalesceKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
+ CoalesceFunctor<FixedSizeBinaryType>::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor<Decimal128Type>::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor<Decimal256Type>::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase<CoalesceFunctor>(ty));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
index 7810577b1fe..e9f0696c8fd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -18,7 +18,7 @@
// Vector kernels involving nested types
#include "arrow/array/array_base.h"
-#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/result.h"
#include "arrow/util/bit_block_counter.h"
@@ -29,7 +29,7 @@ namespace internal {
namespace {
template <typename Type, typename offset_type = typename Type::offset_type>
-Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
using ScalarType = typename TypeTraits<Type>::ScalarType;
using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;
@@ -51,131 +51,131 @@ Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
static_cast<offset_type>(arg0.value->length());
}
}
-
- return Status::OK();
+
+ return Status::OK();
+}
+
+const FunctionDoc list_value_length_doc{
+ "Compute list lengths",
+ ("`lists` must have a list-like type.\n"
+ "For each non-null value in `lists`, its length is emitted.\n"
+ "Null values emit a null in the output."),
+ {"lists"}};
+
+Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
+ const std::vector<ValueDescr>& descrs) {
+ auto names = OptionsWrapper<MakeStructOptions>::Get(ctx).field_names;
+ auto nullable = OptionsWrapper<MakeStructOptions>::Get(ctx).field_nullability;
+ auto metadata = OptionsWrapper<MakeStructOptions>::Get(ctx).field_metadata;
+
+ if (names.size() == 0) {
+ names.resize(descrs.size());
+ nullable.resize(descrs.size(), true);
+ metadata.resize(descrs.size(), nullptr);
+ int i = 0;
+ for (auto& name : names) {
+ name = std::to_string(i++);
+ }
+ } else if (names.size() != descrs.size() || nullable.size() != descrs.size() ||
+ metadata.size() != descrs.size()) {
+ return Status::Invalid("make_struct() was passed ", descrs.size(), " arguments but ",
+ names.size(), " field names, ", nullable.size(),
+ " nullability bits, and ", metadata.size(),
+ " metadata dictionaries.");
+ }
+
+ size_t i = 0;
+ FieldVector fields(descrs.size());
+
+ ValueDescr::Shape shape = ValueDescr::SCALAR;
+ for (const ValueDescr& descr : descrs) {
+ if (descr.shape != ValueDescr::SCALAR) {
+ shape = ValueDescr::ARRAY;
+ } else {
+ switch (descr.type->id()) {
+ case Type::EXTENSION:
+ case Type::DENSE_UNION:
+ case Type::SPARSE_UNION:
+ return Status::NotImplemented("Broadcasting scalars of type ", *descr.type);
+ default:
+ break;
+ }
+ }
+
+ fields[i] =
+ field(std::move(names[i]), descr.type, nullable[i], std::move(metadata[i]));
+ ++i;
+ }
+
+ return ValueDescr{struct_(std::move(fields)), shape};
+}
+
+Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors()));
+
+ for (int i = 0; i < batch.num_values(); ++i) {
+ const auto& field = checked_cast<const StructType&>(*descr.type).field(i);
+ if (batch[i].null_count() > 0 && !field->nullable()) {
+ return Status::Invalid("Output field ", field, " (#", i,
+ ") does not allow nulls but the corresponding "
+ "argument was not entirely valid.");
+ }
+ }
+
+ if (descr.shape == ValueDescr::SCALAR) {
+ ScalarVector scalars(batch.num_values());
+ for (int i = 0; i < batch.num_values(); ++i) {
+ scalars[i] = batch[i].scalar();
+ }
+
+ *out =
+ Datum(std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type)));
+ return Status::OK();
+ }
+
+ ArrayVector arrays(batch.num_values());
+ for (int i = 0; i < batch.num_values(); ++i) {
+ if (batch[i].is_array()) {
+ arrays[i] = batch[i].make_array();
+ continue;
+ }
+
+ ARROW_ASSIGN_OR_RAISE(arrays[i], MakeArrayFromScalar(*batch[i].scalar(), batch.length,
+ ctx->memory_pool()));
+ }
+
+ *out = std::make_shared<StructArray>(descr.type, batch.length, std::move(arrays));
+ return Status::OK();
}
-const FunctionDoc list_value_length_doc{
- "Compute list lengths",
- ("`lists` must have a list-like type.\n"
- "For each non-null value in `lists`, its length is emitted.\n"
- "Null values emit a null in the output."),
- {"lists"}};
-
-Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
- const std::vector<ValueDescr>& descrs) {
- auto names = OptionsWrapper<MakeStructOptions>::Get(ctx).field_names;
- auto nullable = OptionsWrapper<MakeStructOptions>::Get(ctx).field_nullability;
- auto metadata = OptionsWrapper<MakeStructOptions>::Get(ctx).field_metadata;
-
- if (names.size() == 0) {
- names.resize(descrs.size());
- nullable.resize(descrs.size(), true);
- metadata.resize(descrs.size(), nullptr);
- int i = 0;
- for (auto& name : names) {
- name = std::to_string(i++);
- }
- } else if (names.size() != descrs.size() || nullable.size() != descrs.size() ||
- metadata.size() != descrs.size()) {
- return Status::Invalid("make_struct() was passed ", descrs.size(), " arguments but ",
- names.size(), " field names, ", nullable.size(),
- " nullability bits, and ", metadata.size(),
- " metadata dictionaries.");
- }
-
- size_t i = 0;
- FieldVector fields(descrs.size());
-
- ValueDescr::Shape shape = ValueDescr::SCALAR;
- for (const ValueDescr& descr : descrs) {
- if (descr.shape != ValueDescr::SCALAR) {
- shape = ValueDescr::ARRAY;
- } else {
- switch (descr.type->id()) {
- case Type::EXTENSION:
- case Type::DENSE_UNION:
- case Type::SPARSE_UNION:
- return Status::NotImplemented("Broadcasting scalars of type ", *descr.type);
- default:
- break;
- }
- }
-
- fields[i] =
- field(std::move(names[i]), descr.type, nullable[i], std::move(metadata[i]));
- ++i;
- }
-
- return ValueDescr{struct_(std::move(fields)), shape};
-}
-
-Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors()));
-
- for (int i = 0; i < batch.num_values(); ++i) {
- const auto& field = checked_cast<const StructType&>(*descr.type).field(i);
- if (batch[i].null_count() > 0 && !field->nullable()) {
- return Status::Invalid("Output field ", field, " (#", i,
- ") does not allow nulls but the corresponding "
- "argument was not entirely valid.");
- }
- }
-
- if (descr.shape == ValueDescr::SCALAR) {
- ScalarVector scalars(batch.num_values());
- for (int i = 0; i < batch.num_values(); ++i) {
- scalars[i] = batch[i].scalar();
- }
-
- *out =
- Datum(std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type)));
- return Status::OK();
- }
-
- ArrayVector arrays(batch.num_values());
- for (int i = 0; i < batch.num_values(); ++i) {
- if (batch[i].is_array()) {
- arrays[i] = batch[i].make_array();
- continue;
- }
-
- ARROW_ASSIGN_OR_RAISE(arrays[i], MakeArrayFromScalar(*batch[i].scalar(), batch.length,
- ctx->memory_pool()));
- }
-
- *out = std::make_shared<StructArray>(descr.type, batch.length, std::move(arrays));
- return Status::OK();
-}
-
-const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray",
- ("Names of the StructArray's fields are\n"
- "specified through MakeStructOptions."),
- {"*args"},
- "MakeStructOptions"};
-
+const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray",
+ ("Names of the StructArray's fields are\n"
+ "specified through MakeStructOptions."),
+ {"*args"},
+ "MakeStructOptions"};
+
} // namespace
void RegisterScalarNested(FunctionRegistry* registry) {
- auto list_value_length = std::make_shared<ScalarFunction>(
- "list_value_length", Arity::Unary(), &list_value_length_doc);
+ auto list_value_length = std::make_shared<ScalarFunction>(
+ "list_value_length", Arity::Unary(), &list_value_length_doc);
DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(),
ListValueLength<ListType>));
DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(),
ListValueLength<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(list_value_length)));
-
- static MakeStructOptions kDefaultMakeStructOptions;
- auto make_struct_function = std::make_shared<ScalarFunction>(
- "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions);
-
- ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{MakeStructResolve},
- /*is_varargs=*/true),
- MakeStructExec, OptionsWrapper<MakeStructOptions>::Init};
- kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- DCHECK_OK(make_struct_function->AddKernel(std::move(kernel)));
- DCHECK_OK(registry->AddFunction(std::move(make_struct_function)));
+
+ static MakeStructOptions kDefaultMakeStructOptions;
+ auto make_struct_function = std::make_shared<ScalarFunction>(
+ "make_struct", Arity::VarArgs(), &make_struct_doc, &kDefaultMakeStructOptions);
+
+ ScalarKernel kernel{KernelSignature::Make({InputType{}}, OutputType{MakeStructResolve},
+ /*is_varargs=*/true),
+ MakeStructExec, OptionsWrapper<MakeStructOptions>::Init};
+ kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(make_struct_function->AddKernel(std::move(kernel)));
+ DCHECK_OK(registry->AddFunction(std::move(make_struct_function)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 8fe28aae920..3e2e95e5401 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -18,9 +18,9 @@
#include "arrow/array/array_base.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/cast.h"
+#include "arrow/compute/cast.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_writer.h"
#include "arrow/util/hashing.h"
@@ -37,68 +37,68 @@ namespace {
template <typename Type>
struct SetLookupState : public KernelState {
- explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
+ explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
Status Init(const SetLookupOptions& options) {
- if (options.value_set.kind() == Datum::ARRAY) {
- const ArrayData& value_set = *options.value_set.array();
- memo_index_to_value_index.reserve(value_set.length);
- RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
- } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
- const ChunkedArray& value_set = *options.value_set.chunked_array();
- memo_index_to_value_index.reserve(value_set.length());
- int64_t offset = 0;
- for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
- RETURN_NOT_OK(AddArrayValueSet(options, *chunk->data(), offset));
- offset += chunk->length();
- }
- } else {
- return Status::Invalid("value_set should be an array or chunked array");
- }
- if (!options.skip_nulls && lookup_table.GetNull() >= 0) {
- null_index = memo_index_to_value_index[lookup_table.GetNull()];
- }
- return Status::OK();
- }
-
- Status AddArrayValueSet(const SetLookupOptions& options, const ArrayData& data,
- int64_t start_index = 0) {
+ if (options.value_set.kind() == Datum::ARRAY) {
+ const ArrayData& value_set = *options.value_set.array();
+ memo_index_to_value_index.reserve(value_set.length);
+ RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
+ } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
+ const ChunkedArray& value_set = *options.value_set.chunked_array();
+ memo_index_to_value_index.reserve(value_set.length());
+ int64_t offset = 0;
+ for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
+ RETURN_NOT_OK(AddArrayValueSet(options, *chunk->data(), offset));
+ offset += chunk->length();
+ }
+ } else {
+ return Status::Invalid("value_set should be an array or chunked array");
+ }
+ if (!options.skip_nulls && lookup_table.GetNull() >= 0) {
+ null_index = memo_index_to_value_index[lookup_table.GetNull()];
+ }
+ return Status::OK();
+ }
+
+ Status AddArrayValueSet(const SetLookupOptions& options, const ArrayData& data,
+ int64_t start_index = 0) {
using T = typename GetViewType<Type>::T;
- int32_t index = static_cast<int32_t>(start_index);
+ int32_t index = static_cast<int32_t>(start_index);
auto visit_valid = [&](T v) {
- const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
+ const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
int32_t unused_memo_index;
- auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
- auto on_not_found = [&](int32_t memo_index) {
- DCHECK_EQ(memo_index, memo_size);
- memo_index_to_value_index.push_back(index);
- };
- RETURN_NOT_OK(lookup_table.GetOrInsert(
- v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
- ++index;
- return Status::OK();
+ auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
+ auto on_not_found = [&](int32_t memo_index) {
+ DCHECK_EQ(memo_index, memo_size);
+ memo_index_to_value_index.push_back(index);
+ };
+ RETURN_NOT_OK(lookup_table.GetOrInsert(
+ v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
+ ++index;
+ return Status::OK();
};
auto visit_null = [&]() {
- const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
- auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
- auto on_not_found = [&](int32_t memo_index) {
- DCHECK_EQ(memo_index, memo_size);
- memo_index_to_value_index.push_back(index);
- };
- lookup_table.GetOrInsertNull(std::move(on_found), std::move(on_not_found));
- ++index;
+ const auto memo_size = static_cast<int32_t>(memo_index_to_value_index.size());
+ auto on_found = [&](int32_t memo_index) { DCHECK_LT(memo_index, memo_size); };
+ auto on_not_found = [&](int32_t memo_index) {
+ DCHECK_EQ(memo_index, memo_size);
+ memo_index_to_value_index.push_back(index);
+ };
+ lookup_table.GetOrInsertNull(std::move(on_found), std::move(on_not_found));
+ ++index;
return Status::OK();
};
-
- return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
+
+ return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
}
using MemoTable = typename HashTraits<Type>::MemoTableType;
MemoTable lookup_table;
- // When there are duplicates in value_set, the MemoTable indices must
- // be mapped back to indices in the value_set.
- std::vector<int32_t> memo_index_to_value_index;
- int32_t null_index = -1;
+ // When there are duplicates in value_set, the MemoTable indices must
+ // be mapped back to indices in the value_set.
+ std::vector<int32_t> memo_index_to_value_index;
+ int32_t null_index = -1;
};
template <>
@@ -106,11 +106,11 @@ struct SetLookupState<NullType> : public KernelState {
explicit SetLookupState(MemoryPool*) {}
Status Init(const SetLookupOptions& options) {
- value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
+ value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
return Status::OK();
}
- bool value_set_has_null;
+ bool value_set_has_null;
};
// TODO: Put this concept somewhere reusable
@@ -140,20 +140,20 @@ struct UnsignedIntType<8> {
// Constructing the type requires a type parameter
struct InitStateVisitor {
KernelContext* ctx;
- SetLookupOptions options;
- const std::shared_ptr<DataType>& arg_type;
+ SetLookupOptions options;
+ const std::shared_ptr<DataType>& arg_type;
std::unique_ptr<KernelState> result;
- InitStateVisitor(KernelContext* ctx, const KernelInitArgs& args)
- : ctx(ctx),
- options(*checked_cast<const SetLookupOptions*>(args.options)),
- arg_type(args.inputs[0].type) {}
+ InitStateVisitor(KernelContext* ctx, const KernelInitArgs& args)
+ : ctx(ctx),
+ options(*checked_cast<const SetLookupOptions*>(args.options)),
+ arg_type(args.inputs[0].type) {}
template <typename Type>
Status Init() {
using StateType = SetLookupState<Type>;
result.reset(new StateType(ctx->exec_context()->memory_pool()));
- return static_cast<StateType*>(result.get())->Init(options);
+ return static_cast<StateType*>(result.get())->Init(options);
}
Status Visit(const DataType&) { return Init<NullType>(); }
@@ -177,26 +177,26 @@ struct InitStateVisitor {
// Handle Decimal128Type, FixedSizeBinaryType
Status Visit(const FixedSizeBinaryType& type) { return Init<FixedSizeBinaryType>(); }
- Result<std::unique_ptr<KernelState>> GetResult() {
- if (!options.value_set.type()->Equals(arg_type)) {
- ARROW_ASSIGN_OR_RAISE(
- options.value_set,
- Cast(options.value_set, CastOptions::Safe(arg_type), ctx->exec_context()));
- }
-
- RETURN_NOT_OK(VisitTypeInline(*arg_type, this));
- return std::move(result);
+ Result<std::unique_ptr<KernelState>> GetResult() {
+ if (!options.value_set.type()->Equals(arg_type)) {
+ ARROW_ASSIGN_OR_RAISE(
+ options.value_set,
+ Cast(options.value_set, CastOptions::Safe(arg_type), ctx->exec_context()));
+ }
+
+ RETURN_NOT_OK(VisitTypeInline(*arg_type, this));
+ return std::move(result);
}
};
-Result<std::unique_ptr<KernelState>> InitSetLookup(KernelContext* ctx,
- const KernelInitArgs& args) {
- if (args.options == nullptr) {
- return Status::Invalid(
- "Attempted to call a set lookup function without SetLookupOptions");
- }
-
- return InitStateVisitor{ctx, args}.GetResult();
+Result<std::unique_ptr<KernelState>> InitSetLookup(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ if (args.options == nullptr) {
+ return Status::Invalid(
+ "Attempted to call a set lookup function without SetLookupOptions");
+ }
+
+ return InitStateVisitor{ctx, args}.GetResult();
}
struct IndexInVisitor {
@@ -208,18 +208,18 @@ struct IndexInVisitor {
IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
: ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {}
- Status Visit(const DataType& type) {
- DCHECK_EQ(type.id(), Type::NA);
+ Status Visit(const DataType& type) {
+ DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
if (data.length != 0) {
- // skip_nulls is honored for consistency with other types
- if (state.value_set_has_null) {
+ // skip_nulls is honored for consistency with other types
+ if (state.value_set_has_null) {
RETURN_NOT_OK(this->builder.Reserve(data.length));
for (int64_t i = 0; i < data.length; ++i) {
this->builder.UnsafeAppend(0);
}
- } else {
- RETURN_NOT_OK(this->builder.AppendNulls(data.length));
+ } else {
+ RETURN_NOT_OK(this->builder.AppendNulls(data.length));
}
}
return Status::OK();
@@ -238,16 +238,16 @@ struct IndexInVisitor {
int32_t index = state.lookup_table.Get(v);
if (index != -1) {
// matching needle; output index from value_set
- this->builder.UnsafeAppend(state.memo_index_to_value_index[index]);
+ this->builder.UnsafeAppend(state.memo_index_to_value_index[index]);
} else {
// no matching needle; output null
this->builder.UnsafeAppendNull();
}
},
[&]() {
- if (state.null_index != -1) {
+ if (state.null_index != -1) {
// value_set included null
- this->builder.UnsafeAppend(state.null_index);
+ this->builder.UnsafeAppend(state.null_index);
} else {
// value_set does not include null; output null
this->builder.UnsafeAppendNull();
@@ -290,13 +290,13 @@ struct IndexInVisitor {
}
};
-Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return IndexInVisitor(ctx, *batch[0].array(), out).Execute();
+Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return IndexInVisitor(ctx, *batch[0].array(), out).Execute();
}
// ----------------------------------------------------------------------
-// IsIn writes the results into a preallocated boolean data bitmap
+// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
KernelContext* ctx;
const ArrayData& data;
@@ -305,13 +305,13 @@ struct IsInVisitor {
IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
: ctx(ctx), data(data), out(out) {}
- Status Visit(const DataType& type) {
- DCHECK_EQ(type.id(), Type::NA);
+ Status Visit(const DataType& type) {
+ DCHECK_EQ(type.id(), Type::NA);
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
ArrayData* output = out->mutable_array();
- // skip_nulls is honored for consistency with other types
- BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset, output->length,
- state.value_set_has_null);
+ // skip_nulls is honored for consistency with other types
+ BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset, output->length,
+ state.value_set_has_null);
return Status::OK();
}
@@ -323,7 +323,7 @@ struct IsInVisitor {
FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
output->length);
-
+
VisitArrayDataInline<Type>(
this->data,
[&](T v) {
@@ -335,11 +335,11 @@ struct IsInVisitor {
writer.Next();
},
[&]() {
- if (state.null_index != -1) {
- writer.Set();
- } else {
- writer.Clear();
- }
+ if (state.null_index != -1) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
writer.Next();
});
writer.Finish();
@@ -370,8 +370,8 @@ struct IsInVisitor {
Status Execute() { return VisitTypeInline(*data.type, this); }
};
-Status ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return IsInVisitor(ctx, *batch[0].array(), out).Execute();
+Status ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return IsInVisitor(ctx, *batch[0].array(), out).Execute();
}
// Unary set lookup kernels available for the following input types
@@ -408,8 +408,8 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
// Enables calling is_in with CallFunction as though it were binary.
class IsInMetaBinary : public MetaFunction {
public:
- IsInMetaBinary()
- : MetaFunction("is_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+ IsInMetaBinary()
+ : MetaFunction("is_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -424,8 +424,8 @@ class IsInMetaBinary : public MetaFunction {
// Enables calling index_in with CallFunction as though it were binary.
class IndexInMetaBinary : public MetaFunction {
public:
- IndexInMetaBinary()
- : MetaFunction("index_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
+ IndexInMetaBinary()
+ : MetaFunction("index_in_meta_binary", Arity::Binary(), /*doc=*/nullptr) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -437,46 +437,46 @@ class IndexInMetaBinary : public MetaFunction {
}
};
-struct SetLookupFunction : ScalarFunction {
- using ScalarFunction::ScalarFunction;
-
- Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
- EnsureDictionaryDecoded(values);
- return DispatchExact(*values);
- }
-};
-
-const FunctionDoc is_in_doc{
- "Find each element in a set of values",
- ("For each element in `values`, return true if it is found in a given\n"
- "set of values, false otherwise.\n"
- "The set of values to look for must be given in SetLookupOptions.\n"
- "By default, nulls are matched against the value set, this can be\n"
- "changed in SetLookupOptions."),
- {"values"},
- "SetLookupOptions"};
-
-const FunctionDoc index_in_doc{
- "Return index of each element in a set of values",
- ("For each element in `values`, return its index in a given set of\n"
- "values, or null if it is not found there.\n"
- "The set of values to look for must be given in SetLookupOptions.\n"
- "By default, nulls are matched against the value set, this can be\n"
- "changed in SetLookupOptions."),
- {"values"},
- "SetLookupOptions"};
-
+struct SetLookupFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const override {
+ EnsureDictionaryDecoded(values);
+ return DispatchExact(*values);
+ }
+};
+
+const FunctionDoc is_in_doc{
+ "Find each element in a set of values",
+ ("For each element in `values`, return true if it is found in a given\n"
+ "set of values, false otherwise.\n"
+ "The set of values to look for must be given in SetLookupOptions.\n"
+ "By default, nulls are matched against the value set, this can be\n"
+ "changed in SetLookupOptions."),
+ {"values"},
+ "SetLookupOptions"};
+
+const FunctionDoc index_in_doc{
+ "Return index of each element in a set of values",
+ ("For each element in `values`, return its index in a given set of\n"
+ "values, or null if it is not found there.\n"
+ "The set of values to look for must be given in SetLookupOptions.\n"
+ "By default, nulls are matched against the value set, this can be\n"
+ "changed in SetLookupOptions."),
+ {"values"},
+ "SetLookupOptions"};
+
} // namespace
void RegisterScalarSetLookup(FunctionRegistry* registry) {
- // IsIn writes its boolean output into preallocated memory
+ // IsIn writes its boolean output into preallocated memory
{
ScalarKernel isin_base;
isin_base.init = InitSetLookup;
- isin_base.exec =
- TrivialScalarUnaryAsArraysExec(ExecIsIn, NullHandling::OUTPUT_NOT_NULL);
- isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
- auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), &is_in_doc);
+ isin_base.exec =
+ TrivialScalarUnaryAsArraysExec(ExecIsIn, NullHandling::OUTPUT_NOT_NULL);
+ isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
+ auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), &is_in_doc);
AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get());
@@ -491,12 +491,12 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
{
ScalarKernel index_in_base;
index_in_base.init = InitSetLookup;
- index_in_base.exec = TrivialScalarUnaryAsArraysExec(
- ExecIndexIn, NullHandling::COMPUTED_NO_PREALLOCATE);
+ index_in_base.exec = TrivialScalarUnaryAsArraysExec(
+ ExecIndexIn, NullHandling::COMPUTED_NO_PREALLOCATE);
index_in_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
index_in_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
- auto index_in =
- std::make_shared<SetLookupFunction>("index_in", Arity::Unary(), &index_in_doc);
+ auto index_in =
+ std::make_shared<SetLookupFunction>("index_in", Arity::Unary(), &index_in_doc);
AddBasicSetLookupKernels(index_in_base, /*output_type=*/int32(), index_in.get());
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
index ce37b089b6f..ab0a490eeb3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -17,55 +17,55 @@
#include <algorithm>
#include <cctype>
-#include <iterator>
+#include <iterator>
#include <string>
#ifdef ARROW_WITH_UTF8PROC
#include <utf8proc.h>
#endif
-#ifdef ARROW_WITH_RE2
-#include <re2/re2.h>
-#endif
-
-#include "arrow/array/builder_binary.h"
-#include "arrow/array/builder_nested.h"
-#include "arrow/buffer_builder.h"
-
-#include "arrow/builder.h"
+#ifdef ARROW_WITH_RE2
+#include <re2/re2.h>
+#endif
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/buffer_builder.h"
+
+#include "arrow/builder.h"
#include "arrow/compute/api_scalar.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/utf8.h"
#include "arrow/util/value_parsing.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
-
-using internal::checked_cast;
-
+
+using internal::checked_cast;
+
namespace compute {
namespace internal {
namespace {
-#ifdef ARROW_WITH_RE2
-util::string_view ToStringView(re2::StringPiece piece) {
- return {piece.data(), piece.length()};
-}
-
-re2::StringPiece ToStringPiece(util::string_view view) {
- return {view.data(), view.length()};
-}
-
-Status RegexStatus(const RE2& regex) {
- if (!regex.ok()) {
- return Status::Invalid("Invalid regular expression: ", regex.error());
- }
- return Status::OK();
-}
-#endif
-
+#ifdef ARROW_WITH_RE2
+util::string_view ToStringView(re2::StringPiece piece) {
+ return {piece.data(), piece.length()};
+}
+
+re2::StringPiece ToStringPiece(util::string_view view) {
+ return {view.data(), view.length()};
+}
+
+Status RegexStatus(const RE2& regex) {
+ if (!regex.ok()) {
+ return Status::Invalid("Invalid regular expression: ", regex.error());
+ }
+ return Status::OK();
+}
+#endif
+
// Code units in the range [a-z] can only be an encoding of an ascii
// character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
// codepoint. This guaranteed by non-overlap design of the unicode standard. (see
@@ -88,20 +88,20 @@ static inline bool IsAsciiCharacter(T character) {
struct BinaryLength {
template <typename OutValue, typename Arg0Value = util::string_view>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
return static_cast<OutValue>(val.size());
}
};
-struct Utf8Length {
- template <typename OutValue, typename Arg0Value = util::string_view>
- static OutValue Call(KernelContext*, Arg0Value val, Status*) {
- auto str = reinterpret_cast<const uint8_t*>(val.data());
- auto strlen = val.size();
- return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
- }
-};
-
+struct Utf8Length {
+ template <typename OutValue, typename Arg0Value = util::string_view>
+ static OutValue Call(KernelContext*, Arg0Value val, Status*) {
+ auto str = reinterpret_cast<const uint8_t*>(val.data());
+ auto strlen = val.size();
+ return static_cast<OutValue>(util::UTF8Length(str, str + strlen));
+ }
+};
+
#ifdef ARROW_WITH_UTF8PROC
// Direct lookup tables for unicode properties
@@ -124,239 +124,239 @@ void EnsureLookupTablesFilled() {
});
}
-#else
-
-void EnsureLookupTablesFilled() {}
-
-#endif // ARROW_WITH_UTF8PROC
-
-constexpr int64_t kTransformError = -1;
-
-struct StringTransformBase {
- virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return Status::OK();
- }
-
- // Return the maximum total size of the output in codeunits (i.e. bytes)
- // given input characteristics.
- virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
- return input_ncodeunits;
- }
-
- virtual Status InvalidStatus() {
- return Status::Invalid("Invalid UTF8 sequence in input");
- }
-
- // Derived classes should also define this method:
- // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- // uint8_t* output);
-};
-
-template <typename Type, typename StringTransform>
-struct StringTransformExecBase {
+#else
+
+void EnsureLookupTablesFilled() {}
+
+#endif // ARROW_WITH_UTF8PROC
+
+constexpr int64_t kTransformError = -1;
+
+struct StringTransformBase {
+ virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+
+ // Return the maximum total size of the output in codeunits (i.e. bytes)
+ // given input characteristics.
+ virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
+ return input_ncodeunits;
+ }
+
+ virtual Status InvalidStatus() {
+ return Status::Invalid("Invalid UTF8 sequence in input");
+ }
+
+ // Derived classes should also define this method:
+ // int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ // uint8_t* output);
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExecBase {
using offset_type = typename Type::offset_type;
using ArrayType = typename TypeTraits<Type>::ArrayType;
- static Status Execute(KernelContext* ctx, StringTransform* transform,
- const ExecBatch& batch, Datum* out) {
- if (batch[0].kind() == Datum::ARRAY) {
- return ExecArray(ctx, transform, batch[0].array(), out);
+ static Status Execute(KernelContext* ctx, StringTransform* transform,
+ const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::ARRAY) {
+ return ExecArray(ctx, transform, batch[0].array(), out);
}
- DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
- return ExecScalar(ctx, transform, batch[0].scalar(), out);
+ DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
+ return ExecScalar(ctx, transform, batch[0].scalar(), out);
}
- static Status ExecArray(KernelContext* ctx, StringTransform* transform,
- const std::shared_ptr<ArrayData>& data, Datum* out) {
- ArrayType input(data);
- ArrayData* output = out->mutable_array();
+ static Status ExecArray(KernelContext* ctx, StringTransform* transform,
+ const std::shared_ptr<ArrayData>& data, Datum* out) {
+ ArrayType input(data);
+ ArrayData* output = out->mutable_array();
- const int64_t input_ncodeunits = input.total_values_length();
- const int64_t input_nstrings = input.length();
+ const int64_t input_ncodeunits = input.total_values_length();
+ const int64_t input_nstrings = input.length();
- const int64_t output_ncodeunits_max =
- transform->MaxCodeunits(input_nstrings, input_ncodeunits);
- if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
- return Status::CapacityError(
- "Result might not fit in a 32bit utf8 array, convert to large_utf8");
- }
+ const int64_t output_ncodeunits_max =
+ transform->MaxCodeunits(input_nstrings, input_ncodeunits);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+ }
- ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
- output->buffers[2] = values_buffer;
+ ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(output_ncodeunits_max));
+ output->buffers[2] = values_buffer;
- // String offsets are preallocated
- offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
- uint8_t* output_str = output->buffers[2]->mutable_data();
- offset_type output_ncodeunits = 0;
+ // String offsets are preallocated
+ offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
+ uint8_t* output_str = output->buffers[2]->mutable_data();
+ offset_type output_ncodeunits = 0;
- output_string_offsets[0] = 0;
- for (int64_t i = 0; i < input_nstrings; i++) {
- if (!input.IsNull(i)) {
+ output_string_offsets[0] = 0;
+ for (int64_t i = 0; i < input_nstrings; i++) {
+ if (!input.IsNull(i)) {
offset_type input_string_ncodeunits;
- const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
- auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
- input_string, input_string_ncodeunits, output_str + output_ncodeunits));
- if (encoded_nbytes < 0) {
- return transform->InvalidStatus();
+ const uint8_t* input_string = input.GetValue(i, &input_string_ncodeunits);
+ auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
+ input_string, input_string_ncodeunits, output_str + output_ncodeunits));
+ if (encoded_nbytes < 0) {
+ return transform->InvalidStatus();
}
output_ncodeunits += encoded_nbytes;
}
- output_string_offsets[i + 1] = output_ncodeunits;
- }
- DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
-
- // Trim the codepoint buffer, since we allocated too much
- return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
- }
-
- static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
- const std::shared_ptr<Scalar>& scalar, Datum* out) {
- const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
- if (!input.is_valid) {
- return Status::OK();
+ output_string_offsets[i + 1] = output_ncodeunits;
+ }
+ DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+ // Trim the codepoint buffer, since we allocated too much
+ return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+ }
+
+ static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
+ const std::shared_ptr<Scalar>& scalar, Datum* out) {
+ const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
+ if (!input.is_valid) {
+ return Status::OK();
+ }
+ auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ result->is_valid = true;
+ const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
+
+ const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
}
- auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
- result->is_valid = true;
- const int64_t data_nbytes = static_cast<int64_t>(input.value->size());
-
- const int64_t output_ncodeunits_max = transform->MaxCodeunits(1, data_nbytes);
- if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
- return Status::CapacityError(
- "Result might not fit in a 32bit utf8 array, convert to large_utf8");
- }
- ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
- result->value = value_buffer;
- auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
- input.value->data(), data_nbytes, value_buffer->mutable_data()));
- if (encoded_nbytes < 0) {
- return transform->InvalidStatus();
- }
- DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
- return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+ ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(output_ncodeunits_max));
+ result->value = value_buffer;
+ auto encoded_nbytes = static_cast<offset_type>(transform->Transform(
+ input.value->data(), data_nbytes, value_buffer->mutable_data()));
+ if (encoded_nbytes < 0) {
+ return transform->InvalidStatus();
+ }
+ DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+ return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
}
};
-template <typename Type, typename StringTransform>
-struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
- using StringTransformExecBase<Type, StringTransform>::Execute;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- StringTransform transform;
- RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
- return Execute(ctx, &transform, batch, out);
- }
-};
-
-template <typename Type, typename StringTransform>
-struct StringTransformExecWithState
- : public StringTransformExecBase<Type, StringTransform> {
- using State = typename StringTransform::State;
- using StringTransformExecBase<Type, StringTransform>::Execute;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- StringTransform transform(State::Get(ctx));
- RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
- return Execute(ctx, &transform, batch, out);
- }
-};
-
-#ifdef ARROW_WITH_UTF8PROC
-
-template <typename CodepointTransform>
-struct StringTransformCodepoint : public StringTransformBase {
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- EnsureLookupTablesFilled();
- return Status::OK();
- }
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits);
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- uint8_t* output_start = output;
- if (ARROW_PREDICT_FALSE(
- !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
- CodepointTransform::TransformCodepoint))) {
- return kTransformError;
- }
- return output - output_start;
- }
-};
-
-// struct CaseMappingMixin {
-struct CaseMappingTransform {
- static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
- // Section 5.18 of the Unicode spec claim that the number of codepoints for case
- // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
- // However, since we don't support all casings (SpecialCasing.txt) the growth
- // in bytes iss actually only at max 3/2 (as covered by the unittest).
- // Note that rounding down the 3/2 is ok, since only codepoints encoded by
- // two code units (even) can grow to 3 code units.
- return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
- }
-};
-
-struct UTF8UpperTransform : public CaseMappingTransform {
- static uint32_t TransformCodepoint(uint32_t codepoint) {
+template <typename Type, typename StringTransform>
+struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
+ using StringTransformExecBase<Type, StringTransform>::Execute;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ StringTransform transform;
+ RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+ return Execute(ctx, &transform, batch, out);
+ }
+};
+
+template <typename Type, typename StringTransform>
+struct StringTransformExecWithState
+ : public StringTransformExecBase<Type, StringTransform> {
+ using State = typename StringTransform::State;
+ using StringTransformExecBase<Type, StringTransform>::Execute;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ StringTransform transform(State::Get(ctx));
+ RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
+ return Execute(ctx, &transform, batch, out);
+ }
+};
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename CodepointTransform>
+struct StringTransformCodepoint : public StringTransformBase {
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return CodepointTransform::MaxCodeunits(ninputs, input_ncodeunits);
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ uint8_t* output_start = output;
+ if (ARROW_PREDICT_FALSE(
+ !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
+ CodepointTransform::TransformCodepoint))) {
+ return kTransformError;
+ }
+ return output - output_start;
+ }
+};
+
+// struct CaseMappingMixin {
+struct CaseMappingTransform {
+ static int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) {
+ // Section 5.18 of the Unicode spec claim that the number of codepoints for case
+ // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
+ // However, since we don't support all casings (SpecialCasing.txt) the growth
+ // in bytes iss actually only at max 3/2 (as covered by the unittest).
+ // Note that rounding down the 3/2 is ok, since only codepoints encoded by
+ // two code units (even) can grow to 3 code units.
+ return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
+ }
+};
+
+struct UTF8UpperTransform : public CaseMappingTransform {
+ static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
: utf8proc_toupper(codepoint);
}
};
template <typename Type>
-using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
-
-struct UTF8LowerTransform : public CaseMappingTransform {
+using UTF8Upper = StringTransformExec<Type, StringTransformCodepoint<UTF8UpperTransform>>;
+
+struct UTF8LowerTransform : public CaseMappingTransform {
static uint32_t TransformCodepoint(uint32_t codepoint) {
return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
: utf8proc_tolower(codepoint);
}
};
-template <typename Type>
-using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
+template <typename Type>
+using UTF8Lower = StringTransformExec<Type, StringTransformCodepoint<UTF8LowerTransform>>;
#endif // ARROW_WITH_UTF8PROC
-struct AsciiReverseTransform : public StringTransformBase {
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- uint8_t utf8_char_found = 0;
- for (int64_t i = 0; i < input_string_ncodeunits; i++) {
- // if a utf8 char is found, report to utf8_char_found
- utf8_char_found |= input[i] & 0x80;
- output[input_string_ncodeunits - i - 1] = input[i];
- }
- return utf8_char_found ? kTransformError : input_string_ncodeunits;
- }
-
- Status InvalidStatus() override {
- return Status::Invalid("Non-ASCII sequence in input");
- }
-};
-
-template <typename Type>
-using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
-
-struct Utf8ReverseTransform : public StringTransformBase {
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- int64_t i = 0;
- while (i < input_string_ncodeunits) {
- int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
- input_string_ncodeunits);
- std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
- i = char_end;
- }
- return input_string_ncodeunits;
- }
-};
-
-template <typename Type>
-using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
-
+struct AsciiReverseTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ uint8_t utf8_char_found = 0;
+ for (int64_t i = 0; i < input_string_ncodeunits; i++) {
+ // if a utf8 char is found, report to utf8_char_found
+ utf8_char_found |= input[i] & 0x80;
+ output[input_string_ncodeunits - i - 1] = input[i];
+ }
+ return utf8_char_found ? kTransformError : input_string_ncodeunits;
+ }
+
+ Status InvalidStatus() override {
+ return Status::Invalid("Non-ASCII sequence in input");
+ }
+};
+
+template <typename Type>
+using AsciiReverse = StringTransformExec<Type, AsciiReverseTransform>;
+
+struct Utf8ReverseTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ int64_t i = 0;
+ while (i < input_string_ncodeunits) {
+ int64_t char_end = std::min(i + util::ValidUtf8CodepointByteSize(input + i),
+ input_string_ncodeunits);
+ std::copy(input + i, input + char_end, output + input_string_ncodeunits - char_end);
+ i = char_end;
+ }
+ return input_string_ncodeunits;
+ }
+};
+
+template <typename Type>
+using Utf8Reverse = StringTransformExec<Type, Utf8ReverseTransform>;
+
using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;
// Transform a buffer of offsets to one which begins with 0 and has same
@@ -378,8 +378,8 @@ Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t
// Apply `transform` to input character data- this function cannot change the
// length
template <typename Type>
-Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
- TransformFunc transform, Datum* out) {
+Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
+ TransformFunc transform, Datum* out) {
using ArrayType = typename TypeTraits<Type>::ArrayType;
using offset_type = typename Type::offset_type;
@@ -395,13 +395,13 @@ Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
} else {
DCHECK(input.buffers[1]);
// We must allocate new space for the offsets and shift the existing offsets
- RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
- input.length, &out_arr->buffers[1]));
+ RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
+ input.length, &out_arr->buffers[1]));
}
// Allocate space for output data
int64_t data_nbytes = input_boxed.total_values_length();
- RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
+ RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
if (input.length > 0) {
transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
out_arr->buffers[2]->mutable_data());
@@ -412,13 +412,13 @@ Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
if (input.is_valid) {
result->is_valid = true;
int64_t data_nbytes = input.value->size();
- RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
+ RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&result->value));
transform(input.value->data(), data_nbytes, result->value->mutable_data());
}
- out->value = result;
+ out->value = result;
}
-
- return Status::OK();
+
+ return Status::OK();
}
void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
@@ -427,8 +427,8 @@ void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output)
template <typename Type>
struct AsciiUpper {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
}
};
@@ -438,8 +438,8 @@ void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output)
template <typename Type>
struct AsciiLower {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
}
};
@@ -473,881 +473,881 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
static_cast<offset_type>(input.value->size())};
transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
&result_value);
- out->value = std::make_shared<BooleanScalar>(result_value > 0);
+ out->value = std::make_shared<BooleanScalar>(result_value > 0);
}
}
}
-using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
-
-// This is an implementation of the Knuth-Morris-Pratt algorithm
-struct PlainSubstringMatcher {
- const MatchSubstringOptions& options_;
- std::vector<int64_t> prefix_table;
-
- static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
- const MatchSubstringOptions& options) {
- // Should be handled by partial template specialization below
- DCHECK(!options.ignore_case);
- return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
- }
-
- explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
- : options_(options) {
- // Phase 1: Build the prefix table
- const auto pattern_length = options_.pattern.size();
- prefix_table.resize(pattern_length + 1, /*value=*/0);
- int64_t prefix_length = -1;
- prefix_table[0] = -1;
- for (size_t pos = 0; pos < pattern_length; ++pos) {
- // The prefix cannot be expanded, reset.
- while (prefix_length >= 0 &&
- options_.pattern[pos] != options_.pattern[prefix_length]) {
- prefix_length = prefix_table[prefix_length];
- }
- prefix_length++;
- prefix_table[pos + 1] = prefix_length;
+using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;
+
+// This is an implementation of the Knuth-Morris-Pratt algorithm
+struct PlainSubstringMatcher {
+ const MatchSubstringOptions& options_;
+ std::vector<int64_t> prefix_table;
+
+ static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
+ }
+
+ explicit PlainSubstringMatcher(const MatchSubstringOptions& options)
+ : options_(options) {
+ // Phase 1: Build the prefix table
+ const auto pattern_length = options_.pattern.size();
+ prefix_table.resize(pattern_length + 1, /*value=*/0);
+ int64_t prefix_length = -1;
+ prefix_table[0] = -1;
+ for (size_t pos = 0; pos < pattern_length; ++pos) {
+ // The prefix cannot be expanded, reset.
+ while (prefix_length >= 0 &&
+ options_.pattern[pos] != options_.pattern[prefix_length]) {
+ prefix_length = prefix_table[prefix_length];
+ }
+ prefix_length++;
+ prefix_table[pos + 1] = prefix_length;
}
}
- int64_t Find(util::string_view current) const {
- // Phase 2: Find the prefix in the data
- const auto pattern_length = options_.pattern.size();
+ int64_t Find(util::string_view current) const {
+ // Phase 2: Find the prefix in the data
+ const auto pattern_length = options_.pattern.size();
int64_t pattern_pos = 0;
- int64_t pos = 0;
- if (pattern_length == 0) return 0;
- for (const auto c : current) {
- while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
+ int64_t pos = 0;
+ if (pattern_length == 0) return 0;
+ for (const auto c : current) {
+ while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
pattern_pos = prefix_table[pattern_pos];
}
pattern_pos++;
- if (static_cast<size_t>(pattern_pos) == pattern_length) {
- return pos + 1 - pattern_length;
+ if (static_cast<size_t>(pattern_pos) == pattern_length) {
+ return pos + 1 - pattern_length;
}
- pos++;
+ pos++;
}
- return -1;
- }
-
- bool Match(util::string_view current) const { return Find(current) >= 0; }
-};
-
-struct PlainStartsWithMatcher {
- const MatchSubstringOptions& options_;
-
- explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
- : options_(options) {}
-
- static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
- const MatchSubstringOptions& options) {
- // Should be handled by partial template specialization below
- DCHECK(!options.ignore_case);
- return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
- }
-
- bool Match(util::string_view current) const {
- // string_view::starts_with is C++20
- return current.substr(0, options_.pattern.size()) == options_.pattern;
- }
-};
-
-struct PlainEndsWithMatcher {
- const MatchSubstringOptions& options_;
-
- explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
- : options_(options) {}
-
- static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
- const MatchSubstringOptions& options) {
- // Should be handled by partial template specialization below
- DCHECK(!options.ignore_case);
- return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
- }
-
- bool Match(util::string_view current) const {
- // string_view::ends_with is C++20
- return current.size() >= options_.pattern.size() &&
- current.substr(current.size() - options_.pattern.size(),
- options_.pattern.size()) == options_.pattern;
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct RegexSubstringMatcher {
- const MatchSubstringOptions& options_;
- const RE2 regex_match_;
-
- static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
- const MatchSubstringOptions& options, bool literal = false) {
- auto matcher =
- ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
- RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
- return std::move(matcher);
- }
-
- explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
- bool literal = false)
- : options_(options),
- regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
-
- bool Match(util::string_view current) const {
- auto piece = re2::StringPiece(current.data(), current.length());
- return re2::RE2::PartialMatch(piece, regex_match_);
- }
-
- static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
- bool literal) {
- RE2::RE2::Options re2_options(RE2::Quiet);
- re2_options.set_case_sensitive(!options.ignore_case);
- re2_options.set_literal(literal);
- return re2_options;
- }
-};
-#endif
-
-template <typename Type, typename Matcher>
-struct MatchSubstringImpl {
+ return -1;
+ }
+
+ bool Match(util::string_view current) const { return Find(current) >= 0; }
+};
+
+struct PlainStartsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
+ }
+
+ bool Match(util::string_view current) const {
+ // string_view::starts_with is C++20
+ return current.substr(0, options_.pattern.size()) == options_.pattern;
+ }
+};
+
+struct PlainEndsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
+ }
+
+ bool Match(util::string_view current) const {
+ // string_view::ends_with is C++20
+ return current.size() >= options_.pattern.size() &&
+ current.substr(current.size() - options_.pattern.size(),
+ options_.pattern.size()) == options_.pattern;
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubstringMatcher {
+ const MatchSubstringOptions& options_;
+ const RE2 regex_match_;
+
+ static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
+ const MatchSubstringOptions& options, bool literal = false) {
+ auto matcher =
+ ::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
+ RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
+ return std::move(matcher);
+ }
+
+ explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
+ bool literal = false)
+ : options_(options),
+ regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
+
+ bool Match(util::string_view current) const {
+ auto piece = re2::StringPiece(current.data(), current.length());
+ return re2::RE2::PartialMatch(piece, regex_match_);
+ }
+
+ static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
+ bool literal) {
+ RE2::RE2::Options re2_options(RE2::Quiet);
+ re2_options.set_case_sensitive(!options.ignore_case);
+ re2_options.set_literal(literal);
+ return re2_options;
+ }
+};
+#endif
+
+template <typename Type, typename Matcher>
+struct MatchSubstringImpl {
using offset_type = typename Type::offset_type;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
- const Matcher* matcher) {
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+ const Matcher* matcher) {
StringBoolTransform<Type>(
ctx, batch,
- [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
- int64_t output_offset, uint8_t* output) {
- const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
- FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
- for (int64_t i = 0; i < length; ++i) {
- const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
- int64_t current_length = offsets[i + 1] - offsets[i];
- if (matcher->Match(util::string_view(current_data, current_length))) {
- bitmap_writer.Set();
- }
- bitmap_writer.Next();
- }
- bitmap_writer.Finish();
+ [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
+ int64_t output_offset, uint8_t* output) {
+ const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
+ FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
+ for (int64_t i = 0; i < length; ++i) {
+ const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
+ int64_t current_length = offsets[i + 1] - offsets[i];
+ if (matcher->Match(util::string_view(current_data, current_length))) {
+ bitmap_writer.Set();
+ }
+ bitmap_writer.Next();
+ }
+ bitmap_writer.Finish();
},
out);
- return Status::OK();
+ return Status::OK();
+ }
+};
+
+template <typename Type, typename Matcher>
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache matcher across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
+ return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainSubstringMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto matcher,
+ RegexSubstringMatcher::Make(options, /*literal=*/true));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainStartsWithMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template <typename Type>
+struct MatchSubstring<Type, PlainEndsWithMatcher> {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
+ return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
+ matcher.get());
}
};
-template <typename Type, typename Matcher>
-struct MatchSubstring {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // TODO Cache matcher across invocations (for regex compilation)
- ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
- return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
- }
-};
-
-template <typename Type>
-struct MatchSubstring<Type, PlainSubstringMatcher> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- auto options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- ARROW_ASSIGN_OR_RAISE(auto matcher,
- RegexSubstringMatcher::Make(options, /*literal=*/true));
- return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
- return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
- }
-};
-
-template <typename Type>
-struct MatchSubstring<Type, PlainStartsWithMatcher> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- auto options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- MatchSubstringOptions converted_options = options;
- converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
- ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
- return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
- return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
- matcher.get());
- }
-};
-
-template <typename Type>
-struct MatchSubstring<Type, PlainEndsWithMatcher> {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- auto options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- MatchSubstringOptions converted_options = options;
- converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
- ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
- return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
- matcher.get());
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
- return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
- matcher.get());
- }
-};
-
-const FunctionDoc match_substring_doc(
- "Match strings against literal pattern",
- ("For each string in `strings`, emit true iff it contains a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-const FunctionDoc starts_with_doc(
- "Check if strings start with a literal pattern",
- ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-const FunctionDoc ends_with_doc(
- "Check if strings end with a literal pattern",
- ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-const FunctionDoc match_substring_regex_doc(
- "Match strings against regex pattern",
- ("For each string in `strings`, emit true iff it matches a given pattern at any "
- "position.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
- "If ignore_case is set, only simple case folding is performed."),
- {"strings"}, "MatchSubstringOptions");
-
-// SQL LIKE match
-
-/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
-std::string MakeLikeRegex(const MatchSubstringOptions& options) {
- // Allow . to match \n
- std::string like_pattern = "(?s:^";
- like_pattern.reserve(options.pattern.size() + 7);
- bool escaped = false;
- for (const char c : options.pattern) {
- if (!escaped && c == '%') {
- like_pattern.append(".*");
- } else if (!escaped && c == '_') {
- like_pattern.append(".");
- } else if (!escaped && c == '\\') {
- escaped = true;
- } else {
- switch (c) {
- case '.':
- case '?':
- case '+':
- case '*':
- case '^':
- case '$':
- case '\\':
- case '[':
- case '{':
- case '(':
- case ')':
- case '|': {
- like_pattern.push_back('\\');
- like_pattern.push_back(c);
- escaped = false;
- break;
- }
- default: {
- like_pattern.push_back(c);
- escaped = false;
- break;
- }
- }
- }
- }
- like_pattern.append("$)");
- return like_pattern;
-}
-
-// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
-// substring search as appropriate. See what Apache Impala does:
-// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
-template <typename StringType>
-struct MatchLike {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // NOTE: avoid making those constants global to avoid compiling regexes at startup
- // A LIKE pattern matching this regex can be translated into a substring search.
- static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
- // A LIKE pattern matching this regex can be translated into a prefix search.
- static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
- // A LIKE pattern matching this regex can be translated into a suffix search.
- static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
-
- auto original_options = MatchSubstringState::Get(ctx);
- auto original_state = ctx->state();
-
- Status status;
- std::string pattern;
- if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
- } else if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
- } else if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
- } else {
- MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
- original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
- }
- ctx->SetState(original_state);
- return status;
- }
-};
-
-const FunctionDoc match_like_doc(
- "Match strings against SQL-style LIKE pattern",
- ("For each string in `strings`, emit true iff it fully matches a given pattern "
- "at any position. That is, '%' will match any number of characters, '_' will "
- "match exactly one character, and any other character matches itself. To "
- "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
-#endif
-
+const FunctionDoc match_substring_doc(
+ "Match strings against literal pattern",
+ ("For each string in `strings`, emit true iff it contains a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc starts_with_doc(
+ "Check if strings start with a literal pattern",
+ ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc ends_with_doc(
+ "Check if strings end with a literal pattern",
+ ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+const FunctionDoc match_substring_regex_doc(
+ "Match strings against regex pattern",
+ ("For each string in `strings`, emit true iff it matches a given pattern at any "
+ "position.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+// SQL LIKE match
+
+/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
+std::string MakeLikeRegex(const MatchSubstringOptions& options) {
+ // Allow . to match \n
+ std::string like_pattern = "(?s:^";
+ like_pattern.reserve(options.pattern.size() + 7);
+ bool escaped = false;
+ for (const char c : options.pattern) {
+ if (!escaped && c == '%') {
+ like_pattern.append(".*");
+ } else if (!escaped && c == '_') {
+ like_pattern.append(".");
+ } else if (!escaped && c == '\\') {
+ escaped = true;
+ } else {
+ switch (c) {
+ case '.':
+ case '?':
+ case '+':
+ case '*':
+ case '^':
+ case '$':
+ case '\\':
+ case '[':
+ case '{':
+ case '(':
+ case ')':
+ case '|': {
+ like_pattern.push_back('\\');
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ default: {
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ }
+ }
+ }
+ like_pattern.append("$)");
+ return like_pattern;
+}
+
+// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
+// substring search as appropriate. See what Apache Impala does:
+// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
+template <typename StringType>
+struct MatchLike {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // NOTE: avoid making those constants global to avoid compiling regexes at startup
+ // A LIKE pattern matching this regex can be translated into a substring search.
+ static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a prefix search.
+ static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a suffix search.
+ static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
+
+ auto original_options = MatchSubstringState::Get(ctx);
+ auto original_state = ctx->state();
+
+ Status status;
+ std::string pattern;
+ if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
+ } else {
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
+ original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
+ }
+ ctx->SetState(original_state);
+ return status;
+ }
+};
+
+const FunctionDoc match_like_doc(
+ "Match strings against SQL-style LIKE pattern",
+ ("For each string in `strings`, emit true iff it fully matches a given pattern "
+ "at any position. That is, '%' will match any number of characters, '_' will "
+ "match exactly one character, and any other character matches itself. To "
+ "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#endif
+
void AddMatchSubstring(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
- &match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
- &match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
- &match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#ifdef ARROW_WITH_RE2
- {
- auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
- &match_substring_regex_doc);
- auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func =
- std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
- auto exec_32 = MatchLike<StringType>::Exec;
- auto exec_64 = MatchLike<LargeStringType>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#endif
-}
-
-// Substring find - lfind/index/etc.
-
-struct FindSubstring {
- const PlainSubstringMatcher matcher_;
-
- explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- return static_cast<OutValue>(matcher_.Find(val));
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct FindSubstringRegex {
- std::unique_ptr<RE2> regex_match_;
-
- explicit FindSubstringRegex(const MatchSubstringOptions& options,
- bool literal = false) {
- std::string regex = "(";
- regex.reserve(options.pattern.length() + 2);
- regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
- regex += ")";
- regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
- options, /*literal=*/false)));
- }
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- re2::StringPiece piece(val.data(), val.length());
- re2::StringPiece match;
- if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
- return static_cast<OutValue>(match.data() - piece.data());
- }
- return -1;
- }
-};
-#endif
-
-template <typename InputType>
-struct FindSubstringExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
- kernel{FindSubstringRegex(options, /*literal=*/true)};
- return kernel.Exec(ctx, batch, out);
-#endif
- return Status::NotImplemented("ignore_case requires RE2");
- }
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
- FindSubstring(PlainSubstringMatcher(options))};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-const FunctionDoc find_substring_doc(
- "Find first occurrence of substring",
- ("For each string in `strings`, emit the index of the first occurrence of the given "
- "pattern, or -1 if not found.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-template <typename InputType>
-struct FindSubstringRegexExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
- kernel{FindSubstringRegex(options, /*literal=*/false)};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-const FunctionDoc find_substring_regex_doc(
- "Find location of first match of regex pattern",
- ("For each string in `strings`, emit the index of the first match of the given "
- "pattern, or -1 if not found.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-#endif
-
-void AddFindSubstring(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
- &find_substring_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#ifdef ARROW_WITH_RE2
- {
- auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
- &find_substring_regex_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(
- func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#endif
-}
-
-// Substring count
-
-struct CountSubstring {
- const PlainSubstringMatcher matcher_;
-
- explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- OutValue count = 0;
- uint64_t start = 0;
- const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
- while (start <= val.size()) {
- const int64_t index = matcher_.Find(val.substr(start));
- if (index >= 0) {
- count++;
- start += index + pattern_size;
- } else {
- break;
- }
- }
- return count;
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct CountSubstringRegex {
- std::unique_ptr<RE2> regex_match_;
-
- explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
- : regex_match_(new RE2(options.pattern,
- RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
-
- static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
- bool literal = false) {
- CountSubstringRegex counter(options, literal);
- RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
- return std::move(counter);
- }
-
- template <typename OutValue, typename... Ignored>
- OutValue Call(KernelContext*, util::string_view val, Status*) const {
- OutValue count = 0;
- re2::StringPiece input(val.data(), val.size());
- auto last_size = input.size();
- while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
- count++;
- if (last_size == input.size()) {
- // 0-length match
- if (input.size() > 0) {
- input.remove_prefix(1);
- } else {
- break;
- }
- }
- last_size = input.size();
- }
- return count;
- }
-};
-
-template <typename InputType>
-struct CountSubstringRegexExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
- kernel{std::move(counter)};
- return kernel.Exec(ctx, batch, out);
- }
-};
-#endif
-
-template <typename InputType>
-struct CountSubstringExec {
- using OffsetType = typename TypeTraits<InputType>::OffsetType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
- if (options.ignore_case) {
-#ifdef ARROW_WITH_RE2
- ARROW_ASSIGN_OR_RAISE(auto counter,
- CountSubstringRegex::Make(options, /*literal=*/true));
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
- kernel{std::move(counter)};
- return kernel.Exec(ctx, batch, out);
-#else
- return Status::NotImplemented("ignore_case requires RE2");
-#endif
- }
- applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
- CountSubstring(PlainSubstringMatcher(options))};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-const FunctionDoc count_substring_doc(
- "Count occurrences of substring",
- ("For each string in `strings`, emit the number of occurrences of the given "
- "pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-const FunctionDoc count_substring_regex_doc(
- "Count occurrences of substring",
- ("For each string in `strings`, emit the number of occurrences of the given "
- "regex pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-#endif
-
-void AddCountSubstring(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
- &count_substring_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#ifdef ARROW_WITH_RE2
- {
- auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
- &count_substring_regex_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
- DCHECK_OK(
- func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
- MatchSubstringState::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-#endif
-}
-
-// Slicing
-
-struct SliceTransformBase : public StringTransformBase {
- using State = OptionsWrapper<SliceOptions>;
-
- const SliceOptions* options;
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- options = &State::Get(ctx);
- if (options->step == 0) {
- return Status::Invalid("Slice step cannot be zero");
- }
- return Status::OK();
- }
-};
-
-struct SliceCodeunitsTransform : SliceTransformBase {
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- const SliceOptions& opt = *this->options;
- if ((opt.start >= 0) != (opt.stop >= 0)) {
- // If start and stop don't have the same sign, we can't guess an upper bound
- // on the resulting slice lengths, so return a worst case estimate.
- return input_ncodeunits;
- }
- int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
- // The maximum UTF8 byte size of a codepoint is 4
- return std::min(input_ncodeunits,
- 4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- if (options->step >= 1) {
- return SliceForward(input, input_string_ncodeunits, output);
- }
- return SliceBackward(input, input_string_ncodeunits, output);
- }
-
-#define RETURN_IF_UTF8_ERROR(expr) \
- do { \
- if (ARROW_PREDICT_FALSE(!expr)) { \
- return kTransformError; \
- } \
- } while (0)
-
- int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- // Slice in forward order (step > 0)
- const SliceOptions& opt = *this->options;
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* begin_sliced = begin;
- const uint8_t* end_sliced = end;
-
- // First, compute begin_sliced and end_sliced
- if (opt.start >= 0) {
- // start counting from the left
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
- if (opt.stop > opt.start) {
- // continue counting from begin_sliced
- const int64_t length = opt.stop - opt.start;
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
- } else if (opt.stop < 0) {
- // or from the end (but we will never need to < begin_sliced)
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin_sliced, end, &end_sliced, -opt.stop));
- } else {
- // zero length slice
- return 0;
- }
- } else {
- // start counting from the right
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin, end, &begin_sliced, -opt.start));
- if (opt.stop > 0) {
- // continue counting from the left, we cannot start from begin_sliced because we
- // don't know how many codepoints are between begin and begin_sliced
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
- // and therefore we also needs this
- if (end_sliced <= begin_sliced) {
- // zero length slice
- return 0;
- }
- } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
- // stop is negative, but larger than start, so we count again from the right
- // in some cases we can optimize this, depending on the shortest path (from end
- // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
- // for instance when start=-100, when the string length is only 10.
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin_sliced, end, &end_sliced, -opt.stop));
- } else {
- // zero length slice
- return 0;
- }
- }
-
- // Second, copy computed slice to output
- DCHECK(begin_sliced <= end_sliced);
- if (opt.step == 1) {
- // fast case, where we simply can finish with a memcpy
- std::copy(begin_sliced, end_sliced, output);
- return end_sliced - begin_sliced;
- }
- uint8_t* dest = output;
- const uint8_t* i = begin_sliced;
-
- while (i < end_sliced) {
- uint32_t codepoint = 0;
- // write a single codepoint
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
- dest = arrow::util::UTF8Encode(dest, codepoint);
- // and skip the remainder
- int64_t skips = opt.step - 1;
- while ((skips--) && (i < end_sliced)) {
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
- }
- }
- return dest - output;
- }
-
- int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- // Slice in reverse order (step < 0)
- const SliceOptions& opt = *this->options;
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* begin_sliced = begin;
- const uint8_t* end_sliced = end;
-
- // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
- // reverse iterators.
- if (opt.start >= 0) {
- // +1 because begin_sliced acts as as the end of a reverse iterator
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
- } else {
- // -1 because start=-1 means the last codeunit, which is 0 advances
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin, end, &begin_sliced, -opt.start - 1));
- }
- // make it point at the last codeunit of the previous codeunit
- begin_sliced--;
-
- // similar to opt.start
- if (opt.stop >= 0) {
- RETURN_IF_UTF8_ERROR(
- arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
- } else {
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
- begin, end, &end_sliced, -opt.stop - 1));
- }
- end_sliced--;
-
- // Copy computed slice to output
- uint8_t* dest = output;
- const uint8_t* i = begin_sliced;
- while (i > end_sliced) {
- uint32_t codepoint = 0;
- // write a single codepoint
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
- dest = arrow::util::UTF8Encode(dest, codepoint);
- // and skip the remainder
- int64_t skips = -opt.step - 1;
- while ((skips--) && (i > end_sliced)) {
- RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
- }
- }
- return dest - output;
- }
-
-#undef RETURN_IF_UTF8_ERROR
-};
-
-template <typename Type>
-using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
-
-const FunctionDoc utf8_slice_codeunits_doc(
- "Slice string ",
- ("For each string in `strings`, slice into a substring defined by\n"
- "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
- "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
- "string will be advanced in reversed order. A `step` of zero is considered an\n"
- "error.\n"
- "Null inputs emit null."),
- {"strings"}, "SliceOptions");
-
-void AddSlice(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
- &utf8_slice_codeunits_doc);
- using t32 = SliceCodeunits<StringType>;
- using t64 = SliceCodeunits<LargeStringType>;
+ {
+ auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
+ &match_substring_regex_doc);
+ auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
+ auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func =
+ std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
+ auto exec_32 = MatchLike<StringType>::Exec;
+ auto exec_64 = MatchLike<LargeStringType>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Substring find - lfind/index/etc.
+
+struct FindSubstring {
+ const PlainSubstringMatcher matcher_;
+
+ explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ return static_cast<OutValue>(matcher_.Find(val));
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct FindSubstringRegex {
+ std::unique_ptr<RE2> regex_match_;
+
+ explicit FindSubstringRegex(const MatchSubstringOptions& options,
+ bool literal = false) {
+ std::string regex = "(";
+ regex.reserve(options.pattern.length() + 2);
+ regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
+ regex += ")";
+ regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
+ options, /*literal=*/false)));
+ }
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ re2::StringPiece piece(val.data(), val.length());
+ re2::StringPiece match;
+ if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
+ return static_cast<OutValue>(match.data() - piece.data());
+ }
+ return -1;
+ }
+};
+#endif
+
+template <typename InputType>
+struct FindSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
+ kernel{FindSubstringRegex(options, /*literal=*/true)};
+ return kernel.Exec(ctx, batch, out);
+#endif
+ return Status::NotImplemented("ignore_case requires RE2");
+ }
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
+ FindSubstring(PlainSubstringMatcher(options))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_doc(
+ "Find first occurrence of substring",
+ ("For each string in `strings`, emit the index of the first occurrence of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename InputType>
+struct FindSubstringRegexExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
+ kernel{FindSubstringRegex(options, /*literal=*/false)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_regex_doc(
+ "Find location of first match of regex pattern",
+ ("For each string in `strings`, emit the index of the first match of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
+void AddFindSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
+ &find_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
+ &find_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Substring count
+
+struct CountSubstring {
+ const PlainSubstringMatcher matcher_;
+
+ explicit CountSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ OutValue count = 0;
+ uint64_t start = 0;
+ const auto pattern_size = std::max<uint64_t>(1, matcher_.options_.pattern.size());
+ while (start <= val.size()) {
+ const int64_t index = matcher_.Find(val.substr(start));
+ if (index >= 0) {
+ count++;
+ start += index + pattern_size;
+ } else {
+ break;
+ }
+ }
+ return count;
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct CountSubstringRegex {
+ std::unique_ptr<RE2> regex_match_;
+
+ explicit CountSubstringRegex(const MatchSubstringOptions& options, bool literal = false)
+ : regex_match_(new RE2(options.pattern,
+ RegexSubstringMatcher::MakeRE2Options(options, literal))) {}
+
+ static Result<CountSubstringRegex> Make(const MatchSubstringOptions& options,
+ bool literal = false) {
+ CountSubstringRegex counter(options, literal);
+ RETURN_NOT_OK(RegexStatus(*counter.regex_match_));
+ return std::move(counter);
+ }
+
+ template <typename OutValue, typename... Ignored>
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ OutValue count = 0;
+ re2::StringPiece input(val.data(), val.size());
+ auto last_size = input.size();
+ while (re2::RE2::FindAndConsume(&input, *regex_match_)) {
+ count++;
+ if (last_size == input.size()) {
+ // 0-length match
+ if (input.size() > 0) {
+ input.remove_prefix(1);
+ } else {
+ break;
+ }
+ }
+ last_size = input.size();
+ }
+ return count;
+ }
+};
+
+template <typename InputType>
+struct CountSubstringRegexExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto counter, CountSubstringRegex::Make(options));
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
+ kernel{std::move(counter)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+#endif
+
+template <typename InputType>
+struct CountSubstringExec {
+ using OffsetType = typename TypeTraits<InputType>::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto counter,
+ CountSubstringRegex::Make(options, /*literal=*/true));
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstringRegex>
+ kernel{std::move(counter)};
+ return kernel.Exec(ctx, batch, out);
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, CountSubstring> kernel{
+ CountSubstring(PlainSubstringMatcher(options))};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc count_substring_doc(
+ "Count occurrences of substring",
+ ("For each string in `strings`, emit the number of occurrences of the given "
+ "pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+const FunctionDoc count_substring_regex_doc(
+ "Count occurrences of substring",
+ ("For each string in `strings`, emit the number of occurrences of the given "
+ "regex pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
+void AddCountSubstring(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("count_substring", Arity::Unary(),
+ &count_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<CountSubstringExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared<ScalarFunction>("count_substring_regex", Arity::Unary(),
+ &count_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase<CountSubstringRegexExec>(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
+}
+
+// Slicing
+
+struct SliceTransformBase : public StringTransformBase {
+ using State = OptionsWrapper<SliceOptions>;
+
+ const SliceOptions* options;
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ options = &State::Get(ctx);
+ if (options->step == 0) {
+ return Status::Invalid("Slice step cannot be zero");
+ }
+ return Status::OK();
+ }
+};
+
+struct SliceCodeunitsTransform : SliceTransformBase {
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ const SliceOptions& opt = *this->options;
+ if ((opt.start >= 0) != (opt.stop >= 0)) {
+ // If start and stop don't have the same sign, we can't guess an upper bound
+ // on the resulting slice lengths, so return a worst case estimate.
+ return input_ncodeunits;
+ }
+ int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / opt.step;
+ // The maximum UTF8 byte size of a codepoint is 4
+ return std::min(input_ncodeunits,
+ 4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ if (options->step >= 1) {
+ return SliceForward(input, input_string_ncodeunits, output);
+ }
+ return SliceBackward(input, input_string_ncodeunits, output);
+ }
+
+#define RETURN_IF_UTF8_ERROR(expr) \
+ do { \
+ if (ARROW_PREDICT_FALSE(!expr)) { \
+ return kTransformError; \
+ } \
+ } while (0)
+
+ int64_t SliceForward(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ // Slice in forward order (step > 0)
+ const SliceOptions& opt = *this->options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* begin_sliced = begin;
+ const uint8_t* end_sliced = end;
+
+ // First, compute begin_sliced and end_sliced
+ if (opt.start >= 0) {
+ // start counting from the left
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start));
+ if (opt.stop > opt.start) {
+ // continue counting from begin_sliced
+ const int64_t length = opt.stop - opt.start;
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length));
+ } else if (opt.stop < 0) {
+ // or from the end (but we will never need to < begin_sliced)
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin_sliced, end, &end_sliced, -opt.stop));
+ } else {
+ // zero length slice
+ return 0;
+ }
+ } else {
+ // start counting from the right
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &begin_sliced, -opt.start));
+ if (opt.stop > 0) {
+ // continue counting from the left, we cannot start from begin_sliced because we
+ // don't know how many codepoints are between begin and begin_sliced
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop));
+ // and therefore we also needs this
+ if (end_sliced <= begin_sliced) {
+ // zero length slice
+ return 0;
+ }
+ } else if ((opt.stop < 0) && (opt.stop > opt.start)) {
+ // stop is negative, but larger than start, so we count again from the right
+ // in some cases we can optimize this, depending on the shortest path (from end
+ // or begin_sliced), but begin_sliced and opt.start can be 'out of sync',
+ // for instance when start=-100, when the string length is only 10.
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin_sliced, end, &end_sliced, -opt.stop));
+ } else {
+ // zero length slice
+ return 0;
+ }
+ }
+
+ // Second, copy computed slice to output
+ DCHECK(begin_sliced <= end_sliced);
+ if (opt.step == 1) {
+ // fast case, where we simply can finish with a memcpy
+ std::copy(begin_sliced, end_sliced, output);
+ return end_sliced - begin_sliced;
+ }
+ uint8_t* dest = output;
+ const uint8_t* i = begin_sliced;
+
+ while (i < end_sliced) {
+ uint32_t codepoint = 0;
+ // write a single codepoint
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
+ dest = arrow::util::UTF8Encode(dest, codepoint);
+ // and skip the remainder
+ int64_t skips = opt.step - 1;
+ while ((skips--) && (i < end_sliced)) {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8Decode(&i, &codepoint));
+ }
+ }
+ return dest - output;
+ }
+
+ int64_t SliceBackward(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ // Slice in reverse order (step < 0)
+ const SliceOptions& opt = *this->options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* begin_sliced = begin;
+ const uint8_t* end_sliced = end;
+
+ // Serious +1 -1 kung fu because begin_sliced and end_sliced act like
+ // reverse iterators.
+ if (opt.start >= 0) {
+ // +1 because begin_sliced acts as as the end of a reverse iterator
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opt.start + 1));
+ } else {
+ // -1 because start=-1 means the last codeunit, which is 0 advances
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &begin_sliced, -opt.start - 1));
+ }
+ // make it point at the last codeunit of the previous codeunit
+ begin_sliced--;
+
+ // similar to opt.start
+ if (opt.stop >= 0) {
+ RETURN_IF_UTF8_ERROR(
+ arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop + 1));
+ } else {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
+ begin, end, &end_sliced, -opt.stop - 1));
+ }
+ end_sliced--;
+
+ // Copy computed slice to output
+ uint8_t* dest = output;
+ const uint8_t* i = begin_sliced;
+ while (i > end_sliced) {
+ uint32_t codepoint = 0;
+ // write a single codepoint
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+ dest = arrow::util::UTF8Encode(dest, codepoint);
+ // and skip the remainder
+ int64_t skips = -opt.step - 1;
+ while ((skips--) && (i > end_sliced)) {
+ RETURN_IF_UTF8_ERROR(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+ }
+ }
+ return dest - output;
+ }
+
+#undef RETURN_IF_UTF8_ERROR
+};
+
+template <typename Type>
+using SliceCodeunits = StringTransformExec<Type, SliceCodeunitsTransform>;
+
+const FunctionDoc utf8_slice_codeunits_doc(
+ "Slice string ",
+ ("For each string in `strings`, slice into a substring defined by\n"
+ "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is inclusive\n"
+ "and `stop` is exclusive and are measured in codeunits. If step is negative, the\n"
+ "string will be advanced in reversed order. A `step` of zero is considered an\n"
+ "error.\n"
+ "Null inputs emit null."),
+ {"strings"}, "SliceOptions");
+
+void AddSlice(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", Arity::Unary(),
+ &utf8_slice_codeunits_doc);
+ using t32 = SliceCodeunits<StringType>;
+ using t64 = SliceCodeunits<LargeStringType>;
DCHECK_OK(
- func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
- DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
- SliceCodeunitsTransform::State::Init));
+ func->AddKernel({utf8()}, utf8(), t32::Exec, SliceCodeunitsTransform::State::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
+ SliceCodeunitsTransform::State::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
@@ -1496,8 +1496,8 @@ static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
template <typename Derived, bool allow_empty = false>
struct CharacterPredicateUnicode {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status* st) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status* st) {
if (allow_empty && input_string_ncodeunits == 0) {
return true;
}
@@ -1508,7 +1508,7 @@ struct CharacterPredicateUnicode {
any |= Derived::PredicateCharacterAny(codepoint);
return Derived::PredicateCharacterAll(codepoint);
}))) {
- *st = Status::Invalid("Invalid UTF8 sequence in input");
+ *st = Status::Invalid("Invalid UTF8 sequence in input");
return false;
}
return all & any;
@@ -1521,8 +1521,8 @@ struct CharacterPredicateUnicode {
template <typename Derived, bool allow_empty = false>
struct CharacterPredicateAscii {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status*) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status*) {
if (allow_empty && input_string_ncodeunits == 0) {
return true;
}
@@ -1599,8 +1599,8 @@ struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
#endif
struct IsAscii {
- static bool Call(KernelContext*, const uint8_t* input,
- size_t input_string_nascii_characters, Status*) {
+ static bool Call(KernelContext*, const uint8_t* input,
+ size_t input_string_nascii_characters, Status*) {
return std::all_of(input, input + input_string_nascii_characters,
IsAsciiCharacter<uint8_t>);
}
@@ -1661,8 +1661,8 @@ struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
#ifdef ARROW_WITH_UTF8PROC
struct IsTitleUnicode {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status* st) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status* st) {
// rules:
// * 1: lower case follows cased
// * 2: upper case follows uncased
@@ -1689,7 +1689,7 @@ struct IsTitleUnicode {
return true;
});
if (!ARROW_PREDICT_TRUE(status)) {
- *st = Status::Invalid("Invalid UTF8 sequence in input");
+ *st = Status::Invalid("Invalid UTF8 sequence in input");
return false;
}
return rules_1_and_2 & rule_3;
@@ -1698,8 +1698,8 @@ struct IsTitleUnicode {
#endif
struct IsTitleAscii {
- static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
- Status*) {
+ static bool Call(KernelContext*, const uint8_t* input, size_t input_string_ncodeunits,
+ Status*) {
// rules:
// * 1: lower case follows cased
// * 2: upper case follows uncased
@@ -1758,1021 +1758,1021 @@ struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
}
};
-// splitting
-
-template <typename Options>
-struct SplitFinderBase {
- virtual Status PreExec(const Options& options) { return Status::OK(); }
-
- // Derived classes should also define these methods:
- // static bool Find(const uint8_t* begin, const uint8_t* end,
- // const uint8_t** separator_begin,
- // const uint8_t** separator_end,
- // const SplitPatternOptions& options);
- //
- // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
- // const uint8_t** separator_begin,
- // const uint8_t** separator_end,
- // const SplitPatternOptions& options);
-};
-
-template <typename Type, typename ListType, typename SplitFinder,
- typename Options = typename SplitFinder::Options>
-struct SplitExec {
- using string_offset_type = typename Type::offset_type;
- using list_offset_type = typename ListType::offset_type;
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using ArrayListType = typename TypeTraits<ListType>::ArrayType;
- using ListScalarType = typename TypeTraits<ListType>::ScalarType;
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
- using State = OptionsWrapper<Options>;
-
- // Keep the temporary storage accross individual values, to minimize reallocations
- std::vector<util::string_view> parts;
- Options options;
-
- explicit SplitExec(const Options& options) : options(options) {}
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
- }
-
- Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- SplitFinder finder;
- RETURN_NOT_OK(finder.PreExec(options));
- if (batch[0].kind() == Datum::ARRAY) {
- return Execute(ctx, &finder, batch[0].array(), out);
- }
- DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
- return Execute(ctx, &finder, batch[0].scalar(), out);
- }
-
- Status Execute(KernelContext* ctx, SplitFinder* finder,
- const std::shared_ptr<ArrayData>& data, Datum* out) {
- const ArrayType input(data);
-
- BuilderType builder(input.type(), ctx->memory_pool());
- // A slight overestimate of the data needed
- RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
- // The minimum amount of strings needed
- RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
-
- ArrayData* output_list = out->mutable_array();
- // List offsets were preallocated
- auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
- DCHECK_NE(list_offsets, nullptr);
- // Initial value
- *list_offsets++ = 0;
- for (int64_t i = 0; i < input.length(); ++i) {
- if (!input.IsNull(i)) {
- RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
- if (ARROW_PREDICT_FALSE(builder.length() >
- std::numeric_limits<list_offset_type>::max())) {
- return Status::CapacityError("List offset does not fit into 32 bit");
- }
- }
- *list_offsets++ = static_cast<list_offset_type>(builder.length());
- }
- // Assign string array to list child data
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder.Finish(&string_array));
- output_list->child_data.push_back(string_array->data());
- return Status::OK();
- }
-
- Status Execute(KernelContext* ctx, SplitFinder* finder,
- const std::shared_ptr<Scalar>& scalar, Datum* out) {
- const auto& input = checked_cast<const ScalarType&>(*scalar);
- auto result = checked_cast<ListScalarType*>(out->scalar().get());
- if (input.is_valid) {
- result->is_valid = true;
- BuilderType builder(input.type, ctx->memory_pool());
- util::string_view s(*input.value);
- RETURN_NOT_OK(SplitString(s, finder, &builder));
- RETURN_NOT_OK(builder.Finish(&result->value));
- }
- return Status::OK();
- }
-
- Status SplitString(const util::string_view& s, SplitFinder* finder,
- BuilderType* builder) {
- const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
- const uint8_t* end = begin + s.length();
-
- int64_t max_splits = options.max_splits;
- // if there is no max splits, reversing does not make sense (and is probably less
- // efficient), but is useful for testing
- if (options.reverse) {
- // note that i points 1 further than the 'current'
- const uint8_t* i = end;
- // we will record the parts in reverse order
- parts.clear();
- if (max_splits > -1) {
- parts.reserve(max_splits + 1);
- }
- while (max_splits != 0) {
- const uint8_t *separator_begin, *separator_end;
- // find with whatever algo the part we will 'cut out'
- if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
- parts.emplace_back(reinterpret_cast<const char*>(separator_end),
- i - separator_end);
- i = separator_begin;
- max_splits--;
- } else {
- // if we cannot find a separator, we're done
- break;
- }
- }
- parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
- // now we do the copying
- for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
- RETURN_NOT_OK(builder->Append(*it));
- }
- } else {
- const uint8_t* i = begin;
- while (max_splits != 0) {
- const uint8_t *separator_begin, *separator_end;
- // find with whatever algo the part we will 'cut out'
- if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
- // the part till the beginning of the 'cut'
- RETURN_NOT_OK(
- builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
- i = separator_end;
- max_splits--;
- } else {
- // if we cannot find a separator, we're done
- break;
- }
- }
- // trailing part
- RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
- }
- return Status::OK();
- }
-};
-
-struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
- using Options = SplitPatternOptions;
-
- Status PreExec(const SplitPatternOptions& options) override {
- if (options.pattern.length() == 0) {
- return Status::Invalid("Empty separator");
- }
- return Status::OK();
- }
-
- static bool Find(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitPatternOptions& options) {
- const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
- const int64_t pattern_length = options.pattern.length();
- const uint8_t* i = begin;
- // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
- // the match kernel
- while ((i + pattern_length <= end)) {
- i = std::search(i, end, pattern, pattern + pattern_length);
- if (i != end) {
- *separator_begin = i;
- *separator_end = i + pattern_length;
- return true;
- }
- }
- return false;
- }
-
- static bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitPatternOptions& options) {
- const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
- const int64_t pattern_length = options.pattern.length();
- // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
- // the match kernel
- std::reverse_iterator<const uint8_t*> ri(end);
- std::reverse_iterator<const uint8_t*> rend(begin);
- std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
- std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
- while (begin <= ri.base() - pattern_length) {
- ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
- if (ri != rend) {
- *separator_begin = ri.base() - pattern_length;
- *separator_end = ri.base();
- return true;
- }
- }
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
-
-const FunctionDoc split_pattern_doc(
- "Split string according to separator",
- ("Split each string according to the exact `pattern` defined in\n"
- "SplitPatternOptions. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitPatternOptions."),
- {"strings"}, "SplitPatternOptions");
-
-const FunctionDoc ascii_split_whitespace_doc(
- "Split string according to any ASCII whitespace",
- ("Split each string according any non-zero length sequence of ASCII\n"
- "whitespace characters. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitOptions."),
- {"strings"}, "SplitOptions");
-
-const FunctionDoc utf8_split_whitespace_doc(
- "Split string according to any Unicode whitespace",
- ("Split each string according any non-zero length sequence of Unicode\n"
- "whitespace characters. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitOptions."),
- {"strings"}, "SplitOptions");
-
-void AddSplitPattern(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
- &split_pattern_doc);
- using t32 = SplitPatternExec<StringType, ListType>;
- using t64 = SplitPatternExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
- using Options = SplitOptions;
-
- static bool Find(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitOptions& options) {
- const uint8_t* i = begin;
- while (i < end) {
- if (IsSpaceCharacterAscii(*i)) {
- *separator_begin = i;
- do {
- i++;
- } while (IsSpaceCharacterAscii(*i) && i < end);
- *separator_end = i;
- return true;
- }
- i++;
- }
- return false;
- }
-
- static bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitOptions& options) {
- const uint8_t* i = end - 1;
- while ((i >= begin)) {
- if (IsSpaceCharacterAscii(*i)) {
- *separator_end = i + 1;
- do {
- i--;
- } while (IsSpaceCharacterAscii(*i) && i >= begin);
- *separator_begin = i + 1;
- return true;
- }
- i--;
- }
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
-
-void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
- static const SplitOptions default_options{};
- auto func =
- std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
- &ascii_split_whitespace_doc, &default_options);
- using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
- using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-#ifdef ARROW_WITH_UTF8PROC
-struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
- using Options = SplitOptions;
-
- Status PreExec(const SplitOptions& options) override {
- EnsureLookupTablesFilled();
- return Status::OK();
- }
-
- bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
- const uint8_t** separator_end, const SplitOptions& options) {
- const uint8_t* i = begin;
- while ((i < end)) {
- uint32_t codepoint = 0;
- *separator_begin = i;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
- return false;
- }
- if (IsSpaceCharacterUnicode(codepoint)) {
- do {
- *separator_end = i;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
- return false;
- }
- } while (IsSpaceCharacterUnicode(codepoint) && i < end);
- return true;
- }
- }
- return false;
- }
-
- bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitOptions& options) {
- const uint8_t* i = end - 1;
- while ((i >= begin)) {
- uint32_t codepoint = 0;
- *separator_end = i + 1;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
- return false;
- }
- if (IsSpaceCharacterUnicode(codepoint)) {
- do {
- *separator_begin = i + 1;
- if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
- return false;
- }
- } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
- return true;
- }
- }
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
-
-void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
- static const SplitOptions default_options{};
- auto func =
- std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
- &utf8_split_whitespace_doc, &default_options);
- using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
- using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-#endif // ARROW_WITH_UTF8PROC
-
-#ifdef ARROW_WITH_RE2
-struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
- using Options = SplitPatternOptions;
-
- util::optional<RE2> regex_split;
-
- Status PreExec(const SplitPatternOptions& options) override {
- if (options.reverse) {
- return Status::NotImplemented("Cannot split in reverse with regex");
- }
- // RE2 does *not* give you the full match! Must wrap the regex in a capture group
- // There is FindAndConsume, but it would give only the end of the separator
- std::string pattern = "(";
- pattern.reserve(options.pattern.size() + 2);
- pattern += options.pattern;
- pattern += ')';
- regex_split.emplace(std::move(pattern));
- return RegexStatus(*regex_split);
- }
-
- bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
- const uint8_t** separator_end, const SplitPatternOptions& options) {
- re2::StringPiece piece(reinterpret_cast<const char*>(begin),
- std::distance(begin, end));
- // "StringPiece is mutated to point to matched piece"
- re2::StringPiece result;
- if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
- return false;
- }
- *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
- *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
- return true;
- }
-
- bool FindReverse(const uint8_t* begin, const uint8_t* end,
- const uint8_t** separator_begin, const uint8_t** separator_end,
- const SplitPatternOptions& options) {
- // Unsupported (see PreExec)
- return false;
- }
-};
-
-template <typename Type, typename ListType>
-using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
-
-const FunctionDoc split_pattern_regex_doc(
- "Split string according to regex pattern",
- ("Split each string according to the regex `pattern` defined in\n"
- "SplitPatternOptions. The output for each string input is a list\n"
- "of strings.\n"
- "\n"
- "The maximum number of splits and direction of splitting\n"
- "(forward, reverse) can optionally be defined in SplitPatternOptions."),
- {"strings"}, "SplitPatternOptions");
-
-void AddSplitRegex(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
- &split_pattern_regex_doc);
- using t32 = SplitRegexExec<StringType, ListType>;
- using t64 = SplitRegexExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-#endif // ARROW_WITH_RE2
-
-void AddSplit(FunctionRegistry* registry) {
- AddSplitPattern(registry);
- AddSplitWhitespaceAscii(registry);
-#ifdef ARROW_WITH_UTF8PROC
- AddSplitWhitespaceUTF8(registry);
-#endif
-#ifdef ARROW_WITH_RE2
- AddSplitRegex(registry);
-#endif
-}
-
+// splitting
+
+template <typename Options>
+struct SplitFinderBase {
+ virtual Status PreExec(const Options& options) { return Status::OK(); }
+
+ // Derived classes should also define these methods:
+ // static bool Find(const uint8_t* begin, const uint8_t* end,
+ // const uint8_t** separator_begin,
+ // const uint8_t** separator_end,
+ // const SplitPatternOptions& options);
+ //
+ // static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ // const uint8_t** separator_begin,
+ // const uint8_t** separator_end,
+ // const SplitPatternOptions& options);
+};
+
+template <typename Type, typename ListType, typename SplitFinder,
+ typename Options = typename SplitFinder::Options>
+struct SplitExec {
+ using string_offset_type = typename Type::offset_type;
+ using list_offset_type = typename ListType::offset_type;
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+ using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
+ using State = OptionsWrapper<Options>;
+
+ // Keep the temporary storage accross individual values, to minimize reallocations
+ std::vector<util::string_view> parts;
+ Options options;
+
+ explicit SplitExec(const Options& options) : options(options) {}
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return SplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
+ }
+
+ Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ SplitFinder finder;
+ RETURN_NOT_OK(finder.PreExec(options));
+ if (batch[0].kind() == Datum::ARRAY) {
+ return Execute(ctx, &finder, batch[0].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
+ return Execute(ctx, &finder, batch[0].scalar(), out);
+ }
+
+ Status Execute(KernelContext* ctx, SplitFinder* finder,
+ const std::shared_ptr<ArrayData>& data, Datum* out) {
+ const ArrayType input(data);
+
+ BuilderType builder(input.type(), ctx->memory_pool());
+ // A slight overestimate of the data needed
+ RETURN_NOT_OK(builder.ReserveData(input.total_values_length()));
+ // The minimum amount of strings needed
+ RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
+
+ ArrayData* output_list = out->mutable_array();
+ // List offsets were preallocated
+ auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
+ DCHECK_NE(list_offsets, nullptr);
+ // Initial value
+ *list_offsets++ = 0;
+ for (int64_t i = 0; i < input.length(); ++i) {
+ if (!input.IsNull(i)) {
+ RETURN_NOT_OK(SplitString(input.GetView(i), finder, &builder));
+ if (ARROW_PREDICT_FALSE(builder.length() >
+ std::numeric_limits<list_offset_type>::max())) {
+ return Status::CapacityError("List offset does not fit into 32 bit");
+ }
+ }
+ *list_offsets++ = static_cast<list_offset_type>(builder.length());
+ }
+ // Assign string array to list child data
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ output_list->child_data.push_back(string_array->data());
+ return Status::OK();
+ }
+
+ Status Execute(KernelContext* ctx, SplitFinder* finder,
+ const std::shared_ptr<Scalar>& scalar, Datum* out) {
+ const auto& input = checked_cast<const ScalarType&>(*scalar);
+ auto result = checked_cast<ListScalarType*>(out->scalar().get());
+ if (input.is_valid) {
+ result->is_valid = true;
+ BuilderType builder(input.type, ctx->memory_pool());
+ util::string_view s(*input.value);
+ RETURN_NOT_OK(SplitString(s, finder, &builder));
+ RETURN_NOT_OK(builder.Finish(&result->value));
+ }
+ return Status::OK();
+ }
+
+ Status SplitString(const util::string_view& s, SplitFinder* finder,
+ BuilderType* builder) {
+ const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
+ const uint8_t* end = begin + s.length();
+
+ int64_t max_splits = options.max_splits;
+ // if there is no max splits, reversing does not make sense (and is probably less
+ // efficient), but is useful for testing
+ if (options.reverse) {
+ // note that i points 1 further than the 'current'
+ const uint8_t* i = end;
+ // we will record the parts in reverse order
+ parts.clear();
+ if (max_splits > -1) {
+ parts.reserve(max_splits + 1);
+ }
+ while (max_splits != 0) {
+ const uint8_t *separator_begin, *separator_end;
+ // find with whatever algo the part we will 'cut out'
+ if (finder->FindReverse(begin, i, &separator_begin, &separator_end, options)) {
+ parts.emplace_back(reinterpret_cast<const char*>(separator_end),
+ i - separator_end);
+ i = separator_begin;
+ max_splits--;
+ } else {
+ // if we cannot find a separator, we're done
+ break;
+ }
+ }
+ parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
+ // now we do the copying
+ for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+ RETURN_NOT_OK(builder->Append(*it));
+ }
+ } else {
+ const uint8_t* i = begin;
+ while (max_splits != 0) {
+ const uint8_t *separator_begin, *separator_end;
+ // find with whatever algo the part we will 'cut out'
+ if (finder->Find(i, end, &separator_begin, &separator_end, options)) {
+ // the part till the beginning of the 'cut'
+ RETURN_NOT_OK(
+ builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
+ i = separator_end;
+ max_splits--;
+ } else {
+ // if we cannot find a separator, we're done
+ break;
+ }
+ }
+ // trailing part
+ RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
+ }
+ return Status::OK();
+ }
+};
+
+struct SplitPatternFinder : public SplitFinderBase<SplitPatternOptions> {
+ using Options = SplitPatternOptions;
+
+ Status PreExec(const SplitPatternOptions& options) override {
+ if (options.pattern.length() == 0) {
+ return Status::Invalid("Empty separator");
+ }
+ return Status::OK();
+ }
+
+ static bool Find(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
+ const int64_t pattern_length = options.pattern.length();
+ const uint8_t* i = begin;
+ // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
+ // the match kernel
+ while ((i + pattern_length <= end)) {
+ i = std::search(i, end, pattern, pattern + pattern_length);
+ if (i != end) {
+ *separator_begin = i;
+ *separator_end = i + pattern_length;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
+ const int64_t pattern_length = options.pattern.length();
+ // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
+ // the match kernel
+ std::reverse_iterator<const uint8_t*> ri(end);
+ std::reverse_iterator<const uint8_t*> rend(begin);
+ std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
+ std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
+ while (begin <= ri.base() - pattern_length) {
+ ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
+ if (ri != rend) {
+ *separator_begin = ri.base() - pattern_length;
+ *separator_end = ri.base();
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitPatternExec = SplitExec<Type, ListType, SplitPatternFinder>;
+
+const FunctionDoc split_pattern_doc(
+ "Split string according to separator",
+ ("Split each string according to the exact `pattern` defined in\n"
+ "SplitPatternOptions. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitPatternOptions."),
+ {"strings"}, "SplitPatternOptions");
+
+const FunctionDoc ascii_split_whitespace_doc(
+ "Split string according to any ASCII whitespace",
+ ("Split each string according any non-zero length sequence of ASCII\n"
+ "whitespace characters. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitOptions."),
+ {"strings"}, "SplitOptions");
+
+const FunctionDoc utf8_split_whitespace_doc(
+ "Split string according to any Unicode whitespace",
+ ("Split each string according any non-zero length sequence of Unicode\n"
+ "whitespace characters. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitOptions."),
+ {"strings"}, "SplitOptions");
+
+void AddSplitPattern(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
+ &split_pattern_doc);
+ using t32 = SplitPatternExec<StringType, ListType>;
+ using t64 = SplitPatternExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+struct SplitWhitespaceAsciiFinder : public SplitFinderBase<SplitOptions> {
+ using Options = SplitOptions;
+
+ static bool Find(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = begin;
+ while (i < end) {
+ if (IsSpaceCharacterAscii(*i)) {
+ *separator_begin = i;
+ do {
+ i++;
+ } while (IsSpaceCharacterAscii(*i) && i < end);
+ *separator_end = i;
+ return true;
+ }
+ i++;
+ }
+ return false;
+ }
+
+ static bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = end - 1;
+ while ((i >= begin)) {
+ if (IsSpaceCharacterAscii(*i)) {
+ *separator_end = i + 1;
+ do {
+ i--;
+ } while (IsSpaceCharacterAscii(*i) && i >= begin);
+ *separator_begin = i + 1;
+ return true;
+ }
+ i--;
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitWhitespaceAsciiExec = SplitExec<Type, ListType, SplitWhitespaceAsciiFinder>;
+
+void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
+ static const SplitOptions default_options{};
+ auto func =
+ std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
+ &ascii_split_whitespace_doc, &default_options);
+ using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
+ using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+#ifdef ARROW_WITH_UTF8PROC
+struct SplitWhitespaceUtf8Finder : public SplitFinderBase<SplitOptions> {
+ using Options = SplitOptions;
+
+ Status PreExec(const SplitOptions& options) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
+ const uint8_t** separator_end, const SplitOptions& options) {
+ const uint8_t* i = begin;
+ while ((i < end)) {
+ uint32_t codepoint = 0;
+ *separator_begin = i;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ if (IsSpaceCharacterUnicode(codepoint)) {
+ do {
+ *separator_end = i;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ } while (IsSpaceCharacterUnicode(codepoint) && i < end);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitOptions& options) {
+ const uint8_t* i = end - 1;
+ while ((i >= begin)) {
+ uint32_t codepoint = 0;
+ *separator_end = i + 1;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ if (IsSpaceCharacterUnicode(codepoint)) {
+ do {
+ *separator_begin = i + 1;
+ if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitWhitespaceUtf8Exec = SplitExec<Type, ListType, SplitWhitespaceUtf8Finder>;
+
+void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
+ static const SplitOptions default_options{};
+ auto func =
+ std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
+ &utf8_split_whitespace_doc, &default_options);
+ using t32 = SplitWhitespaceUtf8Exec<StringType, ListType>;
+ using t64 = SplitWhitespaceUtf8Exec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_UTF8PROC
+
+#ifdef ARROW_WITH_RE2
+struct SplitRegexFinder : public SplitFinderBase<SplitPatternOptions> {
+ using Options = SplitPatternOptions;
+
+ util::optional<RE2> regex_split;
+
+ Status PreExec(const SplitPatternOptions& options) override {
+ if (options.reverse) {
+ return Status::NotImplemented("Cannot split in reverse with regex");
+ }
+ // RE2 does *not* give you the full match! Must wrap the regex in a capture group
+ // There is FindAndConsume, but it would give only the end of the separator
+ std::string pattern = "(";
+ pattern.reserve(options.pattern.size() + 2);
+ pattern += options.pattern;
+ pattern += ')';
+ regex_split.emplace(std::move(pattern));
+ return RegexStatus(*regex_split);
+ }
+
+ bool Find(const uint8_t* begin, const uint8_t* end, const uint8_t** separator_begin,
+ const uint8_t** separator_end, const SplitPatternOptions& options) {
+ re2::StringPiece piece(reinterpret_cast<const char*>(begin),
+ std::distance(begin, end));
+ // "StringPiece is mutated to point to matched piece"
+ re2::StringPiece result;
+ if (!re2::RE2::PartialMatch(piece, *regex_split, &result)) {
+ return false;
+ }
+ *separator_begin = reinterpret_cast<const uint8_t*>(result.data());
+ *separator_end = reinterpret_cast<const uint8_t*>(result.data() + result.size());
+ return true;
+ }
+
+ bool FindReverse(const uint8_t* begin, const uint8_t* end,
+ const uint8_t** separator_begin, const uint8_t** separator_end,
+ const SplitPatternOptions& options) {
+ // Unsupported (see PreExec)
+ return false;
+ }
+};
+
+template <typename Type, typename ListType>
+using SplitRegexExec = SplitExec<Type, ListType, SplitRegexFinder>;
+
+const FunctionDoc split_pattern_regex_doc(
+ "Split string according to regex pattern",
+ ("Split each string according to the regex `pattern` defined in\n"
+ "SplitPatternOptions. The output for each string input is a list\n"
+ "of strings.\n"
+ "\n"
+ "The maximum number of splits and direction of splitting\n"
+ "(forward, reverse) can optionally be defined in SplitPatternOptions."),
+ {"strings"}, "SplitPatternOptions");
+
+void AddSplitRegex(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("split_pattern_regex", Arity::Unary(),
+ &split_pattern_regex_doc);
+ using t32 = SplitRegexExec<StringType, ListType>;
+ using t64 = SplitRegexExec<LargeStringType, ListType>;
+ DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_RE2
+
+void AddSplit(FunctionRegistry* registry) {
+ AddSplitPattern(registry);
+ AddSplitWhitespaceAscii(registry);
+#ifdef ARROW_WITH_UTF8PROC
+ AddSplitWhitespaceUTF8(registry);
+#endif
+#ifdef ARROW_WITH_RE2
+ AddSplitRegex(registry);
+#endif
+}
+
+// ----------------------------------------------------------------------
+// Replace substring (plain, regex)
+
+template <typename Type, typename Replacer>
+struct ReplaceSubString {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using offset_type = typename Type::offset_type;
+ using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
+ using OffsetBuilder = TypedBufferBuilder<offset_type>;
+ using State = OptionsWrapper<ReplaceSubstringOptions>;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache replacer across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
+ return Replace(ctx, batch, *replacer, out);
+ }
+
+ static Status Replace(KernelContext* ctx, const ExecBatch& batch,
+ const Replacer& replacer, Datum* out) {
+ ValueDataBuilder value_data_builder(ctx->memory_pool());
+ OffsetBuilder offset_builder(ctx->memory_pool());
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ // We already know how many strings we have, so we can use Reserve/UnsafeAppend
+ RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
+ offset_builder.UnsafeAppend(0); // offsets start at 0
+
+ const ArrayData& input = *batch[0].array();
+ RETURN_NOT_OK(VisitArrayDataInline<Type>(
+ input,
+ [&](util::string_view s) {
+ RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
+ offset_builder.UnsafeAppend(
+ static_cast<offset_type>(value_data_builder.length()));
+ return Status::OK();
+ },
+ [&]() {
+ // offset for null value
+ offset_builder.UnsafeAppend(
+ static_cast<offset_type>(value_data_builder.length()));
+ return Status::OK();
+ }));
+ ArrayData* output = out->mutable_array();
+ RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
+ RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
+ } else {
+ const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto result = std::make_shared<ScalarType>();
+ if (input.is_valid) {
+ util::string_view s = static_cast<util::string_view>(*input.value);
+ RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
+ RETURN_NOT_OK(value_data_builder.Finish(&result->value));
+ result->is_valid = true;
+ }
+ out->value = result;
+ }
+
+ return Status::OK();
+ }
+};
+
+struct PlainSubStringReplacer {
+ const ReplaceSubstringOptions& options_;
+
+ static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
+ const ReplaceSubstringOptions& options) {
+ return arrow::internal::make_unique<PlainSubStringReplacer>(options);
+ }
+
+ explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
+ : options_(options) {}
+
+ Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
+ const char* i = s.begin();
+ const char* end = s.end();
+ int64_t max_replacements = options_.max_replacements;
+ while ((i < end) && (max_replacements != 0)) {
+ const char* pos =
+ std::search(i, end, options_.pattern.begin(), options_.pattern.end());
+ if (pos == end) {
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i)));
+ i = end;
+ } else {
+ // the string before the pattern
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(pos - i)));
+ // the replacement
+ RETURN_NOT_OK(
+ builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
+ options_.replacement.length()));
+ // skip pattern
+ i = pos + options_.pattern.length();
+ max_replacements--;
+ }
+ }
+ // if we exited early due to max_replacements, add the trailing part
+ return builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i));
+ }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubStringReplacer {
+ const ReplaceSubstringOptions& options_;
+ const RE2 regex_find_;
+ const RE2 regex_replacement_;
+
+ static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
+ const ReplaceSubstringOptions& options) {
+ auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
+
+ RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
+ RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
+
+ std::string replacement_error;
+ if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
+ &replacement_error)) {
+ return Status::Invalid("Invalid replacement string: ",
+ std::move(replacement_error));
+ }
+
+ return std::move(replacer);
+ }
+
+ // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
+ // we have 2 regexes, one with () around it, one without.
+ explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
+ : options_(options),
+ regex_find_("(" + options_.pattern + ")", RE2::Quiet),
+ regex_replacement_(options_.pattern, RE2::Quiet) {}
+
+ Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
+ re2::StringPiece replacement(options_.replacement);
+
+ if (options_.max_replacements == -1) {
+ std::string s_copy(s.to_string());
+ re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
+ return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
+ s_copy.length());
+ }
+
+ // Since RE2 does not have the concept of max_replacements, we have to do some work
+ // ourselves.
+ // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
+ const char* i = s.begin();
+ const char* end = s.end();
+ re2::StringPiece piece(s.data(), s.length());
+
+ int64_t max_replacements = options_.max_replacements;
+ while ((i < end) && (max_replacements != 0)) {
+ std::string found;
+ if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i)));
+ i = end;
+ } else {
+ // wind back to the beginning of the match
+ const char* pos = piece.begin() - found.length();
+ // the string before the pattern
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(pos - i)));
+ // replace the pattern in what we found
+ if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
+ return Status::Invalid("Regex found, but replacement failed");
+ }
+ RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
+ static_cast<int64_t>(found.length())));
+ // skip pattern
+ i = piece.begin();
+ max_replacements--;
+ }
+ }
+ // If we exited early due to max_replacements, add the trailing part
+ return builder->Append(reinterpret_cast<const uint8_t*>(i),
+ static_cast<int64_t>(end - i));
+ }
+};
+#endif
+
+template <typename Type>
+using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
+
+const FunctionDoc replace_substring_doc(
+ "Replace non-overlapping substrings that match pattern by replacement",
+ ("For each string in `strings`, replace non-overlapping substrings that match\n"
+ "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
+ "maximum amount of replacements made, counting from the left. Null values emit\n"
+ "null."),
+ {"strings"}, "ReplaceSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename Type>
+using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
+
+const FunctionDoc replace_substring_regex_doc(
+ "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
+ ("For each string in `strings`, replace non-overlapping substrings that match the\n"
+ "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
+ "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
+ "made, counting from the left. Note that if the pattern contains groups,\n"
+ "backreferencing macan be used. Null values emit null."),
+ {"strings"}, "ReplaceSubstringOptions");
+#endif
+
+// ----------------------------------------------------------------------
+// Replace slice
+
+struct ReplaceSliceTransformBase : public StringTransformBase {
+ using State = OptionsWrapper<ReplaceSliceOptions>;
+
+ const ReplaceSliceOptions* options;
+
+ explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return ninputs * options->replacement.size() + input_ncodeunits;
+ }
+};
+
+struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
+ using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const auto& opts = *options;
+ int64_t before_slice = 0;
+ int64_t after_slice = 0;
+ uint8_t* output_start = output;
+
+ if (opts.start >= 0) {
+ // Count from left
+ before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
+ } else {
+ // Count from right
+ before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
+ }
+ // Mimic Pandas: if stop would be before start, treat as 0-length slice
+ if (opts.stop >= 0) {
+ // Count from left
+ after_slice =
+ std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
+ } else {
+ // Count from right
+ after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
+ }
+ output = std::copy(input, input + before_slice, output);
+ output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
+ output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
+ return output - output_start;
+ }
+};
+
+struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
+ using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const auto& opts = *options;
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t *begin_sliced, *end_sliced;
+ uint8_t* output_start = output;
+
+ // Mimic Pandas: if stop would be before start, treat as 0-length slice
+ if (opts.start >= 0) {
+ // Count from left
+ if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
+ return kTransformError;
+ }
+ if (opts.stop > options->start) {
+ // Continue counting from left
+ const int64_t length = opts.stop - options->start;
+ if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
+ return kTransformError;
+ }
+ } else if (opts.stop < 0) {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
+ -opts.stop)) {
+ return kTransformError;
+ }
+ } else {
+ // Zero-length slice
+ end_sliced = begin_sliced;
+ }
+ } else {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
+ -opts.start)) {
+ return kTransformError;
+ }
+ if (opts.stop >= 0) {
+ // Restart counting from left
+ if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
+ return kTransformError;
+ }
+ if (end_sliced <= begin_sliced) {
+ // Zero-length slice
+ end_sliced = begin_sliced;
+ }
+ } else if ((opts.stop < 0) && (options->stop > options->start)) {
+ // Count from right
+ if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
+ -opts.stop)) {
+ return kTransformError;
+ }
+ } else {
+ // zero-length slice
+ end_sliced = begin_sliced;
+ }
+ }
+ output = std::copy(begin, begin_sliced, output);
+ output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
+ output = std::copy(end_sliced, end, output);
+ return output - output_start;
+ }
+};
+
+template <typename Type>
+using BinaryReplaceSlice =
+ StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
+template <typename Type>
+using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
+
+const FunctionDoc binary_replace_slice_doc(
+ "Replace a slice of a binary string with `replacement`",
+ ("For each string in `strings`, replace a slice of the string defined by `start`"
+ "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
+ "and both are measured in bytes.\n"
+ "Null values emit null."),
+ {"strings"}, "ReplaceSliceOptions");
+
+const FunctionDoc utf8_replace_slice_doc(
+ "Replace a slice of a string with `replacement`",
+ ("For each string in `strings`, replace a slice of the string defined by `start`"
+ "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
+ "and both are measured in codeunits.\n"
+ "Null values emit null."),
+ {"strings"}, "ReplaceSliceOptions");
+
+void AddReplaceSlice(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
+ &binary_replace_slice_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ DCHECK_OK(func->AddKernel({ty}, ty,
+ GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
+ ReplaceSliceTransformBase::State::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+
+ {
+ auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
+ &utf8_replace_slice_doc);
+ DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
+ ReplaceSliceTransformBase::State::Init));
+ DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
+ Utf8ReplaceSlice<LargeStringType>::Exec,
+ ReplaceSliceTransformBase::State::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+// ----------------------------------------------------------------------
+// Extract with regex
+
+#ifdef ARROW_WITH_RE2
+
+// TODO cache this once per ExtractRegexOptions
+struct ExtractRegexData {
+ // Use unique_ptr<> because RE2 is non-movable
+ std::unique_ptr<RE2> regex;
+ std::vector<std::string> group_names;
+
+ static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
+ ExtractRegexData data(options.pattern);
+ RETURN_NOT_OK(RegexStatus(*data.regex));
+
+ const int group_count = data.regex->NumberOfCapturingGroups();
+ const auto& name_map = data.regex->CapturingGroupNames();
+ data.group_names.reserve(group_count);
+
+ for (int i = 0; i < group_count; i++) {
+ auto item = name_map.find(i + 1); // re2 starts counting from 1
+ if (item == name_map.end()) {
+ // XXX should we instead just create fields with an empty name?
+ return Status::Invalid("Regular expression contains unnamed groups");
+ }
+ data.group_names.emplace_back(item->second);
+ }
+ return std::move(data);
+ }
+
+ Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
+ const auto& input_type = args[0].type;
+ if (input_type == nullptr) {
+ // No input type specified => propagate shape
+ return args[0];
+ }
+ // Input type is either String or LargeString and is also the type of each
+ // field in the output struct type.
+ DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
+ FieldVector fields;
+ fields.reserve(group_names.size());
+ std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
+ [&](const std::string& name) { return field(name, input_type); });
+ return struct_(std::move(fields));
+ }
+
+ private:
+ explicit ExtractRegexData(const std::string& pattern)
+ : regex(new RE2(pattern, RE2::Quiet)) {}
+};
+
+Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
+ const std::vector<ValueDescr>& args) {
+ using State = OptionsWrapper<ExtractRegexOptions>;
+ ExtractRegexOptions options = State::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+ return data.ResolveOutputType(args);
+}
+
+struct ExtractRegexBase {
+ const ExtractRegexData& data;
+ const int group_count;
+ std::vector<re2::StringPiece> found_values;
+ std::vector<re2::RE2::Arg> args;
+ std::vector<const re2::RE2::Arg*> args_pointers;
+ const re2::RE2::Arg** args_pointers_start;
+ const re2::RE2::Arg* null_arg = nullptr;
+
+ explicit ExtractRegexBase(const ExtractRegexData& data)
+ : data(data),
+ group_count(static_cast<int>(data.group_names.size())),
+ found_values(group_count) {
+ args.reserve(group_count);
+ args_pointers.reserve(group_count);
+
+ for (int i = 0; i < group_count; i++) {
+ args.emplace_back(&found_values[i]);
+ // Since we reserved capacity, we're guaranteed the pointer remains valid
+ args_pointers.push_back(&args[i]);
+ }
+ // Avoid null pointer if there is no capture group
+ args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
+ }
+
+ bool Match(util::string_view s) {
+ return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
+ group_count);
+ }
+};
+
+template <typename Type>
+struct ExtractRegex : public ExtractRegexBase {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using State = OptionsWrapper<ExtractRegexOptions>;
+
+ using ExtractRegexBase::ExtractRegexBase;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ExtractRegexOptions options = State::Get(ctx);
+ ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
+ return ExtractRegex{data}.Extract(ctx, batch, out);
+ }
+
+ Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
+ DCHECK_NE(descr.type, nullptr);
+ const auto& type = descr.type;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ std::unique_ptr<ArrayBuilder> array_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
+ StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
+
+ std::vector<BuilderType*> field_builders;
+ field_builders.reserve(group_count);
+ for (int i = 0; i < group_count; i++) {
+ field_builders.push_back(
+ checked_cast<BuilderType*>(struct_builder->field_builder(i)));
+ }
+
+ auto visit_null = [&]() { return struct_builder->AppendNull(); };
+ auto visit_value = [&](util::string_view s) {
+ if (Match(s)) {
+ for (int i = 0; i < group_count; i++) {
+ RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
+ }
+ return struct_builder->Append();
+ } else {
+ return struct_builder->AppendNull();
+ }
+ };
+ const ArrayData& input = *batch[0].array();
+ RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
+
+ std::shared_ptr<Array> out_array;
+ RETURN_NOT_OK(struct_builder->Finish(&out_array));
+ *out = std::move(out_array);
+ } else {
+ const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+ auto result = std::make_shared<StructScalar>(type);
+ if (input.is_valid && Match(util::string_view(*input.value))) {
+ result->value.reserve(group_count);
+ for (int i = 0; i < group_count; i++) {
+ result->value.push_back(
+ std::make_shared<ScalarType>(found_values[i].as_string()));
+ }
+ result->is_valid = true;
+ } else {
+ result->is_valid = false;
+ }
+ out->value = std::move(result);
+ }
+
+ return Status::OK();
+ }
+};
+
+const FunctionDoc extract_regex_doc(
+ "Extract substrings captured by a regex pattern",
+ ("For each string in `strings`, match the regular expression and, if\n"
+ "successful, emit a struct with field names and values coming from the\n"
+ "regular expression's named capture groups. If the input is null or the\n"
+ "regular expression fails matching, a null output value is emitted.\n"
+ "\n"
+ "Regular expression matching is done using the Google RE2 library."),
+ {"strings"}, "ExtractRegexOptions");
+
+void AddExtractRegex(FunctionRegistry* registry) {
+ auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
+ &extract_regex_doc);
+ using t32 = ExtractRegex<StringType>;
+ using t64 = ExtractRegex<LargeStringType>;
+ OutputType out_ty(ResolveExtractRegexOutput);
+ ScalarKernel kernel;
+
+ // Null values will be computed based on regex match or not
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
+ kernel.exec = t32::Exec;
+ kernel.init = t32::State::Init;
+ DCHECK_OK(func->AddKernel(kernel));
+ kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
+ kernel.exec = t64::Exec;
+ kernel.init = t64::State::Init;
+ DCHECK_OK(func->AddKernel(kernel));
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+#endif // ARROW_WITH_RE2
+
// ----------------------------------------------------------------------
-// Replace substring (plain, regex)
-
-template <typename Type, typename Replacer>
-struct ReplaceSubString {
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- using offset_type = typename Type::offset_type;
- using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
- using OffsetBuilder = TypedBufferBuilder<offset_type>;
- using State = OptionsWrapper<ReplaceSubstringOptions>;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // TODO Cache replacer across invocations (for regex compilation)
- ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(State::Get(ctx)));
- return Replace(ctx, batch, *replacer, out);
- }
-
- static Status Replace(KernelContext* ctx, const ExecBatch& batch,
- const Replacer& replacer, Datum* out) {
- ValueDataBuilder value_data_builder(ctx->memory_pool());
- OffsetBuilder offset_builder(ctx->memory_pool());
-
- if (batch[0].kind() == Datum::ARRAY) {
- // We already know how many strings we have, so we can use Reserve/UnsafeAppend
- RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
- offset_builder.UnsafeAppend(0); // offsets start at 0
-
- const ArrayData& input = *batch[0].array();
- RETURN_NOT_OK(VisitArrayDataInline<Type>(
- input,
- [&](util::string_view s) {
- RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
- offset_builder.UnsafeAppend(
- static_cast<offset_type>(value_data_builder.length()));
- return Status::OK();
- },
- [&]() {
- // offset for null value
- offset_builder.UnsafeAppend(
- static_cast<offset_type>(value_data_builder.length()));
- return Status::OK();
- }));
- ArrayData* output = out->mutable_array();
- RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
- RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
- } else {
- const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
- auto result = std::make_shared<ScalarType>();
- if (input.is_valid) {
- util::string_view s = static_cast<util::string_view>(*input.value);
- RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
- RETURN_NOT_OK(value_data_builder.Finish(&result->value));
- result->is_valid = true;
- }
- out->value = result;
- }
-
- return Status::OK();
- }
-};
-
-struct PlainSubStringReplacer {
- const ReplaceSubstringOptions& options_;
-
- static Result<std::unique_ptr<PlainSubStringReplacer>> Make(
- const ReplaceSubstringOptions& options) {
- return arrow::internal::make_unique<PlainSubStringReplacer>(options);
- }
-
- explicit PlainSubStringReplacer(const ReplaceSubstringOptions& options)
- : options_(options) {}
-
- Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
- const char* i = s.begin();
- const char* end = s.end();
- int64_t max_replacements = options_.max_replacements;
- while ((i < end) && (max_replacements != 0)) {
- const char* pos =
- std::search(i, end, options_.pattern.begin(), options_.pattern.end());
- if (pos == end) {
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i)));
- i = end;
- } else {
- // the string before the pattern
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(pos - i)));
- // the replacement
- RETURN_NOT_OK(
- builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
- options_.replacement.length()));
- // skip pattern
- i = pos + options_.pattern.length();
- max_replacements--;
- }
- }
- // if we exited early due to max_replacements, add the trailing part
- return builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i));
- }
-};
-
-#ifdef ARROW_WITH_RE2
-struct RegexSubStringReplacer {
- const ReplaceSubstringOptions& options_;
- const RE2 regex_find_;
- const RE2 regex_replacement_;
-
- static Result<std::unique_ptr<RegexSubStringReplacer>> Make(
- const ReplaceSubstringOptions& options) {
- auto replacer = arrow::internal::make_unique<RegexSubStringReplacer>(options);
-
- RETURN_NOT_OK(RegexStatus(replacer->regex_find_));
- RETURN_NOT_OK(RegexStatus(replacer->regex_replacement_));
-
- std::string replacement_error;
- if (!replacer->regex_replacement_.CheckRewriteString(replacer->options_.replacement,
- &replacement_error)) {
- return Status::Invalid("Invalid replacement string: ",
- std::move(replacement_error));
- }
-
- return std::move(replacer);
- }
-
- // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
- // we have 2 regexes, one with () around it, one without.
- explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
- : options_(options),
- regex_find_("(" + options_.pattern + ")", RE2::Quiet),
- regex_replacement_(options_.pattern, RE2::Quiet) {}
-
- Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) const {
- re2::StringPiece replacement(options_.replacement);
-
- if (options_.max_replacements == -1) {
- std::string s_copy(s.to_string());
- re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
- return builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
- s_copy.length());
- }
-
- // Since RE2 does not have the concept of max_replacements, we have to do some work
- // ourselves.
- // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
- const char* i = s.begin();
- const char* end = s.end();
- re2::StringPiece piece(s.data(), s.length());
-
- int64_t max_replacements = options_.max_replacements;
- while ((i < end) && (max_replacements != 0)) {
- std::string found;
- if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i)));
- i = end;
- } else {
- // wind back to the beginning of the match
- const char* pos = piece.begin() - found.length();
- // the string before the pattern
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(pos - i)));
- // replace the pattern in what we found
- if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
- return Status::Invalid("Regex found, but replacement failed");
- }
- RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
- static_cast<int64_t>(found.length())));
- // skip pattern
- i = piece.begin();
- max_replacements--;
- }
- }
- // If we exited early due to max_replacements, add the trailing part
- return builder->Append(reinterpret_cast<const uint8_t*>(i),
- static_cast<int64_t>(end - i));
- }
-};
-#endif
-
-template <typename Type>
-using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
-
-const FunctionDoc replace_substring_doc(
- "Replace non-overlapping substrings that match pattern by replacement",
- ("For each string in `strings`, replace non-overlapping substrings that match\n"
- "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
- "maximum amount of replacements made, counting from the left. Null values emit\n"
- "null."),
- {"strings"}, "ReplaceSubstringOptions");
-
-#ifdef ARROW_WITH_RE2
-template <typename Type>
-using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
-
-const FunctionDoc replace_substring_regex_doc(
- "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
- ("For each string in `strings`, replace non-overlapping substrings that match the\n"
- "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
- "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
- "made, counting from the left. Note that if the pattern contains groups,\n"
- "backreferencing macan be used. Null values emit null."),
- {"strings"}, "ReplaceSubstringOptions");
-#endif
-
-// ----------------------------------------------------------------------
-// Replace slice
-
-struct ReplaceSliceTransformBase : public StringTransformBase {
- using State = OptionsWrapper<ReplaceSliceOptions>;
-
- const ReplaceSliceOptions* options;
-
- explicit ReplaceSliceTransformBase(const ReplaceSliceOptions& options)
- : options{&options} {}
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- return ninputs * options->replacement.size() + input_ncodeunits;
- }
-};
-
-struct BinaryReplaceSliceTransform : ReplaceSliceTransformBase {
- using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const auto& opts = *options;
- int64_t before_slice = 0;
- int64_t after_slice = 0;
- uint8_t* output_start = output;
-
- if (opts.start >= 0) {
- // Count from left
- before_slice = std::min<int64_t>(input_string_ncodeunits, opts.start);
- } else {
- // Count from right
- before_slice = std::max<int64_t>(0, input_string_ncodeunits + opts.start);
- }
- // Mimic Pandas: if stop would be before start, treat as 0-length slice
- if (opts.stop >= 0) {
- // Count from left
- after_slice =
- std::min<int64_t>(input_string_ncodeunits, std::max(before_slice, opts.stop));
- } else {
- // Count from right
- after_slice = std::max<int64_t>(before_slice, input_string_ncodeunits + opts.stop);
- }
- output = std::copy(input, input + before_slice, output);
- output = std::copy(opts.replacement.begin(), opts.replacement.end(), output);
- output = std::copy(input + after_slice, input + input_string_ncodeunits, output);
- return output - output_start;
- }
-};
-
-struct Utf8ReplaceSliceTransform : ReplaceSliceTransformBase {
- using ReplaceSliceTransformBase::ReplaceSliceTransformBase;
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const auto& opts = *options;
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t *begin_sliced, *end_sliced;
- uint8_t* output_start = output;
-
- // Mimic Pandas: if stop would be before start, treat as 0-length slice
- if (opts.start >= 0) {
- // Count from left
- if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, opts.start)) {
- return kTransformError;
- }
- if (opts.stop > options->start) {
- // Continue counting from left
- const int64_t length = opts.stop - options->start;
- if (!arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, length)) {
- return kTransformError;
- }
- } else if (opts.stop < 0) {
- // Count from right
- if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
- -opts.stop)) {
- return kTransformError;
- }
- } else {
- // Zero-length slice
- end_sliced = begin_sliced;
- }
- } else {
- // Count from right
- if (!arrow::util::UTF8AdvanceCodepointsReverse(begin, end, &begin_sliced,
- -opts.start)) {
- return kTransformError;
- }
- if (opts.stop >= 0) {
- // Restart counting from left
- if (!arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opts.stop)) {
- return kTransformError;
- }
- if (end_sliced <= begin_sliced) {
- // Zero-length slice
- end_sliced = begin_sliced;
- }
- } else if ((opts.stop < 0) && (options->stop > options->start)) {
- // Count from right
- if (!arrow::util::UTF8AdvanceCodepointsReverse(begin_sliced, end, &end_sliced,
- -opts.stop)) {
- return kTransformError;
- }
- } else {
- // zero-length slice
- end_sliced = begin_sliced;
- }
- }
- output = std::copy(begin, begin_sliced, output);
- output = std::copy(opts.replacement.begin(), options->replacement.end(), output);
- output = std::copy(end_sliced, end, output);
- return output - output_start;
- }
-};
-
-template <typename Type>
-using BinaryReplaceSlice =
- StringTransformExecWithState<Type, BinaryReplaceSliceTransform>;
-template <typename Type>
-using Utf8ReplaceSlice = StringTransformExecWithState<Type, Utf8ReplaceSliceTransform>;
-
-const FunctionDoc binary_replace_slice_doc(
- "Replace a slice of a binary string with `replacement`",
- ("For each string in `strings`, replace a slice of the string defined by `start`"
- "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
- "and both are measured in bytes.\n"
- "Null values emit null."),
- {"strings"}, "ReplaceSliceOptions");
-
-const FunctionDoc utf8_replace_slice_doc(
- "Replace a slice of a string with `replacement`",
- ("For each string in `strings`, replace a slice of the string defined by `start`"
- "and `stop` with `replacement`. `start` is inclusive and `stop` is exclusive, "
- "and both are measured in codeunits.\n"
- "Null values emit null."),
- {"strings"}, "ReplaceSliceOptions");
-
-void AddReplaceSlice(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("binary_replace_slice", Arity::Unary(),
- &binary_replace_slice_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- DCHECK_OK(func->AddKernel({ty}, ty,
- GenerateTypeAgnosticVarBinaryBase<BinaryReplaceSlice>(ty),
- ReplaceSliceTransformBase::State::Init));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-
- {
- auto func = std::make_shared<ScalarFunction>("utf8_replace_slice", Arity::Unary(),
- &utf8_replace_slice_doc);
- DCHECK_OK(func->AddKernel({utf8()}, utf8(), Utf8ReplaceSlice<StringType>::Exec,
- ReplaceSliceTransformBase::State::Init));
- DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(),
- Utf8ReplaceSlice<LargeStringType>::Exec,
- ReplaceSliceTransformBase::State::Init));
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
-// ----------------------------------------------------------------------
-// Extract with regex
-
-#ifdef ARROW_WITH_RE2
-
-// TODO cache this once per ExtractRegexOptions
-struct ExtractRegexData {
- // Use unique_ptr<> because RE2 is non-movable
- std::unique_ptr<RE2> regex;
- std::vector<std::string> group_names;
-
- static Result<ExtractRegexData> Make(const ExtractRegexOptions& options) {
- ExtractRegexData data(options.pattern);
- RETURN_NOT_OK(RegexStatus(*data.regex));
-
- const int group_count = data.regex->NumberOfCapturingGroups();
- const auto& name_map = data.regex->CapturingGroupNames();
- data.group_names.reserve(group_count);
-
- for (int i = 0; i < group_count; i++) {
- auto item = name_map.find(i + 1); // re2 starts counting from 1
- if (item == name_map.end()) {
- // XXX should we instead just create fields with an empty name?
- return Status::Invalid("Regular expression contains unnamed groups");
- }
- data.group_names.emplace_back(item->second);
- }
- return std::move(data);
- }
-
- Result<ValueDescr> ResolveOutputType(const std::vector<ValueDescr>& args) const {
- const auto& input_type = args[0].type;
- if (input_type == nullptr) {
- // No input type specified => propagate shape
- return args[0];
- }
- // Input type is either String or LargeString and is also the type of each
- // field in the output struct type.
- DCHECK(input_type->id() == Type::STRING || input_type->id() == Type::LARGE_STRING);
- FieldVector fields;
- fields.reserve(group_names.size());
- std::transform(group_names.begin(), group_names.end(), std::back_inserter(fields),
- [&](const std::string& name) { return field(name, input_type); });
- return struct_(std::move(fields));
- }
-
- private:
- explicit ExtractRegexData(const std::string& pattern)
- : regex(new RE2(pattern, RE2::Quiet)) {}
-};
-
-Result<ValueDescr> ResolveExtractRegexOutput(KernelContext* ctx,
- const std::vector<ValueDescr>& args) {
- using State = OptionsWrapper<ExtractRegexOptions>;
- ExtractRegexOptions options = State::Get(ctx);
- ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
- return data.ResolveOutputType(args);
-}
-
-struct ExtractRegexBase {
- const ExtractRegexData& data;
- const int group_count;
- std::vector<re2::StringPiece> found_values;
- std::vector<re2::RE2::Arg> args;
- std::vector<const re2::RE2::Arg*> args_pointers;
- const re2::RE2::Arg** args_pointers_start;
- const re2::RE2::Arg* null_arg = nullptr;
-
- explicit ExtractRegexBase(const ExtractRegexData& data)
- : data(data),
- group_count(static_cast<int>(data.group_names.size())),
- found_values(group_count) {
- args.reserve(group_count);
- args_pointers.reserve(group_count);
-
- for (int i = 0; i < group_count; i++) {
- args.emplace_back(&found_values[i]);
- // Since we reserved capacity, we're guaranteed the pointer remains valid
- args_pointers.push_back(&args[i]);
- }
- // Avoid null pointer if there is no capture group
- args_pointers_start = (group_count > 0) ? args_pointers.data() : &null_arg;
- }
-
- bool Match(util::string_view s) {
- return re2::RE2::PartialMatchN(ToStringPiece(s), *data.regex, args_pointers_start,
- group_count);
- }
-};
-
-template <typename Type>
-struct ExtractRegex : public ExtractRegexBase {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- using State = OptionsWrapper<ExtractRegexOptions>;
-
- using ExtractRegexBase::ExtractRegexBase;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ExtractRegexOptions options = State::Get(ctx);
- ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
- return ExtractRegex{data}.Extract(ctx, batch, out);
- }
-
- Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
- DCHECK_NE(descr.type, nullptr);
- const auto& type = descr.type;
-
- if (batch[0].kind() == Datum::ARRAY) {
- std::unique_ptr<ArrayBuilder> array_builder;
- RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
- StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
-
- std::vector<BuilderType*> field_builders;
- field_builders.reserve(group_count);
- for (int i = 0; i < group_count; i++) {
- field_builders.push_back(
- checked_cast<BuilderType*>(struct_builder->field_builder(i)));
- }
-
- auto visit_null = [&]() { return struct_builder->AppendNull(); };
- auto visit_value = [&](util::string_view s) {
- if (Match(s)) {
- for (int i = 0; i < group_count; i++) {
- RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
- }
- return struct_builder->Append();
- } else {
- return struct_builder->AppendNull();
- }
- };
- const ArrayData& input = *batch[0].array();
- RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
-
- std::shared_ptr<Array> out_array;
- RETURN_NOT_OK(struct_builder->Finish(&out_array));
- *out = std::move(out_array);
- } else {
- const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
- auto result = std::make_shared<StructScalar>(type);
- if (input.is_valid && Match(util::string_view(*input.value))) {
- result->value.reserve(group_count);
- for (int i = 0; i < group_count; i++) {
- result->value.push_back(
- std::make_shared<ScalarType>(found_values[i].as_string()));
- }
- result->is_valid = true;
- } else {
- result->is_valid = false;
- }
- out->value = std::move(result);
- }
-
- return Status::OK();
- }
-};
-
-const FunctionDoc extract_regex_doc(
- "Extract substrings captured by a regex pattern",
- ("For each string in `strings`, match the regular expression and, if\n"
- "successful, emit a struct with field names and values coming from the\n"
- "regular expression's named capture groups. If the input is null or the\n"
- "regular expression fails matching, a null output value is emitted.\n"
- "\n"
- "Regular expression matching is done using the Google RE2 library."),
- {"strings"}, "ExtractRegexOptions");
-
-void AddExtractRegex(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
- &extract_regex_doc);
- using t32 = ExtractRegex<StringType>;
- using t64 = ExtractRegex<LargeStringType>;
- OutputType out_ty(ResolveExtractRegexOutput);
- ScalarKernel kernel;
-
- // Null values will be computed based on regex match or not
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
- kernel.exec = t32::Exec;
- kernel.init = t32::State::Init;
- DCHECK_OK(func->AddKernel(kernel));
- kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
- kernel.exec = t64::Exec;
- kernel.init = t64::State::Init;
- DCHECK_OK(func->AddKernel(kernel));
-
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-#endif // ARROW_WITH_RE2
-
-// ----------------------------------------------------------------------
// strptime string parsing
using StrptimeState = OptionsWrapper<StrptimeOptions>;
@@ -2782,11 +2782,11 @@ struct ParseStrptime {
: parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}
template <typename... Ignored>
- int64_t Call(KernelContext*, util::string_view val, Status* st) const {
+ int64_t Call(KernelContext*, util::string_view val, Status* st) const {
int64_t result = 0;
if (!(*parser)(val.data(), val.size(), unit, &result)) {
- *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
- TimestampType(unit).ToString());
+ *st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
+ TimestampType(unit).ToString());
}
return result;
}
@@ -2796,7 +2796,7 @@ struct ParseStrptime {
};
template <typename InputType>
-Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
ParseStrptime(StrptimeState::Get(ctx))};
return kernel.Exec(ctx, batch, out);
@@ -2810,471 +2810,471 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDe
return Status::Invalid("strptime does not provide default StrptimeOptions");
}
-// ----------------------------------------------------------------------
-// string padding
-
-template <bool PadLeft, bool PadRight>
-struct AsciiPadTransform : public StringTransformBase {
- using State = OptionsWrapper<PadOptions>;
-
- const PadOptions& options_;
-
- explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- if (options_.padding.size() != 1) {
- return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
- }
- return Status::OK();
- }
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- // This is likely very overallocated but hard to do better without
- // actually looking at each string (because of strings that may be
- // longer than the given width)
- return input_ncodeunits + ninputs * options_.width;
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- if (input_string_ncodeunits >= options_.width) {
- std::copy(input, input + input_string_ncodeunits, output);
- return input_string_ncodeunits;
- }
- const int64_t spaces = options_.width - input_string_ncodeunits;
- int64_t left = 0;
- int64_t right = 0;
- if (PadLeft && PadRight) {
- // If odd number of spaces, put the extra space on the right
- left = spaces / 2;
- right = spaces - left;
- } else if (PadLeft) {
- left = spaces;
- } else if (PadRight) {
- right = spaces;
- } else {
- DCHECK(false) << "unreachable";
- return 0;
- }
- std::fill(output, output + left, options_.padding[0]);
- output += left;
- output = std::copy(input, input + input_string_ncodeunits, output);
- std::fill(output, output + right, options_.padding[0]);
- return options_.width;
- }
-};
-
-template <bool PadLeft, bool PadRight>
-struct Utf8PadTransform : public StringTransformBase {
- using State = OptionsWrapper<PadOptions>;
-
- const PadOptions& options_;
-
- explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
- auto strlen = options_.padding.size();
- if (util::UTF8Length(str, str + strlen) != 1) {
- return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
- "'");
- }
- return Status::OK();
- }
-
- int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
- // This is likely very overallocated but hard to do better without
- // actually looking at each string (because of strings that may be
- // longer than the given width)
- // One codepoint may be up to 4 bytes
- return input_ncodeunits + 4 * ninputs * options_.width;
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
- if (input_width >= options_.width) {
- std::copy(input, input + input_string_ncodeunits, output);
- return input_string_ncodeunits;
- }
- const int64_t spaces = options_.width - input_width;
- int64_t left = 0;
- int64_t right = 0;
- if (PadLeft && PadRight) {
- // If odd number of spaces, put the extra space on the right
- left = spaces / 2;
- right = spaces - left;
- } else if (PadLeft) {
- left = spaces;
- } else if (PadRight) {
- right = spaces;
- } else {
- DCHECK(false) << "unreachable";
- return 0;
- }
- uint8_t* start = output;
- while (left) {
- output = std::copy(options_.padding.begin(), options_.padding.end(), output);
- left--;
- }
- output = std::copy(input, input + input_string_ncodeunits, output);
- while (right) {
- output = std::copy(options_.padding.begin(), options_.padding.end(), output);
- right--;
- }
- return output - start;
- }
-};
-
-template <typename Type>
-using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
-template <typename Type>
-using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
-template <typename Type>
-using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
-template <typename Type>
-using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
-template <typename Type>
-using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
-template <typename Type>
-using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
-
-// ----------------------------------------------------------------------
-// string trimming
-
-#ifdef ARROW_WITH_UTF8PROC
-
-template <bool TrimLeft, bool TrimRight>
-struct UTF8TrimWhitespaceTransform : public StringTransformBase {
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- EnsureLookupTablesFilled();
- return Status::OK();
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
- if (TrimLeft && !ARROW_PREDICT_TRUE(
- arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
- return kTransformError;
- }
- if (TrimRight && begin_trimmed < end) {
- if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
- predicate, &end_trimmed))) {
- return kTransformError;
- }
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using UTF8TrimWhitespace =
- StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
-
-template <typename Type>
-using UTF8LTrimWhitespace =
- StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
-
-template <typename Type>
-using UTF8RTrimWhitespace =
- StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
-
-struct UTF8TrimState {
- TrimOptions options_;
- std::vector<bool> codepoints_;
- Status status_ = Status::OK();
-
- explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
- : options_(std::move(options)) {
- if (!ARROW_PREDICT_TRUE(
- arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
- codepoints_.resize(
- std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
- codepoints_.at(c) = true;
- }))) {
- status_ = Status::Invalid("Invalid UTF8 sequence in input");
- }
- }
-};
-
-template <bool TrimLeft, bool TrimRight>
-struct UTF8TrimTransform : public StringTransformBase {
- using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
-
- const UTF8TrimState& state_;
-
- explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
-
- Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
- return state_.status_;
- }
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [&](uint32_t c) { return !state_.codepoints_[c]; };
- if (TrimLeft && !ARROW_PREDICT_TRUE(
- arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
- return kTransformError;
- }
- if (TrimRight && begin_trimmed < end) {
- if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
- predicate, &end_trimmed))) {
- return kTransformError;
- }
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
-
-template <typename Type>
-using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
-
-template <typename Type>
-using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
-
-#endif
-
-template <bool TrimLeft, bool TrimRight>
-struct AsciiTrimWhitespaceTransform : public StringTransformBase {
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
- if (TrimLeft) {
- begin_trimmed = std::find_if(begin, end, predicate);
- }
- if (TrimRight && begin_trimmed < end) {
- std::reverse_iterator<const uint8_t*> rbegin(end);
- std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
- end_trimmed = std::find_if(rbegin, rend, predicate).base();
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using AsciiTrimWhitespace =
- StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
-
-template <typename Type>
-using AsciiLTrimWhitespace =
- StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
-
-template <typename Type>
-using AsciiRTrimWhitespace =
- StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
-
-struct AsciiTrimState {
- TrimOptions options_;
- std::vector<bool> characters_;
-
- explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
- : options_(std::move(options)), characters_(256) {
- for (const auto c : options_.characters) {
- characters_[static_cast<unsigned char>(c)] = true;
- }
- }
-};
-
-template <bool TrimLeft, bool TrimRight>
-struct AsciiTrimTransform : public StringTransformBase {
- using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
-
- const AsciiTrimState& state_;
-
- explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
-
- int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
- uint8_t* output) {
- const uint8_t* begin = input;
- const uint8_t* end = input + input_string_ncodeunits;
- const uint8_t* end_trimmed = end;
- const uint8_t* begin_trimmed = begin;
-
- auto predicate = [&](uint8_t c) { return !state_.characters_[c]; };
- if (TrimLeft) {
- begin_trimmed = std::find_if(begin, end, predicate);
- }
- if (TrimRight && begin_trimmed < end) {
- std::reverse_iterator<const uint8_t*> rbegin(end);
- std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
- end_trimmed = std::find_if(rbegin, rend, predicate).base();
- }
- std::copy(begin_trimmed, end_trimmed, output);
- return end_trimmed - begin_trimmed;
- }
-};
-
-template <typename Type>
-using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
-
-template <typename Type>
-using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
-
-template <typename Type>
-using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
-
-const FunctionDoc utf8_center_doc(
- "Center strings by padding with a given character",
- ("For each string in `strings`, emit a centered string by padding both sides \n"
- "with the given UTF8 codeunit.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc utf8_lpad_doc(
- "Right-align strings by padding with a given character",
- ("For each string in `strings`, emit a right-aligned string by prepending \n"
- "the given UTF8 codeunit.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc utf8_rpad_doc(
- "Left-align strings by padding with a given character",
- ("For each string in `strings`, emit a left-aligned string by appending \n"
- "the given UTF8 codeunit.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc ascii_center_doc(
- utf8_center_doc.description + "",
- ("For each string in `strings`, emit a centered string by padding both sides \n"
- "with the given ASCII character.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc ascii_lpad_doc(
- utf8_lpad_doc.description + "",
- ("For each string in `strings`, emit a right-aligned string by prepending \n"
- "the given ASCII character.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc ascii_rpad_doc(
- utf8_rpad_doc.description + "",
- ("For each string in `strings`, emit a left-aligned string by appending \n"
- "the given ASCII character.\nNull values emit null."),
- {"strings"}, "PadOptions");
-
-const FunctionDoc utf8_trim_whitespace_doc(
- "Trim leading and trailing whitespace characters",
- ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
- "characters removed, where whitespace characters are defined by the Unicode\n"
- "standard. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_ltrim_whitespace_doc(
- "Trim leading whitespace characters",
- ("For each string in `strings`, emit a string with leading whitespace\n"
- "characters removed, where whitespace characters are defined by the Unicode\n"
- "standard. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_rtrim_whitespace_doc(
- "Trim trailing whitespace characters",
- ("For each string in `strings`, emit a string with trailing whitespace\n"
- "characters removed, where whitespace characters are defined by the Unicode\n"
- "standard. Null values emit null."),
- {"strings"});
-
-const FunctionDoc ascii_trim_whitespace_doc(
- "Trim leading and trailing ASCII whitespace characters",
- ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
- "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
- "whitespace characters. Null values emit null."),
- {"strings"});
-
-const FunctionDoc ascii_ltrim_whitespace_doc(
- "Trim leading ASCII whitespace characters",
- ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
- "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
- "whitespace characters. Null values emit null."),
- {"strings"});
-
-const FunctionDoc ascii_rtrim_whitespace_doc(
- "Trim trailing ASCII whitespace characters",
- ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
- "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
- "whitespace characters. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_trim_doc(
- "Trim leading and trailing characters present in the `characters` arguments",
- ("For each string in `strings`, emit a string with leading and trailing\n"
- "characters removed that are present in the `characters` argument. Null values\n"
- "emit null."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc utf8_ltrim_doc(
- "Trim leading characters present in the `characters` arguments",
- ("For each string in `strings`, emit a string with leading\n"
- "characters removed that are present in the `characters` argument. Null values\n"
- "emit null."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc utf8_rtrim_doc(
- "Trim trailing characters present in the `characters` arguments",
- ("For each string in `strings`, emit a string with leading "
- "characters removed that are present in the `characters` argument. Null values\n"
- "emit null."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc ascii_trim_doc(
- utf8_trim_doc.summary + "",
- utf8_trim_doc.description +
- ("\nBoth the input string as the `characters` argument are interepreted as\n"
- "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc ascii_ltrim_doc(
- utf8_ltrim_doc.summary + "",
- utf8_ltrim_doc.description +
- ("\nBoth the input string as the `characters` argument are interepreted as\n"
- "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc ascii_rtrim_doc(
- utf8_rtrim_doc.summary + "",
- utf8_rtrim_doc.description +
- ("\nBoth the input string as the `characters` argument are interepreted as\n"
- "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
- {"strings"}, "TrimOptions");
-
-const FunctionDoc strptime_doc(
- "Parse timestamps",
- ("For each string in `strings`, parse it as a timestamp.\n"
- "The timestamp unit and the expected string pattern must be given\n"
- "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
- "fails parsing, an error is returned."),
- {"strings"}, "StrptimeOptions");
-
-const FunctionDoc binary_length_doc(
- "Compute string lengths",
- ("For each string in `strings`, emit the number of bytes. Null values emit null."),
- {"strings"});
-
-const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
- ("For each string in `strings`, emit the number of "
- "UTF8 characters. Null values emit null."),
- {"strings"});
-
+// ----------------------------------------------------------------------
+// string padding
+
+template <bool PadLeft, bool PadRight>
+struct AsciiPadTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ if (options_.padding.size() != 1) {
+ return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
+ }
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ // This is likely very overallocated but hard to do better without
+ // actually looking at each string (because of strings that may be
+ // longer than the given width)
+ return input_ncodeunits + ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ if (input_string_ncodeunits >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_string_ncodeunits;
+ int64_t left = 0;
+ int64_t right = 0;
+ if (PadLeft && PadRight) {
+ // If odd number of spaces, put the extra space on the right
+ left = spaces / 2;
+ right = spaces - left;
+ } else if (PadLeft) {
+ left = spaces;
+ } else if (PadRight) {
+ right = spaces;
+ } else {
+ DCHECK(false) << "unreachable";
+ return 0;
+ }
+ std::fill(output, output + left, options_.padding[0]);
+ output += left;
+ output = std::copy(input, input + input_string_ncodeunits, output);
+ std::fill(output, output + right, options_.padding[0]);
+ return options_.width;
+ }
+};
+
+template <bool PadLeft, bool PadRight>
+struct Utf8PadTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+ auto strlen = options_.padding.size();
+ if (util::UTF8Length(str, str + strlen) != 1) {
+ return Status::Invalid("Padding must be one codepoint, got '", options_.padding,
+ "'");
+ }
+ return Status::OK();
+ }
+
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ // This is likely very overallocated but hard to do better without
+ // actually looking at each string (because of strings that may be
+ // longer than the given width)
+ // One codepoint may be up to 4 bytes
+ return input_ncodeunits + 4 * ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const int64_t input_width = util::UTF8Length(input, input + input_string_ncodeunits);
+ if (input_width >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_width;
+ int64_t left = 0;
+ int64_t right = 0;
+ if (PadLeft && PadRight) {
+ // If odd number of spaces, put the extra space on the right
+ left = spaces / 2;
+ right = spaces - left;
+ } else if (PadLeft) {
+ left = spaces;
+ } else if (PadRight) {
+ right = spaces;
+ } else {
+ DCHECK(false) << "unreachable";
+ return 0;
+ }
+ uint8_t* start = output;
+ while (left) {
+ output = std::copy(options_.padding.begin(), options_.padding.end(), output);
+ left--;
+ }
+ output = std::copy(input, input + input_string_ncodeunits, output);
+ while (right) {
+ output = std::copy(options_.padding.begin(), options_.padding.end(), output);
+ right--;
+ }
+ return output - start;
+ }
+};
+
+template <typename Type>
+using AsciiLPad = StringTransformExecWithState<Type, AsciiPadTransform<true, false>>;
+template <typename Type>
+using AsciiRPad = StringTransformExecWithState<Type, AsciiPadTransform<false, true>>;
+template <typename Type>
+using AsciiCenter = StringTransformExecWithState<Type, AsciiPadTransform<true, true>>;
+template <typename Type>
+using Utf8LPad = StringTransformExecWithState<Type, Utf8PadTransform<true, false>>;
+template <typename Type>
+using Utf8RPad = StringTransformExecWithState<Type, Utf8PadTransform<false, true>>;
+template <typename Type>
+using Utf8Center = StringTransformExecWithState<Type, Utf8PadTransform<true, true>>;
+
+// ----------------------------------------------------------------------
+// string trimming
+
+#ifdef ARROW_WITH_UTF8PROC
+
+template <bool TrimLeft, bool TrimRight>
+struct UTF8TrimWhitespaceTransform : public StringTransformBase {
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ EnsureLookupTablesFilled();
+ return Status::OK();
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+ if (TrimLeft && !ARROW_PREDICT_TRUE(
+ arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
+ return kTransformError;
+ }
+ if (TrimRight && begin_trimmed < end) {
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
+ predicate, &end_trimmed))) {
+ return kTransformError;
+ }
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using UTF8TrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, true>>;
+
+template <typename Type>
+using UTF8LTrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<true, false>>;
+
+template <typename Type>
+using UTF8RTrimWhitespace =
+ StringTransformExec<Type, UTF8TrimWhitespaceTransform<false, true>>;
+
+struct UTF8TrimState {
+ TrimOptions options_;
+ std::vector<bool> codepoints_;
+ Status status_ = Status::OK();
+
+ explicit UTF8TrimState(KernelContext* ctx, TrimOptions options)
+ : options_(std::move(options)) {
+ if (!ARROW_PREDICT_TRUE(
+ arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
+ codepoints_.resize(
+ std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
+ codepoints_.at(c) = true;
+ }))) {
+ status_ = Status::Invalid("Invalid UTF8 sequence in input");
+ }
+ }
+};
+
+template <bool TrimLeft, bool TrimRight>
+struct UTF8TrimTransform : public StringTransformBase {
+ using State = KernelStateFromFunctionOptions<UTF8TrimState, TrimOptions>;
+
+ const UTF8TrimState& state_;
+
+ explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
+
+ Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+ return state_.status_;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [&](uint32_t c) { return !state_.codepoints_[c]; };
+ if (TrimLeft && !ARROW_PREDICT_TRUE(
+ arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
+ return kTransformError;
+ }
+ if (TrimRight && begin_trimmed < end) {
+ if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
+ predicate, &end_trimmed))) {
+ return kTransformError;
+ }
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using UTF8Trim = StringTransformExecWithState<Type, UTF8TrimTransform<true, true>>;
+
+template <typename Type>
+using UTF8LTrim = StringTransformExecWithState<Type, UTF8TrimTransform<true, false>>;
+
+template <typename Type>
+using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, true>>;
+
+#endif
+
+template <bool TrimLeft, bool TrimRight>
+struct AsciiTrimWhitespaceTransform : public StringTransformBase {
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+ if (TrimLeft) {
+ begin_trimmed = std::find_if(begin, end, predicate);
+ }
+ if (TrimRight && begin_trimmed < end) {
+ std::reverse_iterator<const uint8_t*> rbegin(end);
+ std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+ end_trimmed = std::find_if(rbegin, rend, predicate).base();
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using AsciiTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, true>>;
+
+template <typename Type>
+using AsciiLTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<true, false>>;
+
+template <typename Type>
+using AsciiRTrimWhitespace =
+ StringTransformExec<Type, AsciiTrimWhitespaceTransform<false, true>>;
+
+struct AsciiTrimState {
+ TrimOptions options_;
+ std::vector<bool> characters_;
+
+ explicit AsciiTrimState(KernelContext* ctx, TrimOptions options)
+ : options_(std::move(options)), characters_(256) {
+ for (const auto c : options_.characters) {
+ characters_[static_cast<unsigned char>(c)] = true;
+ }
+ }
+};
+
+template <bool TrimLeft, bool TrimRight>
+struct AsciiTrimTransform : public StringTransformBase {
+ using State = KernelStateFromFunctionOptions<AsciiTrimState, TrimOptions>;
+
+ const AsciiTrimState& state_;
+
+ explicit AsciiTrimTransform(const AsciiTrimState& state) : state_(state) {}
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const uint8_t* begin = input;
+ const uint8_t* end = input + input_string_ncodeunits;
+ const uint8_t* end_trimmed = end;
+ const uint8_t* begin_trimmed = begin;
+
+ auto predicate = [&](uint8_t c) { return !state_.characters_[c]; };
+ if (TrimLeft) {
+ begin_trimmed = std::find_if(begin, end, predicate);
+ }
+ if (TrimRight && begin_trimmed < end) {
+ std::reverse_iterator<const uint8_t*> rbegin(end);
+ std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+ end_trimmed = std::find_if(rbegin, rend, predicate).base();
+ }
+ std::copy(begin_trimmed, end_trimmed, output);
+ return end_trimmed - begin_trimmed;
+ }
+};
+
+template <typename Type>
+using AsciiTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, true>>;
+
+template <typename Type>
+using AsciiLTrim = StringTransformExecWithState<Type, AsciiTrimTransform<true, false>>;
+
+template <typename Type>
+using AsciiRTrim = StringTransformExecWithState<Type, AsciiTrimTransform<false, true>>;
+
+const FunctionDoc utf8_center_doc(
+ "Center strings by padding with a given character",
+ ("For each string in `strings`, emit a centered string by padding both sides \n"
+ "with the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_lpad_doc(
+ "Right-align strings by padding with a given character",
+ ("For each string in `strings`, emit a right-aligned string by prepending \n"
+ "the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_rpad_doc(
+ "Left-align strings by padding with a given character",
+ ("For each string in `strings`, emit a left-aligned string by appending \n"
+ "the given UTF8 codeunit.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_center_doc(
+ utf8_center_doc.description + "",
+ ("For each string in `strings`, emit a centered string by padding both sides \n"
+ "with the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_lpad_doc(
+ utf8_lpad_doc.description + "",
+ ("For each string in `strings`, emit a right-aligned string by prepending \n"
+ "the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc ascii_rpad_doc(
+ utf8_rpad_doc.description + "",
+ ("For each string in `strings`, emit a left-aligned string by appending \n"
+ "the given ASCII character.\nNull values emit null."),
+ {"strings"}, "PadOptions");
+
+const FunctionDoc utf8_trim_whitespace_doc(
+ "Trim leading and trailing whitespace characters",
+ ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_ltrim_whitespace_doc(
+ "Trim leading whitespace characters",
+ ("For each string in `strings`, emit a string with leading whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_rtrim_whitespace_doc(
+ "Trim trailing whitespace characters",
+ ("For each string in `strings`, emit a string with trailing whitespace\n"
+ "characters removed, where whitespace characters are defined by the Unicode\n"
+ "standard. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_trim_whitespace_doc(
+ "Trim leading and trailing ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
+ "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_ltrim_whitespace_doc(
+ "Trim leading ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
+ "characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc ascii_rtrim_whitespace_doc(
+ "Trim trailing ASCII whitespace characters",
+ ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
+ "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
+ "whitespace characters. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_trim_doc(
+ "Trim leading and trailing characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading and trailing\n"
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc utf8_ltrim_doc(
+ "Trim leading characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading\n"
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc utf8_rtrim_doc(
+ "Trim trailing characters present in the `characters` arguments",
+ ("For each string in `strings`, emit a string with leading "
+ "characters removed that are present in the `characters` argument. Null values\n"
+ "emit null."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_trim_doc(
+ utf8_trim_doc.summary + "",
+ utf8_trim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_ltrim_doc(
+ utf8_ltrim_doc.summary + "",
+ utf8_ltrim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc ascii_rtrim_doc(
+ utf8_rtrim_doc.summary + "",
+ utf8_rtrim_doc.description +
+ ("\nBoth the input string as the `characters` argument are interepreted as\n"
+ "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
+ {"strings"}, "TrimOptions");
+
+const FunctionDoc strptime_doc(
+ "Parse timestamps",
+ ("For each string in `strings`, parse it as a timestamp.\n"
+ "The timestamp unit and the expected string pattern must be given\n"
+ "in StrptimeOptions. Null inputs emit null. If a non-null string\n"
+ "fails parsing, an error is returned."),
+ {"strings"}, "StrptimeOptions");
+
+const FunctionDoc binary_length_doc(
+ "Compute string lengths",
+ ("For each string in `strings`, emit the number of bytes. Null values emit null."),
+ {"strings"});
+
+const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
+ ("For each string in `strings`, emit the number of "
+ "UTF8 characters. Null values emit null."),
+ {"strings"});
+
void AddStrptime(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
+ auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
StrptimeExec<StringType>, StrptimeState::Init));
DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
@@ -3283,8 +3283,8 @@ void AddStrptime(FunctionRegistry* registry) {
}
void AddBinaryLength(FunctionRegistry* registry) {
- auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
- &binary_length_doc);
+ auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
+ &binary_length_doc);
ArrayKernelExec exec_offset_32 =
applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
ArrayKernelExec exec_offset_64 =
@@ -3298,575 +3298,575 @@ void AddBinaryLength(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-void AddUtf8Length(FunctionRegistry* registry) {
- auto func =
- std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
-
- ArrayKernelExec exec_offset_32 =
- applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
-
- ArrayKernelExec exec_offset_64 =
- applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
- DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
-
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-template <typename BinaryType, typename ListType>
-struct BinaryJoin {
- using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
- using ListArrayType = typename TypeTraits<ListType>::ArrayType;
- using ListScalarType = typename TypeTraits<ListType>::ScalarType;
- using ListOffsetType = typename ListArrayType::offset_type;
- using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- if (batch[0].kind() == Datum::SCALAR) {
- if (batch[1].kind() == Datum::SCALAR) {
- return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
- }
- DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
- return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
- }
- DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
- if (batch[1].kind() == Datum::SCALAR) {
- return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
- }
- DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
- return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
- }
-
- struct ListScalarOffsetLookup {
- const ArrayType& values;
-
- int64_t GetStart(int64_t i) { return 0; }
- int64_t GetStop(int64_t i) { return values.length(); }
- bool IsNull(int64_t i) { return false; }
- };
-
- struct ListArrayOffsetLookup {
- explicit ListArrayOffsetLookup(const ListArrayType& lists)
- : lists_(lists), offsets_(lists.raw_value_offsets()) {}
-
- int64_t GetStart(int64_t i) { return offsets_[i]; }
- int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
- bool IsNull(int64_t i) { return lists_.IsNull(i); }
-
- private:
- const ListArrayType& lists_;
- const ListOffsetType* offsets_;
- };
-
- struct SeparatorScalarLookup {
- const util::string_view separator;
-
- bool IsNull(int64_t i) { return false; }
- util::string_view GetView(int64_t i) { return separator; }
- };
-
- struct SeparatorArrayLookup {
- const ArrayType& separators;
-
- bool IsNull(int64_t i) { return separators.IsNull(i); }
- util::string_view GetView(int64_t i) { return separators.GetView(i); }
- };
-
- // Scalar, scalar -> scalar
- static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
- const Scalar& right, Datum* out) {
- const auto& list = checked_cast<const ListScalarType&>(left);
- const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
- if (!list.is_valid || !separator_scalar.is_valid) {
- return Status::OK();
- }
- util::string_view separator(*separator_scalar.value);
-
- const auto& strings = checked_cast<const ArrayType&>(*list.value);
- if (strings.null_count() > 0) {
- out->scalar()->is_valid = false;
- return Status::OK();
- }
-
- TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
- auto Append = [&](util::string_view value) {
- return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
- static_cast<int64_t>(value.size()));
- };
- if (strings.length() > 0) {
- auto data_length =
- strings.total_values_length() + (strings.length() - 1) * separator.length();
- RETURN_NOT_OK(builder.Reserve(data_length));
- RETURN_NOT_OK(Append(strings.GetView(0)));
- for (int64_t j = 1; j < strings.length(); j++) {
- RETURN_NOT_OK(Append(separator));
- RETURN_NOT_OK(Append(strings.GetView(j)));
- }
- }
- auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
- return builder.Finish(&out_scalar->value);
- }
-
- // Scalar, array -> array
- static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
- const std::shared_ptr<ArrayData>& right, Datum* out) {
- const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
- if (!list_scalar.is_valid) {
- ARROW_ASSIGN_OR_RAISE(
- auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
- *out = *nulls->data();
- return Status::OK();
- }
- const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
- if (strings.null_count() != 0) {
- ARROW_ASSIGN_OR_RAISE(
- auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
- *out = *nulls->data();
- return Status::OK();
- }
- const ArrayType separators(right);
-
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(separators.length()));
-
- // Presize data to avoid multiple reallocations when joining strings
- int64_t total_data_length = 0;
- const int64_t list_length = strings.length();
- if (list_length) {
- const int64_t string_length = strings.total_values_length();
- total_data_length +=
- string_length * (separators.length() - separators.null_count());
- for (int64_t i = 0; i < separators.length(); ++i) {
- if (separators.IsNull(i)) {
- continue;
- }
- total_data_length += (list_length - 1) * separators.value_length(i);
- }
- }
- RETURN_NOT_OK(builder.ReserveData(total_data_length));
-
- return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
- SeparatorArrayLookup{separators}, &builder, out);
- }
-
- // Array, scalar -> array
- static Status ExecArrayScalar(KernelContext* ctx,
- const std::shared_ptr<ArrayData>& left,
- const Scalar& right, Datum* out) {
- const ListArrayType lists(left);
- const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
-
- if (!separator_scalar.is_valid) {
- ARROW_ASSIGN_OR_RAISE(
- auto nulls,
- MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
- *out = *nulls->data();
- return Status::OK();
- }
-
- util::string_view separator(*separator_scalar.value);
- const auto& strings = checked_cast<const ArrayType&>(*lists.values());
- const auto list_offsets = lists.raw_value_offsets();
-
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(lists.length()));
-
- // Presize data to avoid multiple reallocations when joining strings
- int64_t total_data_length = strings.total_values_length();
- for (int64_t i = 0; i < lists.length(); ++i) {
- const auto start = list_offsets[i], end = list_offsets[i + 1];
- if (end > start && !ValuesContainNull(strings, start, end)) {
- total_data_length += (end - start - 1) * separator.length();
- }
- }
- RETURN_NOT_OK(builder.ReserveData(total_data_length));
-
- return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
- SeparatorScalarLookup{separator}, &builder, out);
- }
-
- // Array, array -> array
- static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
- const std::shared_ptr<ArrayData>& right, Datum* out) {
- const ListArrayType lists(left);
- const auto& strings = checked_cast<const ArrayType&>(*lists.values());
- const auto list_offsets = lists.raw_value_offsets();
- const auto string_offsets = strings.raw_value_offsets();
- const ArrayType separators(right);
-
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(lists.length()));
-
- // Presize data to avoid multiple reallocations when joining strings
- int64_t total_data_length = 0;
- for (int64_t i = 0; i < lists.length(); ++i) {
- if (separators.IsNull(i)) {
- continue;
- }
- const auto start = list_offsets[i], end = list_offsets[i + 1];
- if (end > start && !ValuesContainNull(strings, start, end)) {
- total_data_length += string_offsets[end] - string_offsets[start];
- total_data_length += (end - start - 1) * separators.value_length(i);
- }
- }
- RETURN_NOT_OK(builder.ReserveData(total_data_length));
-
- struct SeparatorLookup {
- const ArrayType& separators;
-
- bool IsNull(int64_t i) { return separators.IsNull(i); }
- util::string_view GetView(int64_t i) { return separators.GetView(i); }
- };
- return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
- SeparatorArrayLookup{separators}, &builder, out);
- }
-
- template <typename ListOffsetLookup, typename SeparatorLookup>
- static Status JoinStrings(int64_t length, const ArrayType& strings,
- ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
- BuilderType* builder, Datum* out) {
- for (int64_t i = 0; i < length; ++i) {
- if (list_offsets.IsNull(i) || separators.IsNull(i)) {
- builder->UnsafeAppendNull();
- continue;
- }
- const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
- if (j_start == j_end) {
- builder->UnsafeAppendEmptyValue();
- continue;
- }
- if (ValuesContainNull(strings, j_start, j_end)) {
- builder->UnsafeAppendNull();
- continue;
- }
- builder->UnsafeAppend(strings.GetView(j_start));
- for (int64_t j = j_start + 1; j < j_end; ++j) {
- builder->UnsafeExtendCurrent(separators.GetView(i));
- builder->UnsafeExtendCurrent(strings.GetView(j));
- }
- }
-
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder->Finish(&string_array));
- *out = *string_array->data();
- // Correct the output type based on the input
- out->mutable_array()->type = strings.type();
- return Status::OK();
- }
-
- static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
- if (values.null_count() == 0) {
- return false;
- }
- for (int64_t i = start; i < end; ++i) {
- if (values.IsNull(i)) {
- return true;
- }
- }
- return false;
- }
-};
-
-using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
-
-template <typename Type>
-struct BinaryJoinElementWise {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
- using offset_type = typename Type::offset_type;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
- // Last argument is the separator (for consistency with binary_join)
- if (std::all_of(batch.values.begin(), batch.values.end(),
- [](const Datum& d) { return d.is_scalar(); })) {
- return ExecOnlyScalar(ctx, options, batch, out);
- }
- return ExecContainingArrays(ctx, options, batch, out);
- }
-
- static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
- const ExecBatch& batch, Datum* out) {
- BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
- const size_t num_args = batch.values.size();
- if (num_args == 1) {
- // Only separator, no values
- ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
- output->is_valid = batch.values[0].scalar()->is_valid;
- return Status::OK();
- }
-
- int64_t final_size = CalculateRowSize(options, batch, 0);
- if (final_size < 0) {
- ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
- output->is_valid = false;
- return Status::OK();
- }
- ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
- const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
- uint8_t* buf = output->value->mutable_data();
- bool first = true;
- for (size_t i = 0; i < num_args - 1; i++) {
- const Scalar& scalar = *batch[i].scalar();
- util::string_view s;
- if (scalar.is_valid) {
- s = UnboxScalar<Type>::Unbox(scalar);
- } else {
- switch (options.null_handling) {
- case JoinOptions::EMIT_NULL:
- // Handled by CalculateRowSize
- DCHECK(false) << "unreachable";
- break;
- case JoinOptions::SKIP:
- continue;
- case JoinOptions::REPLACE:
- s = options.null_replacement;
- break;
- }
- }
- if (!first) {
- buf = std::copy(separator.begin(), separator.end(), buf);
- }
- first = false;
- buf = std::copy(s.begin(), s.end(), buf);
- }
- output->is_valid = true;
- DCHECK_EQ(final_size, buf - output->value->mutable_data());
- return Status::OK();
- }
-
- static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
- const ExecBatch& batch, Datum* out) {
- // Presize data to avoid reallocations
- int64_t final_size = 0;
- for (int64_t i = 0; i < batch.length; i++) {
- auto size = CalculateRowSize(options, batch, i);
- if (size > 0) final_size += size;
- }
- BuilderType builder(ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(batch.length));
- RETURN_NOT_OK(builder.ReserveData(final_size));
-
- std::vector<util::string_view> valid_cols(batch.values.size());
- for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
- size_t num_valid = 0; // Not counting separator
- for (size_t col = 0; col < batch.values.size(); col++) {
- if (batch[col].is_scalar()) {
- const auto& scalar = *batch[col].scalar();
- if (scalar.is_valid) {
- valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
- if (col < batch.values.size() - 1) num_valid++;
- } else {
- valid_cols[col] = util::string_view();
- }
- } else {
- const ArrayData& array = *batch[col].array();
- if (!array.MayHaveNulls() ||
- BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
- const offset_type* offsets = array.GetValues<offset_type>(1);
- const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
- const int64_t length = offsets[row + 1] - offsets[row];
- valid_cols[col] = util::string_view(
- reinterpret_cast<const char*>(data + offsets[row]), length);
- if (col < batch.values.size() - 1) num_valid++;
- } else {
- valid_cols[col] = util::string_view();
- }
- }
- }
-
- if (!valid_cols.back().data()) {
- // Separator is null
- builder.UnsafeAppendNull();
- continue;
- } else if (batch.values.size() == 1) {
- // Only given separator
- builder.UnsafeAppendEmptyValue();
- continue;
- } else if (num_valid < batch.values.size() - 1) {
- // We had some nulls
- if (options.null_handling == JoinOptions::EMIT_NULL) {
- builder.UnsafeAppendNull();
- continue;
- }
- }
- const auto separator = valid_cols.back();
- bool first = true;
- for (size_t col = 0; col < batch.values.size() - 1; col++) {
- util::string_view value = valid_cols[col];
- if (!value.data()) {
- switch (options.null_handling) {
- case JoinOptions::EMIT_NULL:
- DCHECK(false) << "unreachable";
- break;
- case JoinOptions::SKIP:
- continue;
- case JoinOptions::REPLACE:
- value = options.null_replacement;
- break;
- }
- }
- if (first) {
- builder.UnsafeAppend(value);
- first = false;
- continue;
- }
- builder.UnsafeExtendCurrent(separator);
- builder.UnsafeExtendCurrent(value);
- }
- }
-
- std::shared_ptr<Array> string_array;
- RETURN_NOT_OK(builder.Finish(&string_array));
- *out = *string_array->data();
- out->mutable_array()->type = batch[0].type();
- DCHECK_EQ(batch.length, out->array()->length);
- DCHECK_EQ(final_size,
- checked_cast<const ArrayType&>(*string_array).total_values_length());
- return Status::OK();
- }
-
- // Compute the length of the output for the given position, or -1 if it would be null.
- static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
- const int64_t index) {
- const auto num_args = batch.values.size();
- int64_t final_size = 0;
- int64_t num_non_null_args = 0;
- for (size_t i = 0; i < num_args; i++) {
- int64_t element_size = 0;
- bool valid = true;
- if (batch[i].is_scalar()) {
- const Scalar& scalar = *batch[i].scalar();
- valid = scalar.is_valid;
- element_size = UnboxScalar<Type>::Unbox(scalar).size();
- } else {
- const ArrayData& array = *batch[i].array();
- valid = !array.MayHaveNulls() ||
- BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
- const offset_type* offsets = array.GetValues<offset_type>(1);
- element_size = offsets[index + 1] - offsets[index];
- }
- if (i == num_args - 1) {
- if (!valid) return -1;
- if (num_non_null_args > 1) {
- // Add separator size (only if there were values to join)
- final_size += (num_non_null_args - 1) * element_size;
- }
- break;
- }
- if (!valid) {
- switch (options.null_handling) {
- case JoinOptions::EMIT_NULL:
- return -1;
- case JoinOptions::SKIP:
- continue;
- case JoinOptions::REPLACE:
- element_size = options.null_replacement.size();
- break;
- }
- }
- num_non_null_args++;
- final_size += element_size;
- }
- return final_size;
- }
-};
-
-const FunctionDoc binary_join_doc(
- "Join a list of strings together with a `separator` to form a single string",
- ("Insert `separator` between `list` elements, and concatenate them.\n"
- "Any null input and any null `list` element emits a null output.\n"),
- {"list", "separator"});
-
-const FunctionDoc binary_join_element_wise_doc(
- "Join string arguments into one, using the last argument as the separator",
- ("Insert the last argument of `strings` between the rest of the elements, "
- "and concatenate them.\n"
- "Any null separator element emits a null output. Null elements either "
- "emit a null (the default), are skipped, or replaced with a given string.\n"),
- {"*strings"}, "JoinOptions");
-
-const auto kDefaultJoinOptions = JoinOptions::Defaults();
-
-template <typename ListType>
-void AddBinaryJoinForListType(ScalarFunction* func) {
- for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
- auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
- auto list_ty = std::make_shared<ListType>(ty);
- DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
- }
-}
-
-void AddBinaryJoin(FunctionRegistry* registry) {
- {
- auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
- &binary_join_doc);
- AddBinaryJoinForListType<ListType>(func.get());
- AddBinaryJoinForListType<LargeListType>(func.get());
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
- {
- auto func = std::make_shared<ScalarFunction>(
- "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
- &binary_join_element_wise_doc, &kDefaultJoinOptions);
- for (const auto& ty : BaseBinaryTypes()) {
- ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
- GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
- BinaryJoinElementWiseState::Init};
- kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
- kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
- }
-}
-
+void AddUtf8Length(FunctionRegistry* registry) {
+ auto func =
+ std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);
+
+ ArrayKernelExec exec_offset_32 =
+ applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
+
+ ArrayKernelExec exec_offset_64 =
+ applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
+ DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));
+
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+template <typename BinaryType, typename ListType>
+struct BinaryJoin {
+ using ArrayType = typename TypeTraits<BinaryType>::ArrayType;
+ using ListArrayType = typename TypeTraits<ListType>::ArrayType;
+ using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+ using ListOffsetType = typename ListArrayType::offset_type;
+ using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ if (batch[0].kind() == Datum::SCALAR) {
+ if (batch[1].kind() == Datum::SCALAR) {
+ return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+ }
+ DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
+ return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
+ }
+ DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+ if (batch[1].kind() == Datum::SCALAR) {
+ return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
+ }
+ DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
+ return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
+ }
+
+ struct ListScalarOffsetLookup {
+ const ArrayType& values;
+
+ int64_t GetStart(int64_t i) { return 0; }
+ int64_t GetStop(int64_t i) { return values.length(); }
+ bool IsNull(int64_t i) { return false; }
+ };
+
+ struct ListArrayOffsetLookup {
+ explicit ListArrayOffsetLookup(const ListArrayType& lists)
+ : lists_(lists), offsets_(lists.raw_value_offsets()) {}
+
+ int64_t GetStart(int64_t i) { return offsets_[i]; }
+ int64_t GetStop(int64_t i) { return offsets_[i + 1]; }
+ bool IsNull(int64_t i) { return lists_.IsNull(i); }
+
+ private:
+ const ListArrayType& lists_;
+ const ListOffsetType* offsets_;
+ };
+
+ struct SeparatorScalarLookup {
+ const util::string_view separator;
+
+ bool IsNull(int64_t i) { return false; }
+ util::string_view GetView(int64_t i) { return separator; }
+ };
+
+ struct SeparatorArrayLookup {
+ const ArrayType& separators;
+
+ bool IsNull(int64_t i) { return separators.IsNull(i); }
+ util::string_view GetView(int64_t i) { return separators.GetView(i); }
+ };
+
+ // Scalar, scalar -> scalar
+ static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
+ const Scalar& right, Datum* out) {
+ const auto& list = checked_cast<const ListScalarType&>(left);
+ const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
+ if (!list.is_valid || !separator_scalar.is_valid) {
+ return Status::OK();
+ }
+ util::string_view separator(*separator_scalar.value);
+
+ const auto& strings = checked_cast<const ArrayType&>(*list.value);
+ if (strings.null_count() > 0) {
+ out->scalar()->is_valid = false;
+ return Status::OK();
+ }
+
+ TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+ auto Append = [&](util::string_view value) {
+ return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+ static_cast<int64_t>(value.size()));
+ };
+ if (strings.length() > 0) {
+ auto data_length =
+ strings.total_values_length() + (strings.length() - 1) * separator.length();
+ RETURN_NOT_OK(builder.Reserve(data_length));
+ RETURN_NOT_OK(Append(strings.GetView(0)));
+ for (int64_t j = 1; j < strings.length(); j++) {
+ RETURN_NOT_OK(Append(separator));
+ RETURN_NOT_OK(Append(strings.GetView(j)));
+ }
+ }
+ auto out_scalar = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ return builder.Finish(&out_scalar->value);
+ }
+
+ // Scalar, array -> array
+ static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
+ const std::shared_ptr<ArrayData>& right, Datum* out) {
+ const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
+ if (!list_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+ const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
+ if (strings.null_count() != 0) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+ const ArrayType separators(right);
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(separators.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = 0;
+ const int64_t list_length = strings.length();
+ if (list_length) {
+ const int64_t string_length = strings.total_values_length();
+ total_data_length +=
+ string_length * (separators.length() - separators.null_count());
+ for (int64_t i = 0; i < separators.length(); ++i) {
+ if (separators.IsNull(i)) {
+ continue;
+ }
+ total_data_length += (list_length - 1) * separators.value_length(i);
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ return JoinStrings(separators.length(), strings, ListScalarOffsetLookup{strings},
+ SeparatorArrayLookup{separators}, &builder, out);
+ }
+
+ // Array, scalar -> array
+ static Status ExecArrayScalar(KernelContext* ctx,
+ const std::shared_ptr<ArrayData>& left,
+ const Scalar& right, Datum* out) {
+ const ListArrayType lists(left);
+ const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
+
+ if (!separator_scalar.is_valid) {
+ ARROW_ASSIGN_OR_RAISE(
+ auto nulls,
+ MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
+ *out = *nulls->data();
+ return Status::OK();
+ }
+
+ util::string_view separator(*separator_scalar.value);
+ const auto& strings = checked_cast<const ArrayType&>(*lists.values());
+ const auto list_offsets = lists.raw_value_offsets();
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(lists.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = strings.total_values_length();
+ for (int64_t i = 0; i < lists.length(); ++i) {
+ const auto start = list_offsets[i], end = list_offsets[i + 1];
+ if (end > start && !ValuesContainNull(strings, start, end)) {
+ total_data_length += (end - start - 1) * separator.length();
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
+ SeparatorScalarLookup{separator}, &builder, out);
+ }
+
+ // Array, array -> array
+ static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
+ const std::shared_ptr<ArrayData>& right, Datum* out) {
+ const ListArrayType lists(left);
+ const auto& strings = checked_cast<const ArrayType&>(*lists.values());
+ const auto list_offsets = lists.raw_value_offsets();
+ const auto string_offsets = strings.raw_value_offsets();
+ const ArrayType separators(right);
+
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(lists.length()));
+
+ // Presize data to avoid multiple reallocations when joining strings
+ int64_t total_data_length = 0;
+ for (int64_t i = 0; i < lists.length(); ++i) {
+ if (separators.IsNull(i)) {
+ continue;
+ }
+ const auto start = list_offsets[i], end = list_offsets[i + 1];
+ if (end > start && !ValuesContainNull(strings, start, end)) {
+ total_data_length += string_offsets[end] - string_offsets[start];
+ total_data_length += (end - start - 1) * separators.value_length(i);
+ }
+ }
+ RETURN_NOT_OK(builder.ReserveData(total_data_length));
+
+ struct SeparatorLookup {
+ const ArrayType& separators;
+
+ bool IsNull(int64_t i) { return separators.IsNull(i); }
+ util::string_view GetView(int64_t i) { return separators.GetView(i); }
+ };
+ return JoinStrings(lists.length(), strings, ListArrayOffsetLookup{lists},
+ SeparatorArrayLookup{separators}, &builder, out);
+ }
+
+ template <typename ListOffsetLookup, typename SeparatorLookup>
+ static Status JoinStrings(int64_t length, const ArrayType& strings,
+ ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
+ BuilderType* builder, Datum* out) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (list_offsets.IsNull(i) || separators.IsNull(i)) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ const auto j_start = list_offsets.GetStart(i), j_end = list_offsets.GetStop(i);
+ if (j_start == j_end) {
+ builder->UnsafeAppendEmptyValue();
+ continue;
+ }
+ if (ValuesContainNull(strings, j_start, j_end)) {
+ builder->UnsafeAppendNull();
+ continue;
+ }
+ builder->UnsafeAppend(strings.GetView(j_start));
+ for (int64_t j = j_start + 1; j < j_end; ++j) {
+ builder->UnsafeExtendCurrent(separators.GetView(i));
+ builder->UnsafeExtendCurrent(strings.GetView(j));
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder->Finish(&string_array));
+ *out = *string_array->data();
+ // Correct the output type based on the input
+ out->mutable_array()->type = strings.type();
+ return Status::OK();
+ }
+
+ static bool ValuesContainNull(const ArrayType& values, int64_t start, int64_t end) {
+ if (values.null_count() == 0) {
+ return false;
+ }
+ for (int64_t i = start; i < end; ++i) {
+ if (values.IsNull(i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+};
+
+using BinaryJoinElementWiseState = OptionsWrapper<JoinOptions>;
+
+template <typename Type>
+struct BinaryJoinElementWise {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
+ // Last argument is the separator (for consistency with binary_join)
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ const size_t num_args = batch.values.size();
+ if (num_args == 1) {
+ // Only separator, no values
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+ output->is_valid = batch.values[0].scalar()->is_valid;
+ return Status::OK();
+ }
+
+ int64_t final_size = CalculateRowSize(options, batch, 0);
+ if (final_size < 0) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
+ output->is_valid = false;
+ return Status::OK();
+ }
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
+ const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
+ uint8_t* buf = output->value->mutable_data();
+ bool first = true;
+ for (size_t i = 0; i < num_args - 1; i++) {
+ const Scalar& scalar = *batch[i].scalar();
+ util::string_view s;
+ if (scalar.is_valid) {
+ s = UnboxScalar<Type>::Unbox(scalar);
+ } else {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ // Handled by CalculateRowSize
+ DCHECK(false) << "unreachable";
+ break;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ s = options.null_replacement;
+ break;
+ }
+ }
+ if (!first) {
+ buf = std::copy(separator.begin(), separator.end(), buf);
+ }
+ first = false;
+ buf = std::copy(s.begin(), s.end(), buf);
+ }
+ output->is_valid = true;
+ DCHECK_EQ(final_size, buf - output->value->mutable_data());
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations
+ int64_t final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSize(options, batch, i);
+ if (size > 0) final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(final_size));
+
+ std::vector<util::string_view> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0; // Not counting separator
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+ if (col < batch.values.size() - 1) num_valid++;
+ } else {
+ valid_cols[col] = util::string_view();
+ }
+ } else {
+ const ArrayData& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const offset_type* offsets = array.GetValues<offset_type>(1);
+ const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+ const int64_t length = offsets[row + 1] - offsets[row];
+ valid_cols[col] = util::string_view(
+ reinterpret_cast<const char*>(data + offsets[row]), length);
+ if (col < batch.values.size() - 1) num_valid++;
+ } else {
+ valid_cols[col] = util::string_view();
+ }
+ }
+ }
+
+ if (!valid_cols.back().data()) {
+ // Separator is null
+ builder.UnsafeAppendNull();
+ continue;
+ } else if (batch.values.size() == 1) {
+ // Only given separator
+ builder.UnsafeAppendEmptyValue();
+ continue;
+ } else if (num_valid < batch.values.size() - 1) {
+ // We had some nulls
+ if (options.null_handling == JoinOptions::EMIT_NULL) {
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ }
+ const auto separator = valid_cols.back();
+ bool first = true;
+ for (size_t col = 0; col < batch.values.size() - 1; col++) {
+ util::string_view value = valid_cols[col];
+ if (!value.data()) {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ DCHECK(false) << "unreachable";
+ break;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ value = options.null_replacement;
+ break;
+ }
+ }
+ if (first) {
+ builder.UnsafeAppend(value);
+ first = false;
+ continue;
+ }
+ builder.UnsafeExtendCurrent(separator);
+ builder.UnsafeExtendCurrent(value);
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ DCHECK_EQ(final_size,
+ checked_cast<const ArrayType&>(*string_array).total_values_length());
+ return Status::OK();
+ }
+
+ // Compute the length of the output for the given position, or -1 if it would be null.
+ static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
+ const int64_t index) {
+ const auto num_args = batch.values.size();
+ int64_t final_size = 0;
+ int64_t num_non_null_args = 0;
+ for (size_t i = 0; i < num_args; i++) {
+ int64_t element_size = 0;
+ bool valid = true;
+ if (batch[i].is_scalar()) {
+ const Scalar& scalar = *batch[i].scalar();
+ valid = scalar.is_valid;
+ element_size = UnboxScalar<Type>::Unbox(scalar).size();
+ } else {
+ const ArrayData& array = *batch[i].array();
+ valid = !array.MayHaveNulls() ||
+ BitUtil::GetBit(array.buffers[0]->data(), array.offset + index);
+ const offset_type* offsets = array.GetValues<offset_type>(1);
+ element_size = offsets[index + 1] - offsets[index];
+ }
+ if (i == num_args - 1) {
+ if (!valid) return -1;
+ if (num_non_null_args > 1) {
+ // Add separator size (only if there were values to join)
+ final_size += (num_non_null_args - 1) * element_size;
+ }
+ break;
+ }
+ if (!valid) {
+ switch (options.null_handling) {
+ case JoinOptions::EMIT_NULL:
+ return -1;
+ case JoinOptions::SKIP:
+ continue;
+ case JoinOptions::REPLACE:
+ element_size = options.null_replacement.size();
+ break;
+ }
+ }
+ num_non_null_args++;
+ final_size += element_size;
+ }
+ return final_size;
+ }
+};
+
+const FunctionDoc binary_join_doc(
+ "Join a list of strings together with a `separator` to form a single string",
+ ("Insert `separator` between `list` elements, and concatenate them.\n"
+ "Any null input and any null `list` element emits a null output.\n"),
+ {"list", "separator"});
+
+const FunctionDoc binary_join_element_wise_doc(
+ "Join string arguments into one, using the last argument as the separator",
+ ("Insert the last argument of `strings` between the rest of the elements, "
+ "and concatenate them.\n"
+ "Any null separator element emits a null output. Null elements either "
+ "emit a null (the default), are skipped, or replaced with a given string.\n"),
+ {"*strings"}, "JoinOptions");
+
+const auto kDefaultJoinOptions = JoinOptions::Defaults();
+
+template <typename ListType>
+void AddBinaryJoinForListType(ScalarFunction* func) {
+ for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+ auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
+ auto list_ty = std::make_shared<ListType>(ty);
+ DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, exec));
+ }
+}
+
+void AddBinaryJoin(FunctionRegistry* registry) {
+ {
+ auto func = std::make_shared<ScalarFunction>("binary_join", Arity::Binary(),
+ &binary_join_doc);
+ AddBinaryJoinForListType<ListType>(func.get());
+ AddBinaryJoinForListType<LargeListType>(func.get());
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared<ScalarFunction>(
+ "binary_join_element_wise", Arity::VarArgs(/*min_args=*/1),
+ &binary_join_element_wise_doc, &kDefaultJoinOptions);
+ for (const auto& ty : BaseBinaryTypes()) {
+ ScalarKernel kernel{KernelSignature::Make({InputType(ty)}, ty, /*is_varargs=*/true),
+ GenerateTypeAgnosticVarBinaryBase<BinaryJoinElementWise>(ty),
+ BinaryJoinElementWiseState::Init};
+ kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+}
+
+template <template <typename> class ExecFunctor>
+void MakeUnaryStringBatchKernel(
+ std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ {
+ auto exec_32 = ExecFunctor<StringType>::Exec;
+ ScalarKernel kernel{{utf8()}, utf8(), exec_32};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ {
+ auto exec_64 = ExecFunctor<LargeStringType>::Exec;
+ ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
template <template <typename> class ExecFunctor>
-void MakeUnaryStringBatchKernel(
- std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
- MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
- {
- auto exec_32 = ExecFunctor<StringType>::Exec;
- ScalarKernel kernel{{utf8()}, utf8(), exec_32};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- {
- auto exec_64 = ExecFunctor<LargeStringType>::Exec;
- ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
+void MakeUnaryStringBatchKernelWithState(
+ std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
+ MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+ {
+ using t32 = ExecFunctor<StringType>;
+ ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
+ {
+ using t64 = ExecFunctor<LargeStringType>;
+ ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
+ kernel.mem_allocation = mem_allocation;
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ }
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-template <template <typename> class ExecFunctor>
-void MakeUnaryStringBatchKernelWithState(
- std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
- MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
- {
- using t32 = ExecFunctor<StringType>;
- ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- {
- using t64 = ExecFunctor<LargeStringType>;
- ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
- kernel.mem_allocation = mem_allocation;
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- }
- DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
#ifdef ARROW_WITH_UTF8PROC
template <template <typename> class Transformer>
-void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
@@ -3876,15 +3876,15 @@ void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* regi
#endif
-// NOTE: Predicate should only populate 'status' with errors,
-// leave it unmodified to indicate Status::OK()
-using StringPredicate =
- std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
+// NOTE: Predicate should only populate 'status' with errors,
+// leave it unmodified to indicate Status::OK()
+using StringPredicate =
+ std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
template <typename Type>
-Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
- StringPredicate predicate, Datum* out) {
- Status st = Status::OK();
+Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
+ StringPredicate predicate, Datum* out) {
+ Status st = Status::OK();
EnsureLookupTablesFilled();
if (batch[0].kind() == Datum::ARRAY) {
const ArrayData& input = *batch[0].array();
@@ -3894,250 +3894,250 @@ Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
[&]() -> bool {
util::string_view val = input_it();
- return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
- &st);
+ return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size(),
+ &st);
});
} else {
const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
if (input.is_valid) {
- bool boolean_result = predicate(ctx, input.value->data(),
- static_cast<size_t>(input.value->size()), &st);
- // UTF decoding can lead to issues
- if (st.ok()) {
- out->value = std::make_shared<BooleanScalar>(boolean_result);
+ bool boolean_result = predicate(ctx, input.value->data(),
+ static_cast<size_t>(input.value->size()), &st);
+ // UTF decoding can lead to issues
+ if (st.ok()) {
+ out->value = std::make_shared<BooleanScalar>(boolean_result);
}
}
}
- return st;
+ return st;
}
template <typename Predicate>
-void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
+ return ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
};
auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
+ return ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
};
DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
- return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
-}
-
-FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
- bool non_empty) {
- std::string summary, description;
- {
- std::stringstream ss;
- ss << "Classify strings as " << class_summary;
- summary = ss.str();
- }
- {
- std::stringstream ss;
- if (non_empty) {
- ss
- << ("For each string in `strings`, emit true iff the string is non-empty\n"
- "and consists only of ");
- } else {
- ss
- << ("For each string in `strings`, emit true iff the string consists only\n"
- "of ");
- }
- ss << class_desc << ". Null strings emit null.";
- description = ss.str();
- }
- return StringPredicateDoc(std::move(summary), std::move(description));
-}
-
-const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
-
-const auto ascii_is_alnum_doc =
- StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
-const auto ascii_is_alpha_doc =
- StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
-const auto ascii_is_decimal_doc =
- StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
-const auto ascii_is_lower_doc =
- StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
-const auto ascii_is_printable_doc =
- StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
-const auto ascii_is_space_doc =
- StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
-const auto ascii_is_upper_doc =
- StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
-
-const auto ascii_is_title_doc = StringPredicateDoc(
- "Classify strings as ASCII titlecase",
- ("For each string in `strings`, emit true iff the string is title-cased,\n"
- "i.e. it has at least one cased character, each uppercase character\n"
- "follows a non-cased character, and each lowercase character follows\n"
- "an uppercase character.\n"));
-
-const auto utf8_is_alnum_doc =
- StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
-const auto utf8_is_alpha_doc =
- StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
-const auto utf8_is_decimal_doc =
- StringClassifyDoc("decimal", "decimal Unicode characters", true);
-const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
-const auto utf8_is_lower_doc =
- StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
-const auto utf8_is_numeric_doc =
- StringClassifyDoc("numeric", "numeric Unicode characters", true);
-const auto utf8_is_printable_doc =
- StringClassifyDoc("printable", "printable Unicode characters", true);
-const auto utf8_is_space_doc =
- StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
-const auto utf8_is_upper_doc =
- StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
-
-const auto utf8_is_title_doc = StringPredicateDoc(
- "Classify strings as titlecase",
- ("For each string in `strings`, emit true iff the string is title-cased,\n"
- "i.e. it has at least one cased character, each uppercase character\n"
- "follows a non-cased character, and each lowercase character follows\n"
- "an uppercase character.\n"));
-
-const FunctionDoc ascii_upper_doc(
- "Transform ASCII input to uppercase",
- ("For each string in `strings`, return an uppercase version.\n\n"
- "This function assumes the input is fully ASCII. It it may contain\n"
- "non-ASCII characters, use \"utf8_upper\" instead."),
- {"strings"});
-
-const FunctionDoc ascii_lower_doc(
- "Transform ASCII input to lowercase",
- ("For each string in `strings`, return a lowercase version.\n\n"
- "This function assumes the input is fully ASCII. If it may contain\n"
- "non-ASCII characters, use \"utf8_lower\" instead."),
- {"strings"});
-
-const FunctionDoc utf8_upper_doc(
- "Transform input to uppercase",
- ("For each string in `strings`, return an uppercase version."), {"strings"});
-
-const FunctionDoc utf8_lower_doc(
- "Transform input to lowercase",
- ("For each string in `strings`, return a lowercase version."), {"strings"});
-
-const FunctionDoc ascii_reverse_doc(
- "Reverse ASCII input",
- ("For each ASCII string in `strings`, return a reversed version.\n\n"
- "This function assumes the input is fully ASCII. If it may contain\n"
- "non-ASCII characters, use \"utf8_reverse\" instead."),
- {"strings"});
-
-const FunctionDoc utf8_reverse_doc(
- "Reverse utf8 input",
- ("For each utf8 string in `strings`, return a reversed version.\n\n"
- "This function operates on codepoints/UTF-8 code units, not grapheme\n"
- "clusters. Hence, it will not correctly reverse grapheme clusters\n"
- "composed of multiple codepoints."),
- {"strings"});
-
+FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
+ return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
+}
+
+FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
+ bool non_empty) {
+ std::string summary, description;
+ {
+ std::stringstream ss;
+ ss << "Classify strings as " << class_summary;
+ summary = ss.str();
+ }
+ {
+ std::stringstream ss;
+ if (non_empty) {
+ ss
+ << ("For each string in `strings`, emit true iff the string is non-empty\n"
+ "and consists only of ");
+ } else {
+ ss
+ << ("For each string in `strings`, emit true iff the string consists only\n"
+ "of ");
+ }
+ ss << class_desc << ". Null strings emit null.";
+ description = ss.str();
+ }
+ return StringPredicateDoc(std::move(summary), std::move(description));
+}
+
+const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);
+
+const auto ascii_is_alnum_doc =
+ StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
+const auto ascii_is_alpha_doc =
+ StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
+const auto ascii_is_decimal_doc =
+ StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
+const auto ascii_is_lower_doc =
+ StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
+const auto ascii_is_printable_doc =
+ StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
+const auto ascii_is_space_doc =
+ StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
+const auto ascii_is_upper_doc =
+ StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);
+
+const auto ascii_is_title_doc = StringPredicateDoc(
+ "Classify strings as ASCII titlecase",
+ ("For each string in `strings`, emit true iff the string is title-cased,\n"
+ "i.e. it has at least one cased character, each uppercase character\n"
+ "follows a non-cased character, and each lowercase character follows\n"
+ "an uppercase character.\n"));
+
+const auto utf8_is_alnum_doc =
+ StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
+const auto utf8_is_alpha_doc =
+ StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
+const auto utf8_is_decimal_doc =
+ StringClassifyDoc("decimal", "decimal Unicode characters", true);
+const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
+const auto utf8_is_lower_doc =
+ StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
+const auto utf8_is_numeric_doc =
+ StringClassifyDoc("numeric", "numeric Unicode characters", true);
+const auto utf8_is_printable_doc =
+ StringClassifyDoc("printable", "printable Unicode characters", true);
+const auto utf8_is_space_doc =
+ StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
+const auto utf8_is_upper_doc =
+ StringClassifyDoc("uppercase", "uppercase Unicode characters", true);
+
+const auto utf8_is_title_doc = StringPredicateDoc(
+ "Classify strings as titlecase",
+ ("For each string in `strings`, emit true iff the string is title-cased,\n"
+ "i.e. it has at least one cased character, each uppercase character\n"
+ "follows a non-cased character, and each lowercase character follows\n"
+ "an uppercase character.\n"));
+
+const FunctionDoc ascii_upper_doc(
+ "Transform ASCII input to uppercase",
+ ("For each string in `strings`, return an uppercase version.\n\n"
+ "This function assumes the input is fully ASCII. It it may contain\n"
+ "non-ASCII characters, use \"utf8_upper\" instead."),
+ {"strings"});
+
+const FunctionDoc ascii_lower_doc(
+ "Transform ASCII input to lowercase",
+ ("For each string in `strings`, return a lowercase version.\n\n"
+ "This function assumes the input is fully ASCII. If it may contain\n"
+ "non-ASCII characters, use \"utf8_lower\" instead."),
+ {"strings"});
+
+const FunctionDoc utf8_upper_doc(
+ "Transform input to uppercase",
+ ("For each string in `strings`, return an uppercase version."), {"strings"});
+
+const FunctionDoc utf8_lower_doc(
+ "Transform input to lowercase",
+ ("For each string in `strings`, return a lowercase version."), {"strings"});
+
+const FunctionDoc ascii_reverse_doc(
+ "Reverse ASCII input",
+ ("For each ASCII string in `strings`, return a reversed version.\n\n"
+ "This function assumes the input is fully ASCII. If it may contain\n"
+ "non-ASCII characters, use \"utf8_reverse\" instead."),
+ {"strings"});
+
+const FunctionDoc utf8_reverse_doc(
+ "Reverse utf8 input",
+ ("For each utf8 string in `strings`, return a reversed version.\n\n"
+ "This function operates on codepoints/UTF-8 code units, not grapheme\n"
+ "clusters. Hence, it will not correctly reverse grapheme clusters\n"
+ "composed of multiple codepoints."),
+ {"strings"});
+
} // namespace
void RegisterScalarStringAscii(FunctionRegistry* registry) {
- // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
- // so don't preallocate them in the output.
- MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
- MemAllocation::NO_PREALLOCATE);
- MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
- MemAllocation::NO_PREALLOCATE);
- MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
- &ascii_trim_whitespace_doc);
- MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
- &ascii_ltrim_whitespace_doc);
- MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
- &ascii_rtrim_whitespace_doc);
- MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
- MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
-
- MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
- &ascii_center_doc);
- MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
- MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
- MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
- &utf8_center_doc);
- MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
- MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
-
- MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
- MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
- &ascii_ltrim_doc);
- MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
- &ascii_rtrim_doc);
-
- AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
-
- AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
- &ascii_is_alnum_doc);
- AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
- AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
- &ascii_is_decimal_doc);
+ // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
+ // so don't preallocate them in the output.
+ MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
+ MemAllocation::NO_PREALLOCATE);
+ MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
+ MemAllocation::NO_PREALLOCATE);
+ MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
+ &ascii_trim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
+ &ascii_ltrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
+ &ascii_rtrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<AsciiReverse>("ascii_reverse", registry, &ascii_reverse_doc);
+ MakeUnaryStringBatchKernel<Utf8Reverse>("utf8_reverse", registry, &utf8_reverse_doc);
+
+ MakeUnaryStringBatchKernelWithState<AsciiCenter>("ascii_center", registry,
+ &ascii_center_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiLPad>("ascii_lpad", registry, &ascii_lpad_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiRPad>("ascii_rpad", registry, &ascii_rpad_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
+ &utf8_center_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, &utf8_lpad_doc);
+ MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, &utf8_rpad_doc);
+
+ MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry, &ascii_trim_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
+ &ascii_ltrim_doc);
+ MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
+ &ascii_rtrim_doc);
+
+ AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);
+
+ AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
+ &ascii_is_alnum_doc);
+ AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
+ AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
+ &ascii_is_decimal_doc);
// no is_digit for ascii, since it is the same as is_decimal
- AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
+ AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
// no is_numeric for ascii, since it is the same as is_decimal
- AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
- &ascii_is_printable_doc);
- AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
- AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
- AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
+ AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
+ &ascii_is_printable_doc);
+ AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
+ AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
+ AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);
#ifdef ARROW_WITH_UTF8PROC
- MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
- MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
- MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
- &utf8_trim_whitespace_doc);
- MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
- &utf8_ltrim_whitespace_doc);
- MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
- &utf8_rtrim_whitespace_doc);
- MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
- MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
- MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
-
- AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
- &utf8_is_alnum_doc);
- AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
- AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
- &utf8_is_decimal_doc);
- AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
- AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
- AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
- &utf8_is_numeric_doc);
- AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
- &utf8_is_printable_doc);
- AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
- AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
- AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
+ MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
+ MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
+ MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
+ &utf8_trim_whitespace_doc);
+ MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
+ &utf8_ltrim_whitespace_doc);
+ MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
+ &utf8_rtrim_whitespace_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
+ MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);
+
+ AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
+ &utf8_is_alnum_doc);
+ AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
+ AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
+ &utf8_is_decimal_doc);
+ AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
+ AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
+ AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
+ &utf8_is_numeric_doc);
+ AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
+ &utf8_is_printable_doc);
+ AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
+ AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
+ AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
#endif
AddBinaryLength(registry);
- AddUtf8Length(registry);
+ AddUtf8Length(registry);
AddMatchSubstring(registry);
- AddFindSubstring(registry);
- AddCountSubstring(registry);
- MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
- "replace_substring", registry, &replace_substring_doc,
- MemAllocation::NO_PREALLOCATE);
-#ifdef ARROW_WITH_RE2
- MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
- "replace_substring_regex", registry, &replace_substring_regex_doc,
- MemAllocation::NO_PREALLOCATE);
- AddExtractRegex(registry);
-#endif
- AddReplaceSlice(registry);
- AddSlice(registry);
- AddSplit(registry);
+ AddFindSubstring(registry);
+ AddCountSubstring(registry);
+ MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
+ "replace_substring", registry, &replace_substring_doc,
+ MemAllocation::NO_PREALLOCATE);
+#ifdef ARROW_WITH_RE2
+ MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
+ "replace_substring_regex", registry, &replace_substring_regex_doc,
+ MemAllocation::NO_PREALLOCATE);
+ AddExtractRegex(registry);
+#endif
+ AddReplaceSlice(registry);
+ AddSlice(registry);
+ AddSplit(registry);
AddStrptime(registry);
- AddBinaryJoin(registry);
+ AddBinaryJoin(registry);
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
index e9375664a90..f0257772d4a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_temporal.cc
@@ -1,663 +1,663 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/builder.h"
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/time.h"
-#include "arrow/vendored/datetime.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-namespace compute {
-namespace internal {
-
-namespace {
-
-using arrow_vendored::date::days;
-using arrow_vendored::date::floor;
-using arrow_vendored::date::hh_mm_ss;
-using arrow_vendored::date::sys_time;
-using arrow_vendored::date::trunc;
-using arrow_vendored::date::weekday;
-using arrow_vendored::date::weeks;
-using arrow_vendored::date::year_month_day;
-using arrow_vendored::date::years;
-using arrow_vendored::date::literals::dec;
-using arrow_vendored::date::literals::jan;
-using arrow_vendored::date::literals::last;
-using arrow_vendored::date::literals::mon;
-using arrow_vendored::date::literals::thu;
-using internal::applicator::ScalarUnaryNotNull;
-using internal::applicator::SimpleUnary;
-
-using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
-
-const std::string& GetInputTimezone(const Datum& datum) {
- return checked_cast<const TimestampType&>(*datum.type()).timezone();
-}
-
-const std::string& GetInputTimezone(const Scalar& scalar) {
- return checked_cast<const TimestampType&>(*scalar.type).timezone();
-}
-
-const std::string& GetInputTimezone(const ArrayData& array) {
- return checked_cast<const TimestampType&>(*array.type).timezone();
-}
-
-template <typename T>
-Status TemporalComponentExtractCheckTimezone(const T& input) {
- const auto& timezone = GetInputTimezone(input);
- if (!timezone.empty()) {
- return Status::NotImplemented(
- "Cannot extract components from timestamp with specific timezone: ", timezone);
- }
- return Status::OK();
-}
-
-template <typename Op, typename OutType>
-struct TemporalComponentExtract {
- using OutValue = typename internal::GetOutputType<OutType>::T;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
- return ScalarUnaryNotNull<OutType, TimestampType, Op>::Exec(ctx, batch, out);
- }
-};
-
-template <typename Op, typename OutType>
-struct DayOfWeekExec {
- using OutValue = typename internal::GetOutputType<OutType>::T;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
- if (options.week_start < 1 || 7 < options.week_start) {
- return Status::Invalid(
- "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=",
- options.week_start);
- }
-
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
- applicator::ScalarUnaryNotNullStateful<OutType, TimestampType, Op> kernel{
- Op(options)};
- return kernel.Exec(ctx, batch, out);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract year from timestamp
-
-template <typename Duration>
-struct Year {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- return static_cast<T>(static_cast<const int32_t>(
- year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).year()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract month from timestamp
-
-template <typename Duration>
-struct Month {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- return static_cast<T>(static_cast<const uint32_t>(
- year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).month()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract day from timestamp
-
-template <typename Duration>
-struct Day {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- return static_cast<T>(static_cast<const uint32_t>(
- year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).day()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract day of week from timestamp
-//
-// By default week starts on Monday represented by 0 and ends on Sunday represented
-// by 6. Start day of the week (Monday=1, Sunday=7) and numbering start (0 or 1) can be
-// set using DayOfWeekOptions
-
-template <typename Duration>
-struct DayOfWeek {
- explicit DayOfWeek(const DayOfWeekOptions& options) {
- for (int i = 0; i < 7; i++) {
- lookup_table[i] = i + 8 - options.week_start;
- lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i];
- lookup_table[i] += options.one_based_numbering;
- }
- }
-
- template <typename T, typename Arg0>
- T Call(KernelContext*, Arg0 arg, Status*) const {
- const auto wd = arrow_vendored::date::year_month_weekday(
- floor<days>(sys_time<Duration>(Duration{arg})))
- .weekday()
- .iso_encoding();
- return lookup_table[wd - 1];
- }
- std::array<int64_t, 7> lookup_table;
-};
-
-// ----------------------------------------------------------------------
-// Extract day of year from timestamp
-
-template <typename Duration>
-struct DayOfYear {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- return static_cast<T>(
- (t - sys_time<days>(year_month_day(t).year() / jan / 0)).count());
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract ISO Year values from timestamp
-//
-// First week of an ISO year has the majority (4 or more) of it's days in January.
-// Last week of an ISO year has the year's last Thursday in it.
-
-template <typename Duration>
-struct ISOYear {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- auto y = year_month_day{t + days{3}}.year();
- auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- if (t < start) {
- --y;
- }
- return static_cast<T>(static_cast<int32_t>(y));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract ISO week from timestamp
-//
-// First week of an ISO year has the majority (4 or more) of it's days in January.
-// Last week of an ISO year has the year's last Thursday in it.
-// Based on
-// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503
-template <typename Duration>
-struct ISOWeek {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- auto y = year_month_day{t + days{3}}.year();
- auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- if (t < start) {
- --y;
- start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- }
- return static_cast<T>(trunc<weeks>(t - start).count() + 1);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract quarter from timestamp
-
-template <typename Duration>
-struct Quarter {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- const auto ymd = year_month_day(floor<days>(sys_time<Duration>(Duration{arg})));
- return static_cast<T>((static_cast<const uint32_t>(ymd.month()) - 1) / 3 + 1);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract hour from timestamp
-
-template <typename Duration>
-struct Hour {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>((t - floor<days>(t)) / std::chrono::hours(1));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract minute from timestamp
-
-template <typename Duration>
-struct Minute {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>((t - floor<std::chrono::hours>(t)) / std::chrono::minutes(1));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract second from timestamp
-
-template <typename Duration>
-struct Second {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>((t - floor<std::chrono::minutes>(t)) / std::chrono::seconds(1));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract subsecond from timestamp
-
-template <typename Duration>
-struct Subsecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- (std::chrono::duration<double>(t - floor<std::chrono::seconds>(t)).count()));
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract milliseconds from timestamp
-
-template <typename Duration>
-struct Millisecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- ((t - floor<std::chrono::seconds>(t)) / std::chrono::milliseconds(1)) % 1000);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract microseconds from timestamp
-
-template <typename Duration>
-struct Microsecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- ((t - floor<std::chrono::seconds>(t)) / std::chrono::microseconds(1)) % 1000);
- }
-};
-
-// ----------------------------------------------------------------------
-// Extract nanoseconds from timestamp
-
-template <typename Duration>
-struct Nanosecond {
- template <typename T, typename Arg0>
- static T Call(KernelContext*, Arg0 arg, Status*) {
- Duration t = Duration{arg};
- return static_cast<T>(
- ((t - floor<std::chrono::seconds>(t)) / std::chrono::nanoseconds(1)) % 1000);
- }
-};
-
-template <typename Duration>
-inline std::vector<int64_t> get_iso_calendar(int64_t arg) {
- const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
- const auto ymd = year_month_day(t);
- auto y = year_month_day{t + days{3}}.year();
- auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- if (t < start) {
- --y;
- start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
- }
- return {static_cast<int64_t>(static_cast<int32_t>(y)),
- static_cast<int64_t>(trunc<weeks>(t - start).count() + 1),
- static_cast<int64_t>(weekday(ymd).iso_encoding())};
-}
-
-// ----------------------------------------------------------------------
-// Extract ISO calendar values from timestamp
-
-template <typename Duration>
-struct ISOCalendar {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
- if (in.is_valid) {
- const std::shared_ptr<DataType> iso_calendar_type =
- struct_({field("iso_year", int64()), field("iso_week", int64()),
- field("iso_day_of_week", int64())});
- const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
- const auto iso_calendar = get_iso_calendar<Duration>(in_val);
-
- std::vector<std::shared_ptr<Scalar>> values = {
- std::make_shared<Int64Scalar>(iso_calendar[0]),
- std::make_shared<Int64Scalar>(iso_calendar[1]),
- std::make_shared<Int64Scalar>(iso_calendar[2])};
- *checked_cast<StructScalar*>(out) = StructScalar(values, iso_calendar_type);
- } else {
- out->is_valid = false;
- }
- return Status::OK();
- }
-
- static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
- using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
-
- RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
- const std::shared_ptr<DataType> iso_calendar_type =
- struct_({field("iso_year", int64()), field("iso_week", int64()),
- field("iso_day_of_week", int64())});
-
- std::unique_ptr<ArrayBuilder> array_builder;
- RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), iso_calendar_type, &array_builder));
- StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
- RETURN_NOT_OK(struct_builder->Reserve(in.length));
-
- std::vector<BuilderType*> field_builders;
- field_builders.reserve(3);
- for (int i = 0; i < 3; i++) {
- field_builders.push_back(
- checked_cast<BuilderType*>(struct_builder->field_builder(i)));
- RETURN_NOT_OK(field_builders[i]->Reserve(1));
- }
- auto visit_null = [&]() { return struct_builder->AppendNull(); };
- auto visit_value = [&](int64_t arg) {
- const auto iso_calendar = get_iso_calendar<Duration>(arg);
- field_builders[0]->UnsafeAppend(iso_calendar[0]);
- field_builders[1]->UnsafeAppend(iso_calendar[1]);
- field_builders[2]->UnsafeAppend(iso_calendar[2]);
- return struct_builder->Append();
- };
- RETURN_NOT_OK(VisitArrayDataInline<Int64Type>(in, visit_value, visit_null));
-
- std::shared_ptr<Array> out_array;
- RETURN_NOT_OK(struct_builder->Finish(&out_array));
- *out = *std::move(out_array->data());
-
- return Status::OK();
- }
-};
-
-template <template <typename...> class Op, typename OutType>
-std::shared_ptr<ScalarFunction> MakeTemporal(std::string name, const FunctionDoc* doc) {
- const auto& out_type = TypeTraits<OutType>::type_singleton();
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- for (auto unit : internal::AllTimeUnits()) {
- InputType in_type{match::TimestampTypeUnit(unit)};
- switch (unit) {
- case TimeUnit::SECOND: {
- auto exec = TemporalComponentExtract<Op<std::chrono::seconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MILLI: {
- auto exec =
- TemporalComponentExtract<Op<std::chrono::milliseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MICRO: {
- auto exec =
- TemporalComponentExtract<Op<std::chrono::microseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::NANO: {
- auto exec = TemporalComponentExtract<Op<std::chrono::nanoseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- }
- }
- return func;
-}
-
-template <template <typename...> class Op, typename OutType>
-std::shared_ptr<ScalarFunction> MakeTemporalWithOptions(
- std::string name, const FunctionDoc* doc, const DayOfWeekOptions& default_options,
- KernelInit init) {
- const auto& out_type = TypeTraits<OutType>::type_singleton();
- auto func =
- std::make_shared<ScalarFunction>(name, Arity::Unary(), doc, &default_options);
-
- for (auto unit : internal::AllTimeUnits()) {
- InputType in_type{match::TimestampTypeUnit(unit)};
- switch (unit) {
- case TimeUnit::SECOND: {
- auto exec = DayOfWeekExec<Op<std::chrono::seconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- case TimeUnit::MILLI: {
- auto exec = DayOfWeekExec<Op<std::chrono::milliseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- case TimeUnit::MICRO: {
- auto exec = DayOfWeekExec<Op<std::chrono::microseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- case TimeUnit::NANO: {
- auto exec = DayOfWeekExec<Op<std::chrono::nanoseconds>, OutType>::Exec;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
- break;
- }
- }
- }
- return func;
-}
-
-template <template <typename...> class Op>
-std::shared_ptr<ScalarFunction> MakeStructTemporal(std::string name,
- const FunctionDoc* doc) {
- const auto& out_type = struct_({field("iso_year", int64()), field("iso_week", int64()),
- field("iso_day_of_week", int64())});
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- for (auto unit : internal::AllTimeUnits()) {
- InputType in_type{match::TimestampTypeUnit(unit)};
- switch (unit) {
- case TimeUnit::SECOND: {
- auto exec = SimpleUnary<Op<std::chrono::seconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MILLI: {
- auto exec = SimpleUnary<Op<std::chrono::milliseconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::MICRO: {
- auto exec = SimpleUnary<Op<std::chrono::microseconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- case TimeUnit::NANO: {
- auto exec = SimpleUnary<Op<std::chrono::nanoseconds>>;
- DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
- break;
- }
- }
- }
- return func;
-}
-
-const FunctionDoc year_doc{
- "Extract year from timestamp",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc month_doc{
- "Extract month number",
- ("Month is encoded as January=1, December=12.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc day_doc{
- "Extract day number",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc day_of_week_doc{
- "Extract day of the week number",
- ("By default, the week starts on Monday represented by 0 and ends on Sunday "
- "represented by 6.\n"
- "DayOfWeekOptions.week_start can be used to set another starting day using ISO "
- "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using "
- "DayOfWeekOptions.one_based_numbering parameter.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"},
- "DayOfWeekOptions"};
-
-const FunctionDoc day_of_year_doc{
- "Extract number of day of year",
- ("January 1st maps to day number 1, February 1st to 32, etc.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc iso_year_doc{
- "Extract ISO year number",
- ("First week of an ISO year has the majority (4 or more) of its days in January."
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc iso_week_doc{
- "Extract ISO week of year number",
- ("First ISO week has the majority (4 or more) of its days in January.\n"
- "Week of the year starts with 1 and can run up to 53.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc iso_calendar_doc{
- "Extract (ISO year, ISO week, ISO day of week) struct",
- ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc quarter_doc{
- "Extract quarter of year number",
- ("First quarter maps to 1 and forth quarter maps to 4.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc hour_doc{
- "Extract hour value",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc minute_doc{
- "Extract minute values",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc second_doc{
- "Extract second values",
- "Returns an error if timestamp has a defined timezone. Null values return null.",
- {"values"}};
-
-const FunctionDoc millisecond_doc{
- "Extract millisecond values",
- ("Millisecond returns number of milliseconds since the last full second.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc microsecond_doc{
- "Extract microsecond values",
- ("Millisecond returns number of microseconds since the last full millisecond.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc nanosecond_doc{
- "Extract nanosecond values",
- ("Nanosecond returns number of nanoseconds since the last full microsecond.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-const FunctionDoc subsecond_doc{
- "Extract subsecond values",
- ("Subsecond returns the fraction of a second since the last full second.\n"
- "Returns an error if timestamp has a defined timezone. Null values return null."),
- {"values"}};
-
-} // namespace
-
-void RegisterScalarTemporal(FunctionRegistry* registry) {
- auto year = MakeTemporal<Year, Int64Type>("year", &year_doc);
- DCHECK_OK(registry->AddFunction(std::move(year)));
-
- auto month = MakeTemporal<Month, Int64Type>("month", &year_doc);
- DCHECK_OK(registry->AddFunction(std::move(month)));
-
- auto day = MakeTemporal<Day, Int64Type>("day", &year_doc);
- DCHECK_OK(registry->AddFunction(std::move(day)));
-
- static auto default_day_of_week_options = DayOfWeekOptions::Defaults();
- auto day_of_week = MakeTemporalWithOptions<DayOfWeek, Int64Type>(
- "day_of_week", &day_of_week_doc, default_day_of_week_options, DayOfWeekState::Init);
- DCHECK_OK(registry->AddFunction(std::move(day_of_week)));
-
- auto day_of_year = MakeTemporal<DayOfYear, Int64Type>("day_of_year", &day_of_year_doc);
- DCHECK_OK(registry->AddFunction(std::move(day_of_year)));
-
- auto iso_year = MakeTemporal<ISOYear, Int64Type>("iso_year", &iso_year_doc);
- DCHECK_OK(registry->AddFunction(std::move(iso_year)));
-
- auto iso_week = MakeTemporal<ISOWeek, Int64Type>("iso_week", &iso_week_doc);
- DCHECK_OK(registry->AddFunction(std::move(iso_week)));
-
- auto iso_calendar = MakeStructTemporal<ISOCalendar>("iso_calendar", &iso_calendar_doc);
- DCHECK_OK(registry->AddFunction(std::move(iso_calendar)));
-
- auto quarter = MakeTemporal<Quarter, Int64Type>("quarter", &quarter_doc);
- DCHECK_OK(registry->AddFunction(std::move(quarter)));
-
- auto hour = MakeTemporal<Hour, Int64Type>("hour", &hour_doc);
- DCHECK_OK(registry->AddFunction(std::move(hour)));
-
- auto minute = MakeTemporal<Minute, Int64Type>("minute", &minute_doc);
- DCHECK_OK(registry->AddFunction(std::move(minute)));
-
- auto second = MakeTemporal<Second, Int64Type>("second", &second_doc);
- DCHECK_OK(registry->AddFunction(std::move(second)));
-
- auto millisecond =
- MakeTemporal<Millisecond, Int64Type>("millisecond", &millisecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(millisecond)));
-
- auto microsecond =
- MakeTemporal<Microsecond, Int64Type>("microsecond", &microsecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(microsecond)));
-
- auto nanosecond = MakeTemporal<Nanosecond, Int64Type>("nanosecond", &nanosecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(nanosecond)));
-
- auto subsecond = MakeTemporal<Subsecond, DoubleType>("subsecond", &subsecond_doc);
- DCHECK_OK(registry->AddFunction(std::move(subsecond)));
-}
-
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/builder.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/time.h"
+#include "arrow/vendored/datetime.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+namespace compute {
+namespace internal {
+
+namespace {
+
+using arrow_vendored::date::days;
+using arrow_vendored::date::floor;
+using arrow_vendored::date::hh_mm_ss;
+using arrow_vendored::date::sys_time;
+using arrow_vendored::date::trunc;
+using arrow_vendored::date::weekday;
+using arrow_vendored::date::weeks;
+using arrow_vendored::date::year_month_day;
+using arrow_vendored::date::years;
+using arrow_vendored::date::literals::dec;
+using arrow_vendored::date::literals::jan;
+using arrow_vendored::date::literals::last;
+using arrow_vendored::date::literals::mon;
+using arrow_vendored::date::literals::thu;
+using internal::applicator::ScalarUnaryNotNull;
+using internal::applicator::SimpleUnary;
+
+using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
+
+const std::string& GetInputTimezone(const Datum& datum) {
+ return checked_cast<const TimestampType&>(*datum.type()).timezone();
+}
+
+const std::string& GetInputTimezone(const Scalar& scalar) {
+ return checked_cast<const TimestampType&>(*scalar.type).timezone();
+}
+
+const std::string& GetInputTimezone(const ArrayData& array) {
+ return checked_cast<const TimestampType&>(*array.type).timezone();
+}
+
+template <typename T>
+Status TemporalComponentExtractCheckTimezone(const T& input) {
+ const auto& timezone = GetInputTimezone(input);
+ if (!timezone.empty()) {
+ return Status::NotImplemented(
+ "Cannot extract components from timestamp with specific timezone: ", timezone);
+ }
+ return Status::OK();
+}
+
+template <typename Op, typename OutType>
+struct TemporalComponentExtract {
+ using OutValue = typename internal::GetOutputType<OutType>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+ return ScalarUnaryNotNull<OutType, TimestampType, Op>::Exec(ctx, batch, out);
+ }
+};
+
+template <typename Op, typename OutType>
+struct DayOfWeekExec {
+ using OutValue = typename internal::GetOutputType<OutType>::T;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
+ if (options.week_start < 1 || 7 < options.week_start) {
+ return Status::Invalid(
+ "week_start must follow ISO convention (Monday=1, Sunday=7). Got week_start=",
+ options.week_start);
+ }
+
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(batch.values[0]));
+ applicator::ScalarUnaryNotNullStateful<OutType, TimestampType, Op> kernel{
+ Op(options)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract year from timestamp
+
+template <typename Duration>
+struct Year {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const int32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).year()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract month from timestamp
+
+template <typename Duration>
+struct Month {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const uint32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).month()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract day from timestamp
+
+template <typename Duration>
+struct Day {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ return static_cast<T>(static_cast<const uint32_t>(
+ year_month_day(floor<days>(sys_time<Duration>(Duration{arg}))).day()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract day of week from timestamp
+//
+// By default week starts on Monday represented by 0 and ends on Sunday represented
+// by 6. Start day of the week (Monday=1, Sunday=7) and numbering start (0 or 1) can be
+// set using DayOfWeekOptions
+
+template <typename Duration>
+struct DayOfWeek {
+ explicit DayOfWeek(const DayOfWeekOptions& options) {
+ for (int i = 0; i < 7; i++) {
+ lookup_table[i] = i + 8 - options.week_start;
+ lookup_table[i] = (lookup_table[i] > 6) ? lookup_table[i] - 7 : lookup_table[i];
+ lookup_table[i] += options.one_based_numbering;
+ }
+ }
+
+ template <typename T, typename Arg0>
+ T Call(KernelContext*, Arg0 arg, Status*) const {
+ const auto wd = arrow_vendored::date::year_month_weekday(
+ floor<days>(sys_time<Duration>(Duration{arg})))
+ .weekday()
+ .iso_encoding();
+ return lookup_table[wd - 1];
+ }
+ std::array<int64_t, 7> lookup_table;
+};
+
+// ----------------------------------------------------------------------
+// Extract day of year from timestamp
+
+template <typename Duration>
+struct DayOfYear {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ return static_cast<T>(
+ (t - sys_time<days>(year_month_day(t).year() / jan / 0)).count());
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract ISO Year values from timestamp
+//
+// First week of an ISO year has the majority (4 or more) of it's days in January.
+// Last week of an ISO year has the year's last Thursday in it.
+
+template <typename Duration>
+struct ISOYear {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ }
+ return static_cast<T>(static_cast<int32_t>(y));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract ISO week from timestamp
+//
+// First week of an ISO year has the majority (4 or more) of it's days in January.
+// Last week of an ISO year has the year's last Thursday in it.
+// Based on
+// https://github.com/HowardHinnant/date/blob/6e921e1b1d21e84a5c82416ba7ecd98e33a436d0/include/date/iso_week.h#L1503
+template <typename Duration>
+struct ISOWeek {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ }
+ return static_cast<T>(trunc<weeks>(t - start).count() + 1);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract quarter from timestamp
+
+template <typename Duration>
+struct Quarter {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ const auto ymd = year_month_day(floor<days>(sys_time<Duration>(Duration{arg})));
+ return static_cast<T>((static_cast<const uint32_t>(ymd.month()) - 1) / 3 + 1);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract hour from timestamp
+
+template <typename Duration>
+struct Hour {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<days>(t)) / std::chrono::hours(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract minute from timestamp
+
+template <typename Duration>
+struct Minute {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<std::chrono::hours>(t)) / std::chrono::minutes(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract second from timestamp
+
+template <typename Duration>
+struct Second {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>((t - floor<std::chrono::minutes>(t)) / std::chrono::seconds(1));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract subsecond from timestamp
+
+template <typename Duration>
+struct Subsecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ (std::chrono::duration<double>(t - floor<std::chrono::seconds>(t)).count()));
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract milliseconds from timestamp
+
+template <typename Duration>
+struct Millisecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::milliseconds(1)) % 1000);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract microseconds from timestamp
+
+template <typename Duration>
+struct Microsecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::microseconds(1)) % 1000);
+ }
+};
+
+// ----------------------------------------------------------------------
+// Extract nanoseconds from timestamp
+
+template <typename Duration>
+struct Nanosecond {
+ template <typename T, typename Arg0>
+ static T Call(KernelContext*, Arg0 arg, Status*) {
+ Duration t = Duration{arg};
+ return static_cast<T>(
+ ((t - floor<std::chrono::seconds>(t)) / std::chrono::nanoseconds(1)) % 1000);
+ }
+};
+
+template <typename Duration>
+inline std::vector<int64_t> get_iso_calendar(int64_t arg) {
+ const auto t = floor<days>(sys_time<Duration>(Duration{arg}));
+ const auto ymd = year_month_day(t);
+ auto y = year_month_day{t + days{3}}.year();
+ auto start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ if (t < start) {
+ --y;
+ start = sys_time<days>((y - years{1}) / dec / thu[last]) + (mon - thu);
+ }
+ return {static_cast<int64_t>(static_cast<int32_t>(y)),
+ static_cast<int64_t>(trunc<weeks>(t - start).count() + 1),
+ static_cast<int64_t>(weekday(ymd).iso_encoding())};
+}
+
+// ----------------------------------------------------------------------
+// Extract ISO calendar values from timestamp
+
+template <typename Duration>
+struct ISOCalendar {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
+ if (in.is_valid) {
+ const std::shared_ptr<DataType> iso_calendar_type =
+ struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+ const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
+ const auto iso_calendar = get_iso_calendar<Duration>(in_val);
+
+ std::vector<std::shared_ptr<Scalar>> values = {
+ std::make_shared<Int64Scalar>(iso_calendar[0]),
+ std::make_shared<Int64Scalar>(iso_calendar[1]),
+ std::make_shared<Int64Scalar>(iso_calendar[2])};
+ *checked_cast<StructScalar*>(out) = StructScalar(values, iso_calendar_type);
+ } else {
+ out->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+ using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
+
+ RETURN_NOT_OK(TemporalComponentExtractCheckTimezone(in));
+ const std::shared_ptr<DataType> iso_calendar_type =
+ struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+
+ std::unique_ptr<ArrayBuilder> array_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), iso_calendar_type, &array_builder));
+ StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
+ RETURN_NOT_OK(struct_builder->Reserve(in.length));
+
+ std::vector<BuilderType*> field_builders;
+ field_builders.reserve(3);
+ for (int i = 0; i < 3; i++) {
+ field_builders.push_back(
+ checked_cast<BuilderType*>(struct_builder->field_builder(i)));
+ RETURN_NOT_OK(field_builders[i]->Reserve(1));
+ }
+ auto visit_null = [&]() { return struct_builder->AppendNull(); };
+ auto visit_value = [&](int64_t arg) {
+ const auto iso_calendar = get_iso_calendar<Duration>(arg);
+ field_builders[0]->UnsafeAppend(iso_calendar[0]);
+ field_builders[1]->UnsafeAppend(iso_calendar[1]);
+ field_builders[2]->UnsafeAppend(iso_calendar[2]);
+ return struct_builder->Append();
+ };
+ RETURN_NOT_OK(VisitArrayDataInline<Int64Type>(in, visit_value, visit_null));
+
+ std::shared_ptr<Array> out_array;
+ RETURN_NOT_OK(struct_builder->Finish(&out_array));
+ *out = *std::move(out_array->data());
+
+ return Status::OK();
+ }
+};
+
+template <template <typename...> class Op, typename OutType>
+std::shared_ptr<ScalarFunction> MakeTemporal(std::string name, const FunctionDoc* doc) {
+ const auto& out_type = TypeTraits<OutType>::type_singleton();
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = TemporalComponentExtract<Op<std::chrono::seconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec =
+ TemporalComponentExtract<Op<std::chrono::milliseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec =
+ TemporalComponentExtract<Op<std::chrono::microseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = TemporalComponentExtract<Op<std::chrono::nanoseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+template <template <typename...> class Op, typename OutType>
+std::shared_ptr<ScalarFunction> MakeTemporalWithOptions(
+ std::string name, const FunctionDoc* doc, const DayOfWeekOptions& default_options,
+ KernelInit init) {
+ const auto& out_type = TypeTraits<OutType>::type_singleton();
+ auto func =
+ std::make_shared<ScalarFunction>(name, Arity::Unary(), doc, &default_options);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = DayOfWeekExec<Op<std::chrono::seconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec = DayOfWeekExec<Op<std::chrono::milliseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec = DayOfWeekExec<Op<std::chrono::microseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = DayOfWeekExec<Op<std::chrono::nanoseconds>, OutType>::Exec;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec), init));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+template <template <typename...> class Op>
+std::shared_ptr<ScalarFunction> MakeStructTemporal(std::string name,
+ const FunctionDoc* doc) {
+ const auto& out_type = struct_({field("iso_year", int64()), field("iso_week", int64()),
+ field("iso_day_of_week", int64())});
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ for (auto unit : internal::AllTimeUnits()) {
+ InputType in_type{match::TimestampTypeUnit(unit)};
+ switch (unit) {
+ case TimeUnit::SECOND: {
+ auto exec = SimpleUnary<Op<std::chrono::seconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MILLI: {
+ auto exec = SimpleUnary<Op<std::chrono::milliseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::MICRO: {
+ auto exec = SimpleUnary<Op<std::chrono::microseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ case TimeUnit::NANO: {
+ auto exec = SimpleUnary<Op<std::chrono::nanoseconds>>;
+ DCHECK_OK(func->AddKernel({in_type}, out_type, std::move(exec)));
+ break;
+ }
+ }
+ }
+ return func;
+}
+
+const FunctionDoc year_doc{
+ "Extract year from timestamp",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc month_doc{
+ "Extract month number",
+ ("Month is encoded as January=1, December=12.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc day_doc{
+ "Extract day number",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc day_of_week_doc{
+ "Extract day of the week number",
+ ("By default, the week starts on Monday represented by 0 and ends on Sunday "
+ "represented by 6.\n"
+ "DayOfWeekOptions.week_start can be used to set another starting day using ISO "
+ "convention (Monday=1, Sunday=7). Day numbering can start with 0 or 1 using "
+ "DayOfWeekOptions.one_based_numbering parameter.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"},
+ "DayOfWeekOptions"};
+
+const FunctionDoc day_of_year_doc{
+ "Extract number of day of year",
+ ("January 1st maps to day number 1, February 1st to 32, etc.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_year_doc{
+ "Extract ISO year number",
+ ("First week of an ISO year has the majority (4 or more) of its days in January."
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_week_doc{
+ "Extract ISO week of year number",
+ ("First ISO week has the majority (4 or more) of its days in January.\n"
+ "Week of the year starts with 1 and can run up to 53.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc iso_calendar_doc{
+ "Extract (ISO year, ISO week, ISO day of week) struct",
+ ("ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc quarter_doc{
+ "Extract quarter of year number",
+ ("First quarter maps to 1 and forth quarter maps to 4.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc hour_doc{
+ "Extract hour value",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc minute_doc{
+ "Extract minute values",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc second_doc{
+ "Extract second values",
+ "Returns an error if timestamp has a defined timezone. Null values return null.",
+ {"values"}};
+
+const FunctionDoc millisecond_doc{
+ "Extract millisecond values",
+ ("Millisecond returns number of milliseconds since the last full second.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc microsecond_doc{
+ "Extract microsecond values",
+ ("Millisecond returns number of microseconds since the last full millisecond.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc nanosecond_doc{
+ "Extract nanosecond values",
+ ("Nanosecond returns number of nanoseconds since the last full microsecond.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+const FunctionDoc subsecond_doc{
+ "Extract subsecond values",
+ ("Subsecond returns the fraction of a second since the last full second.\n"
+ "Returns an error if timestamp has a defined timezone. Null values return null."),
+ {"values"}};
+
+} // namespace
+
+void RegisterScalarTemporal(FunctionRegistry* registry) {
+ auto year = MakeTemporal<Year, Int64Type>("year", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(year)));
+
+ auto month = MakeTemporal<Month, Int64Type>("month", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(month)));
+
+ auto day = MakeTemporal<Day, Int64Type>("day", &year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(day)));
+
+ static auto default_day_of_week_options = DayOfWeekOptions::Defaults();
+ auto day_of_week = MakeTemporalWithOptions<DayOfWeek, Int64Type>(
+ "day_of_week", &day_of_week_doc, default_day_of_week_options, DayOfWeekState::Init);
+ DCHECK_OK(registry->AddFunction(std::move(day_of_week)));
+
+ auto day_of_year = MakeTemporal<DayOfYear, Int64Type>("day_of_year", &day_of_year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(day_of_year)));
+
+ auto iso_year = MakeTemporal<ISOYear, Int64Type>("iso_year", &iso_year_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_year)));
+
+ auto iso_week = MakeTemporal<ISOWeek, Int64Type>("iso_week", &iso_week_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_week)));
+
+ auto iso_calendar = MakeStructTemporal<ISOCalendar>("iso_calendar", &iso_calendar_doc);
+ DCHECK_OK(registry->AddFunction(std::move(iso_calendar)));
+
+ auto quarter = MakeTemporal<Quarter, Int64Type>("quarter", &quarter_doc);
+ DCHECK_OK(registry->AddFunction(std::move(quarter)));
+
+ auto hour = MakeTemporal<Hour, Int64Type>("hour", &hour_doc);
+ DCHECK_OK(registry->AddFunction(std::move(hour)));
+
+ auto minute = MakeTemporal<Minute, Int64Type>("minute", &minute_doc);
+ DCHECK_OK(registry->AddFunction(std::move(minute)));
+
+ auto second = MakeTemporal<Second, Int64Type>("second", &second_doc);
+ DCHECK_OK(registry->AddFunction(std::move(second)));
+
+ auto millisecond =
+ MakeTemporal<Millisecond, Int64Type>("millisecond", &millisecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(millisecond)));
+
+ auto microsecond =
+ MakeTemporal<Microsecond, Int64Type>("microsecond", &microsecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(microsecond)));
+
+ auto nanosecond = MakeTemporal<Nanosecond, Int64Type>("nanosecond", &nanosecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(nanosecond)));
+
+ auto subsecond = MakeTemporal<Subsecond, DoubleType>("subsecond", &subsecond_doc);
+ DCHECK_OK(registry->AddFunction(std::move(subsecond)));
+}
+
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
index dc63edab12c..ead88abc0f2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#include <cmath>
-
+#include <cmath>
+
#include "arrow/compute/kernels/common.h"
#include "arrow/util/bit_util.h"
@@ -32,12 +32,12 @@ namespace internal {
namespace {
struct IsValidOperator {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
checked_cast<BooleanScalar*>(out)->value = in.is_valid;
- return Status::OK();
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+ static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
DCHECK_EQ(out->offset, 0);
DCHECK_LE(out->length, arr.length);
if (arr.MayHaveNulls()) {
@@ -49,64 +49,64 @@ struct IsValidOperator {
arr.offset == 0 ? arr.buffers[0]
: SliceBuffer(arr.buffers[0], arr.offset / 8,
BitUtil::BytesForBits(out->length + out->offset));
- return Status::OK();
+ return Status::OK();
}
// Input has no nulls => output is entirely true.
- ARROW_ASSIGN_OR_RAISE(out->buffers[1],
- ctx->AllocateBitmap(out->length + out->offset));
+ ARROW_ASSIGN_OR_RAISE(out->buffers[1],
+ ctx->AllocateBitmap(out->length + out->offset));
BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
- return Status::OK();
+ return Status::OK();
+ }
+};
+
+struct IsFiniteOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isfinite(value);
+ }
+};
+
+struct IsInfOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isinf(value);
}
};
-struct IsFiniteOperator {
- template <typename OutType, typename InType>
- static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
- return std::isfinite(value);
- }
-};
-
-struct IsInfOperator {
- template <typename OutType, typename InType>
- static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
- return std::isinf(value);
- }
-};
-
struct IsNullOperator {
- static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+ static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
checked_cast<BooleanScalar*>(out)->value = !in.is_valid;
- return Status::OK();
+ return Status::OK();
}
- static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+ static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
if (arr.MayHaveNulls()) {
// Input has nulls => output is the inverted null (validity) bitmap.
InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length,
out->buffers[1]->mutable_data(), out->offset);
- } else {
- // Input has no nulls => output is entirely false.
- BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
- false);
+ } else {
+ // Input has no nulls => output is entirely false.
+ BitUtil::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
+ false);
}
- return Status::OK();
- }
-};
-
-struct IsNanOperator {
- template <typename OutType, typename InType>
- static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
- return std::isnan(value);
+ return Status::OK();
}
};
-void MakeFunction(std::string name, const FunctionDoc* doc,
- std::vector<InputType> in_types, OutputType out_type,
+struct IsNanOperator {
+ template <typename OutType, typename InType>
+ static constexpr OutType Call(KernelContext*, const InType& value, Status*) {
+ return std::isnan(value);
+ }
+};
+
+void MakeFunction(std::string name, const FunctionDoc* doc,
+ std::vector<InputType> in_types, OutputType out_type,
ArrayKernelExec exec, FunctionRegistry* registry,
MemAllocation::type mem_allocation, bool can_write_into_slices) {
Arity arity{static_cast<int>(in_types.size())};
- auto func = std::make_shared<ScalarFunction>(name, arity, doc);
+ auto func = std::make_shared<ScalarFunction>(name, arity, doc);
ScalarKernel kernel(std::move(in_types), out_type, exec);
kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
@@ -117,112 +117,112 @@ void MakeFunction(std::string name, const FunctionDoc* doc,
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-template <typename InType, typename Op>
-void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
- DCHECK_OK(func->AddKernel({ty}, boolean(),
- applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
-}
-
-std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
- AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
-
- return func;
-}
-
-std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
- AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
-
- return func;
-}
-
-std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
- const FunctionDoc* doc) {
- auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
-
- AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
- AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
-
- return func;
-}
-
-Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename InType, typename Op>
+void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
+ DCHECK_OK(func->AddKernel({ty}, boolean(),
+ applicator::ScalarUnary<BooleanType, InType, Op>::Exec));
+}
+
+std::shared_ptr<ScalarFunction> MakeIsFiniteFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsFiniteOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsFiniteOperator>(float64(), func.get());
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeIsInfFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsInfOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsInfOperator>(float64(), func.get());
+
+ return func;
+}
+
+std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name,
+ const FunctionDoc* doc) {
+ auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
+
+ AddFloatValidityKernel<FloatType, IsNanOperator>(float32(), func.get());
+ AddFloatValidityKernel<DoubleType, IsNanOperator>(float64(), func.get());
+
+ return func;
+}
+
+Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const Datum& arg0 = batch[0];
if (arg0.type()->id() == Type::NA) {
auto false_value = std::make_shared<BooleanScalar>(false);
if (arg0.kind() == Datum::SCALAR) {
- out->value = false_value;
+ out->value = false_value;
} else {
std::shared_ptr<Array> false_values;
- RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
- .Value(&false_values));
+ RETURN_NOT_OK(MakeArrayFromScalar(*false_value, out->length(), ctx->memory_pool())
+ .Value(&false_values));
out->value = false_values->data();
}
- return Status::OK();
+ return Status::OK();
} else {
- return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
+ return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
}
}
-Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const Datum& arg0 = batch[0];
if (arg0.type()->id() == Type::NA) {
if (arg0.kind() == Datum::SCALAR) {
- out->value = std::make_shared<BooleanScalar>(true);
+ out->value = std::make_shared<BooleanScalar>(true);
} else {
// Data is preallocated
ArrayData* out_arr = out->mutable_array();
BitUtil::SetBitsTo(out_arr->buffers[1]->mutable_data(), out_arr->offset,
out_arr->length, true);
}
- return Status::OK();
+ return Status::OK();
} else {
- return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
+ return applicator::SimpleUnary<IsNullOperator>(ctx, batch, out);
}
}
-const FunctionDoc is_valid_doc(
- "Return true if non-null",
- ("For each input value, emit true iff the value is valid (non-null)."), {"values"});
-
-const FunctionDoc is_finite_doc(
- "Return true if value is finite",
- ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
- {"values"});
-
-const FunctionDoc is_inf_doc(
- "Return true if infinity",
- ("For each input value, emit true iff the value is infinite (inf or -inf)."),
- {"values"});
-
-const FunctionDoc is_null_doc("Return true if null",
- ("For each input value, emit true iff the value is null."),
- {"values"});
-
-const FunctionDoc is_nan_doc("Return true if NaN",
- ("For each input value, emit true iff the value is NaN."),
- {"values"});
-
+const FunctionDoc is_valid_doc(
+ "Return true if non-null",
+ ("For each input value, emit true iff the value is valid (non-null)."), {"values"});
+
+const FunctionDoc is_finite_doc(
+ "Return true if value is finite",
+ ("For each input value, emit true iff the value is finite (not NaN, inf, or -inf)."),
+ {"values"});
+
+const FunctionDoc is_inf_doc(
+ "Return true if infinity",
+ ("For each input value, emit true iff the value is infinite (inf or -inf)."),
+ {"values"});
+
+const FunctionDoc is_null_doc("Return true if null",
+ ("For each input value, emit true iff the value is null."),
+ {"values"});
+
+const FunctionDoc is_nan_doc("Return true if NaN",
+ ("For each input value, emit true iff the value is NaN."),
+ {"values"});
+
} // namespace
void RegisterScalarValidity(FunctionRegistry* registry) {
- MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
- registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);
+ MakeFunction("is_valid", &is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
+ registry, MemAllocation::NO_PREALLOCATE, /*can_write_into_slices=*/false);
- MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
- registry, MemAllocation::PREALLOCATE,
+ MakeFunction("is_null", &is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec,
+ registry, MemAllocation::PREALLOCATE,
/*can_write_into_slices=*/true);
-
- DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
- DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
- DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
+
+ DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", &is_finite_doc)));
+ DCHECK_OK(registry->AddFunction(MakeIsInfFunction("is_inf", &is_inf_doc)));
+ DCHECK_OK(registry->AddFunction(MakeIsNanFunction("is_nan", &is_nan_doc)));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
index 0ef0ea6c753..846fa26baf2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.cc
@@ -53,30 +53,30 @@ PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
arg.data += arr.offset * arg.bit_width / 8;
}
// This may be kUnknownNullCount
- arg.null_count = (arg.is_valid != nullptr) ? arr.null_count.load() : 0;
+ arg.null_count = (arg.is_valid != nullptr) ? arr.null_count.load() : 0;
return arg;
}
-ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec,
- NullHandling::type null_handling) {
- return [=](KernelContext* ctx, const ExecBatch& batch, Datum* out) -> Status {
- if (out->is_array()) {
- return exec(ctx, batch, out);
- }
-
- if (null_handling == NullHandling::INTERSECTION && !batch[0].scalar()->is_valid) {
- out->scalar()->is_valid = false;
- return Status::OK();
- }
-
- ARROW_ASSIGN_OR_RAISE(Datum array_in, MakeArrayFromScalar(*batch[0].scalar(), 1));
- ARROW_ASSIGN_OR_RAISE(Datum array_out, MakeArrayFromScalar(*out->scalar(), 1));
- RETURN_NOT_OK(exec(ctx, ExecBatch{{std::move(array_in)}, 1}, &array_out));
- ARROW_ASSIGN_OR_RAISE(*out, array_out.make_array()->GetScalar(0));
- return Status::OK();
- };
-}
-
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec,
+ NullHandling::type null_handling) {
+ return [=](KernelContext* ctx, const ExecBatch& batch, Datum* out) -> Status {
+ if (out->is_array()) {
+ return exec(ctx, batch, out);
+ }
+
+ if (null_handling == NullHandling::INTERSECTION && !batch[0].scalar()->is_valid) {
+ out->scalar()->is_valid = false;
+ return Status::OK();
+ }
+
+ ARROW_ASSIGN_OR_RAISE(Datum array_in, MakeArrayFromScalar(*batch[0].scalar(), 1));
+ ARROW_ASSIGN_OR_RAISE(Datum array_out, MakeArrayFromScalar(*out->scalar(), 1));
+ RETURN_NOT_OK(exec(ctx, ExecBatch{{std::move(array_in)}, 1}, &array_out));
+ ARROW_ASSIGN_OR_RAISE(*out, array_out.make_array()->GetScalar(0));
+ return Status::OK();
+ };
+}
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
index 8ce321f6b4f..394e08da581 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/util_internal.h
@@ -18,30 +18,30 @@
#pragma once
#include <cstdint>
-#include <utility>
+#include <utility>
-#include "arrow/array/util.h"
+#include "arrow/array/util.h"
#include "arrow/buffer.h"
-#include "arrow/compute/kernels/codegen_internal.h"
-#include "arrow/compute/type_fwd.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/util/bit_run_reader.h"
namespace arrow {
namespace compute {
namespace internal {
-// Used in some kernels and testing - not provided by default in MSVC
-// and _USE_MATH_DEFINES is not reliable with unity builds
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-#ifndef M_PI_2
-#define M_PI_2 1.57079632679489661923
-#endif
-#ifndef M_PI_4
-#define M_PI_4 0.785398163397448309616
-#endif
-
+// Used in some kernels and testing - not provided by default in MSVC
+// and _USE_MATH_DEFINES is not reliable with unity builds
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#ifndef M_PI_2
+#define M_PI_2 1.57079632679489661923
+#endif
+#ifndef M_PI_4
+#define M_PI_4 0.785398163397448309616
+#endif
+
// An internal data structure for unpacking a primitive argument to pass to a
// kernel implementation
struct PrimitiveArg {
@@ -67,100 +67,100 @@ int GetBitWidth(const DataType& type);
// rather than duplicating compiled code to do all these in each kernel.
PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
-// Augment a unary ArrayKernelExec which supports only array-like inputs with support for
-// scalar inputs. Scalars will be transformed to 1-long arrays with the scalar's value (or
-// null if the scalar is null) as its only element. This 1-long array will be passed to
-// the original exec, then the only element of the resulting array will be extracted as
-// the output scalar. This could be far more efficient, but instead of optimizing this
-// it'd be better to support scalar inputs "upstream" in original exec.
-ArrayKernelExec TrivialScalarUnaryAsArraysExec(
- ArrayKernelExec exec, NullHandling::type null_handling = NullHandling::INTERSECTION);
-
-// Return (min, max) of a numerical array, ignore nulls.
-// For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
-template <typename T>
-ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
- T min = std::numeric_limits<T>::max();
- T max = std::numeric_limits<T>::lowest();
-
- const T* values = data.GetValues<T>(1);
- arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- min = std::min(min, values[pos + i]);
- max = std::max(max, values[pos + i]);
- }
- });
-
- return std::make_pair(min, max);
-}
-
-template <typename T>
-std::pair<T, T> GetMinMax(const Datum& datum) {
- T min = std::numeric_limits<T>::max();
- T max = std::numeric_limits<T>::lowest();
-
- for (const auto& array : datum.chunks()) {
- T local_min, local_max;
- std::tie(local_min, local_max) = GetMinMax<T>(*array->data());
- min = std::min(min, local_min);
- max = std::max(max, local_max);
- }
-
- return std::make_pair(min, max);
-}
-
-// Count value occurrences of an array, ignore nulls.
-// 'counts' must be zeroed and with enough size.
-template <typename T>
-ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T min) {
- const int64_t n = data.length - data.GetNullCount();
- if (n > 0) {
- const T* values = data.GetValues<T>(1);
- arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
- [&](int64_t pos, int64_t len) {
- for (int64_t i = 0; i < len; ++i) {
- ++counts[values[pos + i] - min];
- }
- });
- }
- return n;
-}
-
-template <typename T>
-int64_t CountValues(uint64_t* counts, const Datum& datum, T min) {
- int64_t n = 0;
- for (const auto& array : datum.chunks()) {
- n += CountValues<T>(counts, *array->data(), min);
- }
- return n;
-}
-
-// Copy numerical array values to a buffer, ignore nulls.
-template <typename T>
-ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
- const int64_t n = data.length - data.GetNullCount();
- if (n > 0) {
- int64_t index = 0;
- const T* values = data.GetValues<T>(1);
- arrow::internal::VisitSetBitRunsVoid(
- data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) {
- memcpy(out + index, values + pos, len * sizeof(T));
- index += len;
- });
- }
- return n;
-}
-
-template <typename T>
-int64_t CopyNonNullValues(const Datum& datum, T* out) {
- int64_t n = 0;
- for (const auto& array : datum.chunks()) {
- n += CopyNonNullValues(*array->data(), out + n);
- }
- return n;
-}
-
+// Augment a unary ArrayKernelExec which supports only array-like inputs with support for
+// scalar inputs. Scalars will be transformed to 1-long arrays with the scalar's value (or
+// null if the scalar is null) as its only element. This 1-long array will be passed to
+// the original exec, then the only element of the resulting array will be extracted as
+// the output scalar. This could be far more efficient, but instead of optimizing this
+// it'd be better to support scalar inputs "upstream" in original exec.
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(
+ ArrayKernelExec exec, NullHandling::type null_handling = NullHandling::INTERSECTION);
+
+// Return (min, max) of a numerical array, ignore nulls.
+// For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
+template <typename T>
+ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::lowest();
+
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ min = std::min(min, values[pos + i]);
+ max = std::max(max, values[pos + i]);
+ }
+ });
+
+ return std::make_pair(min, max);
+}
+
+template <typename T>
+std::pair<T, T> GetMinMax(const Datum& datum) {
+ T min = std::numeric_limits<T>::max();
+ T max = std::numeric_limits<T>::lowest();
+
+ for (const auto& array : datum.chunks()) {
+ T local_min, local_max;
+ std::tie(local_min, local_max) = GetMinMax<T>(*array->data());
+ min = std::min(min, local_min);
+ max = std::max(max, local_max);
+ }
+
+ return std::make_pair(min, max);
+}
+
+// Count value occurrences of an array, ignore nulls.
+// 'counts' must be zeroed and with enough size.
+template <typename T>
+ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T min) {
+ const int64_t n = data.length - data.GetNullCount();
+ if (n > 0) {
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+ [&](int64_t pos, int64_t len) {
+ for (int64_t i = 0; i < len; ++i) {
+ ++counts[values[pos + i] - min];
+ }
+ });
+ }
+ return n;
+}
+
+template <typename T>
+int64_t CountValues(uint64_t* counts, const Datum& datum, T min) {
+ int64_t n = 0;
+ for (const auto& array : datum.chunks()) {
+ n += CountValues<T>(counts, *array->data(), min);
+ }
+ return n;
+}
+
+// Copy numerical array values to a buffer, ignore nulls.
+template <typename T>
+ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
+ const int64_t n = data.length - data.GetNullCount();
+ if (n > 0) {
+ int64_t index = 0;
+ const T* values = data.GetValues<T>(1);
+ arrow::internal::VisitSetBitRunsVoid(
+ data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) {
+ memcpy(out + index, values + pos, len * sizeof(T));
+ index += len;
+ });
+ }
+ return n;
+}
+
+template <typename T>
+int64_t CopyNonNullValues(const Datum& datum, T* out) {
+ int64_t n = 0;
+ for (const auto& array : datum.chunks()) {
+ n += CopyNonNullValues(*array->data(), out + n);
+ }
+ return n;
+}
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
index 224916f5980..a68e78130f2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -22,7 +22,7 @@
#include "arrow/array/array_dict.h"
#include "arrow/array/array_nested.h"
#include "arrow/array/builder_primitive.h"
-#include "arrow/array/concatenate.h"
+#include "arrow/array/concatenate.h"
#include "arrow/array/dict_internal.h"
#include "arrow/array/util.h"
#include "arrow/compute/api_vector.h"
@@ -60,10 +60,10 @@ class UniqueAction final : public ActionBase {
static constexpr bool with_error_status = false;
- UniqueAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : ActionBase(type, pool) {}
-
+ UniqueAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool) {}
+
Status Reset() { return Status::OK(); }
Status Reserve(const int64_t length) { return Status::OK(); }
@@ -80,8 +80,8 @@ class UniqueAction final : public ActionBase {
template <class Index>
void ObserveNotFound(Index index) {}
- bool ShouldEncodeNulls() { return true; }
-
+ bool ShouldEncodeNulls() { return true; }
+
Status Flush(Datum* out) { return Status::OK(); }
Status FlushFinal(Datum* out) { return Status::OK(); }
@@ -96,8 +96,8 @@ class ValueCountsAction final : ActionBase {
static constexpr bool with_error_status = true;
- ValueCountsAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
+ ValueCountsAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
: ActionBase(type, pool), count_builder_(pool) {}
Status Reserve(const int64_t length) {
@@ -153,8 +153,8 @@ class ValueCountsAction final : ActionBase {
}
}
- bool ShouldEncodeNulls() const { return true; }
-
+ bool ShouldEncodeNulls() const { return true; }
+
private:
Int64Builder count_builder_;
};
@@ -168,13 +168,13 @@ class DictEncodeAction final : public ActionBase {
static constexpr bool with_error_status = false;
- DictEncodeAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : ActionBase(type, pool), indices_builder_(pool) {
- if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(options)) {
- encode_options_ = *options_ptr;
- }
- }
+ DictEncodeAction(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : ActionBase(type, pool), indices_builder_(pool) {
+ if (auto options_ptr = static_cast<const DictionaryEncodeOptions*>(options)) {
+ encode_options_ = *options_ptr;
+ }
+ }
Status Reset() {
indices_builder_.Reset();
@@ -185,16 +185,16 @@ class DictEncodeAction final : public ActionBase {
template <class Index>
void ObserveNullFound(Index index) {
- if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) {
- indices_builder_.UnsafeAppendNull();
- } else {
- indices_builder_.UnsafeAppend(index);
- }
+ if (encode_options_.null_encoding_behavior == DictionaryEncodeOptions::MASK) {
+ indices_builder_.UnsafeAppendNull();
+ } else {
+ indices_builder_.UnsafeAppend(index);
+ }
}
template <class Index>
void ObserveNullNotFound(Index index) {
- ObserveNullFound(index);
+ ObserveNullFound(index);
}
template <class Index>
@@ -207,10 +207,10 @@ class DictEncodeAction final : public ActionBase {
ObserveFound(index);
}
- bool ShouldEncodeNulls() {
- return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
- }
-
+ bool ShouldEncodeNulls() {
+ return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
+ }
+
Status Flush(Datum* out) {
std::shared_ptr<ArrayData> result;
RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
@@ -222,14 +222,14 @@ class DictEncodeAction final : public ActionBase {
private:
Int32Builder indices_builder_;
- DictionaryEncodeOptions encode_options_;
+ DictionaryEncodeOptions encode_options_;
};
class HashKernel : public KernelState {
public:
- HashKernel() : options_(nullptr) {}
- explicit HashKernel(const FunctionOptions* options) : options_(options) {}
-
+ HashKernel() : options_(nullptr) {}
+ explicit HashKernel(const FunctionOptions* options) : options_(options) {}
+
// Reset for another run.
virtual Status Reset() = 0;
@@ -253,7 +253,7 @@ class HashKernel : public KernelState {
virtual Status Append(const ArrayData& arr) = 0;
protected:
- const FunctionOptions* options_;
+ const FunctionOptions* options_;
std::mutex lock_;
};
@@ -262,12 +262,12 @@ class HashKernel : public KernelState {
// (NullType has a separate implementation)
template <typename Type, typename Scalar, typename Action,
- bool with_error_status = Action::with_error_status>
+ bool with_error_status = Action::with_error_status>
class RegularHashKernel : public HashKernel {
public:
- RegularHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {}
+ RegularHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : HashKernel(options), pool_(pool), type_(type), action_(type, options, pool) {}
Status Reset() override {
memo_table_.reset(new MemoTable(pool_, 0));
@@ -307,7 +307,7 @@ class RegularHashKernel : public HashKernel {
&unused_memo_index);
},
[this]() {
- if (action_.ShouldEncodeNulls()) {
+ if (action_.ShouldEncodeNulls()) {
auto on_found = [this](int32_t memo_index) {
action_.ObserveNullFound(memo_index);
};
@@ -343,13 +343,13 @@ class RegularHashKernel : public HashKernel {
[this]() {
// Null
Status s = Status::OK();
- auto on_found = [this](int32_t memo_index) {
- action_.ObserveNullFound(memo_index);
- };
- auto on_not_found = [this, &s](int32_t memo_index) {
- action_.ObserveNullNotFound(memo_index, &s);
- };
- if (action_.ShouldEncodeNulls()) {
+ auto on_found = [this](int32_t memo_index) {
+ action_.ObserveNullFound(memo_index);
+ };
+ auto on_not_found = [this, &s](int32_t memo_index) {
+ action_.ObserveNullNotFound(memo_index, &s);
+ };
+ if (action_.ShouldEncodeNulls()) {
memo_table_->GetOrInsertNull(std::move(on_found), std::move(on_not_found));
}
return s;
@@ -368,23 +368,23 @@ class RegularHashKernel : public HashKernel {
// ----------------------------------------------------------------------
// Hash kernel implementation for nulls
-template <typename Action, bool with_error_status = Action::with_error_status>
+template <typename Action, bool with_error_status = Action::with_error_status>
class NullHashKernel : public HashKernel {
public:
- NullHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
- MemoryPool* pool)
- : pool_(pool), type_(type), action_(type, options, pool) {}
+ NullHashKernel(const std::shared_ptr<DataType>& type, const FunctionOptions* options,
+ MemoryPool* pool)
+ : pool_(pool), type_(type), action_(type, options, pool) {}
Status Reset() override { return action_.Reset(); }
- Status Append(const ArrayData& arr) override { return DoAppend(arr); }
-
- template <bool HasError = with_error_status>
- enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
+ Status Append(const ArrayData& arr) override { return DoAppend(arr); }
+
+ template <bool HasError = with_error_status>
+ enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
RETURN_NOT_OK(action_.Reserve(arr.length));
for (int64_t i = 0; i < arr.length; ++i) {
if (i == 0) {
- seen_null_ = true;
+ seen_null_ = true;
action_.ObserveNullNotFound(0);
} else {
action_.ObserveNullFound(0);
@@ -393,31 +393,31 @@ class NullHashKernel : public HashKernel {
return Status::OK();
}
- template <bool HasError = with_error_status>
- enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
- Status s = Status::OK();
- RETURN_NOT_OK(action_.Reserve(arr.length));
- for (int64_t i = 0; i < arr.length; ++i) {
- if (seen_null_ == false && i == 0) {
- seen_null_ = true;
- action_.ObserveNullNotFound(0, &s);
- } else {
- action_.ObserveNullFound(0);
- }
- }
- return s;
- }
-
+ template <bool HasError = with_error_status>
+ enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+ Status s = Status::OK();
+ RETURN_NOT_OK(action_.Reserve(arr.length));
+ for (int64_t i = 0; i < arr.length; ++i) {
+ if (seen_null_ == false && i == 0) {
+ seen_null_ = true;
+ action_.ObserveNullNotFound(0, &s);
+ } else {
+ action_.ObserveNullFound(0);
+ }
+ }
+ return s;
+ }
+
Status Flush(Datum* out) override { return action_.Flush(out); }
Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
- std::shared_ptr<NullArray> null_array;
- if (seen_null_) {
- null_array = std::make_shared<NullArray>(1);
- } else {
- null_array = std::make_shared<NullArray>(0);
- }
+ std::shared_ptr<NullArray> null_array;
+ if (seen_null_) {
+ null_array = std::make_shared<NullArray>(1);
+ } else {
+ null_array = std::make_shared<NullArray>(0);
+ }
*out = null_array->data();
return Status::OK();
}
@@ -427,7 +427,7 @@ class NullHashKernel : public HashKernel {
protected:
MemoryPool* pool_;
std::shared_ptr<DataType> type_;
- bool seen_null_ = false;
+ bool seen_null_ = false;
Action action_;
};
@@ -441,33 +441,33 @@ class DictionaryHashKernel : public HashKernel {
Status Reset() override { return indices_kernel_->Reset(); }
- Status Append(const ArrayData& arr) override {
+ Status Append(const ArrayData& arr) override {
if (!dictionary_) {
- dictionary_ = arr.dictionary;
- } else if (!MakeArray(dictionary_)->Equals(*MakeArray(arr.dictionary))) {
- // NOTE: This approach computes a new dictionary unification per chunk.
- // This is in effect O(n*k) where n is the total chunked array length and
- // k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
- //
- // A better approach may be to run the kernel over each individual chunk,
- // and then hash-aggregate all results (for example sum-group-by for
- // the "value_counts" kernel).
- auto out_dict_type = dictionary_->type;
- std::shared_ptr<Buffer> transpose_map;
- std::shared_ptr<Array> out_dict;
- ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type));
-
- ARROW_CHECK_OK(unifier->Unify(*MakeArray(dictionary_)));
- ARROW_CHECK_OK(unifier->Unify(*MakeArray(arr.dictionary), &transpose_map));
- ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict));
-
- this->dictionary_ = out_dict->data();
- auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
- auto in_dict_array = MakeArray(std::make_shared<ArrayData>(arr));
- ARROW_ASSIGN_OR_RAISE(
- auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
- .Transpose(arr.type, out_dict, transpose));
- return indices_kernel_->Append(*tmp->data());
+ dictionary_ = arr.dictionary;
+ } else if (!MakeArray(dictionary_)->Equals(*MakeArray(arr.dictionary))) {
+ // NOTE: This approach computes a new dictionary unification per chunk.
+ // This is in effect O(n*k) where n is the total chunked array length and
+ // k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
+ //
+ // A better approach may be to run the kernel over each individual chunk,
+ // and then hash-aggregate all results (for example sum-group-by for
+ // the "value_counts" kernel).
+ auto out_dict_type = dictionary_->type;
+ std::shared_ptr<Buffer> transpose_map;
+ std::shared_ptr<Array> out_dict;
+ ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type));
+
+ ARROW_CHECK_OK(unifier->Unify(*MakeArray(dictionary_)));
+ ARROW_CHECK_OK(unifier->Unify(*MakeArray(arr.dictionary), &transpose_map));
+ ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict));
+
+ this->dictionary_ = out_dict->data();
+ auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
+ auto in_dict_array = MakeArray(std::make_shared<ArrayData>(arr));
+ ARROW_ASSIGN_OR_RAISE(
+ auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
+ .Transpose(arr.type, out_dict, transpose));
+ return indices_kernel_->Append(*tmp->data());
}
return indices_kernel_->Append(arr);
@@ -513,19 +513,19 @@ struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
};
template <typename Type, typename Action>
-Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<HashKernel>> HashInitImpl(KernelContext* ctx,
+ const KernelInitArgs& args) {
using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernel;
- auto result = ::arrow::internal::make_unique<HashKernelType>(
- args.inputs[0].type, args.options, ctx->memory_pool());
- RETURN_NOT_OK(result->Reset());
+ auto result = ::arrow::internal::make_unique<HashKernelType>(
+ args.inputs[0].type, args.options, ctx->memory_pool());
+ RETURN_NOT_OK(result->Reset());
return std::move(result);
}
template <typename Type, typename Action>
-Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
- const KernelInitArgs& args) {
- return HashInitImpl<Type, Action>(ctx, args);
+Result<std::unique_ptr<KernelState>> HashInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
+ return HashInitImpl<Type, Action>(ctx, args);
}
template <typename Action>
@@ -564,8 +564,8 @@ KernelInit GetHashInit(Type::type type_id) {
case Type::LARGE_STRING:
return HashInit<LargeBinaryType, Action>;
case Type::FIXED_SIZE_BINARY:
- case Type::DECIMAL128:
- case Type::DECIMAL256:
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
return HashInit<FixedSizeBinaryType, Action>;
default:
DCHECK(false);
@@ -573,13 +573,13 @@ KernelInit GetHashInit(Type::type type_id) {
}
}
-using DictionaryEncodeState = OptionsWrapper<DictionaryEncodeOptions>;
-
+using DictionaryEncodeState = OptionsWrapper<DictionaryEncodeOptions>;
+
template <typename Action>
-Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
- const KernelInitArgs& args) {
+Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
+ const KernelInitArgs& args) {
const auto& dict_type = checked_cast<const DictionaryType&>(*args.inputs[0].type);
- Result<std::unique_ptr<HashKernel>> indices_hasher;
+ Result<std::unique_ptr<HashKernel>> indices_hasher;
switch (dict_type.index_type()->id()) {
case Type::INT8:
indices_hasher = HashInitImpl<UInt8Type, Action>(ctx, args);
@@ -597,37 +597,37 @@ Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
DCHECK(false) << "Unsupported dictionary index type";
break;
}
- RETURN_NOT_OK(indices_hasher);
- return ::arrow::internal::make_unique<DictionaryHashKernel>(
- std::move(indices_hasher.ValueOrDie()));
+ RETURN_NOT_OK(indices_hasher);
+ return ::arrow::internal::make_unique<DictionaryHashKernel>(
+ std::move(indices_hasher.ValueOrDie()));
}
-Status HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
- RETURN_NOT_OK(hash_impl->Append(ctx, *batch[0].array()));
- RETURN_NOT_OK(hash_impl->Flush(out));
- return Status::OK();
+ RETURN_NOT_OK(hash_impl->Append(ctx, *batch[0].array()));
+ RETURN_NOT_OK(hash_impl->Flush(out));
+ return Status::OK();
}
-Status UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+Status UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
- RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
*out = {Datum(uniques)};
- return Status::OK();
+ return Status::OK();
}
-Status DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+Status DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
- RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
auto dict_type = dictionary(int32(), uniques->type);
auto dict = MakeArray(uniques);
for (size_t i = 0; i < out->size(); ++i) {
(*out)[i] =
std::make_shared<DictionaryArray>(dict_type, (*out)[i].make_array(), dict);
}
- return Status::OK();
+ return Status::OK();
}
std::shared_ptr<ArrayData> BoxValueCounts(const std::shared_ptr<ArrayData>& uniques,
@@ -638,33 +638,33 @@ std::shared_ptr<ArrayData> BoxValueCounts(const std::shared_ptr<ArrayData>& uniq
return std::make_shared<StructArray>(data_type, uniques->length, children)->data();
}
-Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
auto hash_impl = checked_cast<HashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
Datum value_counts;
- RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
- RETURN_NOT_OK(hash_impl->FlushFinal(&value_counts));
+ RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash_impl->FlushFinal(&value_counts));
*out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
- return Status::OK();
+ return Status::OK();
}
-Status UniqueFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
- RETURN_NOT_OK(UniqueFinalize(ctx, out));
+Status UniqueFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
+ RETURN_NOT_OK(UniqueFinalize(ctx, out));
auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
(*out)[0].mutable_array()->dictionary = hash->dictionary();
- return Status::OK();
+ return Status::OK();
}
-Status ValueCountsFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
+Status ValueCountsFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
std::shared_ptr<ArrayData> uniques;
Datum value_counts;
- RETURN_NOT_OK(hash->GetDictionary(&uniques));
- RETURN_NOT_OK(hash->FlushFinal(&value_counts));
+ RETURN_NOT_OK(hash->GetDictionary(&uniques));
+ RETURN_NOT_OK(hash->FlushFinal(&value_counts));
uniques->dictionary = hash->dictionary();
*out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
- return Status::OK();
+ return Status::OK();
}
ValueDescr DictEncodeOutput(KernelContext*, const std::vector<ValueDescr>& descrs) {
@@ -693,31 +693,31 @@ void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType out_ty)
DCHECK_OK(func->AddKernel(base));
}
- for (auto t : {Type::DECIMAL128, Type::DECIMAL256}) {
- base.init = GetHashInit<Action>(t);
- base.signature = KernelSignature::Make({InputType::Array(t)}, out_ty);
- DCHECK_OK(func->AddKernel(base));
- }
+ for (auto t : {Type::DECIMAL128, Type::DECIMAL256}) {
+ base.init = GetHashInit<Action>(t);
+ base.signature = KernelSignature::Make({InputType::Array(t)}, out_ty);
+ DCHECK_OK(func->AddKernel(base));
+ }
}
-const FunctionDoc unique_doc(
- "Compute unique elements",
- ("Return an array with distinct values. Nulls in the input are ignored."),
- {"array"});
-
-const FunctionDoc value_counts_doc(
- "Compute counts of unique elements",
- ("For each distinct value, compute the number of times it occurs in the array.\n"
- "The result is returned as an array of `struct<input type, int64>`.\n"
- "Nulls in the input are ignored."),
- {"array"});
-
-const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
-const FunctionDoc dictionary_encode_doc(
- "Dictionary-encode array",
- ("Return a dictionary-encoded version of the input array."), {"array"},
- "DictionaryEncodeOptions");
-
+const FunctionDoc unique_doc(
+ "Compute unique elements",
+ ("Return an array with distinct values. Nulls in the input are ignored."),
+ {"array"});
+
+const FunctionDoc value_counts_doc(
+ "Compute counts of unique elements",
+ ("For each distinct value, compute the number of times it occurs in the array.\n"
+ "The result is returned as an array of `struct<input type, int64>`.\n"
+ "Nulls in the input are ignored."),
+ {"array"});
+
+const auto kDefaultDictionaryEncodeOptions = DictionaryEncodeOptions::Defaults();
+const FunctionDoc dictionary_encode_doc(
+ "Dictionary-encode array",
+ ("Return a dictionary-encoded version of the input array."), {"array"},
+ "DictionaryEncodeOptions");
+
} // namespace
void RegisterVectorHash(FunctionRegistry* registry) {
@@ -729,7 +729,7 @@ void RegisterVectorHash(FunctionRegistry* registry) {
base.finalize = UniqueFinalize;
base.output_chunked = false;
- auto unique = std::make_shared<VectorFunction>("unique", Arity::Unary(), &unique_doc);
+ auto unique = std::make_shared<VectorFunction>("unique", Arity::Unary(), &unique_doc);
AddHashKernels<UniqueAction>(unique.get(), base, OutputType(FirstType));
// Dictionary unique
@@ -745,8 +745,8 @@ void RegisterVectorHash(FunctionRegistry* registry) {
// value_counts
base.finalize = ValueCountsFinalize;
- auto value_counts =
- std::make_shared<VectorFunction>("value_counts", Arity::Unary(), &value_counts_doc);
+ auto value_counts =
+ std::make_shared<VectorFunction>("value_counts", Arity::Unary(), &value_counts_doc);
AddHashKernels<ValueCountsAction>(value_counts.get(), base,
OutputType(ValueCountsOutput));
@@ -765,9 +765,9 @@ void RegisterVectorHash(FunctionRegistry* registry) {
base.finalize = DictEncodeFinalize;
// Unique and ValueCounts output unchunked arrays
base.output_chunked = true;
- auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
- &dictionary_encode_doc,
- &kDefaultDictionaryEncodeOptions);
+ auto dict_encode = std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary(),
+ &dictionary_encode_doc,
+ &kDefaultDictionaryEncodeOptions);
AddHashKernels<DictEncodeAction>(dict_encode.get(), base, OutputType(DictEncodeOutput));
// Calling dictionary_encode on dictionary input not supported, but if it
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
index 68db6ae04cc..b84640854ed 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -27,15 +27,15 @@ namespace internal {
namespace {
template <typename Type>
-Status ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
typename TypeTraits<Type>::ArrayType list_array(batch[0].array());
- ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool()));
- out->value = result->data();
- return Status::OK();
+ ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool()));
+ out->value = result->data();
+ return Status::OK();
}
template <typename Type, typename offset_type = typename Type::offset_type>
-Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
typename TypeTraits<Type>::ArrayType list(batch[0].array());
ArrayData* out_arr = out->mutable_array();
@@ -44,8 +44,8 @@ Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out)
out_arr->length = values_length;
out_arr->null_count = 0;
- ARROW_ASSIGN_OR_RAISE(out_arr->buffers[1],
- ctx->Allocate(values_length * sizeof(offset_type)));
+ ARROW_ASSIGN_OR_RAISE(out_arr->buffers[1],
+ ctx->Allocate(values_length * sizeof(offset_type)));
auto out_indices = reinterpret_cast<offset_type*>(out_arr->buffers[1]->mutable_data());
for (int64_t i = 0; i < list.length(); ++i) {
// Note: In most cases, null slots are empty, but when they are non-empty
@@ -55,7 +55,7 @@ Status ListParentIndices(KernelContext* ctx, const ExecBatch& batch, Datum* out)
*out_indices++ = static_cast<offset_type>(i);
}
}
- return Status::OK();
+ return Status::OK();
}
Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& args) {
@@ -63,33 +63,33 @@ Result<ValueDescr> ValuesType(KernelContext*, const std::vector<ValueDescr>& arg
return ValueDescr::Array(list_type.value_type());
}
-const FunctionDoc list_flatten_doc(
- "Flatten list values",
- ("`lists` must have a list-like type.\n"
- "Return an array with the top list level flattened.\n"
- "Top-level null values in `lists` do not emit anything in the input."),
- {"lists"});
-
-const FunctionDoc list_parent_indices_doc(
- "Compute parent indices of nested list values",
- ("`lists` must have a list-like type.\n"
- "For each value in each list of `lists`, the top-level list index\n"
- "is emitted."),
- {"lists"});
-
+const FunctionDoc list_flatten_doc(
+ "Flatten list values",
+ ("`lists` must have a list-like type.\n"
+ "Return an array with the top list level flattened.\n"
+ "Top-level null values in `lists` do not emit anything in the input."),
+ {"lists"});
+
+const FunctionDoc list_parent_indices_doc(
+ "Compute parent indices of nested list values",
+ ("`lists` must have a list-like type.\n"
+ "For each value in each list of `lists`, the top-level list index\n"
+ "is emitted."),
+ {"lists"});
+
} // namespace
void RegisterVectorNested(FunctionRegistry* registry) {
- auto flatten =
- std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
+ auto flatten =
+ std::make_shared<VectorFunction>("list_flatten", Arity::Unary(), &list_flatten_doc);
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LIST)}, OutputType(ValuesType),
ListFlatten<ListType>));
DCHECK_OK(flatten->AddKernel({InputType::Array(Type::LARGE_LIST)},
OutputType(ValuesType), ListFlatten<LargeListType>));
DCHECK_OK(registry->AddFunction(std::move(flatten)));
- auto list_parent_indices = std::make_shared<VectorFunction>(
- "list_parent_indices", Arity::Unary(), &list_parent_indices_doc);
+ auto list_parent_indices = std::make_shared<VectorFunction>(
+ "list_parent_indices", Arity::Unary(), &list_parent_indices_doc);
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LIST)}, int32(),
ListParentIndices<ListType>));
DCHECK_OK(list_parent_indices->AddKernel({InputType::Array(Type::LARGE_LIST)}, int64(),
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
index d89f7a6bb40..644aec2a4e9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -1,540 +1,540 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/api_scalar.h"
-#include "arrow/compute/kernels/common.h"
-#include "arrow/util/bitmap_ops.h"
-
-namespace arrow {
-namespace compute {
-namespace internal {
-
-namespace {
-
-Status ReplacementArrayTooShort(int64_t expected, int64_t actual) {
- return Status::Invalid("Replacement array must be of appropriate length (expected ",
- expected, " items but got ", actual, " items)");
-}
-
-// Helper to implement replace_with kernel with scalar mask for fixed-width types,
-// using callbacks to handle both bool and byte-sized types
-template <typename Functor>
-Status ReplaceWithScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- Datum source = array;
- if (!mask.is_valid) {
- // Output = null
- source = MakeNullScalar(output->type);
- } else if (mask.value) {
- // Output = replacement
- source = replacements;
- }
- uint8_t* out_bitmap = output->buffers[0]->mutable_data();
- uint8_t* out_values = output->buffers[1]->mutable_data();
- const int64_t out_offset = output->offset;
- if (source.is_array()) {
- const ArrayData& in_data = *source.array();
- if (in_data.length < array.length) {
- return ReplacementArrayTooShort(array.length, in_data.length);
- }
- Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
- array.length);
- if (in_data.MayHaveNulls()) {
- arrow::internal::CopyBitmap(in_data.buffers[0]->data(), in_data.offset,
- array.length, out_bitmap, out_offset);
- } else {
- BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
- }
- } else {
- const Scalar& in_data = *source.scalar();
- Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
- array.length);
- BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
- }
- return Status::OK();
-}
-
-struct CopyArrayBitmap {
- const uint8_t* in_bitmap;
- int64_t in_offset;
-
- void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
- int64_t length) const {
- arrow::internal::CopyBitmap(in_bitmap, in_offset + offset, length, out_bitmap,
- out_offset);
- }
-
- void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
- BitUtil::SetBitTo(out_bitmap, out_offset,
- BitUtil::GetBit(in_bitmap, in_offset + offset));
- }
-};
-
-struct CopyScalarBitmap {
- const bool is_valid;
-
- void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
- int64_t length) const {
- BitUtil::SetBitsTo(out_bitmap, out_offset, length, is_valid);
- }
-
- void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
- BitUtil::SetBitTo(out_bitmap, out_offset, is_valid);
- }
-};
-
-// Helper to implement replace_with kernel with array mask for fixed-width types,
-// using callbacks to handle both bool and byte-sized types and to handle
-// scalar and array replacements
-template <typename Functor, typename Data, typename CopyBitmap>
-void ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
- const Data& replacements, bool replacements_bitmap,
- const CopyBitmap& copy_bitmap, const uint8_t* mask_bitmap,
- const uint8_t* mask_values, uint8_t* out_bitmap,
- uint8_t* out_values, const int64_t out_offset) {
- Functor::CopyData(*array.type, out_values, /*out_offset=*/0, array, /*in_offset=*/0,
- array.length);
- arrow::internal::OptionalBinaryBitBlockCounter counter(
- mask_values, mask.offset, mask_bitmap, mask.offset, mask.length);
- int64_t write_offset = 0;
- int64_t replacements_offset = 0;
- while (write_offset < array.length) {
- BitBlockCount block = counter.NextAndBlock();
- if (block.AllSet()) {
- // Copy from replacement array
- Functor::CopyData(*array.type, out_values, out_offset + write_offset, replacements,
- replacements_offset, block.length);
- if (replacements_bitmap) {
- copy_bitmap.CopyBitmap(out_bitmap, out_offset + write_offset, replacements_offset,
- block.length);
- } else if (!replacements_bitmap && out_bitmap) {
- BitUtil::SetBitsTo(out_bitmap, out_offset + write_offset, block.length, true);
- }
- replacements_offset += block.length;
- } else if (block.popcount) {
- for (int64_t i = 0; i < block.length; ++i) {
- if (BitUtil::GetBit(mask_values, write_offset + mask.offset + i) &&
- (!mask_bitmap ||
- BitUtil::GetBit(mask_bitmap, write_offset + mask.offset + i))) {
- Functor::CopyData(*array.type, out_values, out_offset + write_offset + i,
- replacements, replacements_offset, /*length=*/1);
- if (replacements_bitmap) {
- copy_bitmap.SetBit(out_bitmap, out_offset + write_offset + i,
- replacements_offset);
- }
- replacements_offset++;
- }
- }
- }
- write_offset += block.length;
- }
-}
-
-template <typename Functor>
-Status ReplaceWithArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- const int64_t out_offset = output->offset;
- uint8_t* out_bitmap = nullptr;
- uint8_t* out_values = output->buffers[1]->mutable_data();
- const uint8_t* mask_bitmap = mask.MayHaveNulls() ? mask.buffers[0]->data() : nullptr;
- const uint8_t* mask_values = mask.buffers[1]->data();
- const bool replacements_bitmap = replacements.is_array()
- ? replacements.array()->MayHaveNulls()
- : !replacements.scalar()->is_valid;
- if (replacements.is_array()) {
- // Check that we have enough replacement values
- const int64_t replacements_length = replacements.array()->length;
-
- BooleanArray mask_arr(mask.length, mask.buffers[1], mask.buffers[0], mask.null_count,
- mask.offset);
- const int64_t count = mask_arr.true_count();
- if (count > replacements_length) {
- return ReplacementArrayTooShort(count, replacements_length);
- }
- }
- if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
- out_bitmap = output->buffers[0]->mutable_data();
- output->null_count = -1;
- if (array.MayHaveNulls()) {
- // Copy array's bitmap
- arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset, array.length,
- out_bitmap, out_offset);
- } else {
- // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
- BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
- }
- } else {
- BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, array.length,
- true);
- output->null_count = 0;
- }
-
- if (replacements.is_array()) {
- const ArrayData& array_repl = *replacements.array();
- ReplaceWithArrayMaskImpl<Functor>(
- array, mask, array_repl, replacements_bitmap,
- CopyArrayBitmap{replacements_bitmap ? array_repl.buffers[0]->data() : nullptr,
- array_repl.offset},
- mask_bitmap, mask_values, out_bitmap, out_values, out_offset);
- } else {
- const Scalar& scalar_repl = *replacements.scalar();
- ReplaceWithArrayMaskImpl<Functor>(array, mask, scalar_repl, replacements_bitmap,
- CopyScalarBitmap{scalar_repl.is_valid}, mask_bitmap,
- mask_values, out_bitmap, out_values, out_offset);
- }
-
- if (mask.MayHaveNulls()) {
- arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0]->data(),
- mask.offset, array.length, out_offset, out_bitmap);
- }
- return Status::OK();
-}
-
-template <typename Type, typename Enable = void>
-struct ReplaceWithMask {};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_number<Type>> {
- using T = typename TypeTraits<Type>::CType;
-
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * sizeof(T));
- std::memcpy(out + (out_offset * sizeof(T)), in_arr, length * sizeof(T));
- }
-
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- T* begin = reinterpret_cast<T*>(out + (out_offset * sizeof(T)));
- T* end = begin + length;
- std::fill(begin, end, UnboxScalar<Type>::Unbox(in));
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_boolean<Type>> {
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const auto in_arr = in.GetValues<uint8_t>(1, /*absolute_offset=*/0);
- arrow::internal::CopyBitmap(in_arr, in_offset + in.offset, length, out, out_offset);
- }
- static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- BitUtil::SetBitsTo(out, out_offset, length, in.is_valid);
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
- std::memcpy(begin, in_arr, length * width);
- }
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(in);
- // Null scalar may have null value buffer
- if (!scalar.value) return;
- const Buffer& buffer = *scalar.value;
- const uint8_t* value = buffer.data();
- DCHECK_GE(buffer.size(), width);
- for (int i = 0; i < length; i++) {
- std::memcpy(begin, value, width);
- begin += width;
- }
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_decimal<Type>> {
- using ScalarType = typename TypeTraits<Type>::ScalarType;
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const ArrayData& in, const int64_t in_offset,
- const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
- std::memcpy(begin, in_arr, length * width);
- }
- static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
- const Scalar& in, const int64_t in_offset, const int64_t length) {
- const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
- uint8_t* begin = out + (out_offset * width);
- const auto& scalar = checked_cast<const ScalarType&>(in);
- const auto value = scalar.value.ToBytes();
- for (int i = 0; i < length; i++) {
- std::memcpy(begin, value.data(), width);
- begin += width;
- }
- }
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
- output);
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_null<Type>> {
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- *output = array;
- return Status::OK();
- }
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- *output = array;
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
- using offset_type = typename Type::offset_type;
- using BuilderType = typename TypeTraits<Type>::BuilderType;
-
- static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
- const BooleanScalar& mask, const Datum& replacements,
- ArrayData* output) {
- if (!mask.is_valid) {
- // Output = null
- ARROW_ASSIGN_OR_RAISE(
- auto replacement_array,
- MakeArrayOfNull(array.type, array.length, ctx->memory_pool()));
- *output = *replacement_array->data();
- } else if (mask.value) {
- // Output = replacement
- if (replacements.is_scalar()) {
- ARROW_ASSIGN_OR_RAISE(auto replacement_array,
- MakeArrayFromScalar(*replacements.scalar(), array.length,
- ctx->memory_pool()));
- *output = *replacement_array->data();
- } else {
- const ArrayData& replacement_array = *replacements.array();
- if (replacement_array.length < array.length) {
- return ReplacementArrayTooShort(array.length, replacement_array.length);
- }
- *output = replacement_array;
- output->length = array.length;
- }
- } else {
- // Output = input
- *output = array;
- }
- return Status::OK();
- }
- static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
- const ArrayData& mask, const Datum& replacements,
- ArrayData* output) {
- BuilderType builder(array.type, ctx->memory_pool());
- RETURN_NOT_OK(builder.Reserve(array.length));
- RETURN_NOT_OK(builder.ReserveData(array.buffers[2]->size()));
- int64_t source_offset = 0;
- int64_t replacements_offset = 0;
- RETURN_NOT_OK(VisitArrayDataInline<BooleanType>(
- mask,
- [&](bool replace) {
- if (replace && replacements.is_scalar()) {
- const Scalar& scalar = *replacements.scalar();
- if (scalar.is_valid) {
- RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(scalar)));
- } else {
- RETURN_NOT_OK(builder.AppendNull());
- }
- } else {
- const ArrayData& source = replace ? *replacements.array() : array;
- const int64_t offset = replace ? replacements_offset++ : source_offset;
- if (!source.MayHaveNulls() ||
- BitUtil::GetBit(source.buffers[0]->data(), source.offset + offset)) {
- const uint8_t* data = source.buffers[2]->data();
- const offset_type* offsets = source.GetValues<offset_type>(1);
- const offset_type offset0 = offsets[offset];
- const offset_type offset1 = offsets[offset + 1];
- RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
- } else {
- RETURN_NOT_OK(builder.AppendNull());
- }
- }
- source_offset++;
- return Status::OK();
- },
- [&]() {
- RETURN_NOT_OK(builder.AppendNull());
- source_offset++;
- return Status::OK();
- }));
- std::shared_ptr<Array> temp_output;
- RETURN_NOT_OK(builder.Finish(&temp_output));
- *output = *temp_output->data();
- // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
- output->type = array.type;
- return Status::OK();
- }
-};
-
-template <typename Type>
-struct ReplaceWithMaskFunctor {
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const ArrayData& array = *batch[0].array();
- const Datum& replacements = batch[2];
- ArrayData* output = out->array().get();
- output->length = array.length;
-
- // Needed for FixedSizeBinary/parameterized types
- if (!array.type->Equals(*replacements.type(), /*check_metadata=*/false)) {
- return Status::Invalid("Replacements must be of same type (expected ",
- array.type->ToString(), " but got ",
- replacements.type()->ToString(), ")");
- }
-
- if (!replacements.is_array() && !replacements.is_scalar()) {
- return Status::Invalid("Replacements must be array or scalar");
- }
-
- if (batch[1].is_scalar()) {
- return ReplaceWithMask<Type>::ExecScalarMask(
- ctx, array, batch[1].scalar_as<BooleanScalar>(), replacements, output);
- }
- const ArrayData& mask = *batch[1].array();
- if (array.length != mask.length) {
- return Status::Invalid("Mask must be of same length as array (expected ",
- array.length, " items but got ", mask.length, " items)");
- }
- return ReplaceWithMask<Type>::ExecArrayMask(ctx, array, mask, replacements, output);
- }
-};
-
-} // namespace
-
-const FunctionDoc replace_with_mask_doc(
- "Replace items using a mask and replacement values",
- ("Given an array and a Boolean mask (either scalar or of equal length), "
- "along with replacement values (either scalar or array), "
- "each element of the array for which the corresponding mask element is "
- "true will be replaced by the next value from the replacements, "
- "or with null if the mask is null. "
- "Hence, for replacement arrays, len(replacements) == sum(mask == true)."),
- {"values", "mask", "replacements"});
-
-void RegisterVectorReplace(FunctionRegistry* registry) {
- auto func = std::make_shared<VectorFunction>("replace_with_mask", Arity::Ternary(),
- &replace_with_mask_doc);
- auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExec exec) {
- VectorKernel kernel;
- kernel.can_execute_chunkwise = false;
- if (is_fixed_width(get_id.id)) {
- kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
- } else {
- kernel.can_write_into_slices = false;
- kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
- }
- kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
- kernel.signature = KernelSignature::Make(
- {InputType::Array(get_id.id), InputType(boolean()), InputType(get_id.id)},
- OutputType(FirstType));
- kernel.exec = std::move(exec);
- DCHECK_OK(func->AddKernel(std::move(kernel)));
- };
- auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
- add_kernel(get_id, GenerateTypeAgnosticPrimitive<ReplaceWithMaskFunctor>(get_id));
- };
- for (const auto& ty : NumericTypes()) {
- add_primitive_kernel(ty);
- }
- for (const auto& ty : TemporalTypes()) {
- add_primitive_kernel(ty);
- }
- add_primitive_kernel(null());
- add_primitive_kernel(boolean());
- add_primitive_kernel(day_time_interval());
- add_primitive_kernel(month_interval());
- add_kernel(Type::FIXED_SIZE_BINARY, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
- add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<Decimal128Type>::Exec);
- add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<Decimal256Type>::Exec);
- for (const auto& ty : BaseBinaryTypes()) {
- add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<ReplaceWithMaskFunctor>(*ty));
- }
- // TODO: list types
- DCHECK_OK(registry->AddFunction(std::move(func)));
-
- // TODO(ARROW-9431): "replace_with_indices"
-}
-} // namespace internal
-} // namespace compute
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bitmap_ops.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+namespace {
+
+Status ReplacementArrayTooShort(int64_t expected, int64_t actual) {
+ return Status::Invalid("Replacement array must be of appropriate length (expected ",
+ expected, " items but got ", actual, " items)");
+}
+
+// Helper to implement replace_with kernel with scalar mask for fixed-width types,
+// using callbacks to handle both bool and byte-sized types
+template <typename Functor>
+Status ReplaceWithScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ Datum source = array;
+ if (!mask.is_valid) {
+ // Output = null
+ source = MakeNullScalar(output->type);
+ } else if (mask.value) {
+ // Output = replacement
+ source = replacements;
+ }
+ uint8_t* out_bitmap = output->buffers[0]->mutable_data();
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ const int64_t out_offset = output->offset;
+ if (source.is_array()) {
+ const ArrayData& in_data = *source.array();
+ if (in_data.length < array.length) {
+ return ReplacementArrayTooShort(array.length, in_data.length);
+ }
+ Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
+ array.length);
+ if (in_data.MayHaveNulls()) {
+ arrow::internal::CopyBitmap(in_data.buffers[0]->data(), in_data.offset,
+ array.length, out_bitmap, out_offset);
+ } else {
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
+ }
+ } else {
+ const Scalar& in_data = *source.scalar();
+ Functor::CopyData(*array.type, out_values, out_offset, in_data, /*in_offset=*/0,
+ array.length);
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
+ }
+ return Status::OK();
+}
+
+struct CopyArrayBitmap {
+ const uint8_t* in_bitmap;
+ int64_t in_offset;
+
+ void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
+ int64_t length) const {
+ arrow::internal::CopyBitmap(in_bitmap, in_offset + offset, length, out_bitmap,
+ out_offset);
+ }
+
+ void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
+ BitUtil::SetBitTo(out_bitmap, out_offset,
+ BitUtil::GetBit(in_bitmap, in_offset + offset));
+ }
+};
+
+struct CopyScalarBitmap {
+ const bool is_valid;
+
+ void CopyBitmap(uint8_t* out_bitmap, int64_t out_offset, int64_t offset,
+ int64_t length) const {
+ BitUtil::SetBitsTo(out_bitmap, out_offset, length, is_valid);
+ }
+
+ void SetBit(uint8_t* out_bitmap, int64_t out_offset, int64_t offset) const {
+ BitUtil::SetBitTo(out_bitmap, out_offset, is_valid);
+ }
+};
+
+// Helper to implement replace_with kernel with array mask for fixed-width types,
+// using callbacks to handle both bool and byte-sized types and to handle
+// scalar and array replacements
+template <typename Functor, typename Data, typename CopyBitmap>
+void ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
+ const Data& replacements, bool replacements_bitmap,
+ const CopyBitmap& copy_bitmap, const uint8_t* mask_bitmap,
+ const uint8_t* mask_values, uint8_t* out_bitmap,
+ uint8_t* out_values, const int64_t out_offset) {
+ Functor::CopyData(*array.type, out_values, /*out_offset=*/0, array, /*in_offset=*/0,
+ array.length);
+ arrow::internal::OptionalBinaryBitBlockCounter counter(
+ mask_values, mask.offset, mask_bitmap, mask.offset, mask.length);
+ int64_t write_offset = 0;
+ int64_t replacements_offset = 0;
+ while (write_offset < array.length) {
+ BitBlockCount block = counter.NextAndBlock();
+ if (block.AllSet()) {
+ // Copy from replacement array
+ Functor::CopyData(*array.type, out_values, out_offset + write_offset, replacements,
+ replacements_offset, block.length);
+ if (replacements_bitmap) {
+ copy_bitmap.CopyBitmap(out_bitmap, out_offset + write_offset, replacements_offset,
+ block.length);
+ } else if (!replacements_bitmap && out_bitmap) {
+ BitUtil::SetBitsTo(out_bitmap, out_offset + write_offset, block.length, true);
+ }
+ replacements_offset += block.length;
+ } else if (block.popcount) {
+ for (int64_t i = 0; i < block.length; ++i) {
+ if (BitUtil::GetBit(mask_values, write_offset + mask.offset + i) &&
+ (!mask_bitmap ||
+ BitUtil::GetBit(mask_bitmap, write_offset + mask.offset + i))) {
+ Functor::CopyData(*array.type, out_values, out_offset + write_offset + i,
+ replacements, replacements_offset, /*length=*/1);
+ if (replacements_bitmap) {
+ copy_bitmap.SetBit(out_bitmap, out_offset + write_offset + i,
+ replacements_offset);
+ }
+ replacements_offset++;
+ }
+ }
+ }
+ write_offset += block.length;
+ }
+}
+
+template <typename Functor>
+Status ReplaceWithArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ const int64_t out_offset = output->offset;
+ uint8_t* out_bitmap = nullptr;
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+ const uint8_t* mask_bitmap = mask.MayHaveNulls() ? mask.buffers[0]->data() : nullptr;
+ const uint8_t* mask_values = mask.buffers[1]->data();
+ const bool replacements_bitmap = replacements.is_array()
+ ? replacements.array()->MayHaveNulls()
+ : !replacements.scalar()->is_valid;
+ if (replacements.is_array()) {
+ // Check that we have enough replacement values
+ const int64_t replacements_length = replacements.array()->length;
+
+ BooleanArray mask_arr(mask.length, mask.buffers[1], mask.buffers[0], mask.null_count,
+ mask.offset);
+ const int64_t count = mask_arr.true_count();
+ if (count > replacements_length) {
+ return ReplacementArrayTooShort(count, replacements_length);
+ }
+ }
+ if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
+ out_bitmap = output->buffers[0]->mutable_data();
+ output->null_count = -1;
+ if (array.MayHaveNulls()) {
+ // Copy array's bitmap
+ arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset, array.length,
+ out_bitmap, out_offset);
+ } else {
+ // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
+ BitUtil::SetBitsTo(out_bitmap, out_offset, array.length, true);
+ }
+ } else {
+ BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, array.length,
+ true);
+ output->null_count = 0;
+ }
+
+ if (replacements.is_array()) {
+ const ArrayData& array_repl = *replacements.array();
+ ReplaceWithArrayMaskImpl<Functor>(
+ array, mask, array_repl, replacements_bitmap,
+ CopyArrayBitmap{replacements_bitmap ? array_repl.buffers[0]->data() : nullptr,
+ array_repl.offset},
+ mask_bitmap, mask_values, out_bitmap, out_values, out_offset);
+ } else {
+ const Scalar& scalar_repl = *replacements.scalar();
+ ReplaceWithArrayMaskImpl<Functor>(array, mask, scalar_repl, replacements_bitmap,
+ CopyScalarBitmap{scalar_repl.is_valid}, mask_bitmap,
+ mask_values, out_bitmap, out_values, out_offset);
+ }
+
+ if (mask.MayHaveNulls()) {
+ arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0]->data(),
+ mask.offset, array.length, out_offset, out_bitmap);
+ }
+ return Status::OK();
+}
+
+template <typename Type, typename Enable = void>
+struct ReplaceWithMask {};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_number<Type>> {
+ using T = typename TypeTraits<Type>::CType;
+
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * sizeof(T));
+ std::memcpy(out + (out_offset * sizeof(T)), in_arr, length * sizeof(T));
+ }
+
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ T* begin = reinterpret_cast<T*>(out + (out_offset * sizeof(T)));
+ T* end = begin + length;
+ std::fill(begin, end, UnboxScalar<Type>::Unbox(in));
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_boolean<Type>> {
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const auto in_arr = in.GetValues<uint8_t>(1, /*absolute_offset=*/0);
+ arrow::internal::CopyBitmap(in_arr, in_offset + in.offset, length, out, out_offset);
+ }
+ static void CopyData(const DataType&, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ BitUtil::SetBitsTo(out, out_offset, length, in.is_valid);
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_same<Type, FixedSizeBinaryType>> {
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
+ std::memcpy(begin, in_arr, length * width);
+ }
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto& scalar = checked_cast<const FixedSizeBinaryScalar&>(in);
+ // Null scalar may have null value buffer
+ if (!scalar.value) return;
+ const Buffer& buffer = *scalar.value;
+ const uint8_t* value = buffer.data();
+ DCHECK_GE(buffer.size(), width);
+ for (int i = 0; i < length; i++) {
+ std::memcpy(begin, value, width);
+ begin += width;
+ }
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_decimal<Type>> {
+ using ScalarType = typename TypeTraits<Type>::ScalarType;
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const ArrayData& in, const int64_t in_offset,
+ const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto in_arr = in.GetValues<uint8_t>(1, (in_offset + in.offset) * width);
+ std::memcpy(begin, in_arr, length * width);
+ }
+ static void CopyData(const DataType& ty, uint8_t* out, const int64_t out_offset,
+ const Scalar& in, const int64_t in_offset, const int64_t length) {
+ const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+ uint8_t* begin = out + (out_offset * width);
+ const auto& scalar = checked_cast<const ScalarType&>(in);
+ const auto value = scalar.value.ToBytes();
+ for (int i = 0; i < length; i++) {
+ std::memcpy(begin, value.data(), width);
+ begin += width;
+ }
+ }
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithScalarMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ return ReplaceWithArrayMask<ReplaceWithMask<Type>>(ctx, array, mask, replacements,
+ output);
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_null<Type>> {
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ *output = array;
+ return Status::OK();
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ *output = array;
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+ static Status ExecScalarMask(KernelContext* ctx, const ArrayData& array,
+ const BooleanScalar& mask, const Datum& replacements,
+ ArrayData* output) {
+ if (!mask.is_valid) {
+ // Output = null
+ ARROW_ASSIGN_OR_RAISE(
+ auto replacement_array,
+ MakeArrayOfNull(array.type, array.length, ctx->memory_pool()));
+ *output = *replacement_array->data();
+ } else if (mask.value) {
+ // Output = replacement
+ if (replacements.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto replacement_array,
+ MakeArrayFromScalar(*replacements.scalar(), array.length,
+ ctx->memory_pool()));
+ *output = *replacement_array->data();
+ } else {
+ const ArrayData& replacement_array = *replacements.array();
+ if (replacement_array.length < array.length) {
+ return ReplacementArrayTooShort(array.length, replacement_array.length);
+ }
+ *output = replacement_array;
+ output->length = array.length;
+ }
+ } else {
+ // Output = input
+ *output = array;
+ }
+ return Status::OK();
+ }
+ static Status ExecArrayMask(KernelContext* ctx, const ArrayData& array,
+ const ArrayData& mask, const Datum& replacements,
+ ArrayData* output) {
+ BuilderType builder(array.type, ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(array.length));
+ RETURN_NOT_OK(builder.ReserveData(array.buffers[2]->size()));
+ int64_t source_offset = 0;
+ int64_t replacements_offset = 0;
+ RETURN_NOT_OK(VisitArrayDataInline<BooleanType>(
+ mask,
+ [&](bool replace) {
+ if (replace && replacements.is_scalar()) {
+ const Scalar& scalar = *replacements.scalar();
+ if (scalar.is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(scalar)));
+ } else {
+ RETURN_NOT_OK(builder.AppendNull());
+ }
+ } else {
+ const ArrayData& source = replace ? *replacements.array() : array;
+ const int64_t offset = replace ? replacements_offset++ : source_offset;
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + offset)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues<offset_type>(1);
+ const offset_type offset0 = offsets[offset];
+ const offset_type offset1 = offsets[offset + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ } else {
+ RETURN_NOT_OK(builder.AppendNull());
+ }
+ }
+ source_offset++;
+ return Status::OK();
+ },
+ [&]() {
+ RETURN_NOT_OK(builder.AppendNull());
+ source_offset++;
+ return Status::OK();
+ }));
+ std::shared_ptr<Array> temp_output;
+ RETURN_NOT_OK(builder.Finish(&temp_output));
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = array.type;
+ return Status::OK();
+ }
+};
+
+template <typename Type>
+struct ReplaceWithMaskFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ArrayData& array = *batch[0].array();
+ const Datum& replacements = batch[2];
+ ArrayData* output = out->array().get();
+ output->length = array.length;
+
+ // Needed for FixedSizeBinary/parameterized types
+ if (!array.type->Equals(*replacements.type(), /*check_metadata=*/false)) {
+ return Status::Invalid("Replacements must be of same type (expected ",
+ array.type->ToString(), " but got ",
+ replacements.type()->ToString(), ")");
+ }
+
+ if (!replacements.is_array() && !replacements.is_scalar()) {
+ return Status::Invalid("Replacements must be array or scalar");
+ }
+
+ if (batch[1].is_scalar()) {
+ return ReplaceWithMask<Type>::ExecScalarMask(
+ ctx, array, batch[1].scalar_as<BooleanScalar>(), replacements, output);
+ }
+ const ArrayData& mask = *batch[1].array();
+ if (array.length != mask.length) {
+ return Status::Invalid("Mask must be of same length as array (expected ",
+ array.length, " items but got ", mask.length, " items)");
+ }
+ return ReplaceWithMask<Type>::ExecArrayMask(ctx, array, mask, replacements, output);
+ }
+};
+
+} // namespace
+
+const FunctionDoc replace_with_mask_doc(
+ "Replace items using a mask and replacement values",
+ ("Given an array and a Boolean mask (either scalar or of equal length), "
+ "along with replacement values (either scalar or array), "
+ "each element of the array for which the corresponding mask element is "
+ "true will be replaced by the next value from the replacements, "
+ "or with null if the mask is null. "
+ "Hence, for replacement arrays, len(replacements) == sum(mask == true)."),
+ {"values", "mask", "replacements"});
+
+void RegisterVectorReplace(FunctionRegistry* registry) {
+ auto func = std::make_shared<VectorFunction>("replace_with_mask", Arity::Ternary(),
+ &replace_with_mask_doc);
+ auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExec exec) {
+ VectorKernel kernel;
+ kernel.can_execute_chunkwise = false;
+ if (is_fixed_width(get_id.id)) {
+ kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
+ } else {
+ kernel.can_write_into_slices = false;
+ kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+ }
+ kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+ kernel.signature = KernelSignature::Make(
+ {InputType::Array(get_id.id), InputType(boolean()), InputType(get_id.id)},
+ OutputType(FirstType));
+ kernel.exec = std::move(exec);
+ DCHECK_OK(func->AddKernel(std::move(kernel)));
+ };
+ auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
+ add_kernel(get_id, GenerateTypeAgnosticPrimitive<ReplaceWithMaskFunctor>(get_id));
+ };
+ for (const auto& ty : NumericTypes()) {
+ add_primitive_kernel(ty);
+ }
+ for (const auto& ty : TemporalTypes()) {
+ add_primitive_kernel(ty);
+ }
+ add_primitive_kernel(null());
+ add_primitive_kernel(boolean());
+ add_primitive_kernel(day_time_interval());
+ add_primitive_kernel(month_interval());
+ add_kernel(Type::FIXED_SIZE_BINARY, ReplaceWithMaskFunctor<FixedSizeBinaryType>::Exec);
+ add_kernel(Type::DECIMAL128, ReplaceWithMaskFunctor<Decimal128Type>::Exec);
+ add_kernel(Type::DECIMAL256, ReplaceWithMaskFunctor<Decimal256Type>::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<ReplaceWithMaskFunctor>(*ty));
+ }
+ // TODO: list types
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+
+ // TODO(ARROW-9431): "replace_with_indices"
+}
+} // namespace internal
+} // namespace compute
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
index b70dadbd146..5845a7ee2d0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -36,7 +36,7 @@
#include "arrow/table.h"
#include "arrow/type.h"
#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/bitmap_reader.h"
@@ -87,8 +87,8 @@ int64_t GetFilterOutputSize(const ArrayData& filter,
return output_size;
}
-namespace {
-
+namespace {
+
template <typename IndexType>
Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
@@ -96,130 +96,130 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
using T = typename IndexType::c_type;
const uint8_t* filter_data = filter.buffers[1]->data();
- const bool have_filter_nulls = filter.MayHaveNulls();
- const uint8_t* filter_is_valid =
- have_filter_nulls ? filter.buffers[0]->data() : nullptr;
-
- if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
- // Most complex case: the filter may have nulls and we don't drop them.
- // The logic is ternary:
- // - filter is null: emit null
- // - filter is valid and true: emit index
- // - filter is valid and false: don't emit anything
-
- typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
-
- // The position relative to the start of the filter
- T position = 0;
- // The current position taking the filter offset into account
- int64_t position_with_offset = filter.offset;
-
- // To count blocks where filter_data[i] || !filter_is_valid[i]
+ const bool have_filter_nulls = filter.MayHaveNulls();
+ const uint8_t* filter_is_valid =
+ have_filter_nulls ? filter.buffers[0]->data() : nullptr;
+
+ if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
+ // Most complex case: the filter may have nulls and we don't drop them.
+ // The logic is ternary:
+ // - filter is null: emit null
+ // - filter is valid and true: emit index
+ // - filter is valid and false: don't emit anything
+
+ typename TypeTraits<IndexType>::BuilderType builder(memory_pool);
+
+ // The position relative to the start of the filter
+ T position = 0;
+ // The current position taking the filter offset into account
+ int64_t position_with_offset = filter.offset;
+
+ // To count blocks where filter_data[i] || !filter_is_valid[i]
BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
filter.offset, filter.length);
- BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
- while (position < filter.length) {
- // true OR NOT valid
- BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
- if (selected_or_null_block.NoneSet()) {
- position += selected_or_null_block.length;
- position_with_offset += selected_or_null_block.length;
- continue;
+ BitBlockCounter is_valid_counter(filter_is_valid, filter.offset, filter.length);
+ while (position < filter.length) {
+ // true OR NOT valid
+ BitBlockCount selected_or_null_block = filter_counter.NextOrNotWord();
+ if (selected_or_null_block.NoneSet()) {
+ position += selected_or_null_block.length;
+ position_with_offset += selected_or_null_block.length;
+ continue;
}
- RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
-
- // If the values are all valid and the selected_or_null_block is full,
- // then we can infer that all the values are true and skip the bit checking
- BitBlockCount is_valid_block = is_valid_counter.NextWord();
-
- if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
- // All the values are selected and non-null
- for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
- builder.UnsafeAppend(position++);
- }
- position_with_offset += selected_or_null_block.length;
- } else {
- // Some of the values are false or null
- for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
- if (BitUtil::GetBit(filter_is_valid, position_with_offset)) {
- if (BitUtil::GetBit(filter_data, position_with_offset)) {
- builder.UnsafeAppend(position);
+ RETURN_NOT_OK(builder.Reserve(selected_or_null_block.popcount));
+
+ // If the values are all valid and the selected_or_null_block is full,
+ // then we can infer that all the values are true and skip the bit checking
+ BitBlockCount is_valid_block = is_valid_counter.NextWord();
+
+ if (selected_or_null_block.AllSet() && is_valid_block.AllSet()) {
+ // All the values are selected and non-null
+ for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+ builder.UnsafeAppend(position++);
+ }
+ position_with_offset += selected_or_null_block.length;
+ } else {
+ // Some of the values are false or null
+ for (int64_t i = 0; i < selected_or_null_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, position_with_offset)) {
+ if (BitUtil::GetBit(filter_data, position_with_offset)) {
+ builder.UnsafeAppend(position);
}
- } else {
- // Null slot, so append a null
- builder.UnsafeAppendNull();
+ } else {
+ // Null slot, so append a null
+ builder.UnsafeAppendNull();
}
- ++position;
- ++position_with_offset;
+ ++position;
+ ++position_with_offset;
}
}
}
- std::shared_ptr<ArrayData> result;
- RETURN_NOT_OK(builder.FinishInternal(&result));
- return result;
- }
-
- // Other cases don't emit nulls and are therefore simpler.
- TypedBufferBuilder<T> builder(memory_pool);
-
- if (have_filter_nulls) {
- // The filter may have nulls, so we scan the validity bitmap and the filter
- // data bitmap together.
- DCHECK_EQ(null_selection, FilterOptions::DROP);
-
- // The position relative to the start of the filter
- T position = 0;
- // The current position taking the filter offset into account
- int64_t position_with_offset = filter.offset;
-
- BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
- filter.offset, filter.length);
- while (position < filter.length) {
- BitBlockCount and_block = filter_counter.NextAndWord();
- RETURN_NOT_OK(builder.Reserve(and_block.popcount));
- if (and_block.AllSet()) {
- // All the values are selected and non-null
- for (int64_t i = 0; i < and_block.length; ++i) {
+ std::shared_ptr<ArrayData> result;
+ RETURN_NOT_OK(builder.FinishInternal(&result));
+ return result;
+ }
+
+ // Other cases don't emit nulls and are therefore simpler.
+ TypedBufferBuilder<T> builder(memory_pool);
+
+ if (have_filter_nulls) {
+ // The filter may have nulls, so we scan the validity bitmap and the filter
+ // data bitmap together.
+ DCHECK_EQ(null_selection, FilterOptions::DROP);
+
+ // The position relative to the start of the filter
+ T position = 0;
+ // The current position taking the filter offset into account
+ int64_t position_with_offset = filter.offset;
+
+ BinaryBitBlockCounter filter_counter(filter_data, filter.offset, filter_is_valid,
+ filter.offset, filter.length);
+ while (position < filter.length) {
+ BitBlockCount and_block = filter_counter.NextAndWord();
+ RETURN_NOT_OK(builder.Reserve(and_block.popcount));
+ if (and_block.AllSet()) {
+ // All the values are selected and non-null
+ for (int64_t i = 0; i < and_block.length; ++i) {
builder.UnsafeAppend(position++);
}
- position_with_offset += and_block.length;
- } else if (!and_block.NoneSet()) {
- // Some of the values are false or null
- for (int64_t i = 0; i < and_block.length; ++i) {
- if (BitUtil::GetBit(filter_is_valid, position_with_offset) &&
- BitUtil::GetBit(filter_data, position_with_offset)) {
+ position_with_offset += and_block.length;
+ } else if (!and_block.NoneSet()) {
+ // Some of the values are false or null
+ for (int64_t i = 0; i < and_block.length; ++i) {
+ if (BitUtil::GetBit(filter_is_valid, position_with_offset) &&
+ BitUtil::GetBit(filter_data, position_with_offset)) {
builder.UnsafeAppend(position);
}
++position;
++position_with_offset;
}
} else {
- position += and_block.length;
- position_with_offset += and_block.length;
+ position += and_block.length;
+ position_with_offset += and_block.length;
}
}
- } else {
- // The filter has no nulls, so we need only look for true values
- RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
- // Append the consecutive run of indices
- RETURN_NOT_OK(builder.Reserve(length));
- for (int64_t i = 0; i < length; ++i) {
- builder.UnsafeAppend(static_cast<T>(offset + i));
- }
- return Status::OK();
- }));
+ } else {
+ // The filter has no nulls, so we need only look for true values
+ RETURN_NOT_OK(::arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, [&](int64_t offset, int64_t length) {
+ // Append the consecutive run of indices
+ RETURN_NOT_OK(builder.Reserve(length));
+ for (int64_t i = 0; i < length; ++i) {
+ builder.UnsafeAppend(static_cast<T>(offset + i));
+ }
+ return Status::OK();
+ }));
}
-
- const int64_t length = builder.length();
- std::shared_ptr<Buffer> out_buffer;
- RETURN_NOT_OK(builder.Finish(&out_buffer));
- return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
- BufferVector{nullptr, out_buffer}, /*null_count=*/0);
+
+ const int64_t length = builder.length();
+ std::shared_ptr<Buffer> out_buffer;
+ RETURN_NOT_OK(builder.Finish(&out_buffer));
+ return std::make_shared<ArrayData>(TypeTraits<IndexType>::type_singleton(), length,
+ BufferVector{nullptr, out_buffer}, /*null_count=*/0);
}
-} // namespace
-
+} // namespace
+
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
MemoryPool* memory_pool) {
@@ -490,9 +490,9 @@ void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
}
}
-Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (TakeState::Get(ctx).boundscheck) {
- RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
}
PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
@@ -504,29 +504,29 @@ Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// allocating the validity bitmap altogether and save time and space. A
// streamlined PrimitiveTakeImpl would need to be written that skips all
// interactions with the output validity bitmap, though.
- RETURN_NOT_OK(PreallocateData(ctx, indices.length, values.bit_width,
- /*allocate_validity=*/true, out_arr));
+ RETURN_NOT_OK(PreallocateData(ctx, indices.length, values.bit_width,
+ /*allocate_validity=*/true, out_arr));
switch (values.bit_width) {
case 1:
- TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
+ break;
case 8:
- TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_arr);
+ break;
case 16:
- TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_arr);
+ break;
case 32:
- TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_arr);
+ break;
case 64:
- TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
- break;
+ TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_arr);
+ break;
default:
DCHECK(false) << "Invalid values byte width";
break;
}
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -597,9 +597,9 @@ class PrimitiveFilterImpl {
void ExecNonNull() {
// Fast filter when values and filter are not null
- ::arrow::internal::VisitSetBitRunsVoid(
- filter_data_, filter_offset_, values_length_,
- [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
+ ::arrow::internal::VisitSetBitRunsVoid(
+ filter_data_, filter_offset_, values_length_,
+ [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
}
void Exec() {
@@ -783,7 +783,7 @@ inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
BitUtil::ClearBit(out_data_, out_offset_ + out_position_++);
}
-Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
PrimitiveArg filter = GetPrimitiveArg(*batch[1].array());
FilterOptions::NullSelectionBehavior null_selection =
@@ -808,30 +808,30 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// validity bitmap.
bool allocate_validity = values.null_count != 0 || filter.null_count != 0;
- RETURN_NOT_OK(
- PreallocateData(ctx, output_length, values.bit_width, allocate_validity, out_arr));
+ RETURN_NOT_OK(
+ PreallocateData(ctx, output_length, values.bit_width, allocate_validity, out_arr));
switch (values.bit_width) {
case 1:
- PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
+ break;
case 8:
- PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
+ break;
case 16:
- PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
+ break;
case 32:
- PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
+ break;
case 64:
- PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
- break;
+ PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
+ break;
default:
DCHECK(false) << "Invalid values bit width";
break;
}
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -880,25 +880,25 @@ Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArrayData& values,
ArrayData* out) {
using offset_type = typename Type::offset_type;
const auto filter_data = filter.buffers[1]->data();
-
+
BINARY_FILTER_SETUP_COMMON();
- RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
- filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
+ RETURN_NOT_OK(arrow::internal::VisitSetBitRuns(
+ filter_data, filter.offset, filter.length, [&](int64_t position, int64_t length) {
// Bulk-append raw data
- const offset_type run_data_bytes =
- (raw_offsets[position + length] - raw_offsets[position]);
- APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
+ const offset_type run_data_bytes =
+ (raw_offsets[position + length] - raw_offsets[position]);
+ APPEND_RAW_DATA(raw_data + raw_offsets[position], run_data_bytes);
// Append offsets
- offset_type cur_offset = raw_offsets[position];
- for (int64_t i = 0; i < length; ++i) {
+ offset_type cur_offset = raw_offsets[position];
+ for (int64_t i = 0; i < length; ++i) {
offset_builder.UnsafeAppend(offset);
- offset += raw_offsets[i + position + 1] - cur_offset;
- cur_offset = raw_offsets[i + position + 1];
+ offset += raw_offsets[i + position + 1] - cur_offset;
+ cur_offset = raw_offsets[i + position + 1];
}
- return Status::OK();
- }));
-
+ return Status::OK();
+ }));
+
offset_builder.UnsafeAppend(offset);
out->length = output_length;
RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
@@ -936,8 +936,8 @@ Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
BINARY_FILTER_SETUP_COMMON();
- int64_t in_position = 0;
- int64_t out_position = 0;
+ int64_t in_position = 0;
+ int64_t out_position = 0;
while (in_position < filter.length) {
BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
BitBlockCount values_valid_block = values_valid_counter.NextWord();
@@ -1079,7 +1079,7 @@ Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
#undef APPEND_RAW_DATA
#undef APPEND_SINGLE_VALUE
-Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
FilterOptions::NullSelectionBehavior null_selection =
FilterState::Get(ctx).null_selection_behavior;
@@ -1101,100 +1101,100 @@ Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (values.null_count == 0 && filter.null_count == 0) {
// Faster no-nulls case
if (is_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
- ctx, values, filter, output_length, null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterNonNullImpl<BinaryType>(
+ ctx, values, filter, output_length, null_selection, out_arr));
} else if (is_large_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
- ctx, values, filter, output_length, null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterNonNullImpl<LargeBinaryType>(
+ ctx, values, filter, output_length, null_selection, out_arr));
} else {
DCHECK(false);
}
} else {
// Output may have nulls
- RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
+ RETURN_NOT_OK(ctx->AllocateBitmap(output_length).Value(&out_arr->buffers[0]));
if (is_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
- null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterImpl<BinaryType>(ctx, values, filter, output_length,
+ null_selection, out_arr));
} else if (is_large_binary_like(type_id)) {
- RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
- null_selection, out_arr));
+ RETURN_NOT_OK(BinaryFilterImpl<LargeBinaryType>(ctx, values, filter, output_length,
+ null_selection, out_arr));
} else {
DCHECK(false);
}
}
-
- return Status::OK();
+
+ return Status::OK();
}
// ----------------------------------------------------------------------
// Null take and filter
-Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (TakeState::Get(ctx).boundscheck) {
- RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
}
// batch.length doesn't take into account the take indices
auto new_length = batch[1].array()->length;
out->value = std::make_shared<NullArray>(new_length)->data();
- return Status::OK();
+ return Status::OK();
}
-Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
int64_t output_length = GetFilterOutputSize(
*batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
out->value = std::make_shared<NullArray>(output_length)->data();
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
// Dictionary take and filter
-Status DictionaryTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status DictionaryTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DictionaryArray values(batch[0].array());
Datum result;
- RETURN_NOT_OK(
- Take(Datum(values.indices()), batch[1], TakeState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(
+ Take(Datum(values.indices()), batch[1], TakeState::Get(ctx), ctx->exec_context())
+ .Value(&result));
DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
out->value = taken_values.data();
- return Status::OK();
+ return Status::OK();
}
-Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DictionaryArray dict_values(batch[0].array());
Datum result;
- RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array(),
- FilterState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array(),
+ FilterState::Get(ctx), ctx->exec_context())
+ .Value(&result));
DictionaryArray filtered_values(dict_values.type(), result.make_array(),
dict_values.dictionary());
out->value = filtered_values.data();
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
// Extension take and filter
-Status ExtensionTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExtensionTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ExtensionArray values(batch[0].array());
Datum result;
- RETURN_NOT_OK(
- Take(Datum(values.storage()), batch[1], TakeState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(
+ Take(Datum(values.storage()), batch[1], TakeState::Get(ctx), ctx->exec_context())
+ .Value(&result));
ExtensionArray taken_values(values.type(), result.make_array());
out->value = taken_values.data();
- return Status::OK();
+ return Status::OK();
}
-Status ExtensionFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExtensionFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ExtensionArray ext_values(batch[0].array());
Datum result;
- RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array(),
- FilterState::Get(ctx), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array(),
+ FilterState::Get(ctx), ctx->exec_context())
+ .Value(&result));
ExtensionArray filtered_values(ext_values.type(), result.make_array());
out->value = filtered_values.data();
- return Status::OK();
+ return Status::OK();
}
// ----------------------------------------------------------------------
@@ -1668,81 +1668,81 @@ struct ListImpl : public Selection<ListImpl<Type>, Type> {
}
};
-struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
- using Base = Selection<DenseUnionImpl, DenseUnionType>;
- LIFT_BASE_MEMBERS();
-
- TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
- TypedBufferBuilder<int8_t> child_id_buffer_builder_;
- std::vector<int8_t> type_codes_;
- std::vector<Int32Builder> child_indices_builders_;
-
- DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
- Datum* out)
- : Base(ctx, batch, output_length, out),
- value_offset_buffer_builder_(ctx->memory_pool()),
- child_id_buffer_builder_(ctx->memory_pool()),
- type_codes_(checked_cast<const UnionType&>(*this->values->type).type_codes()),
- child_indices_builders_(type_codes_.size()) {
- for (auto& child_indices_builder : child_indices_builders_) {
- child_indices_builder = Int32Builder(ctx->memory_pool());
- }
- }
-
- template <typename Adapter>
- Status GenerateOutput() {
- DenseUnionArray typed_values(this->values);
- Adapter adapter(this);
- RETURN_NOT_OK(adapter.Generate(
- [&](int64_t index) {
- int8_t child_id = typed_values.child_id(index);
- child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
- int32_t value_offset = typed_values.value_offset(index);
- value_offset_buffer_builder_.UnsafeAppend(
- static_cast<int32_t>(child_indices_builders_[child_id].length()));
- RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
- child_indices_builders_[child_id].UnsafeAppend(value_offset);
- return Status::OK();
- },
- [&]() {
- int8_t child_id = 0;
- child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
- value_offset_buffer_builder_.UnsafeAppend(
- static_cast<int32_t>(child_indices_builders_[child_id].length()));
- RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
- child_indices_builders_[child_id].UnsafeAppendNull();
- return Status::OK();
- }));
- return Status::OK();
- }
-
- Status Init() override {
- RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
- RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
- return Status::OK();
- }
-
- Status Finish() override {
- ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
- ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
- value_offset_buffer_builder_.Finish());
- DenseUnionArray typed_values(this->values);
- auto num_fields = typed_values.num_fields();
- auto num_rows = child_ids_buffer->size();
- BufferVector buffers{nullptr, std::move(child_ids_buffer),
- std::move(value_offsets_buffer)};
- *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
- for (auto i = 0; i < num_fields; i++) {
- ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
- child_indices_builders_[i].Finish());
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
- Take(*typed_values.field(i), *child_indices_array));
- out->child_data.push_back(child_array->data());
- }
- return Status::OK();
- }
-};
-
+struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
+ using Base = Selection<DenseUnionImpl, DenseUnionType>;
+ LIFT_BASE_MEMBERS();
+
+ TypedBufferBuilder<int32_t> value_offset_buffer_builder_;
+ TypedBufferBuilder<int8_t> child_id_buffer_builder_;
+ std::vector<int8_t> type_codes_;
+ std::vector<Int32Builder> child_indices_builders_;
+
+ DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
+ Datum* out)
+ : Base(ctx, batch, output_length, out),
+ value_offset_buffer_builder_(ctx->memory_pool()),
+ child_id_buffer_builder_(ctx->memory_pool()),
+ type_codes_(checked_cast<const UnionType&>(*this->values->type).type_codes()),
+ child_indices_builders_(type_codes_.size()) {
+ for (auto& child_indices_builder : child_indices_builders_) {
+ child_indices_builder = Int32Builder(ctx->memory_pool());
+ }
+ }
+
+ template <typename Adapter>
+ Status GenerateOutput() {
+ DenseUnionArray typed_values(this->values);
+ Adapter adapter(this);
+ RETURN_NOT_OK(adapter.Generate(
+ [&](int64_t index) {
+ int8_t child_id = typed_values.child_id(index);
+ child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+ int32_t value_offset = typed_values.value_offset(index);
+ value_offset_buffer_builder_.UnsafeAppend(
+ static_cast<int32_t>(child_indices_builders_[child_id].length()));
+ RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+ child_indices_builders_[child_id].UnsafeAppend(value_offset);
+ return Status::OK();
+ },
+ [&]() {
+ int8_t child_id = 0;
+ child_id_buffer_builder_.UnsafeAppend(type_codes_[child_id]);
+ value_offset_buffer_builder_.UnsafeAppend(
+ static_cast<int32_t>(child_indices_builders_[child_id].length()));
+ RETURN_NOT_OK(child_indices_builders_[child_id].Reserve(1));
+ child_indices_builders_[child_id].UnsafeAppendNull();
+ return Status::OK();
+ }));
+ return Status::OK();
+ }
+
+ Status Init() override {
+ RETURN_NOT_OK(child_id_buffer_builder_.Reserve(output_length));
+ RETURN_NOT_OK(value_offset_buffer_builder_.Reserve(output_length));
+ return Status::OK();
+ }
+
+ Status Finish() override {
+ ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
+ ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
+ value_offset_buffer_builder_.Finish());
+ DenseUnionArray typed_values(this->values);
+ auto num_fields = typed_values.num_fields();
+ auto num_rows = child_ids_buffer->size();
+ BufferVector buffers{nullptr, std::move(child_ids_buffer),
+ std::move(value_offsets_buffer)};
+ *out = ArrayData(typed_values.type(), num_rows, std::move(buffers), 0);
+ for (auto i = 0; i < num_fields; i++) {
+ ARROW_ASSIGN_OR_RAISE(auto child_indices_array,
+ child_indices_builders_[i].Finish());
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> child_array,
+ Take(*typed_values.field(i), *child_indices_array));
+ out->child_data.push_back(child_array->data());
+ }
+ return Status::OK();
+ }
+};
+
struct FSLImpl : public Selection<FSLImpl, FixedSizeListType> {
Int64Builder child_index_builder;
@@ -1827,20 +1827,20 @@ struct StructImpl : public Selection<StructImpl, StructType> {
}
};
-Status StructFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status StructFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// Transform filter to selection indices and then use Take.
std::shared_ptr<ArrayData> indices;
- RETURN_NOT_OK(GetTakeIndices(*batch[1].array(),
- FilterState::Get(ctx).null_selection_behavior,
- ctx->memory_pool())
- .Value(&indices));
+ RETURN_NOT_OK(GetTakeIndices(*batch[1].array(),
+ FilterState::Get(ctx).null_selection_behavior,
+ ctx->memory_pool())
+ .Value(&indices));
Datum result;
- RETURN_NOT_OK(
- Take(batch[0], Datum(indices), TakeOptions::NoBoundsCheck(), ctx->exec_context())
- .Value(&result));
+ RETURN_NOT_OK(
+ Take(batch[0], Datum(indices), TakeOptions::NoBoundsCheck(), ctx->exec_context())
+ .Value(&result));
out->value = result.array();
- return Status::OK();
+ return Status::OK();
}
#undef LIFT_BASE_MEMBERS
@@ -1860,15 +1860,15 @@ Result<std::shared_ptr<RecordBatch>> FilterRecordBatch(const RecordBatch& batch,
const auto& filter_opts = *static_cast<const FilterOptions*>(options);
ARROW_ASSIGN_OR_RAISE(
std::shared_ptr<ArrayData> indices,
- GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
- ctx->memory_pool()));
+ GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior,
+ ctx->memory_pool()));
std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
for (int i = 0; i < batch.num_columns(); ++i) {
ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices),
TakeOptions::NoBoundsCheck(), ctx));
columns[i] = out.make_array();
}
- return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
+ return RecordBatch::Make(batch.schema(), indices->length, std::move(columns));
}
Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filter,
@@ -1877,82 +1877,82 @@ Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filt
if (table.num_rows() != filter.length()) {
return Status::Invalid("Filter inputs must all be the same length");
}
- if (table.num_rows() == 0) {
- return Table::Make(table.schema(), table.columns(), 0);
- }
-
- // Last input element will be the filter array
- const int num_columns = table.num_columns();
- std::vector<ArrayVector> inputs(num_columns + 1);
-
- // Fetch table columns
- for (int i = 0; i < num_columns; ++i) {
- inputs[i] = table.column(i)->chunks();
- }
- // Fetch filter
- const auto& filter_opts = *static_cast<const FilterOptions*>(options);
- switch (filter.kind()) {
- case Datum::ARRAY:
- inputs.back().push_back(filter.make_array());
- break;
- case Datum::CHUNKED_ARRAY:
- inputs.back() = filter.chunked_array()->chunks();
- break;
- default:
- return Status::NotImplemented("Filter should be array-like");
- }
-
- // Rechunk inputs to allow consistent iteration over their respective chunks
- inputs = arrow::internal::RechunkArraysConsistently(inputs);
-
- // Instead of filtering each column with the boolean filter
- // (which would be slow if the table has a large number of columns: ARROW-10569),
- // convert each filter chunk to indices, and take() the column.
- const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
- std::vector<ArrayVector> out_columns(num_columns);
- int64_t out_num_rows = 0;
-
- for (int64_t i = 0; i < num_chunks; ++i) {
- const ArrayData& filter_chunk = *inputs.back()[i]->data();
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+
+ // Last input element will be the filter array
+ const int num_columns = table.num_columns();
+ std::vector<ArrayVector> inputs(num_columns + 1);
+
+ // Fetch table columns
+ for (int i = 0; i < num_columns; ++i) {
+ inputs[i] = table.column(i)->chunks();
+ }
+ // Fetch filter
+ const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+ switch (filter.kind()) {
+ case Datum::ARRAY:
+ inputs.back().push_back(filter.make_array());
+ break;
+ case Datum::CHUNKED_ARRAY:
+ inputs.back() = filter.chunked_array()->chunks();
+ break;
+ default:
+ return Status::NotImplemented("Filter should be array-like");
+ }
+
+ // Rechunk inputs to allow consistent iteration over their respective chunks
+ inputs = arrow::internal::RechunkArraysConsistently(inputs);
+
+ // Instead of filtering each column with the boolean filter
+ // (which would be slow if the table has a large number of columns: ARROW-10569),
+ // convert each filter chunk to indices, and take() the column.
+ const int64_t num_chunks = static_cast<int64_t>(inputs.back().size());
+ std::vector<ArrayVector> out_columns(num_columns);
+ int64_t out_num_rows = 0;
+
+ for (int64_t i = 0; i < num_chunks; ++i) {
+ const ArrayData& filter_chunk = *inputs.back()[i]->data();
ARROW_ASSIGN_OR_RAISE(
- const auto indices,
- GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
- ctx->memory_pool()));
-
- if (indices->length > 0) {
- // Take from all input columns
- Datum indices_datum{std::move(indices)};
- for (int col = 0; col < num_columns; ++col) {
- const auto& column_chunk = inputs[col][i];
- ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
- TakeOptions::NoBoundsCheck(), ctx));
- out_columns[col].push_back(std::move(out).make_array());
- }
- out_num_rows += indices->length;
- }
+ const auto indices,
+ GetTakeIndices(filter_chunk, filter_opts.null_selection_behavior,
+ ctx->memory_pool()));
+
+ if (indices->length > 0) {
+ // Take from all input columns
+ Datum indices_datum{std::move(indices)};
+ for (int col = 0; col < num_columns; ++col) {
+ const auto& column_chunk = inputs[col][i];
+ ARROW_ASSIGN_OR_RAISE(Datum out, Take(column_chunk, indices_datum,
+ TakeOptions::NoBoundsCheck(), ctx));
+ out_columns[col].push_back(std::move(out).make_array());
+ }
+ out_num_rows += indices->length;
+ }
}
-
- ChunkedArrayVector out_chunks(num_columns);
- for (int i = 0; i < num_columns; ++i) {
- out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
- table.column(i)->type());
- }
- return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
+
+ ChunkedArrayVector out_chunks(num_columns);
+ for (int i = 0; i < num_columns; ++i) {
+ out_chunks[i] = std::make_shared<ChunkedArray>(std::move(out_columns[i]),
+ table.column(i)->type());
+ }
+ return Table::Make(table.schema(), std::move(out_chunks), out_num_rows);
}
static auto kDefaultFilterOptions = FilterOptions::Defaults();
-const FunctionDoc filter_doc(
- "Filter with a boolean selection filter",
- ("The output is populated with values from the input at positions\n"
- "where the selection filter is non-zero. Nulls in the selection filter\n"
- "are handled based on FilterOptions."),
- {"input", "selection_filter"}, "FilterOptions");
-
+const FunctionDoc filter_doc(
+ "Filter with a boolean selection filter",
+ ("The output is populated with values from the input at positions\n"
+ "where the selection filter is non-zero. Nulls in the selection filter\n"
+ "are handled based on FilterOptions."),
+ {"input", "selection_filter"}, "FilterOptions");
+
class FilterMetaFunction : public MetaFunction {
public:
FilterMetaFunction()
- : MetaFunction("filter", Arity::Binary(), &filter_doc, &kDefaultFilterOptions) {}
+ : MetaFunction("filter", Arity::Binary(), &filter_doc, &kDefaultFilterOptions) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -2061,7 +2061,7 @@ Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
for (int j = 0; j < ncols; j++) {
ARROW_ASSIGN_OR_RAISE(columns[j], TakeAA(*batch.column(j), indices, options, ctx));
}
- return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
+ return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
}
Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
@@ -2072,7 +2072,7 @@ Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
for (int j = 0; j < ncols; j++) {
ARROW_ASSIGN_OR_RAISE(columns[j], TakeCA(*table.column(j), indices, options, ctx));
}
- return Table::Make(table.schema(), std::move(columns));
+ return Table::Make(table.schema(), std::move(columns));
}
Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& indices,
@@ -2082,17 +2082,17 @@ Result<std::shared_ptr<Table>> TakeTC(const Table& table, const ChunkedArray& in
for (int j = 0; j < ncols; j++) {
ARROW_ASSIGN_OR_RAISE(columns[j], TakeCC(*table.column(j), indices, options, ctx));
}
- return Table::Make(table.schema(), std::move(columns));
+ return Table::Make(table.schema(), std::move(columns));
}
static auto kDefaultTakeOptions = TakeOptions::Defaults();
-const FunctionDoc take_doc(
- "Select values from an input based on indices from another array",
- ("The output is populated with values from the input at positions\n"
- "given by `indices`. Nulls in `indices` emit null in the output."),
- {"input", "indices"}, "TakeOptions");
-
+const FunctionDoc take_doc(
+ "Select values from an input based on indices from another array",
+ ("The output is populated with values from the input at positions\n"
+ "given by `indices`. Nulls in `indices` emit null in the output."),
+ {"input", "indices"}, "TakeOptions");
+
// Metafunction for dispatching to different Take implementations other than
// Array-Array.
//
@@ -2100,8 +2100,8 @@ const FunctionDoc take_doc(
// overly complex dispatching, there is no parallelization.
class TakeMetaFunction : public MetaFunction {
public:
- TakeMetaFunction()
- : MetaFunction("take", Arity::Binary(), &take_doc, &kDefaultTakeOptions) {}
+ TakeMetaFunction()
+ : MetaFunction("take", Arity::Binary(), &take_doc, &kDefaultTakeOptions) {}
Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
const FunctionOptions* options,
@@ -2149,21 +2149,21 @@ class TakeMetaFunction : public MetaFunction {
// ----------------------------------------------------------------------
template <typename Impl>
-Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// TODO: where are the values and filter length equality checked?
int64_t output_length = GetFilterOutputSize(
*batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
Impl kernel(ctx, batch, output_length, out);
- return kernel.ExecFilter();
+ return kernel.ExecFilter();
}
template <typename Impl>
-Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
if (TakeState::Get(ctx).boundscheck) {
- RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+ RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
}
Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
- return kernel.ExecTake();
+ return kernel.ExecTake();
}
struct SelectionKernelDescr {
@@ -2171,13 +2171,13 @@ struct SelectionKernelDescr {
ArrayKernelExec exec;
};
-void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
- VectorKernel base_kernel, InputType selection_type,
+void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
+ VectorKernel base_kernel, InputType selection_type,
const std::vector<SelectionKernelDescr>& descrs,
const FunctionOptions* default_options,
FunctionRegistry* registry) {
- auto func =
- std::make_shared<VectorFunction>(name, Arity::Binary(), doc, default_options);
+ auto func =
+ std::make_shared<VectorFunction>(name, Arity::Binary(), doc, default_options);
for (auto& descr : descrs) {
base_kernel.signature = KernelSignature::Make(
{std::move(descr.input), selection_type}, OutputType(FirstType));
@@ -2187,19 +2187,19 @@ void RegisterSelectionFunction(const std::string& name, const FunctionDoc* doc,
DCHECK_OK(registry->AddFunction(std::move(func)));
}
-const FunctionDoc array_filter_doc(
- "Filter with a boolean selection filter",
- ("The output is populated with values from the input `array` at positions\n"
- "where the selection filter is non-zero. Nulls in the selection filter\n"
- "are handled based on FilterOptions."),
- {"array", "selection_filter"}, "FilterOptions");
-
-const FunctionDoc array_take_doc(
- "Select values from an array based on indices from another array",
- ("The output is populated with values from the input array at positions\n"
- "given by `indices`. Nulls in `indices` emit null in the output."),
- {"array", "indices"}, "TakeOptions");
-
+const FunctionDoc array_filter_doc(
+ "Filter with a boolean selection filter",
+ ("The output is populated with values from the input `array` at positions\n"
+ "where the selection filter is non-zero. Nulls in the selection filter\n"
+ "are handled based on FilterOptions."),
+ {"array", "selection_filter"}, "FilterOptions");
+
+const FunctionDoc array_take_doc(
+ "Select values from an array based on indices from another array",
+ ("The output is populated with values from the input array at positions\n"
+ "given by `indices`. Nulls in `indices` emit null in the output."),
+ {"array", "indices"}, "TakeOptions");
+
} // namespace
void RegisterVectorSelection(FunctionRegistry* registry) {
@@ -2216,7 +2216,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
{InputType::Array(Type::LIST), FilterExec<ListImpl<ListType>>},
{InputType::Array(Type::LARGE_LIST), FilterExec<ListImpl<LargeListType>>},
{InputType::Array(Type::FIXED_SIZE_LIST), FilterExec<FSLImpl>},
- {InputType::Array(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
+ {InputType::Array(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
{InputType::Array(Type::STRUCT), StructFilter},
// TODO: Reuse ListType kernel for MAP
{InputType::Array(Type::MAP), FilterExec<ListImpl<MapType>>},
@@ -2224,7 +2224,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
VectorKernel filter_base;
filter_base.init = FilterState::Init;
- RegisterSelectionFunction("array_filter", &array_filter_doc, filter_base,
+ RegisterSelectionFunction("array_filter", &array_filter_doc, filter_base,
/*selection_type=*/InputType::Array(boolean()),
filter_kernel_descrs, &kDefaultFilterOptions, registry);
@@ -2239,14 +2239,14 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
TakeExec<VarBinaryImpl<LargeBinaryType>>},
{InputType::Array(Type::FIXED_SIZE_BINARY), TakeExec<FSBImpl>},
{InputType::Array(null()), NullTake},
- {InputType::Array(Type::DECIMAL128), TakeExec<FSBImpl>},
- {InputType::Array(Type::DECIMAL256), TakeExec<FSBImpl>},
+ {InputType::Array(Type::DECIMAL128), TakeExec<FSBImpl>},
+ {InputType::Array(Type::DECIMAL256), TakeExec<FSBImpl>},
{InputType::Array(Type::DICTIONARY), DictionaryTake},
{InputType::Array(Type::EXTENSION), ExtensionTake},
{InputType::Array(Type::LIST), TakeExec<ListImpl<ListType>>},
{InputType::Array(Type::LARGE_LIST), TakeExec<ListImpl<LargeListType>>},
{InputType::Array(Type::FIXED_SIZE_LIST), TakeExec<FSLImpl>},
- {InputType::Array(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
+ {InputType::Array(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
{InputType::Array(Type::STRUCT), TakeExec<StructImpl>},
// TODO: Reuse ListType kernel for MAP
{InputType::Array(Type::MAP), TakeExec<ListImpl<MapType>>},
@@ -2256,7 +2256,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
take_base.init = TakeState::Init;
take_base.can_execute_chunkwise = false;
RegisterSelectionFunction(
- "array_take", &array_take_doc, take_base,
+ "array_take", &array_take_doc, take_base,
/*selection_type=*/InputType(match::Integer(), ValueDescr::ARRAY),
take_kernel_descrs, &kDefaultTakeOptions, registry);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
index b42e9d536f1..7fa43e715d8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -16,305 +16,305 @@
// under the License.
#include <algorithm>
-#include <cmath>
+#include <cmath>
#include <limits>
#include <numeric>
-#include <type_traits>
-#include <utility>
+#include <type_traits>
+#include <utility>
#include "arrow/array/data.h"
#include "arrow/compute/api_vector.h"
#include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/table.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bitmap.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/table.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bitmap.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/optional.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
-
-using internal::checked_cast;
-
+
+using internal::checked_cast;
+
namespace compute {
-namespace internal {
-
-// Visit all physical types for which sorting is implemented.
-#define VISIT_PHYSICAL_TYPES(VISIT) \
- VISIT(BooleanType) \
- VISIT(Int8Type) \
- VISIT(Int16Type) \
- VISIT(Int32Type) \
- VISIT(Int64Type) \
- VISIT(UInt8Type) \
- VISIT(UInt16Type) \
- VISIT(UInt32Type) \
- VISIT(UInt64Type) \
- VISIT(FloatType) \
- VISIT(DoubleType) \
- VISIT(BinaryType) \
- VISIT(LargeBinaryType) \
- VISIT(FixedSizeBinaryType) \
- VISIT(Decimal128Type) \
- VISIT(Decimal256Type)
-
+namespace internal {
+
+// Visit all physical types for which sorting is implemented.
+#define VISIT_PHYSICAL_TYPES(VISIT) \
+ VISIT(BooleanType) \
+ VISIT(Int8Type) \
+ VISIT(Int16Type) \
+ VISIT(Int32Type) \
+ VISIT(Int64Type) \
+ VISIT(UInt8Type) \
+ VISIT(UInt16Type) \
+ VISIT(UInt32Type) \
+ VISIT(UInt64Type) \
+ VISIT(FloatType) \
+ VISIT(DoubleType) \
+ VISIT(BinaryType) \
+ VISIT(LargeBinaryType) \
+ VISIT(FixedSizeBinaryType) \
+ VISIT(Decimal128Type) \
+ VISIT(Decimal256Type)
+
namespace {
-// The target chunk in a chunked array.
-template <typename ArrayType>
-struct ResolvedChunk {
- using V = GetViewType<typename ArrayType::TypeClass>;
- using LogicalValueType = typename V::T;
-
- // The target array in chunked array.
- const ArrayType* array;
- // The index in the target array.
- const int64_t index;
-
- ResolvedChunk(const ArrayType* array, int64_t index) : array(array), index(index) {}
-
- bool IsNull() const { return array->IsNull(index); }
-
- LogicalValueType Value() const { return V::LogicalValue(array->GetView(index)); }
-};
-
-// ResolvedChunk specialization for untyped arrays when all is needed is null lookup
-template <>
-struct ResolvedChunk<Array> {
- // The target array in chunked array.
- const Array* array;
- // The index in the target array.
- const int64_t index;
-
- ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
-
- bool IsNull() const { return array->IsNull(index); }
-};
-
-// An object that resolves an array chunk depending on the index.
-struct ChunkedArrayResolver {
- explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
- : num_chunks_(static_cast<int64_t>(chunks.size())),
- chunks_(chunks.data()),
- offsets_(MakeEndOffsets(chunks)),
- cached_chunk_(0) {}
-
- template <typename ArrayType>
- ResolvedChunk<ArrayType> Resolve(int64_t index) const {
- // It is common for the algorithms below to make consecutive accesses at
- // a relatively small distance from each other, hence often falling in
- // the same chunk.
- // This is trivial when merging (assuming each side of the merge uses
- // its own resolver), but also in the inner recursive invocations of
- // partitioning.
- const bool cache_hit =
- (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]);
- if (ARROW_PREDICT_TRUE(cache_hit)) {
- return ResolvedChunk<ArrayType>(
- checked_cast<const ArrayType*>(chunks_[cached_chunk_]),
- index - offsets_[cached_chunk_]);
- } else {
- return ResolveMissBisect<ArrayType>(index);
- }
- }
-
- private:
- template <typename ArrayType>
- ResolvedChunk<ArrayType> ResolveMissBisect(int64_t index) const {
- // Like std::upper_bound(), but hand-written as it can help the compiler.
- const int64_t* raw_offsets = offsets_.data();
- // Search [lo, lo + n)
- int64_t lo = 0, n = num_chunks_;
- while (n > 1) {
- int64_t m = n >> 1;
- int64_t mid = lo + m;
- if (index >= raw_offsets[mid]) {
- lo = mid;
- n -= m;
- } else {
- n = m;
- }
- }
- cached_chunk_ = lo;
- return ResolvedChunk<ArrayType>(checked_cast<const ArrayType*>(chunks_[lo]),
- index - offsets_[lo]);
- }
-
- static std::vector<int64_t> MakeEndOffsets(const std::vector<const Array*>& chunks) {
- std::vector<int64_t> end_offsets(chunks.size() + 1);
- int64_t offset = 0;
- end_offsets[0] = 0;
- std::transform(chunks.begin(), chunks.end(), end_offsets.begin() + 1,
- [&](const Array* chunk) {
- offset += chunk->length();
- return offset;
- });
- return end_offsets;
- }
-
- int64_t num_chunks_;
- const Array* const* chunks_;
- std::vector<int64_t> offsets_;
-
- mutable int64_t cached_chunk_;
-};
-
-// We could try to reproduce the concrete Array classes' facilities
-// (such as cached raw values pointer) in a separate hierarchy of
-// physical accessors, but doing so ends up too cumbersome.
-// Instead, we simply create the desired concrete Array objects.
-std::shared_ptr<Array> GetPhysicalArray(const Array& array,
- const std::shared_ptr<DataType>& physical_type) {
- auto new_data = array.data()->Copy();
- new_data->type = physical_type;
- return MakeArray(std::move(new_data));
-}
-
-ArrayVector GetPhysicalChunks(const ChunkedArray& chunked_array,
- const std::shared_ptr<DataType>& physical_type) {
- const auto& chunks = chunked_array.chunks();
- ArrayVector physical(chunks.size());
- std::transform(chunks.begin(), chunks.end(), physical.begin(),
- [&](const std::shared_ptr<Array>& array) {
- return GetPhysicalArray(*array, physical_type);
- });
- return physical;
-}
-
-std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
- std::vector<const Array*> pointers(arrays.size());
- std::transform(arrays.begin(), arrays.end(), pointers.begin(),
- [&](const std::shared_ptr<Array>& array) { return array.get(); });
- return pointers;
-}
-
-// NOTE: std::partition is usually faster than std::stable_partition.
-
-struct NonStablePartitioner {
- template <typename Predicate>
- uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
- return std::partition(indices_begin, indices_end, std::forward<Predicate>(pred));
- }
-};
-
-struct StablePartitioner {
- template <typename Predicate>
- uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
- return std::stable_partition(indices_begin, indices_end,
- std::forward<Predicate>(pred));
- }
-};
-
-// TODO factor out value comparison and NaN checking?
-
-template <typename TypeClass, typename Enable = void>
-struct NullTraits {
- static constexpr bool has_null_like_values = false;
-};
-
-template <typename TypeClass>
-struct NullTraits<TypeClass, enable_if_floating_point<TypeClass>> {
- static constexpr bool has_null_like_values = true;
-};
-
-// Move nulls (not null-like values) to end of array. Return where null starts.
-//
-// `offset` is used when this is called on a chunk of a chunked array
-template <typename Partitioner>
-uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
- const Array& values, int64_t offset) {
- if (values.null_count() == 0) {
- return indices_end;
- }
- Partitioner partitioner;
- return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
- return !values.IsNull(ind - offset);
- });
-}
-
-// For chunked array.
-template <typename Partitioner>
-uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays,
- int64_t null_count) {
- if (null_count == 0) {
- return indices_end;
- }
- ChunkedArrayResolver resolver(arrays);
- Partitioner partitioner;
- return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
- const auto chunk = resolver.Resolve<Array>(ind);
- return !chunk.IsNull();
- });
-}
-
-// Move non-null null-like values to end of array. Return where null-like starts.
-//
-// `offset` is used when this is called on a chunk of a chunked array
-template <typename ArrayType, typename Partitioner>
-enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset) {
- return indices_end;
-}
-
-// For chunked array.
-template <typename ArrayType, typename Partitioner>
-enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count) {
- return indices_end;
-}
-
-template <typename ArrayType, typename Partitioner>
-enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset) {
- Partitioner partitioner;
- return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
- return !std::isnan(values.GetView(ind - offset));
- });
-}
-
-template <typename ArrayType, typename Partitioner>
-enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
-PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count) {
- Partitioner partitioner;
- ChunkedArrayResolver resolver(arrays);
- return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
- const auto chunk = resolver.Resolve<ArrayType>(ind);
- return !std::isnan(chunk.Value());
- });
-}
-
-// Move nulls to end of array. Return where null starts.
-//
-// `offset` is used when this is called on a chunk of a chunked array
-template <typename ArrayType, typename Partitioner>
-uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset) {
- // Partition nulls at end, and null-like values just before
- uint64_t* nulls_begin =
- PartitionNullsOnly<Partitioner>(indices_begin, indices_end, values, offset);
- return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, values,
- offset);
-}
-
-// For chunked array.
-template <typename ArrayType, typename Partitioner>
-uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count) {
- // Partition nulls at end, and null-like values just before
- uint64_t* nulls_begin =
- PartitionNullsOnly<Partitioner>(indices_begin, indices_end, arrays, null_count);
- return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, arrays,
- null_count);
-}
-
+// The target chunk in a chunked array.
+template <typename ArrayType>
+struct ResolvedChunk {
+ using V = GetViewType<typename ArrayType::TypeClass>;
+ using LogicalValueType = typename V::T;
+
+ // The target array in chunked array.
+ const ArrayType* array;
+ // The index in the target array.
+ const int64_t index;
+
+ ResolvedChunk(const ArrayType* array, int64_t index) : array(array), index(index) {}
+
+ bool IsNull() const { return array->IsNull(index); }
+
+ LogicalValueType Value() const { return V::LogicalValue(array->GetView(index)); }
+};
+
+// ResolvedChunk specialization for untyped arrays when all is needed is null lookup
+template <>
+struct ResolvedChunk<Array> {
+ // The target array in chunked array.
+ const Array* array;
+ // The index in the target array.
+ const int64_t index;
+
+ ResolvedChunk(const Array* array, int64_t index) : array(array), index(index) {}
+
+ bool IsNull() const { return array->IsNull(index); }
+};
+
+// An object that resolves an array chunk depending on the index.
+struct ChunkedArrayResolver {
+ explicit ChunkedArrayResolver(const std::vector<const Array*>& chunks)
+ : num_chunks_(static_cast<int64_t>(chunks.size())),
+ chunks_(chunks.data()),
+ offsets_(MakeEndOffsets(chunks)),
+ cached_chunk_(0) {}
+
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> Resolve(int64_t index) const {
+ // It is common for the algorithms below to make consecutive accesses at
+ // a relatively small distance from each other, hence often falling in
+ // the same chunk.
+ // This is trivial when merging (assuming each side of the merge uses
+ // its own resolver), but also in the inner recursive invocations of
+ // partitioning.
+ const bool cache_hit =
+ (index >= offsets_[cached_chunk_] && index < offsets_[cached_chunk_ + 1]);
+ if (ARROW_PREDICT_TRUE(cache_hit)) {
+ return ResolvedChunk<ArrayType>(
+ checked_cast<const ArrayType*>(chunks_[cached_chunk_]),
+ index - offsets_[cached_chunk_]);
+ } else {
+ return ResolveMissBisect<ArrayType>(index);
+ }
+ }
+
+ private:
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> ResolveMissBisect(int64_t index) const {
+ // Like std::upper_bound(), but hand-written as it can help the compiler.
+ const int64_t* raw_offsets = offsets_.data();
+ // Search [lo, lo + n)
+ int64_t lo = 0, n = num_chunks_;
+ while (n > 1) {
+ int64_t m = n >> 1;
+ int64_t mid = lo + m;
+ if (index >= raw_offsets[mid]) {
+ lo = mid;
+ n -= m;
+ } else {
+ n = m;
+ }
+ }
+ cached_chunk_ = lo;
+ return ResolvedChunk<ArrayType>(checked_cast<const ArrayType*>(chunks_[lo]),
+ index - offsets_[lo]);
+ }
+
+ static std::vector<int64_t> MakeEndOffsets(const std::vector<const Array*>& chunks) {
+ std::vector<int64_t> end_offsets(chunks.size() + 1);
+ int64_t offset = 0;
+ end_offsets[0] = 0;
+ std::transform(chunks.begin(), chunks.end(), end_offsets.begin() + 1,
+ [&](const Array* chunk) {
+ offset += chunk->length();
+ return offset;
+ });
+ return end_offsets;
+ }
+
+ int64_t num_chunks_;
+ const Array* const* chunks_;
+ std::vector<int64_t> offsets_;
+
+ mutable int64_t cached_chunk_;
+};
+
+// We could try to reproduce the concrete Array classes' facilities
+// (such as cached raw values pointer) in a separate hierarchy of
+// physical accessors, but doing so ends up too cumbersome.
+// Instead, we simply create the desired concrete Array objects.
+std::shared_ptr<Array> GetPhysicalArray(const Array& array,
+ const std::shared_ptr<DataType>& physical_type) {
+ auto new_data = array.data()->Copy();
+ new_data->type = physical_type;
+ return MakeArray(std::move(new_data));
+}
+
+ArrayVector GetPhysicalChunks(const ChunkedArray& chunked_array,
+ const std::shared_ptr<DataType>& physical_type) {
+ const auto& chunks = chunked_array.chunks();
+ ArrayVector physical(chunks.size());
+ std::transform(chunks.begin(), chunks.end(), physical.begin(),
+ [&](const std::shared_ptr<Array>& array) {
+ return GetPhysicalArray(*array, physical_type);
+ });
+ return physical;
+}
+
+std::vector<const Array*> GetArrayPointers(const ArrayVector& arrays) {
+ std::vector<const Array*> pointers(arrays.size());
+ std::transform(arrays.begin(), arrays.end(), pointers.begin(),
+ [&](const std::shared_ptr<Array>& array) { return array.get(); });
+ return pointers;
+}
+
+// NOTE: std::partition is usually faster than std::stable_partition.
+
+struct NonStablePartitioner {
+ template <typename Predicate>
+ uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
+ return std::partition(indices_begin, indices_end, std::forward<Predicate>(pred));
+ }
+};
+
+struct StablePartitioner {
+ template <typename Predicate>
+ uint64_t* operator()(uint64_t* indices_begin, uint64_t* indices_end, Predicate&& pred) {
+ return std::stable_partition(indices_begin, indices_end,
+ std::forward<Predicate>(pred));
+ }
+};
+
+// TODO factor out value comparison and NaN checking?
+
+template <typename TypeClass, typename Enable = void>
+struct NullTraits {
+ static constexpr bool has_null_like_values = false;
+};
+
+template <typename TypeClass>
+struct NullTraits<TypeClass, enable_if_floating_point<TypeClass>> {
+ static constexpr bool has_null_like_values = true;
+};
+
+// Move nulls (not null-like values) to end of array. Return where null starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename Partitioner>
+uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
+ const Array& values, int64_t offset) {
+ if (values.null_count() == 0) {
+ return indices_end;
+ }
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
+ return !values.IsNull(ind - offset);
+ });
+}
+
+// For chunked array.
+template <typename Partitioner>
+uint64_t* PartitionNullsOnly(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays,
+ int64_t null_count) {
+ if (null_count == 0) {
+ return indices_end;
+ }
+ ChunkedArrayResolver resolver(arrays);
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
+ const auto chunk = resolver.Resolve<Array>(ind);
+ return !chunk.IsNull();
+ });
+}
+
+// Move non-null null-like values to end of array. Return where null-like starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename ArrayType, typename Partitioner>
+enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ return indices_end;
+}
+
+// For chunked array.
+template <typename ArrayType, typename Partitioner>
+enable_if_t<!is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ return indices_end;
+}
+
+template <typename ArrayType, typename Partitioner>
+enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ Partitioner partitioner;
+ return partitioner(indices_begin, indices_end, [&values, &offset](uint64_t ind) {
+ return !std::isnan(values.GetView(ind - offset));
+ });
+}
+
+template <typename ArrayType, typename Partitioner>
+enable_if_t<is_floating_type<typename ArrayType::TypeClass>::value, uint64_t*>
+PartitionNullLikes(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ Partitioner partitioner;
+ ChunkedArrayResolver resolver(arrays);
+ return partitioner(indices_begin, indices_end, [&](uint64_t ind) {
+ const auto chunk = resolver.Resolve<ArrayType>(ind);
+ return !std::isnan(chunk.Value());
+ });
+}
+
+// Move nulls to end of array. Return where null starts.
+//
+// `offset` is used when this is called on a chunk of a chunked array
+template <typename ArrayType, typename Partitioner>
+uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset) {
+ // Partition nulls at end, and null-like values just before
+ uint64_t* nulls_begin =
+ PartitionNullsOnly<Partitioner>(indices_begin, indices_end, values, offset);
+ return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, values,
+ offset);
+}
+
+// For chunked array.
+template <typename ArrayType, typename Partitioner>
+uint64_t* PartitionNulls(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count) {
+ // Partition nulls at end, and null-like values just before
+ uint64_t* nulls_begin =
+ PartitionNullsOnly<Partitioner>(indices_begin, indices_end, arrays, null_count);
+ return PartitionNullLikes<ArrayType, Partitioner>(indices_begin, nulls_begin, arrays,
+ null_count);
+}
+
// ----------------------------------------------------------------------
// partition_nth_indices implementation
@@ -324,116 +324,116 @@ using PartitionNthToIndicesState = internal::OptionsWrapper<PartitionNthOptions>
template <typename OutType, typename InType>
struct PartitionNthToIndices {
using ArrayType = typename TypeTraits<InType>::ArrayType;
-
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- using GetView = GetViewType<InType>;
-
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ using GetView = GetViewType<InType>;
+
if (ctx->state() == nullptr) {
- return Status::Invalid("NthToIndices requires PartitionNthOptions");
+ return Status::Invalid("NthToIndices requires PartitionNthOptions");
}
- ArrayType arr(batch[0].array());
+ ArrayType arr(batch[0].array());
int64_t pivot = PartitionNthToIndicesState::Get(ctx).pivot;
if (pivot > arr.length()) {
- return Status::IndexError("NthToIndices index out of bound");
+ return Status::IndexError("NthToIndices index out of bound");
}
ArrayData* out_arr = out->mutable_array();
uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
uint64_t* out_end = out_begin + arr.length();
std::iota(out_begin, out_end, 0);
if (pivot == arr.length()) {
- return Status::OK();
+ return Status::OK();
}
- auto nulls_begin =
- PartitionNulls<ArrayType, NonStablePartitioner>(out_begin, out_end, arr, 0);
+ auto nulls_begin =
+ PartitionNulls<ArrayType, NonStablePartitioner>(out_begin, out_end, arr, 0);
auto nth_begin = out_begin + pivot;
if (nth_begin < nulls_begin) {
std::nth_element(out_begin, nth_begin, nulls_begin,
[&arr](uint64_t left, uint64_t right) {
- const auto lval = GetView::LogicalValue(arr.GetView(left));
- const auto rval = GetView::LogicalValue(arr.GetView(right));
- return lval < rval;
+ const auto lval = GetView::LogicalValue(arr.GetView(left));
+ const auto rval = GetView::LogicalValue(arr.GetView(right));
+ return lval < rval;
});
}
- return Status::OK();
+ return Status::OK();
}
};
-// ----------------------------------------------------------------------
-// Array sorting implementations
-
+// ----------------------------------------------------------------------
+// Array sorting implementations
+
template <typename ArrayType, typename VisitorNotNull, typename VisitorNull>
inline void VisitRawValuesInline(const ArrayType& values,
VisitorNotNull&& visitor_not_null,
VisitorNull&& visitor_null) {
const auto data = values.raw_values();
- VisitBitBlocksVoid(
- values.null_bitmap(), values.offset(), values.length(),
- [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
-}
-
-template <typename VisitorNotNull, typename VisitorNull>
-inline void VisitRawValuesInline(const BooleanArray& values,
- VisitorNotNull&& visitor_not_null,
- VisitorNull&& visitor_null) {
- if (values.null_count() != 0) {
- const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
- VisitBitBlocksVoid(
- values.null_bitmap(), values.offset(), values.length(),
- [&](int64_t i) { visitor_not_null(BitUtil::GetBit(data, values.offset() + i)); },
- [&]() { visitor_null(); });
+ VisitBitBlocksVoid(
+ values.null_bitmap(), values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
+}
+
+template <typename VisitorNotNull, typename VisitorNull>
+inline void VisitRawValuesInline(const BooleanArray& values,
+ VisitorNotNull&& visitor_not_null,
+ VisitorNull&& visitor_null) {
+ if (values.null_count() != 0) {
+ const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
+ VisitBitBlocksVoid(
+ values.null_bitmap(), values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(BitUtil::GetBit(data, values.offset() + i)); },
+ [&]() { visitor_null(); });
} else {
- // Can avoid GetBit() overhead in the no-nulls case
- VisitBitBlocksVoid(
- values.data()->buffers[1], values.offset(), values.length(),
- [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
+ // Can avoid GetBit() overhead in the no-nulls case
+ VisitBitBlocksVoid(
+ values.data()->buffers[1], values.offset(), values.length(),
+ [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
}
}
template <typename ArrowType>
-class ArrayCompareSorter {
+class ArrayCompareSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
- using GetView = GetViewType<ArrowType>;
+ using GetView = GetViewType<ArrowType>;
public:
- // Returns where null starts.
- //
- // `offset` is used when this is called on a chunk of a chunked array
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
- int64_t offset, const ArraySortOptions& options) {
- auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
- indices_begin, indices_end, values, offset);
- if (options.order == SortOrder::Ascending) {
- std::stable_sort(
- indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
- const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
- return lhs < rhs;
- });
- } else {
- std::stable_sort(
- indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
- const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
- // We don't use 'left > right' here to reduce required operator.
- // If we use 'right < left' here, '<' is only required.
- return rhs < lhs;
- });
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
+ auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
+ indices_begin, indices_end, values, offset);
+ if (options.order == SortOrder::Ascending) {
+ std::stable_sort(
+ indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
+ return lhs < rhs;
+ });
+ } else {
+ std::stable_sort(
+ indices_begin, nulls_begin, [&values, &offset](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(values.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(values.GetView(right - offset));
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ return rhs < lhs;
+ });
}
- return nulls_begin;
+ return nulls_begin;
}
};
template <typename ArrowType>
-class ArrayCountSorter {
+class ArrayCountSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
using c_type = typename ArrowType::c_type;
public:
- ArrayCountSorter() = default;
+ ArrayCountSorter() = default;
- explicit ArrayCountSorter(c_type min, c_type max) { SetMinMax(min, max); }
+ explicit ArrayCountSorter(c_type min, c_type max) { SetMinMax(min, max); }
// Assume: max >= min && (max - min) < 4Gi
void SetMinMax(c_type min, c_type max) {
@@ -441,14 +441,14 @@ class ArrayCountSorter {
value_range_ = static_cast<uint32_t>(max - min) + 1;
}
- // Returns where null starts.
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
- int64_t offset, const ArraySortOptions& options) {
+ // Returns where null starts.
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
// 32bit counter performs much better than 64bit one
if (values.length() < (1LL << 32)) {
- return SortInternal<uint32_t>(indices_begin, indices_end, values, offset, options);
+ return SortInternal<uint32_t>(indices_begin, indices_end, values, offset, options);
} else {
- return SortInternal<uint64_t>(indices_begin, indices_end, values, offset, options);
+ return SortInternal<uint64_t>(indices_begin, indices_end, values, offset, options);
}
}
@@ -456,81 +456,81 @@ class ArrayCountSorter {
c_type min_{0};
uint32_t value_range_{0};
- // Returns where null starts.
- //
- // `offset` is used when this is called on a chunk of a chunked array
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
template <typename CounterType>
- uint64_t* SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
- const ArrayType& values, int64_t offset,
- const ArraySortOptions& options) {
+ uint64_t* SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
+ const ArrayType& values, int64_t offset,
+ const ArraySortOptions& options) {
const uint32_t value_range = value_range_;
// first slot reserved for prefix sum
std::vector<CounterType> counts(1 + value_range);
- if (options.order == SortOrder::Ascending) {
- VisitRawValuesInline(
- values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {});
- for (uint32_t i = 1; i <= value_range; ++i) {
- counts[i] += counts[i - 1];
- }
- auto null_position = counts[value_range];
- auto nulls_begin = indices_begin + null_position;
- int64_t index = offset;
- VisitRawValuesInline(
- values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
- [&]() { indices_begin[null_position++] = index++; });
- return nulls_begin;
- } else {
- VisitRawValuesInline(
- values, [&](c_type v) { ++counts[v - min_]; }, []() {});
- for (uint32_t i = value_range; i >= 1; --i) {
- counts[i - 1] += counts[i];
- }
- auto null_position = counts[0];
- auto nulls_begin = indices_begin + null_position;
- int64_t index = offset;
- VisitRawValuesInline(
- values, [&](c_type v) { indices_begin[counts[v - min_ + 1]++] = index++; },
- [&]() { indices_begin[null_position++] = index++; });
- return nulls_begin;
- }
- }
-};
-
-using ::arrow::internal::Bitmap;
-
-template <>
-class ArrayCountSorter<BooleanType> {
- public:
- ArrayCountSorter() = default;
-
- // Returns where null starts.
- // `offset` is used when this is called on a chunk of a chunked array
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
- const BooleanArray& values, int64_t offset,
- const ArraySortOptions& options) {
- std::array<int64_t, 2> counts{0, 0};
-
- const int64_t nulls = values.null_count();
- const int64_t ones = values.true_count();
- const int64_t zeros = values.length() - ones - nulls;
-
- int64_t null_position = values.length() - nulls;
- int64_t index = offset;
- const auto nulls_begin = indices_begin + null_position;
-
- if (options.order == SortOrder::Ascending) {
- // ones start after zeros
- counts[1] = zeros;
- } else {
- // zeros start after ones
- counts[0] = ones;
+ if (options.order == SortOrder::Ascending) {
+ VisitRawValuesInline(
+ values, [&](c_type v) { ++counts[v - min_ + 1]; }, []() {});
+ for (uint32_t i = 1; i <= value_range; ++i) {
+ counts[i] += counts[i - 1];
+ }
+ auto null_position = counts[value_range];
+ auto nulls_begin = indices_begin + null_position;
+ int64_t index = offset;
+ VisitRawValuesInline(
+ values, [&](c_type v) { indices_begin[counts[v - min_]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ } else {
+ VisitRawValuesInline(
+ values, [&](c_type v) { ++counts[v - min_]; }, []() {});
+ for (uint32_t i = value_range; i >= 1; --i) {
+ counts[i - 1] += counts[i];
+ }
+ auto null_position = counts[0];
+ auto nulls_begin = indices_begin + null_position;
+ int64_t index = offset;
+ VisitRawValuesInline(
+ values, [&](c_type v) { indices_begin[counts[v - min_ + 1]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
+ }
+ }
+};
+
+using ::arrow::internal::Bitmap;
+
+template <>
+class ArrayCountSorter<BooleanType> {
+ public:
+ ArrayCountSorter() = default;
+
+ // Returns where null starts.
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
+ const BooleanArray& values, int64_t offset,
+ const ArraySortOptions& options) {
+ std::array<int64_t, 2> counts{0, 0};
+
+ const int64_t nulls = values.null_count();
+ const int64_t ones = values.true_count();
+ const int64_t zeros = values.length() - ones - nulls;
+
+ int64_t null_position = values.length() - nulls;
+ int64_t index = offset;
+ const auto nulls_begin = indices_begin + null_position;
+
+ if (options.order == SortOrder::Ascending) {
+ // ones start after zeros
+ counts[1] = zeros;
+ } else {
+ // zeros start after ones
+ counts[0] = ones;
}
VisitRawValuesInline(
- values, [&](bool v) { indices_begin[counts[v]++] = index++; },
- [&]() { indices_begin[null_position++] = index++; });
- return nulls_begin;
+ values, [&](bool v) { indices_begin[counts[v]++] = index++; },
+ [&]() { indices_begin[null_position++] = index++; });
+ return nulls_begin;
}
};
@@ -538,35 +538,35 @@ class ArrayCountSorter<BooleanType> {
// - Use O(n) counting sort if values are in a small range
// - Use O(nlogn) std::stable_sort otherwise
template <typename ArrowType>
-class ArrayCountOrCompareSorter {
+class ArrayCountOrCompareSorter {
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
using c_type = typename ArrowType::c_type;
public:
- // Returns where null starts.
- //
- // `offset` is used when this is called on a chunk of a chunked array
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
- int64_t offset, const ArraySortOptions& options) {
+ // Returns where null starts.
+ //
+ // `offset` is used when this is called on a chunk of a chunked array
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values,
+ int64_t offset, const ArraySortOptions& options) {
if (values.length() >= countsort_min_len_ && values.length() > values.null_count()) {
- c_type min, max;
- std::tie(min, max) = GetMinMax<c_type>(*values.data());
+ c_type min, max;
+ std::tie(min, max) = GetMinMax<c_type>(*values.data());
// For signed int32/64, (max - min) may overflow and trigger UBSAN.
// Cast to largest unsigned type(uint64_t) before subtraction.
if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <=
countsort_max_range_) {
count_sorter_.SetMinMax(min, max);
- return count_sorter_.Sort(indices_begin, indices_end, values, offset, options);
+ return count_sorter_.Sort(indices_begin, indices_end, values, offset, options);
}
}
- return compare_sorter_.Sort(indices_begin, indices_end, values, offset, options);
+ return compare_sorter_.Sort(indices_begin, indices_end, values, offset, options);
}
private:
- ArrayCompareSorter<ArrowType> compare_sorter_;
- ArrayCountSorter<ArrowType> count_sorter_;
+ ArrayCompareSorter<ArrowType> compare_sorter_;
+ ArrayCountSorter<ArrowType> count_sorter_;
// Cross point to prefer counting sort than stl::stable_sort(merge sort)
// - array to be sorted is longer than "count_min_len_"
@@ -582,1257 +582,1257 @@ class ArrayCountOrCompareSorter {
};
template <typename Type, typename Enable = void>
-struct ArraySorter;
+struct ArraySorter;
+
+template <>
+struct ArraySorter<BooleanType> {
+ ArrayCountSorter<BooleanType> impl;
+};
template <>
-struct ArraySorter<BooleanType> {
- ArrayCountSorter<BooleanType> impl;
+struct ArraySorter<UInt8Type> {
+ ArrayCountSorter<UInt8Type> impl;
+ ArraySorter() : impl(0, 255) {}
};
template <>
-struct ArraySorter<UInt8Type> {
- ArrayCountSorter<UInt8Type> impl;
- ArraySorter() : impl(0, 255) {}
+struct ArraySorter<Int8Type> {
+ ArrayCountSorter<Int8Type> impl;
+ ArraySorter() : impl(-128, 127) {}
};
-template <>
-struct ArraySorter<Int8Type> {
- ArrayCountSorter<Int8Type> impl;
- ArraySorter() : impl(-128, 127) {}
-};
-
template <typename Type>
-struct ArraySorter<Type, enable_if_t<(is_integer_type<Type>::value &&
- (sizeof(typename Type::c_type) > 1)) ||
- is_temporal_type<Type>::value>> {
- ArrayCountOrCompareSorter<Type> impl;
+struct ArraySorter<Type, enable_if_t<(is_integer_type<Type>::value &&
+ (sizeof(typename Type::c_type) > 1)) ||
+ is_temporal_type<Type>::value>> {
+ ArrayCountOrCompareSorter<Type> impl;
};
template <typename Type>
-struct ArraySorter<
- Type, enable_if_t<is_floating_type<Type>::value || is_base_binary_type<Type>::value ||
- is_fixed_size_binary_type<Type>::value>> {
- ArrayCompareSorter<Type> impl;
+struct ArraySorter<
+ Type, enable_if_t<is_floating_type<Type>::value || is_base_binary_type<Type>::value ||
+ is_fixed_size_binary_type<Type>::value>> {
+ ArrayCompareSorter<Type> impl;
};
-using ArraySortIndicesState = internal::OptionsWrapper<ArraySortOptions>;
-
+using ArraySortIndicesState = internal::OptionsWrapper<ArraySortOptions>;
+
template <typename OutType, typename InType>
-struct ArraySortIndices {
+struct ArraySortIndices {
using ArrayType = typename TypeTraits<InType>::ArrayType;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- const auto& options = ArraySortIndicesState::Get(ctx);
-
- ArrayType arr(batch[0].array());
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const auto& options = ArraySortIndicesState::Get(ctx);
+
+ ArrayType arr(batch[0].array());
ArrayData* out_arr = out->mutable_array();
uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
uint64_t* out_end = out_begin + arr.length();
- std::iota(out_begin, out_end, 0);
+ std::iota(out_begin, out_end, 0);
+
+ ArraySorter<InType> sorter;
+ sorter.impl.Sort(out_begin, out_end, arr, 0, options);
- ArraySorter<InType> sorter;
- sorter.impl.Sort(out_begin, out_end, arr, 0, options);
-
- return Status::OK();
+ return Status::OK();
}
};
// Sort indices kernels implemented for
//
-// * Boolean type
+// * Boolean type
// * Number types
// * Base binary types
template <template <typename...> class ExecTemplate>
void AddSortingKernels(VectorKernel base, VectorFunction* func) {
- // bool type
- base.signature = KernelSignature::Make({InputType::Array(boolean())}, uint64());
- base.exec = ExecTemplate<UInt64Type, BooleanType>::Exec;
- DCHECK_OK(func->AddKernel(base));
-
+ // bool type
+ base.signature = KernelSignature::Make({InputType::Array(boolean())}, uint64());
+ base.exec = ExecTemplate<UInt64Type, BooleanType>::Exec;
+ DCHECK_OK(func->AddKernel(base));
+
for (const auto& ty : NumericTypes()) {
- auto physical_type = GetPhysicalType(ty);
+ auto physical_type = GetPhysicalType(ty);
+ base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+ base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ for (const auto& ty : TemporalTypes()) {
+ auto physical_type = GetPhysicalType(ty);
base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
- base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+ base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+ DCHECK_OK(func->AddKernel(base));
+ }
+ for (const auto id : DecimalTypeIds()) {
+ base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
+ base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
DCHECK_OK(func->AddKernel(base));
}
- for (const auto& ty : TemporalTypes()) {
- auto physical_type = GetPhysicalType(ty);
- base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
- base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
- DCHECK_OK(func->AddKernel(base));
- }
- for (const auto id : DecimalTypeIds()) {
- base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
- base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
- DCHECK_OK(func->AddKernel(base));
- }
for (const auto& ty : BaseBinaryTypes()) {
- auto physical_type = GetPhysicalType(ty);
+ auto physical_type = GetPhysicalType(ty);
base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
- base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
+ base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
DCHECK_OK(func->AddKernel(base));
}
- base.signature =
- KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, uint64());
- base.exec = ExecTemplate<UInt64Type, FixedSizeBinaryType>::Exec;
- DCHECK_OK(func->AddKernel(base));
+ base.signature =
+ KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, uint64());
+ base.exec = ExecTemplate<UInt64Type, FixedSizeBinaryType>::Exec;
+ DCHECK_OK(func->AddKernel(base));
}
-// ----------------------------------------------------------------------
-// ChunkedArray sorting implementations
-
-// Sort a chunked array directly without sorting each array in the
-// chunked array. This is used for processing the second and following
-// sort keys in TableRadixSorter.
-//
-// This uses the same algorithm as ArrayCompareSorter.
-template <typename Type>
-class ChunkedArrayCompareSorter {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- public:
- // Returns where null starts.
- uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
- const std::vector<const Array*>& arrays, int64_t null_count,
- const ArraySortOptions& options) {
- auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
- indices_begin, indices_end, arrays, null_count);
- ChunkedArrayResolver resolver(arrays);
- if (options.order == SortOrder::Ascending) {
- std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
- const auto chunk_left = resolver.Resolve<ArrayType>(left);
- const auto chunk_right = resolver.Resolve<ArrayType>(right);
- return chunk_left.Value() < chunk_right.Value();
- });
- } else {
- std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
- const auto chunk_left = resolver.Resolve<ArrayType>(left);
- const auto chunk_right = resolver.Resolve<ArrayType>(right);
- // We don't use 'left > right' here to reduce required operator.
- // If we use 'right < left' here, '<' is only required.
- return chunk_right.Value() < chunk_left.Value();
- });
- }
- return nulls_begin;
- }
-};
-
-// Sort a chunked array by sorting each array in the chunked array.
-//
-// TODO: This is a naive implementation. We'll be able to improve
-// performance of this. For example, we'll be able to use threads for
-// sorting each array.
-class ChunkedArraySorter : public TypeVisitor {
- public:
- ChunkedArraySorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
- const ChunkedArray& chunked_array, const SortOrder order,
- bool can_use_array_sorter = true)
- : TypeVisitor(),
- indices_begin_(indices_begin),
- indices_end_(indices_end),
- chunked_array_(chunked_array),
- physical_type_(GetPhysicalType(chunked_array.type())),
- physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
- order_(order),
- can_use_array_sorter_(can_use_array_sorter),
- ctx_(ctx) {}
-
- Status Sort() { return physical_type_->Accept(this); }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- private:
- template <typename Type>
- Status SortInternal() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- ArraySortOptions options(order_);
- const auto num_chunks = chunked_array_.num_chunks();
- if (num_chunks == 0) {
- return Status::OK();
- }
- const auto arrays = GetArrayPointers(physical_chunks_);
- if (can_use_array_sorter_) {
- // Sort each chunk independently and merge to sorted indices.
- // This is a serial implementation.
- ArraySorter<Type> sorter;
- struct SortedChunk {
- int64_t begin_offset;
- int64_t end_offset;
- int64_t nulls_offset;
- };
- std::vector<SortedChunk> sorted(num_chunks);
-
- // First sort all individual chunks
- int64_t begin_offset = 0;
- int64_t end_offset = 0;
- int64_t null_count = 0;
- for (int i = 0; i < num_chunks; ++i) {
- const auto array = checked_cast<const ArrayType*>(arrays[i]);
- end_offset += array->length();
- null_count += array->null_count();
- uint64_t* nulls_begin =
- sorter.impl.Sort(indices_begin_ + begin_offset, indices_begin_ + end_offset,
- *array, begin_offset, options);
- sorted[i] = {begin_offset, end_offset, nulls_begin - indices_begin_};
- begin_offset = end_offset;
- }
- DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
-
- std::unique_ptr<Buffer> temp_buffer;
- uint64_t* temp_indices = nullptr;
- if (sorted.size() > 1) {
- ARROW_ASSIGN_OR_RAISE(
- temp_buffer,
- AllocateBuffer(sizeof(int64_t) * (indices_end_ - indices_begin_ - null_count),
- ctx_->memory_pool()));
- temp_indices = reinterpret_cast<uint64_t*>(temp_buffer->mutable_data());
- }
-
- // Then merge them by pairs, recursively
- while (sorted.size() > 1) {
- auto out_it = sorted.begin();
- auto it = sorted.begin();
- while (it < sorted.end() - 1) {
- const auto& left = *it++;
- const auto& right = *it++;
- DCHECK_EQ(left.end_offset, right.begin_offset);
- DCHECK_GE(left.nulls_offset, left.begin_offset);
- DCHECK_LE(left.nulls_offset, left.end_offset);
- DCHECK_GE(right.nulls_offset, right.begin_offset);
- DCHECK_LE(right.nulls_offset, right.end_offset);
- uint64_t* nulls_begin = Merge<ArrayType>(
- indices_begin_ + left.begin_offset, indices_begin_ + left.end_offset,
- indices_begin_ + right.end_offset, indices_begin_ + left.nulls_offset,
- indices_begin_ + right.nulls_offset, arrays, null_count, order_,
- temp_indices);
- *out_it++ = {left.begin_offset, right.end_offset, nulls_begin - indices_begin_};
- }
- if (it < sorted.end()) {
- *out_it++ = *it++;
- }
- sorted.erase(out_it, sorted.end());
- }
- DCHECK_EQ(sorted.size(), 1);
- DCHECK_EQ(sorted[0].begin_offset, 0);
- DCHECK_EQ(sorted[0].end_offset, chunked_array_.length());
- // Note that "nulls" can also include NaNs, hence the >= check
- DCHECK_GE(chunked_array_.length() - sorted[0].nulls_offset, null_count);
- } else {
- // Sort the chunked array directory.
- ChunkedArrayCompareSorter<Type> sorter;
- sorter.Sort(indices_begin_, indices_end_, arrays, chunked_array_.null_count(),
- options);
- }
- return Status::OK();
- }
-
- // Merges two sorted indices arrays and returns where nulls starts.
- // Where nulls starts is used when the next merge to detect the
- // sorted indices locations.
- template <typename ArrayType>
- uint64_t* Merge(uint64_t* indices_begin, uint64_t* indices_middle,
- uint64_t* indices_end, uint64_t* left_nulls_begin,
- uint64_t* right_nulls_begin, const std::vector<const Array*>& arrays,
- int64_t null_count, const SortOrder order, uint64_t* temp_indices) {
- // Input layout:
- // [left non-nulls .... left nulls .... right non-nulls .... right nulls]
- // ^ ^ ^ ^
- // | | | |
- // indices_begin left_nulls_begin indices_middle right_nulls_begin
- auto left_num_non_nulls = left_nulls_begin - indices_begin;
- auto right_num_non_nulls = right_nulls_begin - indices_middle;
-
- // Mutate the input, stably, to obtain the following layout:
- // [left non-nulls .... right non-nulls .... left nulls .... right nulls]
- // ^ ^ ^ ^
- // | | | |
- // indices_begin indices_middle nulls_begin right_nulls_begin
- std::rotate(left_nulls_begin, indices_middle, right_nulls_begin);
- auto nulls_begin = indices_begin + left_num_non_nulls + right_num_non_nulls;
- // If the type has null-like values (such as NaN), ensure those plus regular
- // nulls are partitioned in the right order. Note this assumes that all
- // null-like values (e.g. NaN) are ordered equally.
- if (NullTraits<typename ArrayType::TypeClass>::has_null_like_values) {
- PartitionNullsOnly<StablePartitioner>(nulls_begin, indices_end, arrays, null_count);
- }
-
- // Merge the non-null values into temp area
- indices_middle = indices_begin + left_num_non_nulls;
- indices_end = indices_middle + right_num_non_nulls;
- const ChunkedArrayResolver left_resolver(arrays);
- const ChunkedArrayResolver right_resolver(arrays);
- if (order == SortOrder::Ascending) {
- std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
- [&](uint64_t left, uint64_t right) {
- const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
- const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
- return chunk_left.Value() < chunk_right.Value();
- });
- } else {
- std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
- [&](uint64_t left, uint64_t right) {
- const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
- const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
- // We don't use 'left > right' here to reduce required
- // operator. If we use 'right < left' here, '<' is only
- // required.
- return chunk_right.Value() < chunk_left.Value();
- });
- }
- // Copy back temp area into main buffer
- std::copy(temp_indices, temp_indices + (nulls_begin - indices_begin), indices_begin);
- return nulls_begin;
- }
-
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
- const ChunkedArray& chunked_array_;
- const std::shared_ptr<DataType> physical_type_;
- const ArrayVector physical_chunks_;
- const SortOrder order_;
- const bool can_use_array_sorter_;
- ExecContext* ctx_;
-};
-
-// ----------------------------------------------------------------------
-// Record batch sorting implementation(s)
-
-// Visit contiguous ranges of equal values. All entries are assumed
-// to be non-null.
-template <typename ArrayType, typename Visitor>
-void VisitConstantRanges(const ArrayType& array, uint64_t* indices_begin,
- uint64_t* indices_end, Visitor&& visit) {
- using GetView = GetViewType<typename ArrayType::TypeClass>;
-
- if (indices_begin == indices_end) {
- return;
- }
- auto range_start = indices_begin;
- auto range_cur = range_start;
- auto last_value = GetView::LogicalValue(array.GetView(*range_cur));
- while (++range_cur != indices_end) {
- auto v = GetView::LogicalValue(array.GetView(*range_cur));
- if (v != last_value) {
- visit(range_start, range_cur);
- range_start = range_cur;
- last_value = v;
- }
- }
- if (range_start != range_cur) {
- visit(range_start, range_cur);
- }
-}
-
-// A sorter for a single column of a RecordBatch, deferring to the next column
-// for ranges of equal values.
-class RecordBatchColumnSorter {
- public:
- explicit RecordBatchColumnSorter(RecordBatchColumnSorter* next_column = nullptr)
- : next_column_(next_column) {}
- virtual ~RecordBatchColumnSorter() {}
-
- virtual void SortRange(uint64_t* indices_begin, uint64_t* indices_end) = 0;
-
- protected:
- RecordBatchColumnSorter* next_column_;
-};
-
-template <typename Type>
-class ConcreteRecordBatchColumnSorter : public RecordBatchColumnSorter {
- public:
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- ConcreteRecordBatchColumnSorter(std::shared_ptr<Array> array, SortOrder order,
- RecordBatchColumnSorter* next_column = nullptr)
- : RecordBatchColumnSorter(next_column),
- owned_array_(std::move(array)),
- array_(checked_cast<const ArrayType&>(*owned_array_)),
- order_(order),
- null_count_(array_.null_count()) {}
-
- void SortRange(uint64_t* indices_begin, uint64_t* indices_end) {
- using GetView = GetViewType<Type>;
-
- constexpr int64_t offset = 0;
- uint64_t* nulls_begin;
- if (null_count_ == 0) {
- nulls_begin = indices_end;
- } else {
- // NOTE that null_count_ is merely an upper bound on the number of nulls
- // in this particular range.
- nulls_begin = PartitionNullsOnly<StablePartitioner>(indices_begin, indices_end,
- array_, offset);
- DCHECK_LE(indices_end - nulls_begin, null_count_);
- }
- uint64_t* null_likes_begin = PartitionNullLikes<ArrayType, StablePartitioner>(
- indices_begin, nulls_begin, array_, offset);
-
- // TODO This is roughly the same as ArrayCompareSorter.
- // Also, we would like to use a counting sort if possible. This requires
- // a counting sort compatible with indirect indexing.
- if (order_ == SortOrder::Ascending) {
- std::stable_sort(
- indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
- const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
- return lhs < rhs;
- });
- } else {
- std::stable_sort(
- indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
- // We don't use 'left > right' here to reduce required operator.
- // If we use 'right < left' here, '<' is only required.
- const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
- const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
- return lhs > rhs;
- });
- }
-
- if (next_column_ != nullptr) {
- // Visit all ranges of equal values in this column and sort them on
- // the next column.
- SortNextColumn(null_likes_begin, nulls_begin);
- SortNextColumn(nulls_begin, indices_end);
- VisitConstantRanges(array_, indices_begin, null_likes_begin,
- [&](uint64_t* range_start, uint64_t* range_end) {
- SortNextColumn(range_start, range_end);
- });
- }
- }
-
- void SortNextColumn(uint64_t* indices_begin, uint64_t* indices_end) {
- // Avoid the cost of a virtual method call in trivial cases
- if (indices_end - indices_begin > 1) {
- next_column_->SortRange(indices_begin, indices_end);
- }
- }
-
- protected:
- const std::shared_ptr<Array> owned_array_;
- const ArrayType& array_;
- const SortOrder order_;
- const int64_t null_count_;
-};
-
-// Sort a batch using a single-pass left-to-right radix sort.
-class RadixRecordBatchSorter {
- public:
- RadixRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
- const RecordBatch& batch, const SortOptions& options)
- : batch_(batch),
- options_(options),
- indices_begin_(indices_begin),
- indices_end_(indices_end) {}
-
- Status Sort() {
- ARROW_ASSIGN_OR_RAISE(const auto sort_keys,
- ResolveSortKeys(batch_, options_.sort_keys));
-
- // Create column sorters from right to left
- std::vector<std::unique_ptr<RecordBatchColumnSorter>> column_sorts(sort_keys.size());
- RecordBatchColumnSorter* next_column = nullptr;
- for (int64_t i = static_cast<int64_t>(sort_keys.size() - 1); i >= 0; --i) {
- ColumnSortFactory factory(sort_keys[i], next_column);
- ARROW_ASSIGN_OR_RAISE(column_sorts[i], factory.MakeColumnSort());
- next_column = column_sorts[i].get();
- }
-
- // Sort from left to right
- column_sorts.front()->SortRange(indices_begin_, indices_end_);
- return Status::OK();
- }
-
- protected:
- struct ResolvedSortKey {
- std::shared_ptr<Array> array;
- SortOrder order;
- };
-
- struct ColumnSortFactory {
- ColumnSortFactory(const ResolvedSortKey& sort_key,
- RecordBatchColumnSorter* next_column)
- : physical_type(GetPhysicalType(sort_key.array->type())),
- array(GetPhysicalArray(*sort_key.array, physical_type)),
- order(sort_key.order),
- next_column(next_column) {}
-
- Result<std::unique_ptr<RecordBatchColumnSorter>> MakeColumnSort() {
- RETURN_NOT_OK(VisitTypeInline(*physical_type, this));
- DCHECK_NE(result, nullptr);
- return std::move(result);
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) { return VisitGeneric(type); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- Status Visit(const DataType& type) {
- return Status::TypeError("Unsupported type for RecordBatch sorting: ",
- type.ToString());
- }
-
- template <typename Type>
- Status VisitGeneric(const Type&) {
- result.reset(new ConcreteRecordBatchColumnSorter<Type>(array, order, next_column));
- return Status::OK();
- }
-
- std::shared_ptr<DataType> physical_type;
- std::shared_ptr<Array> array;
- SortOrder order;
- RecordBatchColumnSorter* next_column;
- std::unique_ptr<RecordBatchColumnSorter> result;
- };
-
- static Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
- const RecordBatch& batch, const std::vector<SortKey>& sort_keys) {
- std::vector<ResolvedSortKey> resolved;
- resolved.reserve(sort_keys.size());
- for (const auto& sort_key : sort_keys) {
- auto array = batch.GetColumnByName(sort_key.name);
- if (!array) {
- return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- }
- resolved.push_back({std::move(array), sort_key.order});
- }
- return resolved;
- }
-
- const RecordBatch& batch_;
- const SortOptions& options_;
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
-};
-
-// Compare two records in the same RecordBatch or Table
-// (indexing is handled through ResolvedSortKey)
-template <typename ResolvedSortKey>
-class MultipleKeyComparator {
- public:
- explicit MultipleKeyComparator(const std::vector<ResolvedSortKey>& sort_keys)
- : sort_keys_(sort_keys) {}
-
- Status status() const { return status_; }
-
- // Returns true if the left-th value should be ordered before the
- // right-th value, false otherwise. The start_sort_key_index-th
- // sort key and subsequent sort keys are used for comparison.
- bool Compare(uint64_t left, uint64_t right, size_t start_sort_key_index) {
- current_left_ = left;
- current_right_ = right;
- current_compared_ = 0;
- auto num_sort_keys = sort_keys_.size();
- for (size_t i = start_sort_key_index; i < num_sort_keys; ++i) {
- current_sort_key_index_ = i;
- status_ = VisitTypeInline(*sort_keys_[i].type, this);
- // If the left value equals to the right value, we need to
- // continue to sort.
- if (current_compared_ != 0) {
- break;
- }
- }
- return current_compared_ < 0;
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) { \
- current_compared_ = CompareType<TYPE>(); \
- return Status::OK(); \
- }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- Status Visit(const DataType& type) {
- return Status::TypeError("Unsupported type for RecordBatch sorting: ",
- type.ToString());
- }
-
- private:
- // Compares two records in the same table and returns -1, 0 or 1.
- //
- // -1: The left is less than the right.
- // 0: The left equals to the right.
- // 1: The left is greater than the right.
- //
- // This supports null and NaN. Null is processed in this and NaN
- // is processed in CompareTypeValue().
- template <typename Type>
- int32_t CompareType() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- const auto& sort_key = sort_keys_[current_sort_key_index_];
- auto order = sort_key.order;
- const auto chunk_left = sort_key.template GetChunk<ArrayType>(current_left_);
- const auto chunk_right = sort_key.template GetChunk<ArrayType>(current_right_);
- if (sort_key.null_count > 0) {
- auto is_null_left = chunk_left.IsNull();
- auto is_null_right = chunk_right.IsNull();
- if (is_null_left && is_null_right) {
- return 0;
- } else if (is_null_left) {
- return 1;
- } else if (is_null_right) {
- return -1;
- }
- }
- return CompareTypeValue<Type>(chunk_left, chunk_right, order);
- }
-
- // For non-float types. Value is never NaN.
- template <typename Type>
- enable_if_t<!is_floating_type<Type>::value, int32_t> CompareTypeValue(
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
- const SortOrder order) {
- const auto left = chunk_left.Value();
- const auto right = chunk_right.Value();
- int32_t compared;
- if (left == right) {
- compared = 0;
- } else if (left > right) {
- compared = 1;
- } else {
- compared = -1;
- }
- if (order == SortOrder::Descending) {
- compared = -compared;
- }
- return compared;
- }
-
- // For float types. Value may be NaN.
- template <typename Type>
- enable_if_t<is_floating_type<Type>::value, int32_t> CompareTypeValue(
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
- const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
- const SortOrder order) {
- const auto left = chunk_left.Value();
- const auto right = chunk_right.Value();
- auto is_nan_left = std::isnan(left);
- auto is_nan_right = std::isnan(right);
- if (is_nan_left && is_nan_right) {
- return 0;
- } else if (is_nan_left) {
- return 1;
- } else if (is_nan_right) {
- return -1;
- }
- int32_t compared;
- if (left == right) {
- compared = 0;
- } else if (left > right) {
- compared = 1;
- } else {
- compared = -1;
- }
- if (order == SortOrder::Descending) {
- compared = -compared;
- }
- return compared;
- }
-
- const std::vector<ResolvedSortKey>& sort_keys_;
- Status status_;
- int64_t current_left_;
- int64_t current_right_;
- size_t current_sort_key_index_;
- int32_t current_compared_;
-};
-
-// Sort a batch using a single sort and multiple-key comparisons.
-class MultipleKeyRecordBatchSorter : public TypeVisitor {
- private:
- // Preprocessed sort key.
- struct ResolvedSortKey {
- ResolvedSortKey(const std::shared_ptr<Array>& array, const SortOrder order)
- : type(GetPhysicalType(array->type())),
- owned_array(GetPhysicalArray(*array, type)),
- array(*owned_array),
- order(order),
- null_count(array->null_count()) {}
-
- template <typename ArrayType>
- ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
- return {&checked_cast<const ArrayType&>(array), index};
- }
-
- const std::shared_ptr<DataType> type;
- std::shared_ptr<Array> owned_array;
- const Array& array;
- SortOrder order;
- int64_t null_count;
- };
-
- using Comparator = MultipleKeyComparator<ResolvedSortKey>;
-
- public:
- MultipleKeyRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
- const RecordBatch& batch, const SortOptions& options)
- : indices_begin_(indices_begin),
- indices_end_(indices_end),
- sort_keys_(ResolveSortKeys(batch, options.sort_keys, &status_)),
- comparator_(sort_keys_) {}
-
- // This is optimized for the first sort key. The first sort key sort
- // is processed in this class. The second and following sort keys
- // are processed in Comparator.
- Status Sort() {
- RETURN_NOT_OK(status_);
- return sort_keys_[0].type->Accept(this);
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- private:
- static std::vector<ResolvedSortKey> ResolveSortKeys(
- const RecordBatch& batch, const std::vector<SortKey>& sort_keys, Status* status) {
- std::vector<ResolvedSortKey> resolved;
- for (const auto& sort_key : sort_keys) {
- auto array = batch.GetColumnByName(sort_key.name);
- if (!array) {
- *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- break;
- }
- resolved.emplace_back(array, sort_key.order);
- }
- return resolved;
- }
-
- template <typename Type>
- Status SortInternal() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- auto& comparator = comparator_;
- const auto& first_sort_key = sort_keys_[0];
- const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
- auto nulls_begin = indices_end_;
- nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
- // Sort first-key non-nulls
- std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
- // Both values are never null nor NaN
- // (otherwise they've been partitioned away above).
- const auto value_left = array.GetView(left);
- const auto value_right = array.GetView(right);
- if (value_left != value_right) {
- bool compared = value_left < value_right;
- if (first_sort_key.order == SortOrder::Ascending) {
- return compared;
- } else {
- return !compared;
- }
- }
- // If the left value equals to the right value,
- // we need to compare the second and following
- // sort keys.
- return comparator.Compare(left, right, 1);
- });
- return comparator_.status();
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For non-float types.
- template <typename Type>
- enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- if (first_sort_key.null_count == 0) {
- return indices_end_;
- }
- const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
- StablePartitioner partitioner;
- auto nulls_begin = partitioner(indices_begin_, indices_end_,
- [&](uint64_t index) { return !array.IsNull(index); });
- // Sort all nulls by second and following sort keys
- // TODO: could we instead run an independent sort from the second key on
- // this slice?
- if (nulls_begin != indices_end_) {
- auto& comparator = comparator_;
- std::stable_sort(nulls_begin, indices_end_,
- [&comparator](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- }
- return nulls_begin;
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For float types.
- template <typename Type>
- enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
- StablePartitioner partitioner;
- uint64_t* nulls_begin;
- if (first_sort_key.null_count == 0) {
- nulls_begin = indices_end_;
- } else {
- nulls_begin = partitioner(indices_begin_, indices_end_,
- [&](uint64_t index) { return !array.IsNull(index); });
- }
- uint64_t* nans_and_nulls_begin =
- partitioner(indices_begin_, nulls_begin,
- [&](uint64_t index) { return !std::isnan(array.GetView(index)); });
- auto& comparator = comparator_;
- if (nans_and_nulls_begin != nulls_begin) {
- // Sort all NaNs by the second and following sort keys.
- // TODO: could we instead run an independent sort from the second key on
- // this slice?
- std::stable_sort(nans_and_nulls_begin, nulls_begin,
- [&comparator](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- }
- if (nulls_begin != indices_end_) {
- // Sort all nulls by the second and following sort keys.
- // TODO: could we instead run an independent sort from the second key on
- // this slice?
- std::stable_sort(nulls_begin, indices_end_,
- [&comparator](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- }
- return nans_and_nulls_begin;
- }
-
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
- Status status_;
- std::vector<ResolvedSortKey> sort_keys_;
- Comparator comparator_;
-};
-
-// ----------------------------------------------------------------------
-// Table sorting implementations
-
-// Sort a table using a radix sort-like algorithm.
-// A distinct stable sort is called for each sort key, from the last key to the first.
-class TableRadixSorter {
- public:
- Status Sort(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
- const Table& table, const SortOptions& options) {
- for (auto i = options.sort_keys.size(); i > 0; --i) {
- const auto& sort_key = options.sort_keys[i - 1];
- const auto& chunked_array = table.GetColumnByName(sort_key.name);
- if (!chunked_array) {
- return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- }
- // We can use ArraySorter only for the sort key that is
- // processed first because ArraySorter doesn't care about
- // existing indices.
- const auto can_use_array_sorter = (i == 0);
- ChunkedArraySorter sorter(ctx, indices_begin, indices_end, *chunked_array.get(),
- sort_key.order, can_use_array_sorter);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- }
- return Status::OK();
- }
-};
-
-// Sort a table using a single sort and multiple-key comparisons.
-class MultipleKeyTableSorter : public TypeVisitor {
- private:
- // TODO instead of resolving chunks for each column independently, we could
- // split the table into RecordBatches and pay the cost of chunked indexing
- // at the first column only.
-
- // Preprocessed sort key.
- struct ResolvedSortKey {
- ResolvedSortKey(const ChunkedArray& chunked_array, const SortOrder order)
- : order(order),
- type(GetPhysicalType(chunked_array.type())),
- chunks(GetPhysicalChunks(chunked_array, type)),
- chunk_pointers(GetArrayPointers(chunks)),
- null_count(chunked_array.null_count()),
- num_chunks(chunked_array.num_chunks()),
- resolver(chunk_pointers) {}
-
- // Finds the target chunk and index in the target chunk from an
- // index in chunked array.
- template <typename ArrayType>
- ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
- return resolver.Resolve<ArrayType>(index);
- }
-
- const SortOrder order;
- const std::shared_ptr<DataType> type;
- const ArrayVector chunks;
- const std::vector<const Array*> chunk_pointers;
- const int64_t null_count;
- const int num_chunks;
- const ChunkedArrayResolver resolver;
- };
-
- using Comparator = MultipleKeyComparator<ResolvedSortKey>;
-
- public:
- MultipleKeyTableSorter(uint64_t* indices_begin, uint64_t* indices_end,
- const Table& table, const SortOptions& options)
- : indices_begin_(indices_begin),
- indices_end_(indices_end),
- sort_keys_(ResolveSortKeys(table, options.sort_keys, &status_)),
- comparator_(sort_keys_) {}
-
- // This is optimized for the first sort key. The first sort key sort
- // is processed in this class. The second and following sort keys
- // are processed in Comparator.
- Status Sort() {
- ARROW_RETURN_NOT_OK(status_);
- return sort_keys_[0].type->Accept(this);
- }
-
-#define VISIT(TYPE) \
- Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
-
- VISIT_PHYSICAL_TYPES(VISIT)
-
-#undef VISIT
-
- private:
- static std::vector<ResolvedSortKey> ResolveSortKeys(
- const Table& table, const std::vector<SortKey>& sort_keys, Status* status) {
- std::vector<ResolvedSortKey> resolved;
- resolved.reserve(sort_keys.size());
- for (const auto& sort_key : sort_keys) {
- const auto& chunked_array = table.GetColumnByName(sort_key.name);
- if (!chunked_array) {
- *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
- break;
- }
- resolved.emplace_back(*chunked_array, sort_key.order);
- }
- return resolved;
- }
-
- template <typename Type>
- Status SortInternal() {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
-
- auto& comparator = comparator_;
- const auto& first_sort_key = sort_keys_[0];
- auto nulls_begin = indices_end_;
- nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
- std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
- // Both values are never null nor NaN.
- auto chunk_left = first_sort_key.GetChunk<ArrayType>(left);
- auto chunk_right = first_sort_key.GetChunk<ArrayType>(right);
- auto value_left = chunk_left.Value();
- auto value_right = chunk_right.Value();
- if (value_left == value_right) {
- // If the left value equals to the right value,
- // we need to compare the second and following
- // sort keys.
- return comparator.Compare(left, right, 1);
- } else {
- auto compared = value_left < value_right;
- if (first_sort_key.order == SortOrder::Ascending) {
- return compared;
- } else {
- return !compared;
- }
- }
- });
- return comparator_.status();
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For non-float types.
- template <typename Type>
- enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- if (first_sort_key.null_count == 0) {
- return indices_end_;
- }
- StablePartitioner partitioner;
- auto nulls_begin =
- partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) {
- const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
- return !chunk.IsNull();
- });
- DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
- auto& comparator = comparator_;
- std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- return nulls_begin;
- }
-
- // Behaves like PatitionNulls() but this supports multiple sort keys.
- //
- // For float types.
- template <typename Type>
- enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
- const ResolvedSortKey& first_sort_key) {
- using ArrayType = typename TypeTraits<Type>::ArrayType;
- StablePartitioner partitioner;
- uint64_t* nulls_begin;
- if (first_sort_key.null_count == 0) {
- nulls_begin = indices_end_;
- } else {
- nulls_begin = partitioner(indices_begin_, indices_end_, [&](uint64_t index) {
- const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
- return !chunk.IsNull();
- });
- }
- DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
- uint64_t* nans_begin = partitioner(indices_begin_, nulls_begin, [&](uint64_t index) {
- const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
- return !std::isnan(chunk.Value());
- });
- auto& comparator = comparator_;
- // Sort all NaNs by the second and following sort keys.
- std::stable_sort(nans_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- // Sort all nulls by the second and following sort keys.
- std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
- return comparator.Compare(left, right, 1);
- });
- return nans_begin;
- }
-
- uint64_t* indices_begin_;
- uint64_t* indices_end_;
- Status status_;
- std::vector<ResolvedSortKey> sort_keys_;
- Comparator comparator_;
-};
-
-// ----------------------------------------------------------------------
-// Top-level sort functions
-
-const auto kDefaultSortOptions = SortOptions::Defaults();
-
-const FunctionDoc sort_indices_doc(
- "Return the indices that would sort an array, record batch or table",
- ("This function computes an array of indices that define a stable sort\n"
- "of the input array, record batch or table. Null values are considered\n"
- "greater than any other value and are therefore sorted at the end of the\n"
- "input. For floating-point types, NaNs are considered greater than any\n"
- "other non-null value, but smaller than null values."),
- {"input"}, "SortOptions");
-
-class SortIndicesMetaFunction : public MetaFunction {
- public:
- SortIndicesMetaFunction()
- : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc,
- &kDefaultSortOptions) {}
-
- Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
- const FunctionOptions* options,
- ExecContext* ctx) const override {
- const SortOptions& sort_options = static_cast<const SortOptions&>(*options);
- switch (args[0].kind()) {
- case Datum::ARRAY:
- return SortIndices(*args[0].make_array(), sort_options, ctx);
- break;
- case Datum::CHUNKED_ARRAY:
- return SortIndices(*args[0].chunked_array(), sort_options, ctx);
- break;
- case Datum::RECORD_BATCH: {
- return SortIndices(*args[0].record_batch(), sort_options, ctx);
- } break;
- case Datum::TABLE:
- return SortIndices(*args[0].table(), sort_options, ctx);
- break;
- default:
- break;
- }
- return Status::NotImplemented(
- "Unsupported types for sort_indices operation: "
- "values=",
- args[0].ToString());
- }
-
- private:
- Result<Datum> SortIndices(const Array& values, const SortOptions& options,
- ExecContext* ctx) const {
- SortOrder order = SortOrder::Ascending;
- if (!options.sort_keys.empty()) {
- order = options.sort_keys[0].order;
- }
- ArraySortOptions array_options(order);
- return CallFunction("array_sort_indices", {values}, &array_options, ctx);
- }
-
- Result<Datum> SortIndices(const ChunkedArray& chunked_array, const SortOptions& options,
- ExecContext* ctx) const {
- SortOrder order = SortOrder::Ascending;
- if (!options.sort_keys.empty()) {
- order = options.sort_keys[0].order;
- }
-
- auto out_type = uint64();
- auto length = chunked_array.length();
- auto buffer_size = BitUtil::BytesForBits(
- length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
- std::vector<std::shared_ptr<Buffer>> buffers(2);
- ARROW_ASSIGN_OR_RAISE(buffers[1],
- AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
- auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
- auto out_begin = out->GetMutableValues<uint64_t>(1);
- auto out_end = out_begin + length;
- std::iota(out_begin, out_end, 0);
-
- ChunkedArraySorter sorter(ctx, out_begin, out_end, chunked_array, order);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- return Datum(out);
- }
-
- Result<Datum> SortIndices(const RecordBatch& batch, const SortOptions& options,
- ExecContext* ctx) const {
- auto n_sort_keys = options.sort_keys.size();
- if (n_sort_keys == 0) {
- return Status::Invalid("Must specify one or more sort keys");
- }
- if (n_sort_keys == 1) {
- auto array = batch.GetColumnByName(options.sort_keys[0].name);
- if (!array) {
- return Status::Invalid("Nonexistent sort key column: ",
- options.sort_keys[0].name);
- }
- return SortIndices(*array, options, ctx);
- }
-
- auto out_type = uint64();
- auto length = batch.num_rows();
- auto buffer_size = BitUtil::BytesForBits(
- length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
- BufferVector buffers(2);
- ARROW_ASSIGN_OR_RAISE(buffers[1],
- AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
- auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
- auto out_begin = out->GetMutableValues<uint64_t>(1);
- auto out_end = out_begin + length;
- std::iota(out_begin, out_end, 0);
-
- // Radix sorting is consistently faster except when there is a large number
- // of sort keys, in which case it can end up degrading catastrophically.
- // Cut off above 8 sort keys.
- if (n_sort_keys <= 8) {
- RadixRecordBatchSorter sorter(out_begin, out_end, batch, options);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- } else {
- MultipleKeyRecordBatchSorter sorter(out_begin, out_end, batch, options);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- }
- return Datum(out);
- }
-
- Result<Datum> SortIndices(const Table& table, const SortOptions& options,
- ExecContext* ctx) const {
- auto n_sort_keys = options.sort_keys.size();
- if (n_sort_keys == 0) {
- return Status::Invalid("Must specify one or more sort keys");
- }
- if (n_sort_keys == 1) {
- auto chunked_array = table.GetColumnByName(options.sort_keys[0].name);
- if (!chunked_array) {
- return Status::Invalid("Nonexistent sort key column: ",
- options.sort_keys[0].name);
- }
- return SortIndices(*chunked_array, options, ctx);
- }
-
- auto out_type = uint64();
- auto length = table.num_rows();
- auto buffer_size = BitUtil::BytesForBits(
- length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
- std::vector<std::shared_ptr<Buffer>> buffers(2);
- ARROW_ASSIGN_OR_RAISE(buffers[1],
- AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
- auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
- auto out_begin = out->GetMutableValues<uint64_t>(1);
- auto out_end = out_begin + length;
- std::iota(out_begin, out_end, 0);
-
- // TODO: We should choose suitable sort implementation
- // automatically. The current TableRadixSorter implementation is
- // faster than MultipleKeyTableSorter only when the number of
- // sort keys is 2 and counting sort is used. So we always
- // MultipleKeyTableSorter for now.
- //
- // TableRadixSorter sorter;
- // ARROW_RETURN_NOT_OK(sorter.Sort(ctx, out_begin, out_end, table, options));
- MultipleKeyTableSorter sorter(out_begin, out_end, table, options);
- ARROW_RETURN_NOT_OK(sorter.Sort());
- return Datum(out);
- }
-};
-
-const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
-
-const FunctionDoc array_sort_indices_doc(
- "Return the indices that would sort an array",
- ("This function computes an array of indices that define a stable sort\n"
- "of the input array. Null values are considered greater than any\n"
- "other value and are therefore sorted at the end of the array.\n"
- "For floating-point types, NaNs are considered greater than any\n"
- "other non-null value, but smaller than null values."),
- {"array"}, "ArraySortOptions");
-
-const FunctionDoc partition_nth_indices_doc(
- "Return the indices that would partition an array around a pivot",
- ("This functions computes an array of indices that define a non-stable\n"
- "partial sort of the input array.\n"
- "\n"
- "The output is such that the `N`'th index points to the `N`'th element\n"
- "of the input in sorted order, and all indices before the `N`'th point\n"
- "to elements in the input less or equal to elements at or after the `N`'th.\n"
- "\n"
- "Null values are considered greater than any other value and are\n"
- "therefore partitioned towards the end of the array.\n"
- "For floating-point types, NaNs are considered greater than any\n"
- "other non-null value, but smaller than null values.\n"
- "\n"
- "The pivot index `N` must be given in PartitionNthOptions."),
- {"array"}, "PartitionNthOptions");
-
-} // namespace
-
+// ----------------------------------------------------------------------
+// ChunkedArray sorting implementations
+
+// Sort a chunked array directly without sorting each array in the
+// chunked array. This is used for processing the second and following
+// sort keys in TableRadixSorter.
+//
+// This uses the same algorithm as ArrayCompareSorter.
+template <typename Type>
+class ChunkedArrayCompareSorter {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ public:
+ // Returns where null starts.
+ uint64_t* Sort(uint64_t* indices_begin, uint64_t* indices_end,
+ const std::vector<const Array*>& arrays, int64_t null_count,
+ const ArraySortOptions& options) {
+ auto nulls_begin = PartitionNulls<ArrayType, StablePartitioner>(
+ indices_begin, indices_end, arrays, null_count);
+ ChunkedArrayResolver resolver(arrays);
+ if (options.order == SortOrder::Ascending) {
+ std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = resolver.Resolve<ArrayType>(right);
+ return chunk_left.Value() < chunk_right.Value();
+ });
+ } else {
+ std::stable_sort(indices_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = resolver.Resolve<ArrayType>(right);
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ return chunk_right.Value() < chunk_left.Value();
+ });
+ }
+ return nulls_begin;
+ }
+};
+
+// Sort a chunked array by sorting each array in the chunked array.
+//
+// TODO: This is a naive implementation. We'll be able to improve
+// performance of this. For example, we'll be able to use threads for
+// sorting each array.
+class ChunkedArraySorter : public TypeVisitor {
+ public:
+ ChunkedArraySorter(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const ChunkedArray& chunked_array, const SortOrder order,
+ bool can_use_array_sorter = true)
+ : TypeVisitor(),
+ indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ chunked_array_(chunked_array),
+ physical_type_(GetPhysicalType(chunked_array.type())),
+ physical_chunks_(GetPhysicalChunks(chunked_array_, physical_type_)),
+ order_(order),
+ can_use_array_sorter_(can_use_array_sorter),
+ ctx_(ctx) {}
+
+ Status Sort() { return physical_type_->Accept(this); }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ ArraySortOptions options(order_);
+ const auto num_chunks = chunked_array_.num_chunks();
+ if (num_chunks == 0) {
+ return Status::OK();
+ }
+ const auto arrays = GetArrayPointers(physical_chunks_);
+ if (can_use_array_sorter_) {
+ // Sort each chunk independently and merge to sorted indices.
+ // This is a serial implementation.
+ ArraySorter<Type> sorter;
+ struct SortedChunk {
+ int64_t begin_offset;
+ int64_t end_offset;
+ int64_t nulls_offset;
+ };
+ std::vector<SortedChunk> sorted(num_chunks);
+
+ // First sort all individual chunks
+ int64_t begin_offset = 0;
+ int64_t end_offset = 0;
+ int64_t null_count = 0;
+ for (int i = 0; i < num_chunks; ++i) {
+ const auto array = checked_cast<const ArrayType*>(arrays[i]);
+ end_offset += array->length();
+ null_count += array->null_count();
+ uint64_t* nulls_begin =
+ sorter.impl.Sort(indices_begin_ + begin_offset, indices_begin_ + end_offset,
+ *array, begin_offset, options);
+ sorted[i] = {begin_offset, end_offset, nulls_begin - indices_begin_};
+ begin_offset = end_offset;
+ }
+ DCHECK_EQ(end_offset, indices_end_ - indices_begin_);
+
+ std::unique_ptr<Buffer> temp_buffer;
+ uint64_t* temp_indices = nullptr;
+ if (sorted.size() > 1) {
+ ARROW_ASSIGN_OR_RAISE(
+ temp_buffer,
+ AllocateBuffer(sizeof(int64_t) * (indices_end_ - indices_begin_ - null_count),
+ ctx_->memory_pool()));
+ temp_indices = reinterpret_cast<uint64_t*>(temp_buffer->mutable_data());
+ }
+
+ // Then merge them by pairs, recursively
+ while (sorted.size() > 1) {
+ auto out_it = sorted.begin();
+ auto it = sorted.begin();
+ while (it < sorted.end() - 1) {
+ const auto& left = *it++;
+ const auto& right = *it++;
+ DCHECK_EQ(left.end_offset, right.begin_offset);
+ DCHECK_GE(left.nulls_offset, left.begin_offset);
+ DCHECK_LE(left.nulls_offset, left.end_offset);
+ DCHECK_GE(right.nulls_offset, right.begin_offset);
+ DCHECK_LE(right.nulls_offset, right.end_offset);
+ uint64_t* nulls_begin = Merge<ArrayType>(
+ indices_begin_ + left.begin_offset, indices_begin_ + left.end_offset,
+ indices_begin_ + right.end_offset, indices_begin_ + left.nulls_offset,
+ indices_begin_ + right.nulls_offset, arrays, null_count, order_,
+ temp_indices);
+ *out_it++ = {left.begin_offset, right.end_offset, nulls_begin - indices_begin_};
+ }
+ if (it < sorted.end()) {
+ *out_it++ = *it++;
+ }
+ sorted.erase(out_it, sorted.end());
+ }
+ DCHECK_EQ(sorted.size(), 1);
+ DCHECK_EQ(sorted[0].begin_offset, 0);
+ DCHECK_EQ(sorted[0].end_offset, chunked_array_.length());
+ // Note that "nulls" can also include NaNs, hence the >= check
+ DCHECK_GE(chunked_array_.length() - sorted[0].nulls_offset, null_count);
+ } else {
+ // Sort the chunked array directory.
+ ChunkedArrayCompareSorter<Type> sorter;
+ sorter.Sort(indices_begin_, indices_end_, arrays, chunked_array_.null_count(),
+ options);
+ }
+ return Status::OK();
+ }
+
+ // Merges two sorted indices arrays and returns where nulls starts.
+ // Where nulls starts is used when the next merge to detect the
+ // sorted indices locations.
+ template <typename ArrayType>
+ uint64_t* Merge(uint64_t* indices_begin, uint64_t* indices_middle,
+ uint64_t* indices_end, uint64_t* left_nulls_begin,
+ uint64_t* right_nulls_begin, const std::vector<const Array*>& arrays,
+ int64_t null_count, const SortOrder order, uint64_t* temp_indices) {
+ // Input layout:
+ // [left non-nulls .... left nulls .... right non-nulls .... right nulls]
+ // ^ ^ ^ ^
+ // | | | |
+ // indices_begin left_nulls_begin indices_middle right_nulls_begin
+ auto left_num_non_nulls = left_nulls_begin - indices_begin;
+ auto right_num_non_nulls = right_nulls_begin - indices_middle;
+
+ // Mutate the input, stably, to obtain the following layout:
+ // [left non-nulls .... right non-nulls .... left nulls .... right nulls]
+ // ^ ^ ^ ^
+ // | | | |
+ // indices_begin indices_middle nulls_begin right_nulls_begin
+ std::rotate(left_nulls_begin, indices_middle, right_nulls_begin);
+ auto nulls_begin = indices_begin + left_num_non_nulls + right_num_non_nulls;
+ // If the type has null-like values (such as NaN), ensure those plus regular
+ // nulls are partitioned in the right order. Note this assumes that all
+ // null-like values (e.g. NaN) are ordered equally.
+ if (NullTraits<typename ArrayType::TypeClass>::has_null_like_values) {
+ PartitionNullsOnly<StablePartitioner>(nulls_begin, indices_end, arrays, null_count);
+ }
+
+ // Merge the non-null values into temp area
+ indices_middle = indices_begin + left_num_non_nulls;
+ indices_end = indices_middle + right_num_non_nulls;
+ const ChunkedArrayResolver left_resolver(arrays);
+ const ChunkedArrayResolver right_resolver(arrays);
+ if (order == SortOrder::Ascending) {
+ std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
+ [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
+ return chunk_left.Value() < chunk_right.Value();
+ });
+ } else {
+ std::merge(indices_begin, indices_middle, indices_middle, indices_end, temp_indices,
+ [&](uint64_t left, uint64_t right) {
+ const auto chunk_left = left_resolver.Resolve<ArrayType>(left);
+ const auto chunk_right = right_resolver.Resolve<ArrayType>(right);
+ // We don't use 'left > right' here to reduce required
+ // operator. If we use 'right < left' here, '<' is only
+ // required.
+ return chunk_right.Value() < chunk_left.Value();
+ });
+ }
+ // Copy back temp area into main buffer
+ std::copy(temp_indices, temp_indices + (nulls_begin - indices_begin), indices_begin);
+ return nulls_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ const ChunkedArray& chunked_array_;
+ const std::shared_ptr<DataType> physical_type_;
+ const ArrayVector physical_chunks_;
+ const SortOrder order_;
+ const bool can_use_array_sorter_;
+ ExecContext* ctx_;
+};
+
+// ----------------------------------------------------------------------
+// Record batch sorting implementation(s)
+
+// Visit contiguous ranges of equal values. All entries are assumed
+// to be non-null.
+template <typename ArrayType, typename Visitor>
+void VisitConstantRanges(const ArrayType& array, uint64_t* indices_begin,
+ uint64_t* indices_end, Visitor&& visit) {
+ using GetView = GetViewType<typename ArrayType::TypeClass>;
+
+ if (indices_begin == indices_end) {
+ return;
+ }
+ auto range_start = indices_begin;
+ auto range_cur = range_start;
+ auto last_value = GetView::LogicalValue(array.GetView(*range_cur));
+ while (++range_cur != indices_end) {
+ auto v = GetView::LogicalValue(array.GetView(*range_cur));
+ if (v != last_value) {
+ visit(range_start, range_cur);
+ range_start = range_cur;
+ last_value = v;
+ }
+ }
+ if (range_start != range_cur) {
+ visit(range_start, range_cur);
+ }
+}
+
+// A sorter for a single column of a RecordBatch, deferring to the next column
+// for ranges of equal values.
+class RecordBatchColumnSorter {
+ public:
+ explicit RecordBatchColumnSorter(RecordBatchColumnSorter* next_column = nullptr)
+ : next_column_(next_column) {}
+ virtual ~RecordBatchColumnSorter() {}
+
+ virtual void SortRange(uint64_t* indices_begin, uint64_t* indices_end) = 0;
+
+ protected:
+ RecordBatchColumnSorter* next_column_;
+};
+
+template <typename Type>
+class ConcreteRecordBatchColumnSorter : public RecordBatchColumnSorter {
+ public:
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ ConcreteRecordBatchColumnSorter(std::shared_ptr<Array> array, SortOrder order,
+ RecordBatchColumnSorter* next_column = nullptr)
+ : RecordBatchColumnSorter(next_column),
+ owned_array_(std::move(array)),
+ array_(checked_cast<const ArrayType&>(*owned_array_)),
+ order_(order),
+ null_count_(array_.null_count()) {}
+
+ void SortRange(uint64_t* indices_begin, uint64_t* indices_end) {
+ using GetView = GetViewType<Type>;
+
+ constexpr int64_t offset = 0;
+ uint64_t* nulls_begin;
+ if (null_count_ == 0) {
+ nulls_begin = indices_end;
+ } else {
+ // NOTE that null_count_ is merely an upper bound on the number of nulls
+ // in this particular range.
+ nulls_begin = PartitionNullsOnly<StablePartitioner>(indices_begin, indices_end,
+ array_, offset);
+ DCHECK_LE(indices_end - nulls_begin, null_count_);
+ }
+ uint64_t* null_likes_begin = PartitionNullLikes<ArrayType, StablePartitioner>(
+ indices_begin, nulls_begin, array_, offset);
+
+ // TODO This is roughly the same as ArrayCompareSorter.
+ // Also, we would like to use a counting sort if possible. This requires
+ // a counting sort compatible with indirect indexing.
+ if (order_ == SortOrder::Ascending) {
+ std::stable_sort(
+ indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
+ const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
+ return lhs < rhs;
+ });
+ } else {
+ std::stable_sort(
+ indices_begin, null_likes_begin, [&](uint64_t left, uint64_t right) {
+ // We don't use 'left > right' here to reduce required operator.
+ // If we use 'right < left' here, '<' is only required.
+ const auto lhs = GetView::LogicalValue(array_.GetView(left - offset));
+ const auto rhs = GetView::LogicalValue(array_.GetView(right - offset));
+ return lhs > rhs;
+ });
+ }
+
+ if (next_column_ != nullptr) {
+ // Visit all ranges of equal values in this column and sort them on
+ // the next column.
+ SortNextColumn(null_likes_begin, nulls_begin);
+ SortNextColumn(nulls_begin, indices_end);
+ VisitConstantRanges(array_, indices_begin, null_likes_begin,
+ [&](uint64_t* range_start, uint64_t* range_end) {
+ SortNextColumn(range_start, range_end);
+ });
+ }
+ }
+
+ void SortNextColumn(uint64_t* indices_begin, uint64_t* indices_end) {
+ // Avoid the cost of a virtual method call in trivial cases
+ if (indices_end - indices_begin > 1) {
+ next_column_->SortRange(indices_begin, indices_end);
+ }
+ }
+
+ protected:
+ const std::shared_ptr<Array> owned_array_;
+ const ArrayType& array_;
+ const SortOrder order_;
+ const int64_t null_count_;
+};
+
+// Sort a batch using a single-pass left-to-right radix sort.
+class RadixRecordBatchSorter {
+ public:
+ RadixRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const RecordBatch& batch, const SortOptions& options)
+ : batch_(batch),
+ options_(options),
+ indices_begin_(indices_begin),
+ indices_end_(indices_end) {}
+
+ Status Sort() {
+ ARROW_ASSIGN_OR_RAISE(const auto sort_keys,
+ ResolveSortKeys(batch_, options_.sort_keys));
+
+ // Create column sorters from right to left
+ std::vector<std::unique_ptr<RecordBatchColumnSorter>> column_sorts(sort_keys.size());
+ RecordBatchColumnSorter* next_column = nullptr;
+ for (int64_t i = static_cast<int64_t>(sort_keys.size() - 1); i >= 0; --i) {
+ ColumnSortFactory factory(sort_keys[i], next_column);
+ ARROW_ASSIGN_OR_RAISE(column_sorts[i], factory.MakeColumnSort());
+ next_column = column_sorts[i].get();
+ }
+
+ // Sort from left to right
+ column_sorts.front()->SortRange(indices_begin_, indices_end_);
+ return Status::OK();
+ }
+
+ protected:
+ struct ResolvedSortKey {
+ std::shared_ptr<Array> array;
+ SortOrder order;
+ };
+
+ struct ColumnSortFactory {
+ ColumnSortFactory(const ResolvedSortKey& sort_key,
+ RecordBatchColumnSorter* next_column)
+ : physical_type(GetPhysicalType(sort_key.array->type())),
+ array(GetPhysicalArray(*sort_key.array, physical_type)),
+ order(sort_key.order),
+ next_column(next_column) {}
+
+ Result<std::unique_ptr<RecordBatchColumnSorter>> MakeColumnSort() {
+ RETURN_NOT_OK(VisitTypeInline(*physical_type, this));
+ DCHECK_NE(result, nullptr);
+ return std::move(result);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { return VisitGeneric(type); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("Unsupported type for RecordBatch sorting: ",
+ type.ToString());
+ }
+
+ template <typename Type>
+ Status VisitGeneric(const Type&) {
+ result.reset(new ConcreteRecordBatchColumnSorter<Type>(array, order, next_column));
+ return Status::OK();
+ }
+
+ std::shared_ptr<DataType> physical_type;
+ std::shared_ptr<Array> array;
+ SortOrder order;
+ RecordBatchColumnSorter* next_column;
+ std::unique_ptr<RecordBatchColumnSorter> result;
+ };
+
+ static Result<std::vector<ResolvedSortKey>> ResolveSortKeys(
+ const RecordBatch& batch, const std::vector<SortKey>& sort_keys) {
+ std::vector<ResolvedSortKey> resolved;
+ resolved.reserve(sort_keys.size());
+ for (const auto& sort_key : sort_keys) {
+ auto array = batch.GetColumnByName(sort_key.name);
+ if (!array) {
+ return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ }
+ resolved.push_back({std::move(array), sort_key.order});
+ }
+ return resolved;
+ }
+
+ const RecordBatch& batch_;
+ const SortOptions& options_;
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+};
+
+// Compare two records in the same RecordBatch or Table
+// (indexing is handled through ResolvedSortKey)
+template <typename ResolvedSortKey>
+class MultipleKeyComparator {
+ public:
+ explicit MultipleKeyComparator(const std::vector<ResolvedSortKey>& sort_keys)
+ : sort_keys_(sort_keys) {}
+
+ Status status() const { return status_; }
+
+ // Returns true if the left-th value should be ordered before the
+ // right-th value, false otherwise. The start_sort_key_index-th
+ // sort key and subsequent sort keys are used for comparison.
+ bool Compare(uint64_t left, uint64_t right, size_t start_sort_key_index) {
+ current_left_ = left;
+ current_right_ = right;
+ current_compared_ = 0;
+ auto num_sort_keys = sort_keys_.size();
+ for (size_t i = start_sort_key_index; i < num_sort_keys; ++i) {
+ current_sort_key_index_ = i;
+ status_ = VisitTypeInline(*sort_keys_[i].type, this);
+ // If the left value equals to the right value, we need to
+ // continue to sort.
+ if (current_compared_ != 0) {
+ break;
+ }
+ }
+ return current_compared_ < 0;
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) { \
+ current_compared_ = CompareType<TYPE>(); \
+ return Status::OK(); \
+ }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("Unsupported type for RecordBatch sorting: ",
+ type.ToString());
+ }
+
+ private:
+ // Compares two records in the same table and returns -1, 0 or 1.
+ //
+ // -1: The left is less than the right.
+ // 0: The left equals to the right.
+ // 1: The left is greater than the right.
+ //
+ // This supports null and NaN. Null is processed in this and NaN
+ // is processed in CompareTypeValue().
+ template <typename Type>
+ int32_t CompareType() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ const auto& sort_key = sort_keys_[current_sort_key_index_];
+ auto order = sort_key.order;
+ const auto chunk_left = sort_key.template GetChunk<ArrayType>(current_left_);
+ const auto chunk_right = sort_key.template GetChunk<ArrayType>(current_right_);
+ if (sort_key.null_count > 0) {
+ auto is_null_left = chunk_left.IsNull();
+ auto is_null_right = chunk_right.IsNull();
+ if (is_null_left && is_null_right) {
+ return 0;
+ } else if (is_null_left) {
+ return 1;
+ } else if (is_null_right) {
+ return -1;
+ }
+ }
+ return CompareTypeValue<Type>(chunk_left, chunk_right, order);
+ }
+
+ // For non-float types. Value is never NaN.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, int32_t> CompareTypeValue(
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
+ const SortOrder order) {
+ const auto left = chunk_left.Value();
+ const auto right = chunk_right.Value();
+ int32_t compared;
+ if (left == right) {
+ compared = 0;
+ } else if (left > right) {
+ compared = 1;
+ } else {
+ compared = -1;
+ }
+ if (order == SortOrder::Descending) {
+ compared = -compared;
+ }
+ return compared;
+ }
+
+ // For float types. Value may be NaN.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, int32_t> CompareTypeValue(
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_left,
+ const ResolvedChunk<typename TypeTraits<Type>::ArrayType>& chunk_right,
+ const SortOrder order) {
+ const auto left = chunk_left.Value();
+ const auto right = chunk_right.Value();
+ auto is_nan_left = std::isnan(left);
+ auto is_nan_right = std::isnan(right);
+ if (is_nan_left && is_nan_right) {
+ return 0;
+ } else if (is_nan_left) {
+ return 1;
+ } else if (is_nan_right) {
+ return -1;
+ }
+ int32_t compared;
+ if (left == right) {
+ compared = 0;
+ } else if (left > right) {
+ compared = 1;
+ } else {
+ compared = -1;
+ }
+ if (order == SortOrder::Descending) {
+ compared = -compared;
+ }
+ return compared;
+ }
+
+ const std::vector<ResolvedSortKey>& sort_keys_;
+ Status status_;
+ int64_t current_left_;
+ int64_t current_right_;
+ size_t current_sort_key_index_;
+ int32_t current_compared_;
+};
+
+// Sort a batch using a single sort and multiple-key comparisons.
+class MultipleKeyRecordBatchSorter : public TypeVisitor {
+ private:
+ // Preprocessed sort key.
+ struct ResolvedSortKey {
+ ResolvedSortKey(const std::shared_ptr<Array>& array, const SortOrder order)
+ : type(GetPhysicalType(array->type())),
+ owned_array(GetPhysicalArray(*array, type)),
+ array(*owned_array),
+ order(order),
+ null_count(array->null_count()) {}
+
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
+ return {&checked_cast<const ArrayType&>(array), index};
+ }
+
+ const std::shared_ptr<DataType> type;
+ std::shared_ptr<Array> owned_array;
+ const Array& array;
+ SortOrder order;
+ int64_t null_count;
+ };
+
+ using Comparator = MultipleKeyComparator<ResolvedSortKey>;
+
+ public:
+ MultipleKeyRecordBatchSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const RecordBatch& batch, const SortOptions& options)
+ : indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ sort_keys_(ResolveSortKeys(batch, options.sort_keys, &status_)),
+ comparator_(sort_keys_) {}
+
+ // This is optimized for the first sort key. The first sort key sort
+ // is processed in this class. The second and following sort keys
+ // are processed in Comparator.
+ Status Sort() {
+ RETURN_NOT_OK(status_);
+ return sort_keys_[0].type->Accept(this);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ static std::vector<ResolvedSortKey> ResolveSortKeys(
+ const RecordBatch& batch, const std::vector<SortKey>& sort_keys, Status* status) {
+ std::vector<ResolvedSortKey> resolved;
+ for (const auto& sort_key : sort_keys) {
+ auto array = batch.GetColumnByName(sort_key.name);
+ if (!array) {
+ *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ break;
+ }
+ resolved.emplace_back(array, sort_key.order);
+ }
+ return resolved;
+ }
+
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ auto& comparator = comparator_;
+ const auto& first_sort_key = sort_keys_[0];
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ auto nulls_begin = indices_end_;
+ nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
+ // Sort first-key non-nulls
+ std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
+ // Both values are never null nor NaN
+ // (otherwise they've been partitioned away above).
+ const auto value_left = array.GetView(left);
+ const auto value_right = array.GetView(right);
+ if (value_left != value_right) {
+ bool compared = value_left < value_right;
+ if (first_sort_key.order == SortOrder::Ascending) {
+ return compared;
+ } else {
+ return !compared;
+ }
+ }
+ // If the left value equals to the right value,
+ // we need to compare the second and following
+ // sort keys.
+ return comparator.Compare(left, right, 1);
+ });
+ return comparator_.status();
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For non-float types.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ if (first_sort_key.null_count == 0) {
+ return indices_end_;
+ }
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ StablePartitioner partitioner;
+ auto nulls_begin = partitioner(indices_begin_, indices_end_,
+ [&](uint64_t index) { return !array.IsNull(index); });
+ // Sort all nulls by second and following sort keys
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ if (nulls_begin != indices_end_) {
+ auto& comparator = comparator_;
+ std::stable_sort(nulls_begin, indices_end_,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ return nulls_begin;
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For float types.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ const ArrayType& array = checked_cast<const ArrayType&>(first_sort_key.array);
+ StablePartitioner partitioner;
+ uint64_t* nulls_begin;
+ if (first_sort_key.null_count == 0) {
+ nulls_begin = indices_end_;
+ } else {
+ nulls_begin = partitioner(indices_begin_, indices_end_,
+ [&](uint64_t index) { return !array.IsNull(index); });
+ }
+ uint64_t* nans_and_nulls_begin =
+ partitioner(indices_begin_, nulls_begin,
+ [&](uint64_t index) { return !std::isnan(array.GetView(index)); });
+ auto& comparator = comparator_;
+ if (nans_and_nulls_begin != nulls_begin) {
+ // Sort all NaNs by the second and following sort keys.
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ std::stable_sort(nans_and_nulls_begin, nulls_begin,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ if (nulls_begin != indices_end_) {
+ // Sort all nulls by the second and following sort keys.
+ // TODO: could we instead run an independent sort from the second key on
+ // this slice?
+ std::stable_sort(nulls_begin, indices_end_,
+ [&comparator](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ }
+ return nans_and_nulls_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ Status status_;
+ std::vector<ResolvedSortKey> sort_keys_;
+ Comparator comparator_;
+};
+
+// ----------------------------------------------------------------------
+// Table sorting implementations
+
+// Sort a table using a radix sort-like algorithm.
+// A distinct stable sort is called for each sort key, from the last key to the first.
+class TableRadixSorter {
+ public:
+ Status Sort(ExecContext* ctx, uint64_t* indices_begin, uint64_t* indices_end,
+ const Table& table, const SortOptions& options) {
+ for (auto i = options.sort_keys.size(); i > 0; --i) {
+ const auto& sort_key = options.sort_keys[i - 1];
+ const auto& chunked_array = table.GetColumnByName(sort_key.name);
+ if (!chunked_array) {
+ return Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ }
+ // We can use ArraySorter only for the sort key that is
+ // processed first because ArraySorter doesn't care about
+ // existing indices.
+ const auto can_use_array_sorter = (i == 0);
+ ChunkedArraySorter sorter(ctx, indices_begin, indices_end, *chunked_array.get(),
+ sort_key.order, can_use_array_sorter);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ }
+ return Status::OK();
+ }
+};
+
+// Sort a table using a single sort and multiple-key comparisons.
+class MultipleKeyTableSorter : public TypeVisitor {
+ private:
+ // TODO instead of resolving chunks for each column independently, we could
+ // split the table into RecordBatches and pay the cost of chunked indexing
+ // at the first column only.
+
+ // Preprocessed sort key.
+ struct ResolvedSortKey {
+ ResolvedSortKey(const ChunkedArray& chunked_array, const SortOrder order)
+ : order(order),
+ type(GetPhysicalType(chunked_array.type())),
+ chunks(GetPhysicalChunks(chunked_array, type)),
+ chunk_pointers(GetArrayPointers(chunks)),
+ null_count(chunked_array.null_count()),
+ num_chunks(chunked_array.num_chunks()),
+ resolver(chunk_pointers) {}
+
+ // Finds the target chunk and index in the target chunk from an
+ // index in chunked array.
+ template <typename ArrayType>
+ ResolvedChunk<ArrayType> GetChunk(int64_t index) const {
+ return resolver.Resolve<ArrayType>(index);
+ }
+
+ const SortOrder order;
+ const std::shared_ptr<DataType> type;
+ const ArrayVector chunks;
+ const std::vector<const Array*> chunk_pointers;
+ const int64_t null_count;
+ const int num_chunks;
+ const ChunkedArrayResolver resolver;
+ };
+
+ using Comparator = MultipleKeyComparator<ResolvedSortKey>;
+
+ public:
+ MultipleKeyTableSorter(uint64_t* indices_begin, uint64_t* indices_end,
+ const Table& table, const SortOptions& options)
+ : indices_begin_(indices_begin),
+ indices_end_(indices_end),
+ sort_keys_(ResolveSortKeys(table, options.sort_keys, &status_)),
+ comparator_(sort_keys_) {}
+
+ // This is optimized for the first sort key. The first sort key sort
+ // is processed in this class. The second and following sort keys
+ // are processed in Comparator.
+ Status Sort() {
+ ARROW_RETURN_NOT_OK(status_);
+ return sort_keys_[0].type->Accept(this);
+ }
+
+#define VISIT(TYPE) \
+ Status Visit(const TYPE& type) override { return SortInternal<TYPE>(); }
+
+ VISIT_PHYSICAL_TYPES(VISIT)
+
+#undef VISIT
+
+ private:
+ static std::vector<ResolvedSortKey> ResolveSortKeys(
+ const Table& table, const std::vector<SortKey>& sort_keys, Status* status) {
+ std::vector<ResolvedSortKey> resolved;
+ resolved.reserve(sort_keys.size());
+ for (const auto& sort_key : sort_keys) {
+ const auto& chunked_array = table.GetColumnByName(sort_key.name);
+ if (!chunked_array) {
+ *status = Status::Invalid("Nonexistent sort key column: ", sort_key.name);
+ break;
+ }
+ resolved.emplace_back(*chunked_array, sort_key.order);
+ }
+ return resolved;
+ }
+
+ template <typename Type>
+ Status SortInternal() {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+ auto& comparator = comparator_;
+ const auto& first_sort_key = sort_keys_[0];
+ auto nulls_begin = indices_end_;
+ nulls_begin = PartitionNullsInternal<Type>(first_sort_key);
+ std::stable_sort(indices_begin_, nulls_begin, [&](uint64_t left, uint64_t right) {
+ // Both values are never null nor NaN.
+ auto chunk_left = first_sort_key.GetChunk<ArrayType>(left);
+ auto chunk_right = first_sort_key.GetChunk<ArrayType>(right);
+ auto value_left = chunk_left.Value();
+ auto value_right = chunk_right.Value();
+ if (value_left == value_right) {
+ // If the left value equals to the right value,
+ // we need to compare the second and following
+ // sort keys.
+ return comparator.Compare(left, right, 1);
+ } else {
+ auto compared = value_left < value_right;
+ if (first_sort_key.order == SortOrder::Ascending) {
+ return compared;
+ } else {
+ return !compared;
+ }
+ }
+ });
+ return comparator_.status();
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For non-float types.
+ template <typename Type>
+ enable_if_t<!is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ if (first_sort_key.null_count == 0) {
+ return indices_end_;
+ }
+ StablePartitioner partitioner;
+ auto nulls_begin =
+ partitioner(indices_begin_, indices_end_, [&first_sort_key](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !chunk.IsNull();
+ });
+ DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
+ auto& comparator = comparator_;
+ std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ return nulls_begin;
+ }
+
+ // Behaves like PatitionNulls() but this supports multiple sort keys.
+ //
+ // For float types.
+ template <typename Type>
+ enable_if_t<is_floating_type<Type>::value, uint64_t*> PartitionNullsInternal(
+ const ResolvedSortKey& first_sort_key) {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ StablePartitioner partitioner;
+ uint64_t* nulls_begin;
+ if (first_sort_key.null_count == 0) {
+ nulls_begin = indices_end_;
+ } else {
+ nulls_begin = partitioner(indices_begin_, indices_end_, [&](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !chunk.IsNull();
+ });
+ }
+ DCHECK_EQ(indices_end_ - nulls_begin, first_sort_key.null_count);
+ uint64_t* nans_begin = partitioner(indices_begin_, nulls_begin, [&](uint64_t index) {
+ const auto chunk = first_sort_key.GetChunk<ArrayType>(index);
+ return !std::isnan(chunk.Value());
+ });
+ auto& comparator = comparator_;
+ // Sort all NaNs by the second and following sort keys.
+ std::stable_sort(nans_begin, nulls_begin, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ // Sort all nulls by the second and following sort keys.
+ std::stable_sort(nulls_begin, indices_end_, [&](uint64_t left, uint64_t right) {
+ return comparator.Compare(left, right, 1);
+ });
+ return nans_begin;
+ }
+
+ uint64_t* indices_begin_;
+ uint64_t* indices_end_;
+ Status status_;
+ std::vector<ResolvedSortKey> sort_keys_;
+ Comparator comparator_;
+};
+
+// ----------------------------------------------------------------------
+// Top-level sort functions
+
+const auto kDefaultSortOptions = SortOptions::Defaults();
+
+const FunctionDoc sort_indices_doc(
+ "Return the indices that would sort an array, record batch or table",
+ ("This function computes an array of indices that define a stable sort\n"
+ "of the input array, record batch or table. Null values are considered\n"
+ "greater than any other value and are therefore sorted at the end of the\n"
+ "input. For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values."),
+ {"input"}, "SortOptions");
+
+class SortIndicesMetaFunction : public MetaFunction {
+ public:
+ SortIndicesMetaFunction()
+ : MetaFunction("sort_indices", Arity::Unary(), &sort_indices_doc,
+ &kDefaultSortOptions) {}
+
+ Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+ const FunctionOptions* options,
+ ExecContext* ctx) const override {
+ const SortOptions& sort_options = static_cast<const SortOptions&>(*options);
+ switch (args[0].kind()) {
+ case Datum::ARRAY:
+ return SortIndices(*args[0].make_array(), sort_options, ctx);
+ break;
+ case Datum::CHUNKED_ARRAY:
+ return SortIndices(*args[0].chunked_array(), sort_options, ctx);
+ break;
+ case Datum::RECORD_BATCH: {
+ return SortIndices(*args[0].record_batch(), sort_options, ctx);
+ } break;
+ case Datum::TABLE:
+ return SortIndices(*args[0].table(), sort_options, ctx);
+ break;
+ default:
+ break;
+ }
+ return Status::NotImplemented(
+ "Unsupported types for sort_indices operation: "
+ "values=",
+ args[0].ToString());
+ }
+
+ private:
+ Result<Datum> SortIndices(const Array& values, const SortOptions& options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+ ArraySortOptions array_options(order);
+ return CallFunction("array_sort_indices", {values}, &array_options, ctx);
+ }
+
+ Result<Datum> SortIndices(const ChunkedArray& chunked_array, const SortOptions& options,
+ ExecContext* ctx) const {
+ SortOrder order = SortOrder::Ascending;
+ if (!options.sort_keys.empty()) {
+ order = options.sort_keys[0].order;
+ }
+
+ auto out_type = uint64();
+ auto length = chunked_array.length();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ ChunkedArraySorter sorter(ctx, out_begin, out_end, chunked_array, order);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ return Datum(out);
+ }
+
+ Result<Datum> SortIndices(const RecordBatch& batch, const SortOptions& options,
+ ExecContext* ctx) const {
+ auto n_sort_keys = options.sort_keys.size();
+ if (n_sort_keys == 0) {
+ return Status::Invalid("Must specify one or more sort keys");
+ }
+ if (n_sort_keys == 1) {
+ auto array = batch.GetColumnByName(options.sort_keys[0].name);
+ if (!array) {
+ return Status::Invalid("Nonexistent sort key column: ",
+ options.sort_keys[0].name);
+ }
+ return SortIndices(*array, options, ctx);
+ }
+
+ auto out_type = uint64();
+ auto length = batch.num_rows();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ BufferVector buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ // Radix sorting is consistently faster except when there is a large number
+ // of sort keys, in which case it can end up degrading catastrophically.
+ // Cut off above 8 sort keys.
+ if (n_sort_keys <= 8) {
+ RadixRecordBatchSorter sorter(out_begin, out_end, batch, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ } else {
+ MultipleKeyRecordBatchSorter sorter(out_begin, out_end, batch, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ }
+ return Datum(out);
+ }
+
+ Result<Datum> SortIndices(const Table& table, const SortOptions& options,
+ ExecContext* ctx) const {
+ auto n_sort_keys = options.sort_keys.size();
+ if (n_sort_keys == 0) {
+ return Status::Invalid("Must specify one or more sort keys");
+ }
+ if (n_sort_keys == 1) {
+ auto chunked_array = table.GetColumnByName(options.sort_keys[0].name);
+ if (!chunked_array) {
+ return Status::Invalid("Nonexistent sort key column: ",
+ options.sort_keys[0].name);
+ }
+ return SortIndices(*chunked_array, options, ctx);
+ }
+
+ auto out_type = uint64();
+ auto length = table.num_rows();
+ auto buffer_size = BitUtil::BytesForBits(
+ length * std::static_pointer_cast<UInt64Type>(out_type)->bit_width());
+ std::vector<std::shared_ptr<Buffer>> buffers(2);
+ ARROW_ASSIGN_OR_RAISE(buffers[1],
+ AllocateResizableBuffer(buffer_size, ctx->memory_pool()));
+ auto out = std::make_shared<ArrayData>(out_type, length, buffers, 0);
+ auto out_begin = out->GetMutableValues<uint64_t>(1);
+ auto out_end = out_begin + length;
+ std::iota(out_begin, out_end, 0);
+
+ // TODO: We should choose suitable sort implementation
+ // automatically. The current TableRadixSorter implementation is
+ // faster than MultipleKeyTableSorter only when the number of
+ // sort keys is 2 and counting sort is used. So we always
+ // MultipleKeyTableSorter for now.
+ //
+ // TableRadixSorter sorter;
+ // ARROW_RETURN_NOT_OK(sorter.Sort(ctx, out_begin, out_end, table, options));
+ MultipleKeyTableSorter sorter(out_begin, out_end, table, options);
+ ARROW_RETURN_NOT_OK(sorter.Sort());
+ return Datum(out);
+ }
+};
+
+const auto kDefaultArraySortOptions = ArraySortOptions::Defaults();
+
+const FunctionDoc array_sort_indices_doc(
+ "Return the indices that would sort an array",
+ ("This function computes an array of indices that define a stable sort\n"
+ "of the input array. Null values are considered greater than any\n"
+ "other value and are therefore sorted at the end of the array.\n"
+ "For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values."),
+ {"array"}, "ArraySortOptions");
+
+const FunctionDoc partition_nth_indices_doc(
+ "Return the indices that would partition an array around a pivot",
+ ("This functions computes an array of indices that define a non-stable\n"
+ "partial sort of the input array.\n"
+ "\n"
+ "The output is such that the `N`'th index points to the `N`'th element\n"
+ "of the input in sorted order, and all indices before the `N`'th point\n"
+ "to elements in the input less or equal to elements at or after the `N`'th.\n"
+ "\n"
+ "Null values are considered greater than any other value and are\n"
+ "therefore partitioned towards the end of the array.\n"
+ "For floating-point types, NaNs are considered greater than any\n"
+ "other non-null value, but smaller than null values.\n"
+ "\n"
+ "The pivot index `N` must be given in PartitionNthOptions."),
+ {"array"}, "PartitionNthOptions");
+
+} // namespace
+
void RegisterVectorSort(FunctionRegistry* registry) {
// The kernel outputs into preallocated memory and is never null
VectorKernel base;
base.mem_allocation = MemAllocation::PREALLOCATE;
base.null_handling = NullHandling::OUTPUT_NOT_NULL;
- auto array_sort_indices = std::make_shared<VectorFunction>(
- "array_sort_indices", Arity::Unary(), &array_sort_indices_doc,
- &kDefaultArraySortOptions);
- base.init = ArraySortIndicesState::Init;
- AddSortingKernels<ArraySortIndices>(base, array_sort_indices.get());
- DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
+ auto array_sort_indices = std::make_shared<VectorFunction>(
+ "array_sort_indices", Arity::Unary(), &array_sort_indices_doc,
+ &kDefaultArraySortOptions);
+ base.init = ArraySortIndicesState::Init;
+ AddSortingKernels<ArraySortIndices>(base, array_sort_indices.get());
+ DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
+
+ DCHECK_OK(registry->AddFunction(std::make_shared<SortIndicesMetaFunction>()));
- DCHECK_OK(registry->AddFunction(std::make_shared<SortIndicesMetaFunction>()));
-
// partition_nth_indices has a parameter so needs its init function
- auto part_indices = std::make_shared<VectorFunction>(
- "partition_nth_indices", Arity::Unary(), &partition_nth_indices_doc);
+ auto part_indices = std::make_shared<VectorFunction>(
+ "partition_nth_indices", Arity::Unary(), &partition_nth_indices_doc);
base.init = PartitionNthToIndicesState::Init;
AddSortingKernels<PartitionNthToIndices>(base, part_indices.get());
DCHECK_OK(registry->AddFunction(std::move(part_indices)));
}
-#undef VISIT_PHYSICAL_TYPES
-
+#undef VISIT_PHYSICAL_TYPES
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
index 9f24f7a7008..ca7b6137306 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.cc
@@ -24,10 +24,10 @@
#include <utility>
#include "arrow/compute/function.h"
-#include "arrow/compute/function_internal.h"
+#include "arrow/compute/function_internal.h"
#include "arrow/compute/registry_internal.h"
#include "arrow/status.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/logging.h"
namespace arrow {
namespace compute {
@@ -35,8 +35,8 @@ namespace compute {
class FunctionRegistry::FunctionRegistryImpl {
public:
Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite) {
- RETURN_NOT_OK(function->Validate());
-
+ RETURN_NOT_OK(function->Validate());
+
std::lock_guard<std::mutex> mutation_guard(lock_);
const std::string& name = function->name();
@@ -59,20 +59,20 @@ class FunctionRegistry::FunctionRegistryImpl {
return Status::OK();
}
- Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
- bool allow_overwrite = false) {
- std::lock_guard<std::mutex> mutation_guard(lock_);
-
- const std::string name = options_type->type_name();
- auto it = name_to_options_type_.find(name);
- if (it != name_to_options_type_.end() && !allow_overwrite) {
- return Status::KeyError(
- "Already have a function options type registered with name: ", name);
- }
- name_to_options_type_[name] = options_type;
- return Status::OK();
- }
-
+ Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite = false) {
+ std::lock_guard<std::mutex> mutation_guard(lock_);
+
+ const std::string name = options_type->type_name();
+ auto it = name_to_options_type_.find(name);
+ if (it != name_to_options_type_.end() && !allow_overwrite) {
+ return Status::KeyError(
+ "Already have a function options type registered with name: ", name);
+ }
+ name_to_options_type_[name] = options_type;
+ return Status::OK();
+ }
+
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const {
auto it = name_to_function_.find(name);
if (it == name_to_function_.end()) {
@@ -90,21 +90,21 @@ class FunctionRegistry::FunctionRegistryImpl {
return results;
}
- Result<const FunctionOptionsType*> GetFunctionOptionsType(
- const std::string& name) const {
- auto it = name_to_options_type_.find(name);
- if (it == name_to_options_type_.end()) {
- return Status::KeyError("No function options type registered with name: ", name);
- }
- return it->second;
- }
-
+ Result<const FunctionOptionsType*> GetFunctionOptionsType(
+ const std::string& name) const {
+ auto it = name_to_options_type_.find(name);
+ if (it == name_to_options_type_.end()) {
+ return Status::KeyError("No function options type registered with name: ", name);
+ }
+ return it->second;
+ }
+
int num_functions() const { return static_cast<int>(name_to_function_.size()); }
private:
std::mutex lock_;
std::unordered_map<std::string, std::shared_ptr<Function>> name_to_function_;
- std::unordered_map<std::string, const FunctionOptionsType*> name_to_options_type_;
+ std::unordered_map<std::string, const FunctionOptionsType*> name_to_options_type_;
};
std::unique_ptr<FunctionRegistry> FunctionRegistry::Make() {
@@ -125,11 +125,11 @@ Status FunctionRegistry::AddAlias(const std::string& target_name,
return impl_->AddAlias(target_name, source_name);
}
-Status FunctionRegistry::AddFunctionOptionsType(const FunctionOptionsType* options_type,
- bool allow_overwrite) {
- return impl_->AddFunctionOptionsType(options_type, allow_overwrite);
-}
-
+Status FunctionRegistry::AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite) {
+ return impl_->AddFunctionOptionsType(options_type, allow_overwrite);
+}
+
Result<std::shared_ptr<Function>> FunctionRegistry::GetFunction(
const std::string& name) const {
return impl_->GetFunction(name);
@@ -139,11 +139,11 @@ std::vector<std::string> FunctionRegistry::GetFunctionNames() const {
return impl_->GetFunctionNames();
}
-Result<const FunctionOptionsType*> FunctionRegistry::GetFunctionOptionsType(
- const std::string& name) const {
- return impl_->GetFunctionOptionsType(name);
-}
-
+Result<const FunctionOptionsType*> FunctionRegistry::GetFunctionOptionsType(
+ const std::string& name) const {
+ return impl_->GetFunctionOptionsType(name);
+}
+
int FunctionRegistry::num_functions() const { return impl_->num_functions(); }
namespace internal {
@@ -161,30 +161,30 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
RegisterScalarStringAscii(registry.get());
RegisterScalarValidity(registry.get());
RegisterScalarFillNull(registry.get());
- RegisterScalarIfElse(registry.get());
- RegisterScalarTemporal(registry.get());
+ RegisterScalarIfElse(registry.get());
+ RegisterScalarTemporal(registry.get());
- RegisterScalarOptions(registry.get());
+ RegisterScalarOptions(registry.get());
// Vector functions
RegisterVectorHash(registry.get());
- RegisterVectorReplace(registry.get());
+ RegisterVectorReplace(registry.get());
RegisterVectorSelection(registry.get());
RegisterVectorNested(registry.get());
RegisterVectorSort(registry.get());
- RegisterVectorOptions(registry.get());
-
- // Aggregate functions
- RegisterScalarAggregateBasic(registry.get());
- RegisterScalarAggregateMode(registry.get());
- RegisterScalarAggregateQuantile(registry.get());
- RegisterScalarAggregateTDigest(registry.get());
- RegisterScalarAggregateVariance(registry.get());
- RegisterHashAggregateBasic(registry.get());
-
- RegisterAggregateOptions(registry.get());
-
+ RegisterVectorOptions(registry.get());
+
+ // Aggregate functions
+ RegisterScalarAggregateBasic(registry.get());
+ RegisterScalarAggregateMode(registry.get());
+ RegisterScalarAggregateQuantile(registry.get());
+ RegisterScalarAggregateTDigest(registry.get());
+ RegisterScalarAggregateVariance(registry.get());
+ RegisterHashAggregateBasic(registry.get());
+
+ RegisterAggregateOptions(registry.get());
+
return registry;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
index 796eba2fb12..e83036db6ac 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry.h
@@ -32,7 +32,7 @@ namespace arrow {
namespace compute {
class Function;
-class FunctionOptionsType;
+class FunctionOptionsType;
/// \brief A mutable central function registry for built-in functions as well
/// as user-defined functions. Functions are implementations of
@@ -59,11 +59,11 @@ class ARROW_EXPORT FunctionRegistry {
/// function with the given name is not registered
Status AddAlias(const std::string& target_name, const std::string& source_name);
- /// \brief Add a new function options type to the registry. Returns Status::KeyError if
- /// a function options type with the same name is already registered
- Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
- bool allow_overwrite = false);
-
+ /// \brief Add a new function options type to the registry. Returns Status::KeyError if
+ /// a function options type with the same name is already registered
+ Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+ bool allow_overwrite = false);
+
/// \brief Retrieve a function by name from the registry
Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
@@ -71,10 +71,10 @@ class ARROW_EXPORT FunctionRegistry {
/// displaying a manifest of available functions
std::vector<std::string> GetFunctionNames() const;
- /// \brief Retrieve a function options type by name from the registry
- Result<const FunctionOptionsType*> GetFunctionOptionsType(
- const std::string& name) const;
-
+ /// \brief Retrieve a function options type by name from the registry
+ Result<const FunctionOptionsType*> GetFunctionOptionsType(
+ const std::string& name) const;
+
/// \brief The number of currently registered functions
int num_functions() const;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
index bc5a2d734f4..892b54341da 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/registry_internal.h
@@ -34,30 +34,30 @@ void RegisterScalarSetLookup(FunctionRegistry* registry);
void RegisterScalarStringAscii(FunctionRegistry* registry);
void RegisterScalarValidity(FunctionRegistry* registry);
void RegisterScalarFillNull(FunctionRegistry* registry);
-void RegisterScalarIfElse(FunctionRegistry* registry);
-void RegisterScalarTemporal(FunctionRegistry* registry);
+void RegisterScalarIfElse(FunctionRegistry* registry);
+void RegisterScalarTemporal(FunctionRegistry* registry);
+
+void RegisterScalarOptions(FunctionRegistry* registry);
-void RegisterScalarOptions(FunctionRegistry* registry);
-
// Vector functions
void RegisterVectorHash(FunctionRegistry* registry);
-void RegisterVectorReplace(FunctionRegistry* registry);
+void RegisterVectorReplace(FunctionRegistry* registry);
void RegisterVectorSelection(FunctionRegistry* registry);
void RegisterVectorNested(FunctionRegistry* registry);
void RegisterVectorSort(FunctionRegistry* registry);
-void RegisterVectorOptions(FunctionRegistry* registry);
-
+void RegisterVectorOptions(FunctionRegistry* registry);
+
// Aggregate functions
void RegisterScalarAggregateBasic(FunctionRegistry* registry);
-void RegisterScalarAggregateMode(FunctionRegistry* registry);
-void RegisterScalarAggregateQuantile(FunctionRegistry* registry);
-void RegisterScalarAggregateTDigest(FunctionRegistry* registry);
-void RegisterScalarAggregateVariance(FunctionRegistry* registry);
-void RegisterHashAggregateBasic(FunctionRegistry* registry);
-
-void RegisterAggregateOptions(FunctionRegistry* registry);
-
+void RegisterScalarAggregateMode(FunctionRegistry* registry);
+void RegisterScalarAggregateQuantile(FunctionRegistry* registry);
+void RegisterScalarAggregateTDigest(FunctionRegistry* registry);
+void RegisterScalarAggregateVariance(FunctionRegistry* registry);
+void RegisterHashAggregateBasic(FunctionRegistry* registry);
+
+void RegisterAggregateOptions(FunctionRegistry* registry);
+
} // namespace internal
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
index 3a3d2ac4b7d..eebc8c1b678 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/compute/type_fwd.h
@@ -20,16 +20,16 @@
namespace arrow {
struct Datum;
-struct ValueDescr;
+struct ValueDescr;
namespace compute {
-class Function;
-class FunctionOptions;
-
-class CastOptions;
-
-struct ExecBatch;
+class Function;
+class FunctionOptions;
+
+class CastOptions;
+
+struct ExecBatch;
class ExecContext;
class KernelContext;
@@ -38,11 +38,11 @@ struct ScalarKernel;
struct ScalarAggregateKernel;
struct VectorKernel;
-struct KernelState;
-
-class Expression;
-class ExecNode;
-class ExecPlan;
-
+struct KernelState;
+
+class Expression;
+class ExecNode;
+class ExecPlan;
+
} // namespace compute
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/config.cc b/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
index 7d68f638b6c..b93f207161d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/config.cc
@@ -16,19 +16,19 @@
// under the License.
#include "arrow/config.h"
-
-#include <cstdint>
-
+
+#include <cstdint>
+
#include "arrow/util/config.h"
-#include "arrow/util/cpu_info.h"
+#include "arrow/util/cpu_info.h"
namespace arrow {
-using internal::CpuInfo;
-
-namespace {
-
-const BuildInfo kBuildInfo = {
+using internal::CpuInfo;
+
+namespace {
+
+const BuildInfo kBuildInfo = {
// clang-format off
ARROW_VERSION,
ARROW_VERSION_MAJOR,
@@ -46,33 +46,33 @@ const BuildInfo kBuildInfo = {
// clang-format on
};
-template <typename QueryFlagFunction>
-std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
- if (query_flag(CpuInfo::AVX512)) {
- return "avx512";
- } else if (query_flag(CpuInfo::AVX2)) {
- return "avx2";
- } else if (query_flag(CpuInfo::AVX)) {
- return "avx";
- } else if (query_flag(CpuInfo::SSE4_2)) {
- return "sse4_2";
- } else {
- return "none";
- }
-}
-
-}; // namespace
-
+template <typename QueryFlagFunction>
+std::string MakeSimdLevelString(QueryFlagFunction&& query_flag) {
+ if (query_flag(CpuInfo::AVX512)) {
+ return "avx512";
+ } else if (query_flag(CpuInfo::AVX2)) {
+ return "avx2";
+ } else if (query_flag(CpuInfo::AVX)) {
+ return "avx";
+ } else if (query_flag(CpuInfo::SSE4_2)) {
+ return "sse4_2";
+ } else {
+ return "none";
+ }
+}
+
+}; // namespace
+
const BuildInfo& GetBuildInfo() { return kBuildInfo; }
-RuntimeInfo GetRuntimeInfo() {
- RuntimeInfo info;
- auto cpu_info = CpuInfo::GetInstance();
- info.simd_level =
- MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
- info.detected_simd_level =
- MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
- return info;
-}
-
+RuntimeInfo GetRuntimeInfo() {
+ RuntimeInfo info;
+ auto cpu_info = CpuInfo::GetInstance();
+ info.simd_level =
+ MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsSupported(flags); });
+ info.detected_simd_level =
+ MakeSimdLevelString([&](int64_t flags) { return cpu_info->IsDetected(flags); });
+ return info;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/config.h b/contrib/libs/apache/arrow/cpp/src/arrow/config.h
index a1abc997984..5ae7e223164 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/config.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/config.h
@@ -45,17 +45,17 @@ struct BuildInfo {
std::string package_kind;
};
-struct RuntimeInfo {
- /// The enabled SIMD level
- ///
- /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
- /// environment variable is set to another value.
- std::string simd_level;
-
- /// The SIMD level available on the OS and CPU
- std::string detected_simd_level;
-};
-
+struct RuntimeInfo {
+ /// The enabled SIMD level
+ ///
+ /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
+ /// environment variable is set to another value.
+ std::string simd_level;
+
+ /// The SIMD level available on the OS and CPU
+ std::string detected_simd_level;
+};
+
/// \brief Get runtime build info.
///
/// The returned values correspond to exact loaded version of the Arrow library,
@@ -64,9 +64,9 @@ struct RuntimeInfo {
ARROW_EXPORT
const BuildInfo& GetBuildInfo();
-/// \brief Get runtime info.
-///
-ARROW_EXPORT
-RuntimeInfo GetRuntimeInfo();
-
+/// \brief Get runtime info.
+///
+ARROW_EXPORT
+RuntimeInfo GetRuntimeInfo();
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc b/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
index 5be26f62d6e..dd10fce3e4d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/datum.cc
@@ -57,20 +57,20 @@ Datum::Datum(std::shared_ptr<RecordBatch> value) : value(std::move(value)) {}
Datum::Datum(std::shared_ptr<Table> value) : value(std::move(value)) {}
Datum::Datum(std::vector<Datum> value) : value(std::move(value)) {}
-Datum::Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
-Datum::Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
-Datum::Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
-Datum::Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
-Datum::Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
-Datum::Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
-Datum::Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
-Datum::Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
-Datum::Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
-Datum::Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
-Datum::Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
-Datum::Datum(std::string value)
- : value(std::make_shared<StringScalar>(std::move(value))) {}
-Datum::Datum(const char* value) : value(std::make_shared<StringScalar>(value)) {}
+Datum::Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
+Datum::Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
+Datum::Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
+Datum::Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
+Datum::Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
+Datum::Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
+Datum::Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
+Datum::Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
+Datum::Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
+Datum::Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
+Datum::Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
+Datum::Datum(std::string value)
+ : value(std::make_shared<StringScalar>(std::move(value))) {}
+Datum::Datum(const char* value) : value(std::make_shared<StringScalar>(value)) {}
Datum::Datum(const ChunkedArray& value)
: value(std::make_shared<ChunkedArray>(value.chunks(), value.type())) {}
@@ -89,26 +89,26 @@ std::shared_ptr<Array> Datum::make_array() const {
std::shared_ptr<DataType> Datum::type() const {
if (this->kind() == Datum::ARRAY) {
return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
- }
- if (this->kind() == Datum::CHUNKED_ARRAY) {
+ }
+ if (this->kind() == Datum::CHUNKED_ARRAY) {
return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
- }
- if (this->kind() == Datum::SCALAR) {
+ }
+ if (this->kind() == Datum::SCALAR) {
return util::get<std::shared_ptr<Scalar>>(this->value)->type;
}
- return nullptr;
+ return nullptr;
+}
+
+std::shared_ptr<Schema> Datum::schema() const {
+ if (this->kind() == Datum::RECORD_BATCH) {
+ return util::get<std::shared_ptr<RecordBatch>>(this->value)->schema();
+ }
+ if (this->kind() == Datum::TABLE) {
+ return util::get<std::shared_ptr<Table>>(this->value)->schema();
+ }
+ return nullptr;
}
-std::shared_ptr<Schema> Datum::schema() const {
- if (this->kind() == Datum::RECORD_BATCH) {
- return util::get<std::shared_ptr<RecordBatch>>(this->value)->schema();
- }
- if (this->kind() == Datum::TABLE) {
- return util::get<std::shared_ptr<Table>>(this->value)->schema();
- }
- return nullptr;
-}
-
int64_t Datum::length() const {
if (this->kind() == Datum::ARRAY) {
return util::get<std::shared_ptr<ArrayData>>(this->value)->length;
@@ -211,21 +211,21 @@ static std::string FormatValueDescr(const ValueDescr& descr) {
std::string ValueDescr::ToString() const { return FormatValueDescr(*this); }
-std::string ValueDescr::ToString(const std::vector<ValueDescr>& descrs) {
- std::stringstream ss;
- ss << "(";
- for (size_t i = 0; i < descrs.size(); ++i) {
- if (i > 0) {
- ss << ", ";
- }
- ss << descrs[i].ToString();
- }
- ss << ")";
- return ss.str();
-}
-
-void PrintTo(const ValueDescr& descr, std::ostream* os) { *os << descr.ToString(); }
-
+std::string ValueDescr::ToString(const std::vector<ValueDescr>& descrs) {
+ std::stringstream ss;
+ ss << "(";
+ for (size_t i = 0; i < descrs.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
+ }
+ ss << descrs[i].ToString();
+ }
+ ss << ")";
+ return ss.str();
+}
+
+void PrintTo(const ValueDescr& descr, std::ostream* os) { *os << descr.ToString(); }
+
std::string Datum::ToString() const {
switch (this->kind()) {
case Datum::NONE:
@@ -250,7 +250,7 @@ std::string Datum::ToString() const {
}
ss << values[i].ToString();
}
- ss << ')';
+ ss << ')';
return ss.str();
}
default:
@@ -262,23 +262,23 @@ std::string Datum::ToString() const {
ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args) {
for (const auto& descr : args) {
if (descr.shape == ValueDescr::ARRAY) {
- return ValueDescr::ARRAY;
+ return ValueDescr::ARRAY;
}
}
- return ValueDescr::SCALAR;
+ return ValueDescr::SCALAR;
+}
+
+void PrintTo(const Datum& datum, std::ostream* os) {
+ switch (datum.kind()) {
+ case Datum::SCALAR:
+ *os << datum.scalar()->ToString();
+ break;
+ case Datum::ARRAY:
+ *os << datum.make_array()->ToString();
+ break;
+ default:
+ *os << datum.ToString();
+ }
}
-void PrintTo(const Datum& datum, std::ostream* os) {
- switch (datum.kind()) {
- case Datum::SCALAR:
- *os << datum.scalar()->ToString();
- break;
- case Datum::ARRAY:
- *os << datum.make_array()->ToString();
- break;
- default:
- *os << datum.ToString();
- }
-}
-
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/datum.h b/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
index d7f487c273c..6ba6af7f79e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/datum.h
@@ -81,17 +81,17 @@ struct ARROW_EXPORT ValueDescr {
}
bool operator==(const ValueDescr& other) const {
- if (shape != other.shape) return false;
- if (type == other.type) return true;
- return type && type->Equals(other.type);
+ if (shape != other.shape) return false;
+ if (type == other.type) return true;
+ return type && type->Equals(other.type);
}
bool operator!=(const ValueDescr& other) const { return !(*this == other); }
std::string ToString() const;
- static std::string ToString(const std::vector<ValueDescr>&);
-
- ARROW_EXPORT friend void PrintTo(const ValueDescr&, std::ostream*);
+ static std::string ToString(const std::vector<ValueDescr>&);
+
+ ARROW_EXPORT friend void PrintTo(const ValueDescr&, std::ostream*);
};
/// \brief For use with scalar functions, returns the broadcasted Value::Shape
@@ -105,25 +105,25 @@ ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args);
struct ARROW_EXPORT Datum {
enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };
- struct Empty {};
-
+ struct Empty {};
+
// Datums variants may have a length. This special value indicate that the
// current variant does not have a length.
static constexpr int64_t kUnknownLength = -1;
- util::Variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
+ util::Variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
std::shared_ptr<Table>, std::vector<Datum>>
value;
/// \brief Empty datum, to be populated elsewhere
- Datum() = default;
+ Datum() = default;
+
+ Datum(const Datum& other) = default;
+ Datum& operator=(const Datum& other) = default;
+ Datum(Datum&& other) = default;
+ Datum& operator=(Datum&& other) = default;
- Datum(const Datum& other) = default;
- Datum& operator=(const Datum& other) = default;
- Datum(Datum&& other) = default;
- Datum& operator=(Datum&& other) = default;
-
Datum(std::shared_ptr<Scalar> value) // NOLINT implicit conversion
: value(std::move(value)) {}
@@ -163,8 +163,8 @@ struct ARROW_EXPORT Datum {
explicit Datum(uint64_t value);
explicit Datum(float value);
explicit Datum(double value);
- explicit Datum(std::string value);
- explicit Datum(const char* value);
+ explicit Datum(std::string value);
+ explicit Datum(const char* value);
Datum::Kind kind() const {
switch (this->value.index()) {
@@ -216,11 +216,11 @@ struct ARROW_EXPORT Datum {
}
template <typename ExactType>
- std::shared_ptr<ExactType> array_as() const {
- return internal::checked_pointer_cast<ExactType>(this->make_array());
- }
-
- template <typename ExactType>
+ std::shared_ptr<ExactType> array_as() const {
+ return internal::checked_pointer_cast<ExactType>(this->make_array());
+ }
+
+ template <typename ExactType>
const ExactType& scalar_as() const {
return internal::checked_cast<const ExactType&>(*this->scalar());
}
@@ -253,11 +253,11 @@ struct ARROW_EXPORT Datum {
/// \return nullptr if no type
std::shared_ptr<DataType> type() const;
- /// \brief The schema of the variant, if any
- ///
- /// \return nullptr if no schema
- std::shared_ptr<Schema> schema() const;
-
+ /// \brief The schema of the variant, if any
+ ///
+ /// \return nullptr if no schema
+ std::shared_ptr<Schema> schema() const;
+
/// \brief The value length of the variant, if any
///
/// \return kUnknownLength if no type
@@ -274,8 +274,8 @@ struct ARROW_EXPORT Datum {
bool operator!=(const Datum& other) const { return !Equals(other); }
std::string ToString() const;
-
- ARROW_EXPORT friend void PrintTo(const Datum&, std::ostream*);
+
+ ARROW_EXPORT friend void PrintTo(const Datum&, std::ostream*);
};
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
index 86893cb5837..7804c130ca1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.cc
@@ -476,14 +476,14 @@ Result<std::shared_ptr<Buffer>> BufferedInputStream::DoRead(int64_t nbytes) {
return impl_->Read(nbytes);
}
-Result<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadata() {
- return impl_->raw()->ReadMetadata();
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadataAsync(
- const IOContext& io_context) {
- return impl_->raw()->ReadMetadataAsync(io_context);
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadata() {
+ return impl_->raw()->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> BufferedInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ return impl_->raw()->ReadMetadataAsync(io_context);
+}
+
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
index 3bcc3a82c1c..8116613fa4e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/buffered.h
@@ -132,9 +132,9 @@ class ARROW_EXPORT BufferedInputStream
// InputStream APIs
bool closed() const override;
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context) override;
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
private:
friend InputStreamConcurrencyWrapper<BufferedInputStream>;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
index 8031d897ba5..722026ccd9b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.cc
@@ -16,11 +16,11 @@
// under the License.
#include <algorithm>
-#include <atomic>
+#include <atomic>
#include <cmath>
-#include <mutex>
+#include <mutex>
#include <utility>
-#include <vector>
+#include <vector>
#include "arrow/buffer.h"
#include "arrow/io/caching.h"
@@ -34,16 +34,16 @@ namespace io {
CacheOptions CacheOptions::Defaults() {
return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
- internal::ReadRangeCache::kDefaultRangeSizeLimit,
- /*lazy=*/false};
+ internal::ReadRangeCache::kDefaultRangeSizeLimit,
+ /*lazy=*/false};
+}
+
+CacheOptions CacheOptions::LazyDefaults() {
+ return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
+ internal::ReadRangeCache::kDefaultRangeSizeLimit,
+ /*lazy=*/true};
}
-CacheOptions CacheOptions::LazyDefaults() {
- return CacheOptions{internal::ReadRangeCache::kDefaultHoleSizeLimit,
- internal::ReadRangeCache::kDefaultRangeSizeLimit,
- /*lazy=*/true};
-}
-
CacheOptions CacheOptions::MakeFromNetworkMetrics(int64_t time_to_first_byte_millis,
int64_t transfer_bandwidth_mib_per_sec,
double ideal_bandwidth_utilization_frac,
@@ -125,7 +125,7 @@ CacheOptions CacheOptions::MakeFromNetworkMetrics(int64_t time_to_first_byte_mil
(1 - ideal_bandwidth_utilization_frac))));
DCHECK_GT(range_size_limit, 0) << "Computed range_size_limit must be > 0";
- return {hole_size_limit, range_size_limit, false};
+ return {hole_size_limit, range_size_limit, false};
}
namespace internal {
@@ -134,10 +134,10 @@ struct RangeCacheEntry {
ReadRange range;
Future<std::shared_ptr<Buffer>> future;
- RangeCacheEntry() = default;
- RangeCacheEntry(const ReadRange& range_, Future<std::shared_ptr<Buffer>> future_)
- : range(range_), future(std::move(future_)) {}
-
+ RangeCacheEntry() = default;
+ RangeCacheEntry(const ReadRange& range_, Future<std::shared_ptr<Buffer>> future_)
+ : range(range_), future(std::move(future_)) {}
+
friend bool operator<(const RangeCacheEntry& left, const RangeCacheEntry& right) {
return left.range.offset < right.range.offset;
}
@@ -145,36 +145,36 @@ struct RangeCacheEntry {
struct ReadRangeCache::Impl {
std::shared_ptr<RandomAccessFile> file;
- IOContext ctx;
+ IOContext ctx;
CacheOptions options;
// Ordered by offset (so as to find a matching region by binary search)
std::vector<RangeCacheEntry> entries;
- virtual ~Impl() = default;
-
- // Get the future corresponding to a range
- virtual Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) {
- return entry->future;
- }
-
- // Make cache entries for ranges
- virtual std::vector<RangeCacheEntry> MakeCacheEntries(
- const std::vector<ReadRange>& ranges) {
- std::vector<RangeCacheEntry> new_entries;
- new_entries.reserve(ranges.size());
- for (const auto& range : ranges) {
- new_entries.emplace_back(range, file->ReadAsync(ctx, range.offset, range.length));
- }
- return new_entries;
- }
-
- // Add the given ranges to the cache, coalescing them where possible
- virtual Status Cache(std::vector<ReadRange> ranges) {
- ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
- options.range_size_limit);
- std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
- // Add new entries, themselves ordered by offset
+ virtual ~Impl() = default;
+
+ // Get the future corresponding to a range
+ virtual Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) {
+ return entry->future;
+ }
+
+ // Make cache entries for ranges
+ virtual std::vector<RangeCacheEntry> MakeCacheEntries(
+ const std::vector<ReadRange>& ranges) {
+ std::vector<RangeCacheEntry> new_entries;
+ new_entries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ new_entries.emplace_back(range, file->ReadAsync(ctx, range.offset, range.length));
+ }
+ return new_entries;
+ }
+
+ // Add the given ranges to the cache, coalescing them where possible
+ virtual Status Cache(std::vector<ReadRange> ranges) {
+ ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
+ options.range_size_limit);
+ std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
+ // Add new entries, themselves ordered by offset
if (entries.size() > 0) {
std::vector<RangeCacheEntry> merged(entries.size() + new_entries.size());
std::merge(entries.begin(), entries.end(), new_entries.begin(), new_entries.end(),
@@ -183,134 +183,134 @@ struct ReadRangeCache::Impl {
} else {
entries = std::move(new_entries);
}
- // Prefetch immediately, regardless of executor availability, if possible
- return file->WillNeed(ranges);
+ // Prefetch immediately, regardless of executor availability, if possible
+ return file->WillNeed(ranges);
+ }
+
+ // Read the given range from the cache, blocking if needed. Cannot read a range
+ // that spans cache entries.
+ virtual Result<std::shared_ptr<Buffer>> Read(ReadRange range) {
+ if (range.length == 0) {
+ static const uint8_t byte = 0;
+ return std::make_shared<Buffer>(&byte, 0);
+ }
+
+ const auto it = std::lower_bound(
+ entries.begin(), entries.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length < range.offset + range.length;
+ });
+ if (it != entries.end() && it->range.Contains(range)) {
+ auto fut = MaybeRead(&*it);
+ ARROW_ASSIGN_OR_RAISE(auto buf, fut.result());
+ return SliceBuffer(std::move(buf), range.offset - it->range.offset, range.length);
+ }
+ return Status::Invalid("ReadRangeCache did not find matching cache entry");
+ }
+
+ virtual Future<> Wait() {
+ std::vector<Future<>> futures;
+ for (auto& entry : entries) {
+ futures.emplace_back(MaybeRead(&entry));
+ }
+ return AllComplete(futures);
+ }
+
+ // Return a Future that completes when the given ranges have been read.
+ virtual Future<> WaitFor(std::vector<ReadRange> ranges) {
+ auto end = std::remove_if(ranges.begin(), ranges.end(),
+ [](const ReadRange& range) { return range.length == 0; });
+ ranges.resize(end - ranges.begin());
+ std::vector<Future<>> futures;
+ futures.reserve(ranges.size());
+ for (auto& range : ranges) {
+ const auto it = std::lower_bound(
+ entries.begin(), entries.end(), range,
+ [](const RangeCacheEntry& entry, const ReadRange& range) {
+ return entry.range.offset + entry.range.length < range.offset + range.length;
+ });
+ if (it != entries.end() && it->range.Contains(range)) {
+ futures.push_back(Future<>(MaybeRead(&*it)));
+ } else {
+ return Status::Invalid("Range was not requested for caching: offset=",
+ range.offset, " length=", range.length);
+ }
+ }
+ return AllComplete(futures);
+ }
+};
+
+// Don't read ranges when they're first added. Instead, wait until they're requested
+// (either through Read or WaitFor).
+struct ReadRangeCache::LazyImpl : public ReadRangeCache::Impl {
+ // Protect against concurrent modification of entries[i]->future
+ std::mutex entry_mutex;
+
+ virtual ~LazyImpl() = default;
+
+ Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) override {
+ // Called by superclass Read()/WaitFor() so we have the lock
+ if (!entry->future.is_valid()) {
+ entry->future = file->ReadAsync(ctx, entry->range.offset, entry->range.length);
+ }
+ return entry->future;
+ }
+
+ std::vector<RangeCacheEntry> MakeCacheEntries(
+ const std::vector<ReadRange>& ranges) override {
+ std::vector<RangeCacheEntry> new_entries;
+ new_entries.reserve(ranges.size());
+ for (const auto& range : ranges) {
+ // In the lazy variant, don't read data here - later, a call to Read or WaitFor
+ // will call back to MaybeRead (under the lock) which will fill the future.
+ new_entries.emplace_back(range, Future<std::shared_ptr<Buffer>>());
+ }
+ return new_entries;
+ }
+
+ Status Cache(std::vector<ReadRange> ranges) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Cache(std::move(ranges));
+ }
+
+ Result<std::shared_ptr<Buffer>> Read(ReadRange range) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Read(range);
+ }
+
+ Future<> Wait() override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::Wait();
+ }
+
+ Future<> WaitFor(std::vector<ReadRange> ranges) override {
+ std::unique_lock<std::mutex> guard(entry_mutex);
+ return ReadRangeCache::Impl::WaitFor(std::move(ranges));
}
-
- // Read the given range from the cache, blocking if needed. Cannot read a range
- // that spans cache entries.
- virtual Result<std::shared_ptr<Buffer>> Read(ReadRange range) {
- if (range.length == 0) {
- static const uint8_t byte = 0;
- return std::make_shared<Buffer>(&byte, 0);
- }
-
- const auto it = std::lower_bound(
- entries.begin(), entries.end(), range,
- [](const RangeCacheEntry& entry, const ReadRange& range) {
- return entry.range.offset + entry.range.length < range.offset + range.length;
- });
- if (it != entries.end() && it->range.Contains(range)) {
- auto fut = MaybeRead(&*it);
- ARROW_ASSIGN_OR_RAISE(auto buf, fut.result());
- return SliceBuffer(std::move(buf), range.offset - it->range.offset, range.length);
- }
- return Status::Invalid("ReadRangeCache did not find matching cache entry");
- }
-
- virtual Future<> Wait() {
- std::vector<Future<>> futures;
- for (auto& entry : entries) {
- futures.emplace_back(MaybeRead(&entry));
- }
- return AllComplete(futures);
- }
-
- // Return a Future that completes when the given ranges have been read.
- virtual Future<> WaitFor(std::vector<ReadRange> ranges) {
- auto end = std::remove_if(ranges.begin(), ranges.end(),
- [](const ReadRange& range) { return range.length == 0; });
- ranges.resize(end - ranges.begin());
- std::vector<Future<>> futures;
- futures.reserve(ranges.size());
- for (auto& range : ranges) {
- const auto it = std::lower_bound(
- entries.begin(), entries.end(), range,
- [](const RangeCacheEntry& entry, const ReadRange& range) {
- return entry.range.offset + entry.range.length < range.offset + range.length;
- });
- if (it != entries.end() && it->range.Contains(range)) {
- futures.push_back(Future<>(MaybeRead(&*it)));
- } else {
- return Status::Invalid("Range was not requested for caching: offset=",
- range.offset, " length=", range.length);
- }
- }
- return AllComplete(futures);
- }
};
-// Don't read ranges when they're first added. Instead, wait until they're requested
-// (either through Read or WaitFor).
-struct ReadRangeCache::LazyImpl : public ReadRangeCache::Impl {
- // Protect against concurrent modification of entries[i]->future
- std::mutex entry_mutex;
-
- virtual ~LazyImpl() = default;
-
- Future<std::shared_ptr<Buffer>> MaybeRead(RangeCacheEntry* entry) override {
- // Called by superclass Read()/WaitFor() so we have the lock
- if (!entry->future.is_valid()) {
- entry->future = file->ReadAsync(ctx, entry->range.offset, entry->range.length);
- }
- return entry->future;
- }
-
- std::vector<RangeCacheEntry> MakeCacheEntries(
- const std::vector<ReadRange>& ranges) override {
- std::vector<RangeCacheEntry> new_entries;
- new_entries.reserve(ranges.size());
- for (const auto& range : ranges) {
- // In the lazy variant, don't read data here - later, a call to Read or WaitFor
- // will call back to MaybeRead (under the lock) which will fill the future.
- new_entries.emplace_back(range, Future<std::shared_ptr<Buffer>>());
- }
- return new_entries;
- }
-
- Status Cache(std::vector<ReadRange> ranges) override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::Cache(std::move(ranges));
- }
-
- Result<std::shared_ptr<Buffer>> Read(ReadRange range) override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::Read(range);
- }
-
- Future<> Wait() override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::Wait();
- }
-
- Future<> WaitFor(std::vector<ReadRange> ranges) override {
- std::unique_lock<std::mutex> guard(entry_mutex);
- return ReadRangeCache::Impl::WaitFor(std::move(ranges));
- }
-};
-
-ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+ReadRangeCache::ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
CacheOptions options)
- : impl_(options.lazy ? new LazyImpl() : new Impl()) {
+ : impl_(options.lazy ? new LazyImpl() : new Impl()) {
impl_->file = std::move(file);
impl_->ctx = std::move(ctx);
impl_->options = options;
}
-ReadRangeCache::~ReadRangeCache() = default;
+ReadRangeCache::~ReadRangeCache() = default;
Status ReadRangeCache::Cache(std::vector<ReadRange> ranges) {
- return impl_->Cache(std::move(ranges));
+ return impl_->Cache(std::move(ranges));
}
Result<std::shared_ptr<Buffer>> ReadRangeCache::Read(ReadRange range) {
- return impl_->Read(range);
-}
+ return impl_->Read(range);
+}
+
+Future<> ReadRangeCache::Wait() { return impl_->Wait(); }
-Future<> ReadRangeCache::Wait() { return impl_->Wait(); }
-
-Future<> ReadRangeCache::WaitFor(std::vector<ReadRange> ranges) {
- return impl_->WaitFor(std::move(ranges));
+Future<> ReadRangeCache::WaitFor(std::vector<ReadRange> ranges) {
+ return impl_->WaitFor(std::move(ranges));
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
index 833b36e31a0..59a9b60e82f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/caching.h
@@ -24,7 +24,7 @@
#include <vector>
#include "arrow/io/interfaces.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -34,19 +34,19 @@ struct ARROW_EXPORT CacheOptions {
static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9;
static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64;
- /// \brief The maximum distance in bytes between two consecutive
+ /// \brief The maximum distance in bytes between two consecutive
/// ranges; beyond this value, ranges are not combined
int64_t hole_size_limit;
- /// \brief The maximum size in bytes of a combined range; if
+ /// \brief The maximum size in bytes of a combined range; if
/// combining two consecutive ranges would produce a range of a
/// size greater than this, they are not combined
int64_t range_size_limit;
- /// \brief A lazy cache does not perform any I/O until requested.
- bool lazy;
+ /// \brief A lazy cache does not perform any I/O until requested.
+ bool lazy;
bool operator==(const CacheOptions& other) const {
return hole_size_limit == other.hole_size_limit &&
- range_size_limit == other.range_size_limit && lazy == other.lazy;
+ range_size_limit == other.range_size_limit && lazy == other.lazy;
}
/// \brief Construct CacheOptions from network storage metrics (e.g. S3).
@@ -69,45 +69,45 @@ struct ARROW_EXPORT CacheOptions {
int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib);
static CacheOptions Defaults();
- static CacheOptions LazyDefaults();
+ static CacheOptions LazyDefaults();
};
namespace internal {
/// \brief A read cache designed to hide IO latencies when reading.
///
-/// This class takes multiple byte ranges that an application expects to read, and
-/// coalesces them into fewer, larger read requests, which benefits performance on some
-/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
-/// these read requests in parallel up front.
-///
-/// To use:
-/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
-/// the exact offset and length that will later be read. The cache will combine those
-/// ranges according to parameters (see constructor).
-///
-/// By default, the cache will also start fetching the combined ranges in parallel in
-/// the background, unless CacheOptions.lazy is set.
-///
-/// 2. Call WaitFor() to be notified when the given ranges have been read. If
-/// CacheOptions.lazy is set, I/O will be triggered in the background here instead.
-/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
-/// chunk of the file that can be parsed in parallel).
-///
-/// 3. Call Read() to retrieve the actual data for the given ranges.
-/// A synchronous application may skip WaitFor() and just call Read() - it will still
-/// benefit from coalescing and parallel fetching.
+/// This class takes multiple byte ranges that an application expects to read, and
+/// coalesces them into fewer, larger read requests, which benefits performance on some
+/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
+/// these read requests in parallel up front.
+///
+/// To use:
+/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
+/// the exact offset and length that will later be read. The cache will combine those
+/// ranges according to parameters (see constructor).
+///
+/// By default, the cache will also start fetching the combined ranges in parallel in
+/// the background, unless CacheOptions.lazy is set.
+///
+/// 2. Call WaitFor() to be notified when the given ranges have been read. If
+/// CacheOptions.lazy is set, I/O will be triggered in the background here instead.
+/// This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
+/// chunk of the file that can be parsed in parallel).
+///
+/// 3. Call Read() to retrieve the actual data for the given ranges.
+/// A synchronous application may skip WaitFor() and just call Read() - it will still
+/// benefit from coalescing and parallel fetching.
class ARROW_EXPORT ReadRangeCache {
public:
static constexpr int64_t kDefaultHoleSizeLimit = 8192;
static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024;
/// Construct a read cache with default
- explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
+ explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
: ReadRangeCache(file, std::move(ctx), CacheOptions::Defaults()) {}
/// Construct a read cache with given options
- explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+ explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
CacheOptions options);
~ReadRangeCache();
@@ -120,16 +120,16 @@ class ARROW_EXPORT ReadRangeCache {
/// \brief Read a range previously given to Cache().
Result<std::shared_ptr<Buffer>> Read(ReadRange range);
- /// \brief Wait until all ranges added so far have been cached.
- Future<> Wait();
-
- /// \brief Wait until all given ranges have been cached.
- Future<> WaitFor(std::vector<ReadRange> ranges);
-
+ /// \brief Wait until all ranges added so far have been cached.
+ Future<> Wait();
+
+ /// \brief Wait until all given ranges have been cached.
+ Future<> WaitFor(std::vector<ReadRange> ranges);
+
protected:
struct Impl;
- struct LazyImpl;
-
+ struct LazyImpl;
+
std::unique_ptr<Impl> impl_;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
index 0e6f4dc339a..72977f0f297 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.cc
@@ -342,7 +342,7 @@ class CompressedInputStream::Impl {
RETURN_NOT_OK(EnsureCompressedData());
if (compressed_pos_ == compressed_->size()) {
// No more data to decompress
- if (!fresh_decompressor_ && !decompressor_->IsFinished()) {
+ if (!fresh_decompressor_ && !decompressor_->IsFinished()) {
return Status::IOError("Truncated compressed stream");
}
*has_data = false;
@@ -437,14 +437,14 @@ Result<std::shared_ptr<Buffer>> CompressedInputStream::DoRead(int64_t nbytes) {
std::shared_ptr<InputStream> CompressedInputStream::raw() const { return impl_->raw(); }
-Result<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadata() {
- return impl_->raw()->ReadMetadata();
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadataAsync(
- const IOContext& io_context) {
- return impl_->raw()->ReadMetadataAsync(io_context);
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadata() {
+ return impl_->raw()->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> CompressedInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ return impl_->raw()->ReadMetadataAsync(io_context);
+}
+
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
index 9eb5e44139f..cd1a7f673ce 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/compressed.h
@@ -89,9 +89,9 @@ class ARROW_EXPORT CompressedInputStream
// InputStream interface
bool closed() const override;
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context) override;
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
/// \brief Return the underlying raw input stream.
std::shared_ptr<InputStream> raw() const;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
index 25308240653..70e15335af2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.cc
@@ -390,11 +390,11 @@ class MemoryMappedFile::MemoryMap
// An object representing the entire memory-mapped region.
// It can be sliced in order to return individual subregions, which
// will then keep the original region alive as long as necessary.
- class Region : public Buffer {
+ class Region : public Buffer {
public:
Region(std::shared_ptr<MemoryMappedFile::MemoryMap> memory_map, uint8_t* data,
int64_t size)
- : Buffer(data, size) {
+ : Buffer(data, size) {
is_mutable_ = memory_map->writable();
}
@@ -539,8 +539,8 @@ class MemoryMappedFile::MemoryMap
void advance(int64_t nbytes) { position_ = position_ + nbytes; }
- uint8_t* data() { return region_ ? region_->data() : nullptr; }
-
+ uint8_t* data() { return region_ ? region_->data() : nullptr; }
+
uint8_t* head() { return data() + position_; }
bool writable() { return file_->mode() != FileMode::READ; }
@@ -696,7 +696,7 @@ Result<std::shared_ptr<Buffer>> MemoryMappedFile::Read(int64_t nbytes) {
return buffer;
}
-Future<std::shared_ptr<Buffer>> MemoryMappedFile::ReadAsync(const IOContext&,
+Future<std::shared_ptr<Buffer>> MemoryMappedFile::ReadAsync(const IOContext&,
int64_t position,
int64_t nbytes) {
return Future<std::shared_ptr<Buffer>>::MakeFinished(ReadAt(position, nbytes));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
index 4447f82174f..50d4f2c4dfc 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/file.h
@@ -185,7 +185,7 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface {
Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
// Synchronous ReadAsync override
- Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
int64_t nbytes) override;
Status WillNeed(const std::vector<ReadRange>& ranges) override;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
index cf5d71f2cc8..954c0f37b2d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.cc
@@ -29,11 +29,11 @@
#include "arrow/buffer.h"
#include "arrow/io/concurrency.h"
-#include "arrow/io/type_fwd.h"
+#include "arrow/io/type_fwd.h"
#include "arrow/io/util_internal.h"
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/future.h"
#include "arrow/util/iterator.h"
#include "arrow/util/logging.h"
@@ -42,36 +42,36 @@
namespace arrow {
-using internal::checked_pointer_cast;
+using internal::checked_pointer_cast;
using internal::Executor;
using internal::TaskHints;
using internal::ThreadPool;
namespace io {
-static IOContext g_default_io_context{};
+static IOContext g_default_io_context{};
-IOContext::IOContext(MemoryPool* pool, StopToken stop_token)
- : IOContext(pool, internal::GetIOThreadPool(), std::move(stop_token)) {}
+IOContext::IOContext(MemoryPool* pool, StopToken stop_token)
+ : IOContext(pool, internal::GetIOThreadPool(), std::move(stop_token)) {}
+
+const IOContext& default_io_context() { return g_default_io_context; }
+
+int GetIOThreadPoolCapacity() { return internal::GetIOThreadPool()->GetCapacity(); }
+
+Status SetIOThreadPoolCapacity(int threads) {
+ return internal::GetIOThreadPool()->SetCapacity(threads);
+}
-const IOContext& default_io_context() { return g_default_io_context; }
-
-int GetIOThreadPoolCapacity() { return internal::GetIOThreadPool()->GetCapacity(); }
-
-Status SetIOThreadPoolCapacity(int threads) {
- return internal::GetIOThreadPool()->SetCapacity(threads);
-}
-
FileInterface::~FileInterface() = default;
Status FileInterface::Abort() { return Close(); }
-namespace {
-
+namespace {
+
class InputStreamBlockIterator {
public:
InputStreamBlockIterator(std::shared_ptr<InputStream> stream, int64_t block_size)
- : stream_(std::move(stream)), block_size_(block_size) {}
+ : stream_(std::move(stream)), block_size_(block_size) {}
Result<std::shared_ptr<Buffer>> Next() {
if (done_) {
@@ -95,10 +95,10 @@ class InputStreamBlockIterator {
bool done_ = false;
};
-} // namespace
-
-const IOContext& Readable::io_context() const { return g_default_io_context; }
-
+} // namespace
+
+const IOContext& Readable::io_context() const { return g_default_io_context; }
+
Status InputStream::Advance(int64_t nbytes) { return Read(nbytes).status(); }
Result<util::string_view> InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) {
@@ -107,22 +107,22 @@ Result<util::string_view> InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) {
bool InputStream::supports_zero_copy() const { return false; }
-Result<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadata() {
- return std::shared_ptr<const KeyValueMetadata>{};
-}
-
-// Default ReadMetadataAsync() implementation: simply issue the read on the context's
-// executor
-Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync(
- const IOContext& ctx) {
- auto self = shared_from_this();
- return DeferNotOk(internal::SubmitIO(ctx, [self] { return self->ReadMetadata(); }));
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync() {
- return ReadMetadataAsync(io_context());
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadata() {
+ return std::shared_ptr<const KeyValueMetadata>{};
+}
+
+// Default ReadMetadataAsync() implementation: simply issue the read on the context's
+// executor
+Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync(
+ const IOContext& ctx) {
+ auto self = shared_from_this();
+ return DeferNotOk(internal::SubmitIO(ctx, [self] { return self->ReadMetadata(); }));
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> InputStream::ReadMetadataAsync() {
+ return ReadMetadataAsync(io_context());
+}
+
Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
std::shared_ptr<InputStream> stream, int64_t block_size) {
if (stream->closed()) {
@@ -132,13 +132,13 @@ Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
return Iterator<std::shared_ptr<Buffer>>(InputStreamBlockIterator(stream, block_size));
}
-struct RandomAccessFile::Impl {
+struct RandomAccessFile::Impl {
std::mutex lock_;
};
RandomAccessFile::~RandomAccessFile() = default;
-RandomAccessFile::RandomAccessFile() : interface_impl_(new Impl()) {}
+RandomAccessFile::RandomAccessFile() : interface_impl_(new Impl()) {}
Result<int64_t> RandomAccessFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
std::lock_guard<std::mutex> lock(interface_impl_->lock_);
@@ -154,26 +154,26 @@ Result<std::shared_ptr<Buffer>> RandomAccessFile::ReadAt(int64_t position,
}
// Default ReadAsync() implementation: simply issue the read on the context's executor
-Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(const IOContext& ctx,
+Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(const IOContext& ctx,
int64_t position,
int64_t nbytes) {
- auto self = checked_pointer_cast<RandomAccessFile>(shared_from_this());
- return DeferNotOk(internal::SubmitIO(
- ctx, [self, position, nbytes] { return self->ReadAt(position, nbytes); }));
+ auto self = checked_pointer_cast<RandomAccessFile>(shared_from_this());
+ return DeferNotOk(internal::SubmitIO(
+ ctx, [self, position, nbytes] { return self->ReadAt(position, nbytes); }));
+}
+
+Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(int64_t position,
+ int64_t nbytes) {
+ return ReadAsync(io_context(), position, nbytes);
}
-Future<std::shared_ptr<Buffer>> RandomAccessFile::ReadAsync(int64_t position,
- int64_t nbytes) {
- return ReadAsync(io_context(), position, nbytes);
-}
-
// Default WillNeed() implementation: no-op
Status RandomAccessFile::WillNeed(const std::vector<ReadRange>& ranges) {
return Status::OK();
}
-Status Writable::Write(util::string_view data) {
- return Write(data.data(), static_cast<int64_t>(data.size()));
+Status Writable::Write(util::string_view data) {
+ return Write(data.data(), static_cast<int64_t>(data.size()));
}
Status Writable::Write(const std::shared_ptr<Buffer>& data) {
@@ -380,15 +380,15 @@ struct ReadRangeCombiner {
auto end = std::remove_if(ranges.begin(), ranges.end(),
[](const ReadRange& range) { return range.length == 0; });
// Sort in position order
- std::sort(ranges.begin(), end,
+ std::sort(ranges.begin(), end,
[](const ReadRange& a, const ReadRange& b) { return a.offset < b.offset; });
- // Remove ranges that overlap 100%
- end = std::unique(ranges.begin(), end,
- [](const ReadRange& left, const ReadRange& right) {
- return right.offset >= left.offset &&
- right.offset + right.length <= left.offset + left.length;
- });
- ranges.resize(end - ranges.begin());
+ // Remove ranges that overlap 100%
+ end = std::unique(ranges.begin(), end,
+ [](const ReadRange& left, const ReadRange& right) {
+ return right.offset >= left.offset &&
+ right.offset + right.length <= left.offset + left.length;
+ });
+ ranges.resize(end - ranges.begin());
// Skip further processing if ranges is empty after removing zero-sized ranges.
if (ranges.empty()) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
index 1459b173d89..e524afa99a3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/interfaces.h
@@ -24,7 +24,7 @@
#include "arrow/io/type_fwd.h"
#include "arrow/type_fwd.h"
-#include "arrow/util/cancel.h"
+#include "arrow/util/cancel.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h"
#include "arrow/util/type_fwd.h"
@@ -49,57 +49,57 @@ struct ReadRange {
}
};
-/// EXPERIMENTAL: options provider for IO tasks
-///
-/// Includes an Executor (which will be used to execute asynchronous reads),
-/// a MemoryPool (which will be used to allocate buffers when zero copy reads
-/// are not possible), and an external id (in case the executor receives tasks from
-/// multiple sources and must distinguish tasks associated with this IOContext).
-struct ARROW_EXPORT IOContext {
- // No specified executor: will use a global IO thread pool
- IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {}
-
- explicit IOContext(StopToken stop_token)
- : IOContext(default_memory_pool(), std::move(stop_token)) {}
-
- explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable());
-
- explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
- StopToken stop_token = StopToken::Unstoppable(),
- int64_t external_id = -1)
- : pool_(pool),
- executor_(executor),
- external_id_(external_id),
- stop_token_(std::move(stop_token)) {}
-
- explicit IOContext(::arrow::internal::Executor* executor,
- StopToken stop_token = StopToken::Unstoppable(),
- int64_t external_id = -1)
- : pool_(default_memory_pool()),
- executor_(executor),
- external_id_(external_id),
- stop_token_(std::move(stop_token)) {}
-
- MemoryPool* pool() const { return pool_; }
-
- ::arrow::internal::Executor* executor() const { return executor_; }
-
+/// EXPERIMENTAL: options provider for IO tasks
+///
+/// Includes an Executor (which will be used to execute asynchronous reads),
+/// a MemoryPool (which will be used to allocate buffers when zero copy reads
+/// are not possible), and an external id (in case the executor receives tasks from
+/// multiple sources and must distinguish tasks associated with this IOContext).
+struct ARROW_EXPORT IOContext {
+ // No specified executor: will use a global IO thread pool
+ IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {}
+
+ explicit IOContext(StopToken stop_token)
+ : IOContext(default_memory_pool(), std::move(stop_token)) {}
+
+ explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable());
+
+ explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+ StopToken stop_token = StopToken::Unstoppable(),
+ int64_t external_id = -1)
+ : pool_(pool),
+ executor_(executor),
+ external_id_(external_id),
+ stop_token_(std::move(stop_token)) {}
+
+ explicit IOContext(::arrow::internal::Executor* executor,
+ StopToken stop_token = StopToken::Unstoppable(),
+ int64_t external_id = -1)
+ : pool_(default_memory_pool()),
+ executor_(executor),
+ external_id_(external_id),
+ stop_token_(std::move(stop_token)) {}
+
+ MemoryPool* pool() const { return pool_; }
+
+ ::arrow::internal::Executor* executor() const { return executor_; }
+
// An application-specific ID, forwarded to executor task submissions
- int64_t external_id() const { return external_id_; }
-
- StopToken stop_token() const { return stop_token_; }
-
- private:
- MemoryPool* pool_;
- ::arrow::internal::Executor* executor_;
- int64_t external_id_;
- StopToken stop_token_;
+ int64_t external_id() const { return external_id_; }
+
+ StopToken stop_token() const { return stop_token_; }
+
+ private:
+ MemoryPool* pool_;
+ ::arrow::internal::Executor* executor_;
+ int64_t external_id_;
+ StopToken stop_token_;
+};
+
+struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext {
+ using IOContext::IOContext;
};
-struct ARROW_DEPRECATED("renamed to IOContext in 4.0.0") AsyncContext : public IOContext {
- using IOContext::IOContext;
-};
-
class ARROW_EXPORT FileInterface {
public:
virtual ~FileInterface() = 0;
@@ -168,7 +168,7 @@ class ARROW_EXPORT Writable {
/// \brief Flush buffered bytes, if any
virtual Status Flush();
- Status Write(util::string_view data);
+ Status Write(util::string_view data);
};
class ARROW_EXPORT Readable {
@@ -189,12 +189,12 @@ class ARROW_EXPORT Readable {
/// In some cases (e.g. a memory-mapped file), this method may avoid a
/// memory copy.
virtual Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) = 0;
-
- /// EXPERIMENTAL: The IOContext associated with this file.
- ///
- /// By default, this is the same as default_io_context(), but it may be
- /// overriden by subclasses.
- virtual const IOContext& io_context() const;
+
+ /// EXPERIMENTAL: The IOContext associated with this file.
+ ///
+ /// By default, this is the same as default_io_context(), but it may be
+ /// overriden by subclasses.
+ virtual const IOContext& io_context() const;
};
class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable {
@@ -202,9 +202,9 @@ class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable
OutputStream() = default;
};
-class ARROW_EXPORT InputStream : virtual public FileInterface,
- virtual public Readable,
- public std::enable_shared_from_this<InputStream> {
+class ARROW_EXPORT InputStream : virtual public FileInterface,
+ virtual public Readable,
+ public std::enable_shared_from_this<InputStream> {
public:
/// \brief Advance or skip stream indicated number of bytes
/// \param[in] nbytes the number to move forward
@@ -227,23 +227,23 @@ class ARROW_EXPORT InputStream : virtual public FileInterface,
/// Zero copy reads imply the use of Buffer-returning Read() overloads.
virtual bool supports_zero_copy() const;
- /// \brief Read and return stream metadata
- ///
- /// If the stream implementation doesn't support metadata, empty metadata
- /// is returned. Note that it is allowed to return a null pointer rather
- /// than an allocated empty metadata.
- virtual Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
-
- /// \brief Read stream metadata asynchronously
- virtual Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context);
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync();
-
+ /// \brief Read and return stream metadata
+ ///
+ /// If the stream implementation doesn't support metadata, empty metadata
+ /// is returned. Note that it is allowed to return a null pointer rather
+ /// than an allocated empty metadata.
+ virtual Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ /// \brief Read stream metadata asynchronously
+ virtual Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context);
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync();
+
protected:
InputStream() = default;
};
-class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
+class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
public:
/// Necessary because we hold a std::unique_ptr
~RandomAccessFile() override;
@@ -292,12 +292,12 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
virtual Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes);
/// EXPERIMENTAL: Read data asynchronously.
- virtual Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ virtual Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
int64_t nbytes);
- /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext.
- Future<std::shared_ptr<Buffer>> ReadAsync(int64_t position, int64_t nbytes);
-
+ /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext.
+ Future<std::shared_ptr<Buffer>> ReadAsync(int64_t position, int64_t nbytes);
+
/// EXPERIMENTAL: Inform that the given ranges may be read soon.
///
/// Some implementations might arrange to prefetch some of the data.
@@ -309,8 +309,8 @@ class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
RandomAccessFile();
private:
- struct ARROW_NO_EXPORT Impl;
- std::unique_ptr<Impl> interface_impl_;
+ struct ARROW_NO_EXPORT Impl;
+ std::unique_ptr<Impl> interface_impl_;
};
class ARROW_EXPORT WritableFile : public OutputStream, public Seekable {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
index b52c456fd89..6495242e63b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.cc
@@ -261,10 +261,10 @@ void FixedSizeBufferWriter::set_memcopy_threshold(int64_t threshold) {
// ----------------------------------------------------------------------
// In-memory buffer reader
-BufferReader::BufferReader(std::shared_ptr<Buffer> buffer)
- : buffer_(std::move(buffer)),
- data_(buffer_ ? buffer_->data() : reinterpret_cast<const uint8_t*>("")),
- size_(buffer_ ? buffer_->size() : 0),
+BufferReader::BufferReader(std::shared_ptr<Buffer> buffer)
+ : buffer_(std::move(buffer)),
+ data_(buffer_ ? buffer_->data() : reinterpret_cast<const uint8_t*>("")),
+ size_(buffer_ ? buffer_->size() : 0),
position_(0),
is_open_(true) {}
@@ -320,7 +320,7 @@ Status BufferReader::WillNeed(const std::vector<ReadRange>& ranges) {
return st;
}
-Future<std::shared_ptr<Buffer>> BufferReader::ReadAsync(const IOContext&,
+Future<std::shared_ptr<Buffer>> BufferReader::ReadAsync(const IOContext&,
int64_t position,
int64_t nbytes) {
return Future<std::shared_ptr<Buffer>>::MakeFinished(DoReadAt(position, nbytes));
@@ -344,8 +344,8 @@ Result<std::shared_ptr<Buffer>> BufferReader::DoReadAt(int64_t position, int64_t
DCHECK_GE(nbytes, 0);
// Arrange for data to be paged in
- // RETURN_NOT_OK(::arrow::internal::MemoryAdviseWillNeed(
- // {{const_cast<uint8_t*>(data_ + position), static_cast<size_t>(nbytes)}}));
+ // RETURN_NOT_OK(::arrow::internal::MemoryAdviseWillNeed(
+ // {{const_cast<uint8_t*>(data_ + position), static_cast<size_t>(nbytes)}}));
if (nbytes > 0 && buffer_ != nullptr) {
return SliceBuffer(buffer_, position, nbytes);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
index ff9e179d862..8213439ef74 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/memory.h
@@ -88,7 +88,7 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream {
uint8_t* mutable_data_;
};
-/// \brief A helper class to track the size of allocations
+/// \brief A helper class to track the size of allocations
///
/// Writes to this stream do not copy or retain any data, they just bump
/// a size counter that can be later used to know exactly which data size
@@ -145,7 +145,7 @@ class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile {
class ARROW_EXPORT BufferReader
: public internal::RandomAccessFileConcurrencyWrapper<BufferReader> {
public:
- explicit BufferReader(std::shared_ptr<Buffer> buffer);
+ explicit BufferReader(std::shared_ptr<Buffer> buffer);
explicit BufferReader(const Buffer& buffer);
BufferReader(const uint8_t* data, int64_t size);
@@ -160,7 +160,7 @@ class ARROW_EXPORT BufferReader
std::shared_ptr<Buffer> buffer() const { return buffer_; }
// Synchronous ReadAsync override
- Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+ Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
int64_t nbytes) override;
Status WillNeed(const std::vector<ReadRange>& ranges) override;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
index 48ac06de186..7ef4843a224 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.cc
@@ -1,95 +1,95 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/io/stdio.h"
-
-#include <iostream>
-
-#include "arrow/buffer.h"
-#include "arrow/result.h"
-
-namespace arrow {
-namespace io {
-
-//
-// StdoutStream implementation
-//
-
-StdoutStream::StdoutStream() : pos_(0) { set_mode(FileMode::WRITE); }
-
-Status StdoutStream::Close() { return Status::OK(); }
-
-bool StdoutStream::closed() const { return false; }
-
-Result<int64_t> StdoutStream::Tell() const { return pos_; }
-
-Status StdoutStream::Write(const void* data, int64_t nbytes) {
- pos_ += nbytes;
- std::cout.write(reinterpret_cast<const char*>(data), nbytes);
- return Status::OK();
-}
-
-//
-// StderrStream implementation
-//
-
-StderrStream::StderrStream() : pos_(0) { set_mode(FileMode::WRITE); }
-
-Status StderrStream::Close() { return Status::OK(); }
-
-bool StderrStream::closed() const { return false; }
-
-Result<int64_t> StderrStream::Tell() const { return pos_; }
-
-Status StderrStream::Write(const void* data, int64_t nbytes) {
- pos_ += nbytes;
- std::cerr.write(reinterpret_cast<const char*>(data), nbytes);
- return Status::OK();
-}
-
-//
-// StdinStream implementation
-//
-
-StdinStream::StdinStream() : pos_(0) { set_mode(FileMode::READ); }
-
-Status StdinStream::Close() { return Status::OK(); }
-
-bool StdinStream::closed() const { return false; }
-
-Result<int64_t> StdinStream::Tell() const { return pos_; }
-
-Result<int64_t> StdinStream::Read(int64_t nbytes, void* out) {
- std::cin.read(reinterpret_cast<char*>(out), nbytes);
- if (std::cin) {
- pos_ += nbytes;
- return nbytes;
- } else {
- return 0;
- }
-}
-
-Result<std::shared_ptr<Buffer>> StdinStream::Read(int64_t nbytes) {
- ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes));
- ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
- ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false));
- buffer->ZeroPadding();
- return std::move(buffer);
-}
-
-} // namespace io
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/io/stdio.h"
+
+#include <iostream>
+
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace io {
+
+//
+// StdoutStream implementation
+//
+
+StdoutStream::StdoutStream() : pos_(0) { set_mode(FileMode::WRITE); }
+
+Status StdoutStream::Close() { return Status::OK(); }
+
+bool StdoutStream::closed() const { return false; }
+
+Result<int64_t> StdoutStream::Tell() const { return pos_; }
+
+Status StdoutStream::Write(const void* data, int64_t nbytes) {
+ pos_ += nbytes;
+ std::cout.write(reinterpret_cast<const char*>(data), nbytes);
+ return Status::OK();
+}
+
+//
+// StderrStream implementation
+//
+
+StderrStream::StderrStream() : pos_(0) { set_mode(FileMode::WRITE); }
+
+Status StderrStream::Close() { return Status::OK(); }
+
+bool StderrStream::closed() const { return false; }
+
+Result<int64_t> StderrStream::Tell() const { return pos_; }
+
+Status StderrStream::Write(const void* data, int64_t nbytes) {
+ pos_ += nbytes;
+ std::cerr.write(reinterpret_cast<const char*>(data), nbytes);
+ return Status::OK();
+}
+
+//
+// StdinStream implementation
+//
+
+StdinStream::StdinStream() : pos_(0) { set_mode(FileMode::READ); }
+
+Status StdinStream::Close() { return Status::OK(); }
+
+bool StdinStream::closed() const { return false; }
+
+Result<int64_t> StdinStream::Tell() const { return pos_; }
+
+Result<int64_t> StdinStream::Read(int64_t nbytes, void* out) {
+ std::cin.read(reinterpret_cast<char*>(out), nbytes);
+ if (std::cin) {
+ pos_ += nbytes;
+ return nbytes;
+ } else {
+ return 0;
+ }
+}
+
+Result<std::shared_ptr<Buffer>> StdinStream::Read(int64_t nbytes) {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes));
+ ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, Read(nbytes, buffer->mutable_data()));
+ ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false));
+ buffer->ZeroPadding();
+ return std::move(buffer);
+}
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
index 6df07d670af..9484ac77124 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/stdio.h
@@ -1,82 +1,82 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/io/interfaces.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace io {
-
-// Output stream that just writes to stdout.
-class ARROW_EXPORT StdoutStream : public OutputStream {
- public:
- StdoutStream();
- ~StdoutStream() override {}
-
- Status Close() override;
- bool closed() const override;
-
- Result<int64_t> Tell() const override;
-
- Status Write(const void* data, int64_t nbytes) override;
-
- private:
- int64_t pos_;
-};
-
-// Output stream that just writes to stderr.
-class ARROW_EXPORT StderrStream : public OutputStream {
- public:
- StderrStream();
- ~StderrStream() override {}
-
- Status Close() override;
- bool closed() const override;
-
- Result<int64_t> Tell() const override;
-
- Status Write(const void* data, int64_t nbytes) override;
-
- private:
- int64_t pos_;
-};
-
-// Input stream that just reads from stdin.
-class ARROW_EXPORT StdinStream : public InputStream {
- public:
- StdinStream();
- ~StdinStream() override {}
-
- Status Close() override;
- bool closed() const override;
-
- Result<int64_t> Tell() const override;
-
- Result<int64_t> Read(int64_t nbytes, void* out) override;
-
- Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
-
- private:
- int64_t pos_;
-};
-
-} // namespace io
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+// Output stream that just writes to stdout.
+class ARROW_EXPORT StdoutStream : public OutputStream {
+ public:
+ StdoutStream();
+ ~StdoutStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+// Output stream that just writes to stderr.
+class ARROW_EXPORT StderrStream : public OutputStream {
+ public:
+ StderrStream();
+ ~StderrStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+// Input stream that just reads from stdin.
+class ARROW_EXPORT StdinStream : public InputStream {
+ public:
+ StdinStream();
+ ~StdinStream() override {}
+
+ Status Close() override;
+ bool closed() const override;
+
+ Result<int64_t> Tell() const override;
+
+ Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+ Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+ private:
+ int64_t pos_;
+};
+
+} // namespace io
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
index 50198ad20ef..3fdf5a7a9ba 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.cc
@@ -145,18 +145,18 @@ Result<int64_t> TransformInputStream::Tell() const {
return impl_->pos_;
}
-Result<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadata() {
- RETURN_NOT_OK(impl_->CheckClosed());
-
- return impl_->wrapped_->ReadMetadata();
-}
-
-Future<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadataAsync(
- const IOContext& io_context) {
- RETURN_NOT_OK(impl_->CheckClosed());
-
- return impl_->wrapped_->ReadMetadataAsync(io_context);
-}
-
+Result<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadata() {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->wrapped_->ReadMetadata();
+}
+
+Future<std::shared_ptr<const KeyValueMetadata>> TransformInputStream::ReadMetadataAsync(
+ const IOContext& io_context) {
+ RETURN_NOT_OK(impl_->CheckClosed());
+
+ return impl_->wrapped_->ReadMetadataAsync(io_context);
+}
+
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
index 6ecaa6d6101..c117f275929 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/transform.h
@@ -45,10 +45,10 @@ class ARROW_EXPORT TransformInputStream : public InputStream {
Result<int64_t> Read(int64_t nbytes, void* out) override;
Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
- Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
- Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
- const IOContext& io_context) override;
-
+ Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+ Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+ const IOContext& io_context) override;
+
Result<int64_t> Tell() const override;
protected:
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
index 632616de1fe..a2fd33bf360 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/type_fwd.h
@@ -17,9 +17,9 @@
#pragma once
-#include "arrow/type_fwd.h"
-#include "arrow/util/visibility.h"
-
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
namespace arrow {
namespace io {
@@ -27,30 +27,30 @@ struct FileMode {
enum type { READ, WRITE, READWRITE };
};
-struct IOContext;
-struct CacheOptions;
-
-/// EXPERIMENTAL: convenience global singleton for default IOContext settings
-ARROW_EXPORT
-const IOContext& default_io_context();
-
-/// \brief Get the capacity of the global I/O thread pool
-///
-/// Return the number of worker threads in the thread pool to which
-/// Arrow dispatches various I/O-bound tasks. This is an ideal number,
-/// not necessarily the exact number of threads at a given point in time.
-///
-/// You can change this number using SetIOThreadPoolCapacity().
-ARROW_EXPORT int GetIOThreadPoolCapacity();
-
-/// \brief Set the capacity of the global I/O thread pool
-///
-/// Set the number of worker threads in the thread pool to which
-/// Arrow dispatches various I/O-bound tasks.
-///
-/// The current number is returned by GetIOThreadPoolCapacity().
-ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads);
-
+struct IOContext;
+struct CacheOptions;
+
+/// EXPERIMENTAL: convenience global singleton for default IOContext settings
+ARROW_EXPORT
+const IOContext& default_io_context();
+
+/// \brief Get the capacity of the global I/O thread pool
+///
+/// Return the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks. This is an ideal number,
+/// not necessarily the exact number of threads at a given point in time.
+///
+/// You can change this number using SetIOThreadPoolCapacity().
+ARROW_EXPORT int GetIOThreadPoolCapacity();
+
+/// \brief Set the capacity of the global I/O thread pool
+///
+/// Set the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks.
+///
+/// The current number is returned by GetIOThreadPoolCapacity().
+ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads);
+
class FileInterface;
class Seekable;
class Writable;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
index dc9d6781ada..b1d75d1d0bd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/io/util_internal.h
@@ -18,11 +18,11 @@
#pragma once
#include <memory>
-#include <utility>
+#include <utility>
#include <vector>
#include "arrow/io/interfaces.h"
-#include "arrow/util/thread_pool.h"
+#include "arrow/util/thread_pool.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
@@ -52,15 +52,15 @@ std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
ARROW_EXPORT
::arrow::internal::ThreadPool* GetIOThreadPool();
-template <typename... SubmitArgs>
-auto SubmitIO(IOContext io_context, SubmitArgs&&... submit_args)
- -> decltype(std::declval<::arrow::internal::Executor*>()->Submit(submit_args...)) {
- ::arrow::internal::TaskHints hints;
- hints.external_id = io_context.external_id();
- return io_context.executor()->Submit(hints, io_context.stop_token(),
- std::forward<SubmitArgs>(submit_args)...);
-}
-
+template <typename... SubmitArgs>
+auto SubmitIO(IOContext io_context, SubmitArgs&&... submit_args)
+ -> decltype(std::declval<::arrow::internal::Executor*>()->Submit(submit_args...)) {
+ ::arrow::internal::TaskHints hints;
+ hints.external_id = io_context.external_id();
+ return io_context.executor()->Submit(hints, io_context.stop_token(),
+ std::forward<SubmitArgs>(submit_args)...);
+}
+
} // namespace internal
} // namespace io
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
index 13b1424ee5e..3ab2c8b3847 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.cc
@@ -20,14 +20,14 @@
#include <algorithm>
#include <cstdint>
#include <memory>
-#include <set>
+#include <set>
#include <unordered_map>
#include <utility>
#include <vector>
#include "arrow/array.h"
#include "arrow/array/concatenate.h"
-#include "arrow/array/validate.h"
+#include "arrow/array/validate.h"
#include "arrow/extension_type.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
@@ -88,16 +88,16 @@ struct DictionaryFieldMapper::Impl {
int num_fields() const { return static_cast<int>(field_path_to_id.size()); }
- int num_dicts() const {
- std::set<int64_t> uniqueIds;
-
- for (auto& kv : field_path_to_id) {
- uniqueIds.insert(kv.second);
- }
-
- return static_cast<int>(uniqueIds.size());
- }
-
+ int num_dicts() const {
+ std::set<int64_t> uniqueIds;
+
+ for (auto& kv : field_path_to_id) {
+ uniqueIds.insert(kv.second);
+ }
+
+ return static_cast<int>(uniqueIds.size());
+ }
+
private:
void ImportFields(const FieldPosition& pos,
const std::vector<std::shared_ptr<Field>>& fields) {
@@ -151,32 +151,32 @@ Result<int64_t> DictionaryFieldMapper::GetFieldId(std::vector<int> field_path) c
int DictionaryFieldMapper::num_fields() const { return impl_->num_fields(); }
-int DictionaryFieldMapper::num_dicts() const { return impl_->num_dicts(); }
-
+int DictionaryFieldMapper::num_dicts() const { return impl_->num_dicts(); }
+
// ----------------------------------------------------------------------
// DictionaryMemo implementation
-namespace {
-
-bool HasUnresolvedNestedDict(const ArrayData& data) {
- if (data.type->id() == Type::DICTIONARY) {
- if (data.dictionary == nullptr) {
- return true;
- }
- if (HasUnresolvedNestedDict(*data.dictionary)) {
- return true;
- }
- }
- for (const auto& child : data.child_data) {
- if (HasUnresolvedNestedDict(*child)) {
- return true;
- }
- }
- return false;
-}
-
-} // namespace
-
+namespace {
+
+bool HasUnresolvedNestedDict(const ArrayData& data) {
+ if (data.type->id() == Type::DICTIONARY) {
+ if (data.dictionary == nullptr) {
+ return true;
+ }
+ if (HasUnresolvedNestedDict(*data.dictionary)) {
+ return true;
+ }
+ }
+ for (const auto& child : data.child_data) {
+ if (HasUnresolvedNestedDict(*child)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace
+
struct DictionaryMemo::Impl {
// Map of dictionary id to dictionary array(s) (several in case of deltas)
std::unordered_map<int64_t, ArrayDataVector> id_to_dictionary_;
@@ -205,12 +205,12 @@ struct DictionaryMemo::Impl {
// corrupted data. Full validation is necessary for certain types
// (for example nested dictionaries).
for (const auto& data : *data_vector) {
- if (HasUnresolvedNestedDict(*data)) {
- return Status::NotImplemented(
- "Encountered delta dictionary with an unresolved nested dictionary");
- }
- RETURN_NOT_OK(::arrow::internal::ValidateArray(*data));
- RETURN_NOT_OK(::arrow::internal::ValidateArrayFull(*data));
+ if (HasUnresolvedNestedDict(*data)) {
+ return Status::NotImplemented(
+ "Encountered delta dictionary with an unresolved nested dictionary");
+ }
+ RETURN_NOT_OK(::arrow::internal::ValidateArray(*data));
+ RETURN_NOT_OK(::arrow::internal::ValidateArrayFull(*data));
to_combine.push_back(MakeArray(data));
}
ARROW_ASSIGN_OR_RAISE(auto combined_dict, Concatenate(to_combine, pool));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
index 25fa70f0dfb..e4287cb1974 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/dictionary.h
@@ -80,10 +80,10 @@ class ARROW_EXPORT DictionaryFieldMapper {
int num_fields() const;
- /// \brief Returns number of unique dictionaries, taking into
- /// account that different fields can share the same dictionary.
- int num_dicts() const;
-
+ /// \brief Returns number of unique dictionaries, taking into
+ /// account that different fields can share the same dictionary.
+ int num_dicts() const;
+
private:
struct Impl;
std::unique_ptr<Impl> impl_;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
index 3354ee930ed..b1c30eec0b3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.cc
@@ -61,15 +61,15 @@ class ExtensionType;
namespace ipc {
namespace feather {
-namespace {
+namespace {
-using FBB = flatbuffers::FlatBufferBuilder;
+using FBB = flatbuffers::FlatBufferBuilder;
-constexpr const char* kFeatherV1MagicBytes = "FEA1";
-constexpr const int kFeatherDefaultAlignment = 8;
-const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0};
-
-inline int64_t PaddedLength(int64_t nbytes) {
+constexpr const char* kFeatherV1MagicBytes = "FEA1";
+constexpr const int kFeatherDefaultAlignment = 8;
+const uint8_t kPaddingBytes[kFeatherDefaultAlignment] = {0};
+
+inline int64_t PaddedLength(int64_t nbytes) {
static const int64_t alignment = kFeatherDefaultAlignment;
return ((nbytes + alignment - 1) / alignment) * alignment;
}
@@ -120,14 +120,14 @@ struct ColumnType {
enum type { PRIMITIVE, CATEGORY, TIMESTAMP, DATE, TIME };
};
-inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) {
+inline TimeUnit::type FromFlatbufferEnum(fbs::TimeUnit unit) {
return static_cast<TimeUnit::type>(static_cast<int>(unit));
}
/// For compatibility, we need to write any data sometimes just to keep producing
/// files that can be read with an older reader.
-Status WritePaddedBlank(io::OutputStream* stream, int64_t length,
- int64_t* bytes_written) {
+Status WritePaddedBlank(io::OutputStream* stream, int64_t length,
+ int64_t* bytes_written) {
const uint8_t null = 0;
for (int64_t i = 0; i < length; i++) {
RETURN_NOT_OK(stream->Write(&null, 1));
@@ -180,7 +180,7 @@ class ReaderV1 : public Reader {
GetDataType(col->values(), col->metadata_type(), col->metadata(), &type));
fields.push_back(::arrow::field(col->name()->str(), type));
}
- schema_ = ::arrow::schema(std::move(fields));
+ schema_ = ::arrow::schema(std::move(fields));
return Status::OK();
}
@@ -343,7 +343,7 @@ class ReaderV1 : public Reader {
columns.emplace_back();
RETURN_NOT_OK(GetColumn(i, &columns.back()));
}
- *out = Table::Make(this->schema(), std::move(columns), this->num_rows());
+ *out = Table::Make(this->schema(), std::move(columns), this->num_rows());
return Status::OK();
}
@@ -360,8 +360,8 @@ class ReaderV1 : public Reader {
RETURN_NOT_OK(GetColumn(field_index, &columns.back()));
fields.push_back(my_schema->field(field_index));
}
- *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
- this->num_rows());
+ *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
+ this->num_rows());
return Status::OK();
}
@@ -380,8 +380,8 @@ class ReaderV1 : public Reader {
RETURN_NOT_OK(GetColumn(field_index, &columns.back()));
fields.push_back(sch->field(field_index));
}
- *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
- this->num_rows());
+ *out = Table::Make(::arrow::schema(std::move(fields)), std::move(columns),
+ this->num_rows());
return Status::OK();
}
@@ -440,14 +440,14 @@ Result<fbs::Type> ToFlatbufferType(const DataType& type) {
}
}
-inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray(
+inline flatbuffers::Offset<fbs::PrimitiveArray> GetPrimitiveArray(
FBB& fbb, const ArrayMetadata& array) {
return fbs::CreatePrimitiveArray(fbb, array.type, fbs::Encoding::PLAIN, array.offset,
array.length, array.null_count, array.total_bytes);
}
// Convert Feather enums to Flatbuffer enums
-inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) {
+inline fbs::TimeUnit ToFlatbufferEnum(TimeUnit::type unit) {
return static_cast<fbs::TimeUnit>(static_cast<int>(unit));
}
@@ -459,7 +459,7 @@ const fbs::TypeMetadata COLUMN_TYPE_ENUM_MAPPING[] = {
fbs::TypeMetadata::TimeMetadata // TIME
};
-inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) {
+inline fbs::TypeMetadata ToFlatbufferEnum(ColumnType::type column_type) {
return COLUMN_TYPE_ENUM_MAPPING[column_type];
}
@@ -755,8 +755,8 @@ class ReaderV2 : public Reader {
std::shared_ptr<Schema> schema_;
};
-} // namespace
-
+} // namespace
+
Result<std::shared_ptr<Reader>> Reader::Open(
const std::shared_ptr<io::RandomAccessFile>& source) {
// Pathological issue where the file is smaller than header and footer
@@ -801,8 +801,8 @@ Status WriteTable(const Table& table, io::OutputStream* dst,
return WriteFeatherV1(table, dst);
} else {
IpcWriteOptions ipc_options = IpcWriteOptions::Defaults();
- ipc_options.unify_dictionaries = true;
- ipc_options.allow_64bit = true;
+ ipc_options.unify_dictionaries = true;
+ ipc_options.allow_64bit = true;
ARROW_ASSIGN_OR_RAISE(
ipc_options.codec,
util::Codec::Create(properties.compression, properties.compression_level));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
index 3c43cf7cff7..a32ff6d0a5a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/feather.h
@@ -25,7 +25,7 @@
#include <string>
#include <vector>
-#include "arrow/type_fwd.h"
+#include "arrow/type_fwd.h"
#include "arrow/util/compression.h"
#include "arrow/util/visibility.h"
@@ -128,7 +128,7 @@ struct ARROW_EXPORT WriteProperties {
Compression::type compression = Compression::UNCOMPRESSED;
/// Compressor-specific compression level
- int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
+ int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
};
ARROW_EXPORT
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
index 805a0c44354..4dd3a664aa6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/json_simple.h
@@ -1,61 +1,61 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Implement a simple JSON representation format for arrays
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include "arrow/status.h"
-#include "arrow/util/string_view.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace ipc {
-namespace internal {
-namespace json {
-
-ARROW_EXPORT
-Status ArrayFromJSON(const std::shared_ptr<DataType>&, const std::string& json,
- std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status ArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
- std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status ArrayFromJSON(const std::shared_ptr<DataType>&, const char* json,
- std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status DictArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view indices_json,
- util::string_view dictionary_json, std::shared_ptr<Array>* out);
-
-ARROW_EXPORT
-Status ScalarFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
- std::shared_ptr<Scalar>* out);
-
-} // namespace json
-} // namespace internal
-} // namespace ipc
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement a simple JSON representation format for arrays
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+
+namespace ipc {
+namespace internal {
+namespace json {
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, const std::string& json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ArrayFromJSON(const std::shared_ptr<DataType>&, const char* json,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status DictArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view indices_json,
+ util::string_view dictionary_json, std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
+Status ScalarFromJSON(const std::shared_ptr<DataType>&, util::string_view json,
+ std::shared_ptr<Scalar>* out);
+
+} // namespace json
+} // namespace internal
+} // namespace ipc
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
index e047e29c201..197556efcea 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.cc
@@ -32,8 +32,8 @@
#include "arrow/ipc/options.h"
#include "arrow/ipc/util.h"
#include "arrow/status.h"
-#include "arrow/util/endian.h"
-#include "arrow/util/future.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/future.h"
#include "arrow/util/logging.h"
#include "arrow/util/ubsan.h"
@@ -269,10 +269,10 @@ std::string FormatMessageType(MessageType type) {
return "record batch";
case MessageType::DICTIONARY_BATCH:
return "dictionary";
- case MessageType::TENSOR:
- return "tensor";
- case MessageType::SPARSE_TENSOR:
- return "sparse tensor";
+ case MessageType::TENSOR:
+ return "tensor";
+ case MessageType::SPARSE_TENSOR:
+ return "sparse tensor";
default:
break;
}
@@ -325,60 +325,60 @@ Result<std::unique_ptr<Message>> ReadMessage(int64_t offset, int32_t metadata_le
}
}
-Future<std::shared_ptr<Message>> ReadMessageAsync(int64_t offset, int32_t metadata_length,
- int64_t body_length,
- io::RandomAccessFile* file,
- const io::IOContext& context) {
- struct State {
- std::unique_ptr<Message> result;
- std::shared_ptr<MessageDecoderListener> listener;
- std::shared_ptr<MessageDecoder> decoder;
- };
- auto state = std::make_shared<State>();
- state->listener = std::make_shared<AssignMessageDecoderListener>(&state->result);
- state->decoder = std::make_shared<MessageDecoder>(state->listener);
-
- if (metadata_length < state->decoder->next_required_size()) {
- return Status::Invalid("metadata_length should be at least ",
- state->decoder->next_required_size());
- }
- return file->ReadAsync(context, offset, metadata_length + body_length)
- .Then([=](std::shared_ptr<Buffer> metadata) -> Result<std::shared_ptr<Message>> {
- if (metadata->size() < metadata_length) {
- return Status::Invalid("Expected to read ", metadata_length,
- " metadata bytes but got ", metadata->size());
- }
- ARROW_RETURN_NOT_OK(
- state->decoder->Consume(SliceBuffer(metadata, 0, metadata_length)));
- switch (state->decoder->state()) {
- case MessageDecoder::State::INITIAL:
- return std::move(state->result);
- case MessageDecoder::State::METADATA_LENGTH:
- return Status::Invalid("metadata length is missing. File offset: ", offset,
- ", metadata length: ", metadata_length);
- case MessageDecoder::State::METADATA:
- return Status::Invalid("flatbuffer size ",
- state->decoder->next_required_size(),
- " invalid. File offset: ", offset,
- ", metadata length: ", metadata_length);
- case MessageDecoder::State::BODY: {
- auto body = SliceBuffer(metadata, metadata_length, body_length);
- if (body->size() < state->decoder->next_required_size()) {
- return Status::IOError("Expected to be able to read ",
- state->decoder->next_required_size(),
- " bytes for message body, got ", body->size());
- }
- RETURN_NOT_OK(state->decoder->Consume(body));
- return std::move(state->result);
- }
- case MessageDecoder::State::EOS:
- return Status::Invalid("Unexpected empty message in IPC file format");
- default:
- return Status::Invalid("Unexpected state: ", state->decoder->state());
- }
- });
-}
-
+Future<std::shared_ptr<Message>> ReadMessageAsync(int64_t offset, int32_t metadata_length,
+ int64_t body_length,
+ io::RandomAccessFile* file,
+ const io::IOContext& context) {
+ struct State {
+ std::unique_ptr<Message> result;
+ std::shared_ptr<MessageDecoderListener> listener;
+ std::shared_ptr<MessageDecoder> decoder;
+ };
+ auto state = std::make_shared<State>();
+ state->listener = std::make_shared<AssignMessageDecoderListener>(&state->result);
+ state->decoder = std::make_shared<MessageDecoder>(state->listener);
+
+ if (metadata_length < state->decoder->next_required_size()) {
+ return Status::Invalid("metadata_length should be at least ",
+ state->decoder->next_required_size());
+ }
+ return file->ReadAsync(context, offset, metadata_length + body_length)
+ .Then([=](std::shared_ptr<Buffer> metadata) -> Result<std::shared_ptr<Message>> {
+ if (metadata->size() < metadata_length) {
+ return Status::Invalid("Expected to read ", metadata_length,
+ " metadata bytes but got ", metadata->size());
+ }
+ ARROW_RETURN_NOT_OK(
+ state->decoder->Consume(SliceBuffer(metadata, 0, metadata_length)));
+ switch (state->decoder->state()) {
+ case MessageDecoder::State::INITIAL:
+ return std::move(state->result);
+ case MessageDecoder::State::METADATA_LENGTH:
+ return Status::Invalid("metadata length is missing. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::METADATA:
+ return Status::Invalid("flatbuffer size ",
+ state->decoder->next_required_size(),
+ " invalid. File offset: ", offset,
+ ", metadata length: ", metadata_length);
+ case MessageDecoder::State::BODY: {
+ auto body = SliceBuffer(metadata, metadata_length, body_length);
+ if (body->size() < state->decoder->next_required_size()) {
+ return Status::IOError("Expected to be able to read ",
+ state->decoder->next_required_size(),
+ " bytes for message body, got ", body->size());
+ }
+ RETURN_NOT_OK(state->decoder->Consume(body));
+ return std::move(state->result);
+ }
+ case MessageDecoder::State::EOS:
+ return Status::Invalid("Unexpected empty message in IPC file format");
+ default:
+ return Status::Invalid("Unexpected state: ", state->decoder->state());
+ }
+ });
+}
+
Status AlignStream(io::InputStream* stream, int32_t alignment) {
ARROW_ASSIGN_OR_RAISE(int64_t position, stream->Tell());
return stream->Advance(PaddedLength(position, alignment) - position);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
index d437bdfe773..b2683259cb4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/message.h
@@ -365,7 +365,7 @@ class ARROW_EXPORT MessageDecoder {
/// memcpy(buffer->mutable_data() + current_buffer_size,
/// small_chunk,
/// small_chunk_size);
- /// if (buffer->size() < decoder.next_required_size()) {
+ /// if (buffer->size() < decoder.next_required_size()) {
/// continue;
/// }
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
@@ -459,11 +459,11 @@ Result<std::unique_ptr<Message>> ReadMessage(const int64_t offset,
const int32_t metadata_length,
io::RandomAccessFile* file);
-ARROW_EXPORT
-Future<std::shared_ptr<Message>> ReadMessageAsync(
- const int64_t offset, const int32_t metadata_length, const int64_t body_length,
- io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
-
+ARROW_EXPORT
+Future<std::shared_ptr<Message>> ReadMessageAsync(
+ const int64_t offset, const int32_t metadata_length, const int64_t body_length,
+ io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
+
/// \brief Advance stream to an 8-byte offset if its position is not a multiple
/// of 8 already
/// \param[in] stream an input stream
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
index 9d0db6a0d8b..4b332bd9e1e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.cc
@@ -271,12 +271,12 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
return Status::OK();
case flatbuf::Type::Decimal: {
auto dec_type = static_cast<const flatbuf::Decimal*>(type_data);
- if (dec_type->bitWidth() == 128) {
- return Decimal128Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
- } else if (dec_type->bitWidth() == 256) {
- return Decimal256Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
- } else {
- return Status::Invalid("Library only supports 128-bit or 256-bit decimal values");
+ if (dec_type->bitWidth() == 128) {
+ return Decimal128Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
+ } else if (dec_type->bitWidth() == 256) {
+ return Decimal256Type::Make(dec_type->precision(), dec_type->scale()).Value(out);
+ } else {
+ return Status::Invalid("Library only supports 128-bit or 256-bit decimal values");
}
}
case flatbuf::Type::Date: {
@@ -428,7 +428,7 @@ static Status GetDictionaryEncoding(FBB& fbb, const std::shared_ptr<Field>& fiel
const DictionaryType& type, int64_t dictionary_id,
DictionaryOffset* out) {
// We assume that the dictionary index type (as an integer) has already been
- // validated elsewhere, and can safely assume we are dealing with integers
+ // validated elsewhere, and can safely assume we are dealing with integers
const auto& index_type = checked_cast<const IntegerType&>(*type.index_type());
auto index_type_offset =
@@ -594,24 +594,24 @@ class FieldToFlatbufferVisitor {
return Status::OK();
}
- Status Visit(const Decimal128Type& type) {
+ Status Visit(const Decimal128Type& type) {
const auto& dec_type = checked_cast<const Decimal128Type&>(type);
fb_type_ = flatbuf::Type::Decimal;
- type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
- /*bitWidth=*/128)
- .Union();
+ type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
+ /*bitWidth=*/128)
+ .Union();
+ return Status::OK();
+ }
+
+ Status Visit(const Decimal256Type& type) {
+ const auto& dec_type = checked_cast<const Decimal256Type&>(type);
+ fb_type_ = flatbuf::Type::Decimal;
+ type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
+ /*bitWith=*/256)
+ .Union();
return Status::OK();
}
- Status Visit(const Decimal256Type& type) {
- const auto& dec_type = checked_cast<const Decimal256Type&>(type);
- fb_type_ = flatbuf::Type::Decimal;
- type_offset_ = flatbuf::CreateDecimal(fbb_, dec_type.precision(), dec_type.scale(),
- /*bitWith=*/256)
- .Union();
- return Status::OK();
- }
-
Status Visit(const ListType& type) {
fb_type_ = flatbuf::Type::List;
RETURN_NOT_OK(VisitChildFields(type));
@@ -753,15 +753,15 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, FieldPosition field_pos,
// Reconstruct the data type
// 1. Data type children
- FieldVector child_fields;
+ FieldVector child_fields;
const auto& children = field->children();
- // As a tolerance, allow for a null children field meaning "no children" (ARROW-12100)
- if (children != nullptr) {
- child_fields.resize(children->size());
- for (int i = 0; i < static_cast<int>(children->size()); ++i) {
- RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
- dictionary_memo, &child_fields[i]));
- }
+ // As a tolerance, allow for a null children field meaning "no children" (ARROW-12100)
+ if (children != nullptr) {
+ child_fields.resize(children->size());
+ for (int i = 0; i < static_cast<int>(children->size()); ++i) {
+ RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), field_pos.child(i),
+ dictionary_memo, &child_fields[i]));
+ }
}
// 2. Top-level concrete data type
@@ -871,12 +871,12 @@ Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema,
Result<std::shared_ptr<Buffer>> WriteFBMessage(
FBB& fbb, flatbuf::MessageHeader header_type, flatbuffers::Offset<void> header,
int64_t body_length, MetadataVersion version,
- const std::shared_ptr<const KeyValueMetadata>& custom_metadata, MemoryPool* pool) {
+ const std::shared_ptr<const KeyValueMetadata>& custom_metadata, MemoryPool* pool) {
auto message = flatbuf::CreateMessage(fbb, MetadataVersionToFlatbuffer(version),
header_type, header, body_length,
SerializeCustomMetadata(fbb, custom_metadata));
fbb.Finish(message);
- return WriteFlatbufferBuilder(fbb, pool);
+ return WriteFlatbufferBuilder(fbb, pool);
}
using FieldNodeVector =
@@ -1183,8 +1183,8 @@ Status WriteSchemaMessage(const Schema& schema, const DictionaryFieldMapper& map
flatbuffers::Offset<flatbuf::Schema> fb_schema;
RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, mapper, &fb_schema));
return WriteFBMessage(fbb, flatbuf::MessageHeader::Schema, fb_schema.Union(),
- /*body_length=*/0, options.metadata_version,
- /*custom_metadata=*/nullptr, options.memory_pool)
+ /*body_length=*/0, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool)
.Value(out);
}
@@ -1198,8 +1198,8 @@ Status WriteRecordBatchMessage(
RETURN_NOT_OK(
MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch));
return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(),
- body_length, options.metadata_version, custom_metadata,
- options.memory_pool)
+ body_length, options.metadata_version, custom_metadata,
+ options.memory_pool)
.Value(out);
}
@@ -1233,8 +1233,8 @@ Result<std::shared_ptr<Buffer>> WriteTensorMessage(const Tensor& tensor,
flatbuf::CreateTensor(fbb, fb_type_type, fb_type, fb_shape, fb_strides, &buffer);
return WriteFBMessage(fbb, flatbuf::MessageHeader::Tensor, fb_tensor.Union(),
- body_length, options.metadata_version,
- /*custom_metadata=*/nullptr, options.memory_pool);
+ body_length, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool);
}
Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
@@ -1245,8 +1245,8 @@ Result<std::shared_ptr<Buffer>> WriteSparseTensorMessage(
RETURN_NOT_OK(
MakeSparseTensor(fbb, sparse_tensor, body_length, buffers, &fb_sparse_tensor));
return WriteFBMessage(fbb, flatbuf::MessageHeader::SparseTensor,
- fb_sparse_tensor.Union(), body_length, options.metadata_version,
- /*custom_metadata=*/nullptr, options.memory_pool);
+ fb_sparse_tensor.Union(), body_length, options.metadata_version,
+ /*custom_metadata=*/nullptr, options.memory_pool);
}
Status WriteDictionaryMessage(
@@ -1261,8 +1261,8 @@ Status WriteDictionaryMessage(
auto dictionary_batch =
flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union();
return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch,
- body_length, options.metadata_version, custom_metadata,
- options.memory_pool)
+ body_length, options.metadata_version, custom_metadata,
+ options.memory_pool)
.Value(out);
}
@@ -1338,11 +1338,11 @@ Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
std::shared_ptr<KeyValueMetadata> metadata;
RETURN_NOT_OK(internal::GetKeyValueMetadata(schema->custom_metadata(), &metadata));
- // set endianess using the value in flatbuf schema
- auto endianness = schema->endianness() == flatbuf::Endianness::Little
- ? Endianness::Little
- : Endianness::Big;
- *out = ::arrow::schema(std::move(fields), endianness, metadata);
+ // set endianess using the value in flatbuf schema
+ auto endianness = schema->endianness() == flatbuf::Endianness::Little
+ ? Endianness::Little
+ : Endianness::Big;
+ *out = ::arrow::schema(std::move(fields), endianness, metadata);
return Status::OK();
}
@@ -1356,9 +1356,9 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
return Status::IOError("Header-type of flatbuffer-encoded Message is not Tensor.");
}
- flatbuffers::uoffset_t ndim = tensor->shape()->size();
+ flatbuffers::uoffset_t ndim = tensor->shape()->size();
- for (flatbuffers::uoffset_t i = 0; i < ndim; ++i) {
+ for (flatbuffers::uoffset_t i = 0; i < ndim; ++i) {
auto dim = tensor->shape()->Get(i);
shape->push_back(dim->size());
@@ -1366,12 +1366,12 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
}
if (tensor->strides() && tensor->strides()->size() > 0) {
- if (tensor->strides()->size() != ndim) {
- return Status::IOError(
- "The sizes of shape and strides in a tensor are mismatched.");
- }
-
- for (decltype(ndim) i = 0; i < ndim; ++i) {
+ if (tensor->strides()->size() != ndim) {
+ return Status::IOError(
+ "The sizes of shape and strides in a tensor are mismatched.");
+ }
+
+ for (decltype(ndim) i = 0; i < ndim; ++i) {
strides->push_back(tensor->strides()->Get(i));
}
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
index d47b244d324..9cf489dd668 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/metadata_internal.h
@@ -156,22 +156,22 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
Status GetKeyValueMetadata(const KVVector* fb_metadata,
std::shared_ptr<KeyValueMetadata>* out);
-template <typename RootType>
-bool VerifyFlatbuffers(const uint8_t* data, int64_t size) {
- // Heuristic: tables in a Arrow flatbuffers buffer must take at least 1 bit
- // each in average (ARROW-11559).
- // Especially, the only recursive table (the `Field` table in Schema.fbs)
- // must have a non-empty `type` member.
- flatbuffers::Verifier verifier(
- data, static_cast<size_t>(size),
- /*max_depth=*/128,
- /*max_tables=*/static_cast<flatbuffers::uoffset_t>(8 * size));
- return verifier.VerifyBuffer<RootType>(nullptr);
-}
-
+template <typename RootType>
+bool VerifyFlatbuffers(const uint8_t* data, int64_t size) {
+ // Heuristic: tables in a Arrow flatbuffers buffer must take at least 1 bit
+ // each in average (ARROW-11559).
+ // Especially, the only recursive table (the `Field` table in Schema.fbs)
+ // must have a non-empty `type` member.
+ flatbuffers::Verifier verifier(
+ data, static_cast<size_t>(size),
+ /*max_depth=*/128,
+ /*max_tables=*/static_cast<flatbuffers::uoffset_t>(8 * size));
+ return verifier.VerifyBuffer<RootType>(nullptr);
+}
+
static inline Status VerifyMessage(const uint8_t* data, int64_t size,
const flatbuf::Message** out) {
- if (!VerifyFlatbuffers<flatbuf::Message>(data, size)) {
+ if (!VerifyFlatbuffers<flatbuf::Message>(data, size)) {
return Status::IOError("Invalid flatbuffers message.");
}
*out = flatbuf::GetMessage(data);
@@ -211,11 +211,11 @@ Status WriteDictionaryMessage(
const IpcWriteOptions& options, std::shared_ptr<Buffer>* out);
static inline Result<std::shared_ptr<Buffer>> WriteFlatbufferBuilder(
- flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference
- MemoryPool* pool = default_memory_pool()) {
+ flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference
+ MemoryPool* pool = default_memory_pool()) {
int32_t size = fbb.GetSize();
- ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(size, pool));
+ ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(size, pool));
uint8_t* dst = result->mutable_data();
memcpy(dst, fbb.GetBufferPointer(), size);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
index 2845a61523a..2e0f800b5ad 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/options.h
@@ -39,26 +39,26 @@ constexpr int kMaxNestingDepth = 64;
/// \brief Options for writing Arrow IPC messages
struct ARROW_EXPORT IpcWriteOptions {
- /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
- ///
- /// Some implementations may not be able to parse streams created with this option.
+ /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
+ ///
+ /// Some implementations may not be able to parse streams created with this option.
bool allow_64bit = false;
-
- /// \brief The maximum permitted schema nesting depth.
+
+ /// \brief The maximum permitted schema nesting depth.
int max_recursion_depth = kMaxNestingDepth;
- /// \brief Write padding after memory buffers up to this multiple of bytes.
+ /// \brief Write padding after memory buffers up to this multiple of bytes.
int32_t alignment = 8;
- /// \brief Write the pre-0.15.0 IPC message format
- ///
- /// This legacy format consists of a 4-byte prefix instead of 8-byte.
+ /// \brief Write the pre-0.15.0 IPC message format
+ ///
+ /// This legacy format consists of a 4-byte prefix instead of 8-byte.
bool write_legacy_ipc_format = false;
/// \brief The memory pool to use for allocations made during IPC writing
- ///
- /// While Arrow IPC is predominantly zero-copy, it may have to allocate
- /// memory in some cases (for example if compression is enabled).
+ ///
+ /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+ /// memory in some cases (for example if compression is enabled).
MemoryPool* memory_pool = default_memory_pool();
/// \brief Compression codec to use for record batch body buffers
@@ -70,38 +70,38 @@ struct ARROW_EXPORT IpcWriteOptions {
/// like compression
bool use_threads = true;
- /// \brief Whether to emit dictionary deltas
- ///
- /// If false, a changed dictionary for a given field will emit a full
- /// dictionary replacement.
- /// If true, a changed dictionary will be compared against the previous
- /// version. If possible, a dictionary delta will be omitted, otherwise
- /// a full dictionary replacement.
- ///
- /// Default is false to maximize stream compatibility.
- ///
- /// Also, note that if a changed dictionary is a nested dictionary,
- /// then a delta is never emitted, for compatibility with the read path.
- bool emit_dictionary_deltas = false;
-
- /// \brief Whether to unify dictionaries for the IPC file format
- ///
- /// The IPC file format doesn't support dictionary replacements or deltas.
- /// Therefore, chunks of a column with a dictionary type must have the same
- /// dictionary in each record batch.
- ///
- /// If this option is true, RecordBatchWriter::WriteTable will attempt
- /// to unify dictionaries across each table column. If this option is
- /// false, unequal dictionaries across a table column will simply raise
- /// an error.
- ///
- /// Note that enabling this option has a runtime cost. Also, not all types
- /// currently support dictionary unification.
- ///
- /// This option is ignored for IPC streams, which support dictionary replacement
- /// and deltas.
- bool unify_dictionaries = false;
-
+ /// \brief Whether to emit dictionary deltas
+ ///
+ /// If false, a changed dictionary for a given field will emit a full
+ /// dictionary replacement.
+ /// If true, a changed dictionary will be compared against the previous
+ /// version. If possible, a dictionary delta will be omitted, otherwise
+ /// a full dictionary replacement.
+ ///
+ /// Default is false to maximize stream compatibility.
+ ///
+ /// Also, note that if a changed dictionary is a nested dictionary,
+ /// then a delta is never emitted, for compatibility with the read path.
+ bool emit_dictionary_deltas = false;
+
+ /// \brief Whether to unify dictionaries for the IPC file format
+ ///
+ /// The IPC file format doesn't support dictionary replacements or deltas.
+ /// Therefore, chunks of a column with a dictionary type must have the same
+ /// dictionary in each record batch.
+ ///
+ /// If this option is true, RecordBatchWriter::WriteTable will attempt
+ /// to unify dictionaries across each table column. If this option is
+ /// false, unequal dictionaries across a table column will simply raise
+ /// an error.
+ ///
+ /// Note that enabling this option has a runtime cost. Also, not all types
+ /// currently support dictionary unification.
+ ///
+ /// This option is ignored for IPC streams, which support dictionary replacement
+ /// and deltas.
+ bool unify_dictionaries = false;
+
/// \brief Format version to use for IPC messages and their metadata.
///
/// Presently using V5 version (readable by 1.0.0 and later).
@@ -115,40 +115,40 @@ struct ARROW_EXPORT IpcWriteOptions {
using IpcOptions = IpcWriteOptions;
#endif
-/// \brief Options for reading Arrow IPC messages
+/// \brief Options for reading Arrow IPC messages
struct ARROW_EXPORT IpcReadOptions {
- /// \brief The maximum permitted schema nesting depth.
+ /// \brief The maximum permitted schema nesting depth.
int max_recursion_depth = kMaxNestingDepth;
- /// \brief The memory pool to use for allocations made during IPC reading
- ///
- /// While Arrow IPC is predominantly zero-copy, it may have to allocate
- /// memory in some cases (for example if compression is enabled).
+ /// \brief The memory pool to use for allocations made during IPC reading
+ ///
+ /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+ /// memory in some cases (for example if compression is enabled).
MemoryPool* memory_pool = default_memory_pool();
/// \brief EXPERIMENTAL: Top-level schema fields to include when
- /// deserializing RecordBatch.
- ///
- /// If empty (the default), return all deserialized fields.
- /// If non-empty, the values are the indices of fields in the top-level schema.
+ /// deserializing RecordBatch.
+ ///
+ /// If empty (the default), return all deserialized fields.
+ /// If non-empty, the values are the indices of fields in the top-level schema.
std::vector<int> included_fields;
/// \brief Use global CPU thread pool to parallelize any computational tasks
/// like decompression
bool use_threads = true;
- /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness
- ///
- /// If the endianness of the received schema is not equal to platform-native
- /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
- /// This includes the value buffers of numeric types, temporal types, decimal
- /// types, as well as the offset buffers of variable-sized binary and list-like
- /// types.
- ///
- /// Endianness conversion is achieved by the RecordBatchFileReader,
- /// RecordBatchStreamReader and StreamDecoder classes.
- bool ensure_native_endian = true;
-
+ /// \brief EXPERIMENTAL: Convert incoming data to platform-native endianness
+ ///
+ /// If the endianness of the received schema is not equal to platform-native
+ /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
+ /// This includes the value buffers of numeric types, temporal types, decimal
+ /// types, as well as the offset buffers of variable-sized binary and list-like
+ /// types.
+ ///
+ /// Endianness conversion is achieved by the RecordBatchFileReader,
+ /// RecordBatchStreamReader and StreamDecoder classes.
+ bool ensure_native_endian = true;
+
static IpcReadOptions Defaults();
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
index 5e90be7d4e6..a3c345cc440 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.cc
@@ -31,7 +31,7 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/extension_type.h"
-#include "arrow/io/caching.h"
+#include "arrow/io/caching.h"
#include "arrow/io/interfaces.h"
#include "arrow/io/memory.h"
#include "arrow/ipc/message.h"
@@ -47,14 +47,14 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/compression.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/parallel.h"
-#include "arrow/util/string.h"
-#include "arrow/util/thread_pool.h"
+#include "arrow/util/string.h"
+#include "arrow/util/thread_pool.h"
#include "arrow/util/ubsan.h"
-#include "arrow/util/vector.h"
+#include "arrow/util/vector.h"
#include "arrow/visitor_inline.h"
#include "generated/File_generated.h" // IWYU pragma: export
@@ -112,30 +112,30 @@ Status InvalidMessageType(MessageType expected, MessageType actual) {
// ----------------------------------------------------------------------
// Record batch read path
-/// \brief Structure to keep common arguments to be passed
-struct IpcReadContext {
- IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap,
- MetadataVersion version = MetadataVersion::V5,
- Compression::type kind = Compression::UNCOMPRESSED)
- : dictionary_memo(memo),
- options(option),
- metadata_version(version),
- compression(kind),
- swap_endian(swap) {}
-
- DictionaryMemo* dictionary_memo;
-
- const IpcReadOptions& options;
-
- MetadataVersion metadata_version;
-
- Compression::type compression;
-
- /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements
- /// if this flag is true
- const bool swap_endian;
-};
-
+/// \brief Structure to keep common arguments to be passed
+struct IpcReadContext {
+ IpcReadContext(DictionaryMemo* memo, const IpcReadOptions& option, bool swap,
+ MetadataVersion version = MetadataVersion::V5,
+ Compression::type kind = Compression::UNCOMPRESSED)
+ : dictionary_memo(memo),
+ options(option),
+ metadata_version(version),
+ compression(kind),
+ swap_endian(swap) {}
+
+ DictionaryMemo* dictionary_memo;
+
+ const IpcReadOptions& options;
+
+ MetadataVersion metadata_version;
+
+ Compression::type compression;
+
+ /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness of elements
+ /// if this flag is true
+ const bool swap_endian;
+};
+
/// The field_index and buffer_index are incremented based on how much of the
/// batch is "consumed" (through nested data reconstruction, for example)
class ArrayLoader {
@@ -467,9 +467,9 @@ Status DecompressBuffers(Compression::type compression, const IpcReadOptions& op
Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
- const std::vector<bool>* inclusion_mask, const IpcReadContext& context,
- io::RandomAccessFile* file) {
- ArrayLoader loader(metadata, context.metadata_version, context.options, file);
+ const std::vector<bool>* inclusion_mask, const IpcReadContext& context,
+ io::RandomAccessFile* file) {
+ ArrayLoader loader(metadata, context.metadata_version, context.options, file);
ArrayDataVector columns(schema->num_fields());
ArrayDataVector filtered_columns;
@@ -499,8 +499,8 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
// Dictionary resolution needs to happen on the unfiltered columns,
// because fields are mapped structurally (by path in the original schema).
- RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo,
- context.options.memory_pool));
+ RETURN_NOT_OK(ResolveDictionaries(columns, *context.dictionary_memo,
+ context.options.memory_pool));
if (inclusion_mask) {
filtered_schema = ::arrow::schema(std::move(filtered_fields), schema->metadata());
@@ -509,30 +509,30 @@ Result<std::shared_ptr<RecordBatch>> LoadRecordBatchSubset(
filtered_schema = schema;
filtered_columns = std::move(columns);
}
- if (context.compression != Compression::UNCOMPRESSED) {
- RETURN_NOT_OK(
- DecompressBuffers(context.compression, context.options, &filtered_columns));
+ if (context.compression != Compression::UNCOMPRESSED) {
+ RETURN_NOT_OK(
+ DecompressBuffers(context.compression, context.options, &filtered_columns));
}
- // swap endian in a set of ArrayData if necessary (swap_endian == true)
- if (context.swap_endian) {
- for (int i = 0; i < static_cast<int>(filtered_columns.size()); ++i) {
- ARROW_ASSIGN_OR_RAISE(filtered_columns[i],
- arrow::internal::SwapEndianArrayData(filtered_columns[i]));
- }
- }
- return RecordBatch::Make(std::move(filtered_schema), metadata->length(),
+ // swap endian in a set of ArrayData if necessary (swap_endian == true)
+ if (context.swap_endian) {
+ for (int i = 0; i < static_cast<int>(filtered_columns.size()); ++i) {
+ ARROW_ASSIGN_OR_RAISE(filtered_columns[i],
+ arrow::internal::SwapEndianArrayData(filtered_columns[i]));
+ }
+ }
+ return RecordBatch::Make(std::move(filtered_schema), metadata->length(),
std::move(filtered_columns));
}
Result<std::shared_ptr<RecordBatch>> LoadRecordBatch(
const flatbuf::RecordBatch* metadata, const std::shared_ptr<Schema>& schema,
- const std::vector<bool>& inclusion_mask, const IpcReadContext& context,
- io::RandomAccessFile* file) {
+ const std::vector<bool>& inclusion_mask, const IpcReadContext& context,
+ io::RandomAccessFile* file) {
if (inclusion_mask.size() > 0) {
- return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
+ return LoadRecordBatchSubset(metadata, schema, &inclusion_mask, context, file);
} else {
- return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file);
+ return LoadRecordBatchSubset(metadata, schema, /*param_name=*/nullptr, context, file);
}
}
@@ -569,9 +569,9 @@ Status GetCompressionExperimental(const flatbuf::Message* message,
RETURN_NOT_OK(internal::GetKeyValueMetadata(message->custom_metadata(), &metadata));
int index = metadata->FindKey("ARROW:experimental_compression");
if (index != -1) {
- // Arrow 0.17 stored string in upper case, internal utils now require lower case
- auto name = arrow::internal::AsciiToLower(metadata->value(index));
- ARROW_ASSIGN_OR_RAISE(*out, util::Codec::GetCompressionType(name));
+ // Arrow 0.17 stored string in upper case, internal utils now require lower case
+ auto name = arrow::internal::AsciiToLower(metadata->value(index));
+ ARROW_ASSIGN_OR_RAISE(*out, util::Codec::GetCompressionType(name));
}
return internal::CheckCompressionSupported(*out);
}
@@ -610,8 +610,8 @@ Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
Result<std::shared_ptr<RecordBatch>> ReadRecordBatchInternal(
const Buffer& metadata, const std::shared_ptr<Schema>& schema,
- const std::vector<bool>& inclusion_mask, IpcReadContext& context,
- io::RandomAccessFile* file) {
+ const std::vector<bool>& inclusion_mask, IpcReadContext& context,
+ io::RandomAccessFile* file) {
const flatbuf::Message* message = nullptr;
RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
auto batch = message->header_as_RecordBatch();
@@ -622,15 +622,15 @@ Result<std::shared_ptr<RecordBatch>> ReadRecordBatchInternal(
Compression::type compression;
RETURN_NOT_OK(GetCompression(batch, &compression));
- if (context.compression == Compression::UNCOMPRESSED &&
+ if (context.compression == Compression::UNCOMPRESSED &&
message->version() == flatbuf::MetadataVersion::V4) {
// Possibly obtain codec information from experimental serialization format
// in 0.17.x
RETURN_NOT_OK(GetCompressionExperimental(message, &compression));
}
- context.compression = compression;
- context.metadata_version = internal::GetMetadataVersion(message->version());
- return LoadRecordBatch(batch, schema, inclusion_mask, context, file);
+ context.compression = compression;
+ context.metadata_version = internal::GetMetadataVersion(message->version());
+ return LoadRecordBatch(batch, schema, inclusion_mask, context, file);
}
// If we are selecting only certain fields, populate an inclusion mask for fast lookups.
@@ -663,8 +663,8 @@ Status GetInclusionMaskAndOutSchema(const std::shared_ptr<Schema>& full_schema,
included_fields.push_back(full_schema->field(i));
}
- *out_schema = schema(std::move(included_fields), full_schema->endianness(),
- full_schema->metadata());
+ *out_schema = schema(std::move(included_fields), full_schema->endianness(),
+ full_schema->metadata());
return Status::OK();
}
@@ -672,32 +672,32 @@ Status UnpackSchemaMessage(const void* opaque_schema, const IpcReadOptions& opti
DictionaryMemo* dictionary_memo,
std::shared_ptr<Schema>* schema,
std::shared_ptr<Schema>* out_schema,
- std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
+ std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
RETURN_NOT_OK(internal::GetSchema(opaque_schema, dictionary_memo, schema));
// If we are selecting only certain fields, populate the inclusion mask now
// for fast lookups
- RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields,
- field_inclusion_mask, out_schema));
- *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian();
- if (*swap_endian) {
- // create a new schema with native endianness before swapping endian in ArrayData
- *schema = schema->get()->WithEndianness(Endianness::Native);
- *out_schema = out_schema->get()->WithEndianness(Endianness::Native);
- }
- return Status::OK();
+ RETURN_NOT_OK(GetInclusionMaskAndOutSchema(*schema, options.included_fields,
+ field_inclusion_mask, out_schema));
+ *swap_endian = options.ensure_native_endian && !out_schema->get()->is_native_endian();
+ if (*swap_endian) {
+ // create a new schema with native endianness before swapping endian in ArrayData
+ *schema = schema->get()->WithEndianness(Endianness::Native);
+ *out_schema = out_schema->get()->WithEndianness(Endianness::Native);
+ }
+ return Status::OK();
}
Status UnpackSchemaMessage(const Message& message, const IpcReadOptions& options,
DictionaryMemo* dictionary_memo,
std::shared_ptr<Schema>* schema,
std::shared_ptr<Schema>* out_schema,
- std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
+ std::vector<bool>* field_inclusion_mask, bool* swap_endian) {
CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message.type());
CHECK_HAS_NO_BODY(message);
return UnpackSchemaMessage(message.header(), options, dictionary_memo, schema,
- out_schema, field_inclusion_mask, swap_endian);
+ out_schema, field_inclusion_mask, swap_endian);
}
Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
@@ -707,14 +707,14 @@ Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
std::shared_ptr<Schema> out_schema;
// Empty means do not use
std::vector<bool> inclusion_mask;
- IpcReadContext context(const_cast<DictionaryMemo*>(dictionary_memo), options, false);
- RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields,
+ IpcReadContext context(const_cast<DictionaryMemo*>(dictionary_memo), options, false);
+ RETURN_NOT_OK(GetInclusionMaskAndOutSchema(schema, context.options.included_fields,
&inclusion_mask, &out_schema));
- return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file);
+ return ReadRecordBatchInternal(metadata, schema, inclusion_mask, context, file);
}
-Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
- DictionaryKind* kind, io::RandomAccessFile* file) {
+Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
+ DictionaryKind* kind, io::RandomAccessFile* file) {
const flatbuf::Message* message = nullptr;
RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message));
const auto dictionary_batch = message->header_as_DictionaryBatch();
@@ -741,46 +741,46 @@ Status ReadDictionary(const Buffer& metadata, const IpcReadContext& context,
// Look up the dictionary value type, which must have been added to the
// DictionaryMemo already prior to invoking this function
- ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id));
+ ARROW_ASSIGN_OR_RAISE(auto value_type, context.dictionary_memo->GetDictionaryType(id));
// Load the dictionary data from the dictionary batch
ArrayLoader loader(batch_meta, internal::GetMetadataVersion(message->version()),
- context.options, file);
- auto dict_data = std::make_shared<ArrayData>();
+ context.options, file);
+ auto dict_data = std::make_shared<ArrayData>();
const Field dummy_field("", value_type);
RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get()));
if (compression != Compression::UNCOMPRESSED) {
ArrayDataVector dict_fields{dict_data};
- RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields));
+ RETURN_NOT_OK(DecompressBuffers(compression, context.options, &dict_fields));
+ }
+
+ // swap endian in dict_data if necessary (swap_endian == true)
+ if (context.swap_endian) {
+ ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data));
}
- // swap endian in dict_data if necessary (swap_endian == true)
- if (context.swap_endian) {
- ARROW_ASSIGN_OR_RAISE(dict_data, ::arrow::internal::SwapEndianArrayData(dict_data));
- }
-
if (dictionary_batch->isDelta()) {
if (kind != nullptr) {
*kind = DictionaryKind::Delta;
}
- return context.dictionary_memo->AddDictionaryDelta(id, dict_data);
+ return context.dictionary_memo->AddDictionaryDelta(id, dict_data);
}
ARROW_ASSIGN_OR_RAISE(bool inserted,
- context.dictionary_memo->AddOrReplaceDictionary(id, dict_data));
+ context.dictionary_memo->AddOrReplaceDictionary(id, dict_data));
if (kind != nullptr) {
*kind = inserted ? DictionaryKind::New : DictionaryKind::Replacement;
}
return Status::OK();
}
-Status ReadDictionary(const Message& message, const IpcReadContext& context,
- DictionaryKind* kind) {
+Status ReadDictionary(const Message& message, const IpcReadContext& context,
+ DictionaryKind* kind) {
// Only invoke this method if we already know we have a dictionary message
DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH);
CHECK_HAS_BODY(message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body()));
- return ReadDictionary(*message.metadata(), context, kind, reader.get());
+ return ReadDictionary(*message.metadata(), context, kind, reader.get());
}
// ----------------------------------------------------------------------
@@ -799,10 +799,10 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
return Status::Invalid("Tried reading schema message, was null or length 0");
}
- RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_,
- &out_schema_, &field_inclusion_mask_,
- &swap_endian_));
- return Status::OK();
+ RETURN_NOT_OK(UnpackSchemaMessage(*message, options, &dictionary_memo_, &schema_,
+ &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
+ return Status::OK();
}
Status ReadNext(std::shared_ptr<RecordBatch>* batch) override {
@@ -834,9 +834,9 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
CHECK_HAS_BODY(*message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
return ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
- context, reader.get())
+ context, reader.get())
.Value(batch);
}
@@ -866,8 +866,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
// Read dictionary from dictionary batch
Status ReadDictionary(const Message& message) {
DictionaryKind kind;
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
- RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
switch (kind) {
case DictionaryKind::New:
break;
@@ -888,7 +888,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
// TODO(wesm): In future, we may want to reconcile the ids in the stream with
// those found in the schema
- const auto num_dicts = dictionary_memo_.fields().num_dicts();
+ const auto num_dicts = dictionary_memo_.fields().num_dicts();
for (int i = 0; i < num_dicts; ++i) {
ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage());
if (!message) {
@@ -933,8 +933,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader {
DictionaryMemo dictionary_memo_;
std::shared_ptr<Schema> schema_, out_schema_;
-
- bool swap_endian_;
+
+ bool swap_endian_;
};
// ----------------------------------------------------------------------
@@ -961,94 +961,94 @@ Result<std::shared_ptr<RecordBatchStreamReader>> RecordBatchStreamReader::Open(
// ----------------------------------------------------------------------
// Reader implementation
-// Common functions used in both the random-access file reader and the
-// asynchronous generator
+// Common functions used in both the random-access file reader and the
+// asynchronous generator
static inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) {
return FileBlock{block->offset(), block->metaDataLength(), block->bodyLength()};
}
-static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block,
- io::RandomAccessFile* file) {
- if (!BitUtil::IsMultipleOf8(block.offset) ||
- !BitUtil::IsMultipleOf8(block.metadata_length) ||
- !BitUtil::IsMultipleOf8(block.body_length)) {
- return Status::Invalid("Unaligned block in IPC file");
- }
-
- // TODO(wesm): this breaks integration tests, see ARROW-3256
- // DCHECK_EQ((*out)->body_length(), block.body_length);
-
- ARROW_ASSIGN_OR_RAISE(auto message,
- ReadMessage(block.offset, block.metadata_length, file));
- return std::move(message);
-}
-
-static Future<std::shared_ptr<Message>> ReadMessageFromBlockAsync(
- const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) {
- if (!BitUtil::IsMultipleOf8(block.offset) ||
- !BitUtil::IsMultipleOf8(block.metadata_length) ||
- !BitUtil::IsMultipleOf8(block.body_length)) {
- return Status::Invalid("Unaligned block in IPC file");
- }
-
- // TODO(wesm): this breaks integration tests, see ARROW-3256
- // DCHECK_EQ((*out)->body_length(), block.body_length);
-
- return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file,
- io_context);
-}
-
-static Status ReadOneDictionary(Message* message, const IpcReadContext& context) {
- CHECK_HAS_BODY(*message);
- ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- DictionaryKind kind;
- RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get()));
- if (kind != DictionaryKind::New) {
- return Status::Invalid(
- "Unsupported dictionary replacement or "
- "dictionary delta in IPC file");
- }
- return Status::OK();
-}
-
-class RecordBatchFileReaderImpl;
-
-/// A generator of record batches.
-///
-/// All batches are yielded in order.
-class ARROW_EXPORT IpcFileRecordBatchGenerator {
- public:
- using Item = std::shared_ptr<RecordBatch>;
-
- explicit IpcFileRecordBatchGenerator(
- std::shared_ptr<RecordBatchFileReaderImpl> state,
- std::shared_ptr<io::internal::ReadRangeCache> cached_source,
- const io::IOContext& io_context, arrow::internal::Executor* executor)
- : state_(std::move(state)),
- cached_source_(std::move(cached_source)),
- io_context_(io_context),
- executor_(executor),
- index_(0) {}
-
- Future<Item> operator()();
- Future<std::shared_ptr<Message>> ReadBlock(const FileBlock& block);
-
- static Status ReadDictionaries(
- RecordBatchFileReaderImpl* state,
- std::vector<std::shared_ptr<Message>> dictionary_messages);
- static Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
- RecordBatchFileReaderImpl* state, Message* message);
-
- private:
- std::shared_ptr<RecordBatchFileReaderImpl> state_;
- std::shared_ptr<io::internal::ReadRangeCache> cached_source_;
- io::IOContext io_context_;
- arrow::internal::Executor* executor_;
- int index_;
- // Odd Future type, but this lets us use All() easily
- Future<> read_dictionaries_;
-};
-
+static Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block,
+ io::RandomAccessFile* file) {
+ if (!BitUtil::IsMultipleOf8(block.offset) ||
+ !BitUtil::IsMultipleOf8(block.metadata_length) ||
+ !BitUtil::IsMultipleOf8(block.body_length)) {
+ return Status::Invalid("Unaligned block in IPC file");
+ }
+
+ // TODO(wesm): this breaks integration tests, see ARROW-3256
+ // DCHECK_EQ((*out)->body_length(), block.body_length);
+
+ ARROW_ASSIGN_OR_RAISE(auto message,
+ ReadMessage(block.offset, block.metadata_length, file));
+ return std::move(message);
+}
+
+static Future<std::shared_ptr<Message>> ReadMessageFromBlockAsync(
+ const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& io_context) {
+ if (!BitUtil::IsMultipleOf8(block.offset) ||
+ !BitUtil::IsMultipleOf8(block.metadata_length) ||
+ !BitUtil::IsMultipleOf8(block.body_length)) {
+ return Status::Invalid("Unaligned block in IPC file");
+ }
+
+ // TODO(wesm): this breaks integration tests, see ARROW-3256
+ // DCHECK_EQ((*out)->body_length(), block.body_length);
+
+ return ReadMessageAsync(block.offset, block.metadata_length, block.body_length, file,
+ io_context);
+}
+
+static Status ReadOneDictionary(Message* message, const IpcReadContext& context) {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ DictionaryKind kind;
+ RETURN_NOT_OK(ReadDictionary(*message->metadata(), context, &kind, reader.get()));
+ if (kind != DictionaryKind::New) {
+ return Status::Invalid(
+ "Unsupported dictionary replacement or "
+ "dictionary delta in IPC file");
+ }
+ return Status::OK();
+}
+
+class RecordBatchFileReaderImpl;
+
+/// A generator of record batches.
+///
+/// All batches are yielded in order.
+class ARROW_EXPORT IpcFileRecordBatchGenerator {
+ public:
+ using Item = std::shared_ptr<RecordBatch>;
+
+ explicit IpcFileRecordBatchGenerator(
+ std::shared_ptr<RecordBatchFileReaderImpl> state,
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source,
+ const io::IOContext& io_context, arrow::internal::Executor* executor)
+ : state_(std::move(state)),
+ cached_source_(std::move(cached_source)),
+ io_context_(io_context),
+ executor_(executor),
+ index_(0) {}
+
+ Future<Item> operator()();
+ Future<std::shared_ptr<Message>> ReadBlock(const FileBlock& block);
+
+ static Status ReadDictionaries(
+ RecordBatchFileReaderImpl* state,
+ std::vector<std::shared_ptr<Message>> dictionary_messages);
+ static Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+ RecordBatchFileReaderImpl* state, Message* message);
+
+ private:
+ std::shared_ptr<RecordBatchFileReaderImpl> state_;
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source_;
+ io::IOContext io_context_;
+ arrow::internal::Executor* executor_;
+ int index_;
+ // Odd Future type, but this lets us use All() easily
+ Future<> read_dictionaries_;
+};
+
class RecordBatchFileReaderImpl : public RecordBatchFileReader {
public:
RecordBatchFileReaderImpl() : file_(NULLPTR), footer_offset_(0), footer_(NULLPTR) {}
@@ -1074,33 +1074,33 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
CHECK_HAS_BODY(*message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
- ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal(
- *message->metadata(), schema_,
- field_inclusion_mask_, context, reader.get()));
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ ARROW_ASSIGN_OR_RAISE(auto batch, ReadRecordBatchInternal(
+ *message->metadata(), schema_,
+ field_inclusion_mask_, context, reader.get()));
++stats_.num_record_batches;
return batch;
}
- Result<int64_t> CountRows() override {
- int64_t total = 0;
- for (int i = 0; i < num_record_batches(); i++) {
- ARROW_ASSIGN_OR_RAISE(auto outer_message,
- ReadMessageFromBlock(GetRecordBatchBlock(i)));
- auto metadata = outer_message->metadata();
- const flatbuf::Message* message = nullptr;
- RETURN_NOT_OK(
- internal::VerifyMessage(metadata->data(), metadata->size(), &message));
- auto batch = message->header_as_RecordBatch();
- if (batch == nullptr) {
- return Status::IOError(
- "Header-type of flatbuffer-encoded Message is not RecordBatch.");
- }
- total += batch->length();
- }
- return total;
- }
-
+ Result<int64_t> CountRows() override {
+ int64_t total = 0;
+ for (int i = 0; i < num_record_batches(); i++) {
+ ARROW_ASSIGN_OR_RAISE(auto outer_message,
+ ReadMessageFromBlock(GetRecordBatchBlock(i)));
+ auto metadata = outer_message->metadata();
+ const flatbuf::Message* message = nullptr;
+ RETURN_NOT_OK(
+ internal::VerifyMessage(metadata->data(), metadata->size(), &message));
+ auto batch = message->header_as_RecordBatch();
+ if (batch == nullptr) {
+ return Status::IOError(
+ "Header-type of flatbuffer-encoded Message is not RecordBatch.");
+ }
+ total += batch->length();
+ }
+ return total;
+ }
+
Status Open(const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
const IpcReadOptions& options) {
owned_file_ = file;
@@ -1116,75 +1116,75 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
// Get the schema and record any observed dictionaries
RETURN_NOT_OK(UnpackSchemaMessage(footer_->schema(), options, &dictionary_memo_,
- &schema_, &out_schema_, &field_inclusion_mask_,
- &swap_endian_));
+ &schema_, &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
++stats_.num_messages;
return Status::OK();
}
- Future<> OpenAsync(const std::shared_ptr<io::RandomAccessFile>& file,
- int64_t footer_offset, const IpcReadOptions& options) {
- owned_file_ = file;
- return OpenAsync(file.get(), footer_offset, options);
- }
-
- Future<> OpenAsync(io::RandomAccessFile* file, int64_t footer_offset,
- const IpcReadOptions& options) {
- file_ = file;
- options_ = options;
- footer_offset_ = footer_offset;
- auto cpu_executor = ::arrow::internal::GetCpuThreadPool();
- auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
- return ReadFooterAsync(cpu_executor).Then([self, options]() -> Status {
- // Get the schema and record any observed dictionaries
- RETURN_NOT_OK(UnpackSchemaMessage(
- self->footer_->schema(), options, &self->dictionary_memo_, &self->schema_,
- &self->out_schema_, &self->field_inclusion_mask_, &self->swap_endian_));
- ++self->stats_.num_messages;
- return Status::OK();
- });
- }
-
+ Future<> OpenAsync(const std::shared_ptr<io::RandomAccessFile>& file,
+ int64_t footer_offset, const IpcReadOptions& options) {
+ owned_file_ = file;
+ return OpenAsync(file.get(), footer_offset, options);
+ }
+
+ Future<> OpenAsync(io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ file_ = file;
+ options_ = options;
+ footer_offset_ = footer_offset;
+ auto cpu_executor = ::arrow::internal::GetCpuThreadPool();
+ auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ return ReadFooterAsync(cpu_executor).Then([self, options]() -> Status {
+ // Get the schema and record any observed dictionaries
+ RETURN_NOT_OK(UnpackSchemaMessage(
+ self->footer_->schema(), options, &self->dictionary_memo_, &self->schema_,
+ &self->out_schema_, &self->field_inclusion_mask_, &self->swap_endian_));
+ ++self->stats_.num_messages;
+ return Status::OK();
+ });
+ }
+
std::shared_ptr<Schema> schema() const override { return out_schema_; }
std::shared_ptr<const KeyValueMetadata> metadata() const override { return metadata_; }
ReadStats stats() const override { return stats_; }
- Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
- const bool coalesce, const io::IOContext& io_context,
- const io::CacheOptions cache_options,
- arrow::internal::Executor* executor) override {
- auto state = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
- std::shared_ptr<io::internal::ReadRangeCache> cached_source;
- if (coalesce) {
- if (!owned_file_) return Status::Invalid("Cannot coalesce without an owned file");
- cached_source = std::make_shared<io::internal::ReadRangeCache>(
- owned_file_, io_context, cache_options);
- auto num_dictionaries = this->num_dictionaries();
- auto num_record_batches = this->num_record_batches();
- std::vector<io::ReadRange> ranges(num_dictionaries + num_record_batches);
- for (int i = 0; i < num_dictionaries; i++) {
- auto block = FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
- ranges[i].offset = block.offset;
- ranges[i].length = block.metadata_length + block.body_length;
- }
- for (int i = 0; i < num_record_batches; i++) {
- auto block = FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
- ranges[num_dictionaries + i].offset = block.offset;
- ranges[num_dictionaries + i].length = block.metadata_length + block.body_length;
- }
- RETURN_NOT_OK(cached_source->Cache(std::move(ranges)));
- }
- return IpcFileRecordBatchGenerator(std::move(state), std::move(cached_source),
- io_context, executor);
- }
-
+ Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+ const bool coalesce, const io::IOContext& io_context,
+ const io::CacheOptions cache_options,
+ arrow::internal::Executor* executor) override {
+ auto state = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ std::shared_ptr<io::internal::ReadRangeCache> cached_source;
+ if (coalesce) {
+ if (!owned_file_) return Status::Invalid("Cannot coalesce without an owned file");
+ cached_source = std::make_shared<io::internal::ReadRangeCache>(
+ owned_file_, io_context, cache_options);
+ auto num_dictionaries = this->num_dictionaries();
+ auto num_record_batches = this->num_record_batches();
+ std::vector<io::ReadRange> ranges(num_dictionaries + num_record_batches);
+ for (int i = 0; i < num_dictionaries; i++) {
+ auto block = FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i));
+ ranges[i].offset = block.offset;
+ ranges[i].length = block.metadata_length + block.body_length;
+ }
+ for (int i = 0; i < num_record_batches; i++) {
+ auto block = FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
+ ranges[num_dictionaries + i].offset = block.offset;
+ ranges[num_dictionaries + i].length = block.metadata_length + block.body_length;
+ }
+ RETURN_NOT_OK(cached_source->Cache(std::move(ranges)));
+ }
+ return IpcFileRecordBatchGenerator(std::move(state), std::move(cached_source),
+ io_context, executor);
+ }
+
private:
- friend AsyncGenerator<std::shared_ptr<Message>> MakeMessageGenerator(
- std::shared_ptr<RecordBatchFileReaderImpl>, const io::IOContext&);
- friend class IpcFileRecordBatchGenerator;
-
+ friend AsyncGenerator<std::shared_ptr<Message>> MakeMessageGenerator(
+ std::shared_ptr<RecordBatchFileReaderImpl>, const io::IOContext&);
+ friend class IpcFileRecordBatchGenerator;
+
FileBlock GetRecordBatchBlock(int i) const {
return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i));
}
@@ -1194,28 +1194,28 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
}
Result<std::unique_ptr<Message>> ReadMessageFromBlock(const FileBlock& block) {
- ARROW_ASSIGN_OR_RAISE(auto message, arrow::ipc::ReadMessageFromBlock(block, file_));
+ ARROW_ASSIGN_OR_RAISE(auto message, arrow::ipc::ReadMessageFromBlock(block, file_));
++stats_.num_messages;
return std::move(message);
}
Status ReadDictionaries() {
// Read all the dictionaries
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
for (int i = 0; i < num_dictionaries(); ++i) {
ARROW_ASSIGN_OR_RAISE(auto message, ReadMessageFromBlock(GetDictionaryBlock(i)));
- RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
+ RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
++stats_.num_dictionary_batches;
}
return Status::OK();
}
Status ReadFooter() {
- auto fut = ReadFooterAsync(/*executor=*/nullptr);
- return fut.status();
- }
-
- Future<> ReadFooterAsync(arrow::internal::Executor* executor) {
+ auto fut = ReadFooterAsync(/*executor=*/nullptr);
+ return fut.status();
+ }
+
+ Future<> ReadFooterAsync(arrow::internal::Executor* executor) {
const int32_t magic_size = static_cast<int>(strlen(kArrowMagicBytes));
if (footer_offset_ <= magic_size * 2 + 4) {
@@ -1223,53 +1223,53 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
}
int file_end_size = static_cast<int>(magic_size + sizeof(int32_t));
- auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
- auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size);
- if (executor) read_magic = executor->Transfer(std::move(read_magic));
- return read_magic
- .Then([=](const std::shared_ptr<Buffer>& buffer)
- -> Future<std::shared_ptr<Buffer>> {
- const int64_t expected_footer_size = magic_size + sizeof(int32_t);
- if (buffer->size() < expected_footer_size) {
- return Status::Invalid("Unable to read ", expected_footer_size,
- "from end of file");
- }
-
- if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) {
- return Status::Invalid("Not an Arrow file");
- }
-
- int32_t footer_length = BitUtil::FromLittleEndian(
- *reinterpret_cast<const int32_t*>(buffer->data()));
-
- if (footer_length <= 0 ||
- footer_length > self->footer_offset_ - magic_size * 2 - 4) {
- return Status::Invalid("File is smaller than indicated metadata size");
- }
-
- // Now read the footer
- auto read_footer = self->file_->ReadAsync(
- self->footer_offset_ - footer_length - file_end_size, footer_length);
- if (executor) read_footer = executor->Transfer(std::move(read_footer));
- return read_footer;
- })
- .Then([=](const std::shared_ptr<Buffer>& buffer) -> Status {
- self->footer_buffer_ = buffer;
- const auto data = self->footer_buffer_->data();
- const auto size = self->footer_buffer_->size();
- if (!internal::VerifyFlatbuffers<flatbuf::Footer>(data, size)) {
- return Status::IOError("Verification of flatbuffer-encoded Footer failed.");
- }
- self->footer_ = flatbuf::GetFooter(data);
-
- auto fb_metadata = self->footer_->custom_metadata();
- if (fb_metadata != nullptr) {
- std::shared_ptr<KeyValueMetadata> md;
- RETURN_NOT_OK(internal::GetKeyValueMetadata(fb_metadata, &md));
- self->metadata_ = std::move(md); // const-ify
- }
- return Status::OK();
- });
+ auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this());
+ auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size);
+ if (executor) read_magic = executor->Transfer(std::move(read_magic));
+ return read_magic
+ .Then([=](const std::shared_ptr<Buffer>& buffer)
+ -> Future<std::shared_ptr<Buffer>> {
+ const int64_t expected_footer_size = magic_size + sizeof(int32_t);
+ if (buffer->size() < expected_footer_size) {
+ return Status::Invalid("Unable to read ", expected_footer_size,
+ "from end of file");
+ }
+
+ if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) {
+ return Status::Invalid("Not an Arrow file");
+ }
+
+ int32_t footer_length = BitUtil::FromLittleEndian(
+ *reinterpret_cast<const int32_t*>(buffer->data()));
+
+ if (footer_length <= 0 ||
+ footer_length > self->footer_offset_ - magic_size * 2 - 4) {
+ return Status::Invalid("File is smaller than indicated metadata size");
+ }
+
+ // Now read the footer
+ auto read_footer = self->file_->ReadAsync(
+ self->footer_offset_ - footer_length - file_end_size, footer_length);
+ if (executor) read_footer = executor->Transfer(std::move(read_footer));
+ return read_footer;
+ })
+ .Then([=](const std::shared_ptr<Buffer>& buffer) -> Status {
+ self->footer_buffer_ = buffer;
+ const auto data = self->footer_buffer_->data();
+ const auto size = self->footer_buffer_->size();
+ if (!internal::VerifyFlatbuffers<flatbuf::Footer>(data, size)) {
+ return Status::IOError("Verification of flatbuffer-encoded Footer failed.");
+ }
+ self->footer_ = flatbuf::GetFooter(data);
+
+ auto fb_metadata = self->footer_->custom_metadata();
+ if (fb_metadata != nullptr) {
+ std::shared_ptr<KeyValueMetadata> md;
+ RETURN_NOT_OK(internal::GetKeyValueMetadata(fb_metadata, &md));
+ self->metadata_ = std::move(md); // const-ify
+ }
+ return Status::OK();
+ });
}
int num_dictionaries() const {
@@ -1300,8 +1300,8 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader {
std::shared_ptr<Schema> out_schema_;
ReadStats stats_;
-
- bool swap_endian_;
+
+ bool swap_endian_;
};
Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
@@ -1331,109 +1331,109 @@ Result<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::Open(
return result;
}
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
- ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
- return OpenAsync(std::move(file), footer_offset, options);
-}
-
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- io::RandomAccessFile* file, const IpcReadOptions& options) {
- ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
- return OpenAsync(file, footer_offset, options);
-}
-
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
- const IpcReadOptions& options) {
- auto result = std::make_shared<RecordBatchFileReaderImpl>();
- return result->OpenAsync(file, footer_offset, options)
- .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
-}
-
-Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
- io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) {
- auto result = std::make_shared<RecordBatchFileReaderImpl>();
- return result->OpenAsync(file, footer_offset, options)
- .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
-}
-
-Future<IpcFileRecordBatchGenerator::Item> IpcFileRecordBatchGenerator::operator()() {
- auto state = state_;
- if (!read_dictionaries_.is_valid()) {
- std::vector<Future<std::shared_ptr<Message>>> messages(state->num_dictionaries());
- for (int i = 0; i < state->num_dictionaries(); i++) {
- auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i));
- messages[i] = ReadBlock(block);
- }
- auto read_messages = All(std::move(messages));
- if (executor_) read_messages = executor_->Transfer(read_messages);
- read_dictionaries_ = read_messages.Then(
- [=](const std::vector<Result<std::shared_ptr<Message>>>& maybe_messages)
- -> Status {
- ARROW_ASSIGN_OR_RAISE(auto messages,
- arrow::internal::UnwrapOrRaise(maybe_messages));
- return ReadDictionaries(state.get(), std::move(messages));
- });
- }
- if (index_ >= state_->num_record_batches()) {
- return Future<Item>::MakeFinished(IterationTraits<Item>::End());
- }
- auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++));
- auto read_message = ReadBlock(block);
- auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; });
- // Force transfer. This may be wasteful in some cases, but ensures we get off the
- // I/O threads as soon as possible, and ensures we don't decode record batches
- // synchronously in the case that the message read has already finished.
- if (executor_) {
- auto executor = executor_;
- return read_messages.Then(
- [=](const std::shared_ptr<Message>& message) -> Future<Item> {
- return DeferNotOk(executor->Submit(
- [=]() { return ReadRecordBatch(state.get(), message.get()); }));
- });
- }
- return read_messages.Then([=](const std::shared_ptr<Message>& message) -> Result<Item> {
- return ReadRecordBatch(state.get(), message.get());
- });
-}
-
-Future<std::shared_ptr<Message>> IpcFileRecordBatchGenerator::ReadBlock(
- const FileBlock& block) {
- if (cached_source_) {
- auto cached_source = cached_source_;
- io::ReadRange range{block.offset, block.metadata_length + block.body_length};
- auto pool = state_->options_.memory_pool;
- return cached_source->WaitFor({range}).Then(
- [cached_source, pool, range]() -> Result<std::shared_ptr<Message>> {
- ARROW_ASSIGN_OR_RAISE(auto buffer, cached_source->Read(range));
- io::BufferReader stream(std::move(buffer));
- return ReadMessage(&stream, pool);
- });
- } else {
- return ReadMessageFromBlockAsync(block, state_->file_, io_context_);
- }
-}
-
-Status IpcFileRecordBatchGenerator::ReadDictionaries(
- RecordBatchFileReaderImpl* state,
- std::vector<std::shared_ptr<Message>> dictionary_messages) {
- IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
- for (const auto& message : dictionary_messages) {
- RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
- }
- return Status::OK();
-}
-
-Result<std::shared_ptr<RecordBatch>> IpcFileRecordBatchGenerator::ReadRecordBatch(
- RecordBatchFileReaderImpl* state, Message* message) {
- CHECK_HAS_BODY(*message);
- ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
- return ReadRecordBatchInternal(*message->metadata(), state->schema_,
- state->field_inclusion_mask_, context, reader.get());
-}
-
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return OpenAsync(std::move(file), footer_offset, options);
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ io::RandomAccessFile* file, const IpcReadOptions& options) {
+ ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize());
+ return OpenAsync(file, footer_offset, options);
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ return result->OpenAsync(file, footer_offset, options)
+ .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
+}
+
+Future<std::shared_ptr<RecordBatchFileReader>> RecordBatchFileReader::OpenAsync(
+ io::RandomAccessFile* file, int64_t footer_offset, const IpcReadOptions& options) {
+ auto result = std::make_shared<RecordBatchFileReaderImpl>();
+ return result->OpenAsync(file, footer_offset, options)
+ .Then([=]() -> Result<std::shared_ptr<RecordBatchFileReader>> { return result; });
+}
+
+Future<IpcFileRecordBatchGenerator::Item> IpcFileRecordBatchGenerator::operator()() {
+ auto state = state_;
+ if (!read_dictionaries_.is_valid()) {
+ std::vector<Future<std::shared_ptr<Message>>> messages(state->num_dictionaries());
+ for (int i = 0; i < state->num_dictionaries(); i++) {
+ auto block = FileBlockFromFlatbuffer(state->footer_->dictionaries()->Get(i));
+ messages[i] = ReadBlock(block);
+ }
+ auto read_messages = All(std::move(messages));
+ if (executor_) read_messages = executor_->Transfer(read_messages);
+ read_dictionaries_ = read_messages.Then(
+ [=](const std::vector<Result<std::shared_ptr<Message>>>& maybe_messages)
+ -> Status {
+ ARROW_ASSIGN_OR_RAISE(auto messages,
+ arrow::internal::UnwrapOrRaise(maybe_messages));
+ return ReadDictionaries(state.get(), std::move(messages));
+ });
+ }
+ if (index_ >= state_->num_record_batches()) {
+ return Future<Item>::MakeFinished(IterationTraits<Item>::End());
+ }
+ auto block = FileBlockFromFlatbuffer(state->footer_->recordBatches()->Get(index_++));
+ auto read_message = ReadBlock(block);
+ auto read_messages = read_dictionaries_.Then([read_message]() { return read_message; });
+ // Force transfer. This may be wasteful in some cases, but ensures we get off the
+ // I/O threads as soon as possible, and ensures we don't decode record batches
+ // synchronously in the case that the message read has already finished.
+ if (executor_) {
+ auto executor = executor_;
+ return read_messages.Then(
+ [=](const std::shared_ptr<Message>& message) -> Future<Item> {
+ return DeferNotOk(executor->Submit(
+ [=]() { return ReadRecordBatch(state.get(), message.get()); }));
+ });
+ }
+ return read_messages.Then([=](const std::shared_ptr<Message>& message) -> Result<Item> {
+ return ReadRecordBatch(state.get(), message.get());
+ });
+}
+
+Future<std::shared_ptr<Message>> IpcFileRecordBatchGenerator::ReadBlock(
+ const FileBlock& block) {
+ if (cached_source_) {
+ auto cached_source = cached_source_;
+ io::ReadRange range{block.offset, block.metadata_length + block.body_length};
+ auto pool = state_->options_.memory_pool;
+ return cached_source->WaitFor({range}).Then(
+ [cached_source, pool, range]() -> Result<std::shared_ptr<Message>> {
+ ARROW_ASSIGN_OR_RAISE(auto buffer, cached_source->Read(range));
+ io::BufferReader stream(std::move(buffer));
+ return ReadMessage(&stream, pool);
+ });
+ } else {
+ return ReadMessageFromBlockAsync(block, state_->file_, io_context_);
+ }
+}
+
+Status IpcFileRecordBatchGenerator::ReadDictionaries(
+ RecordBatchFileReaderImpl* state,
+ std::vector<std::shared_ptr<Message>> dictionary_messages) {
+ IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
+ for (const auto& message : dictionary_messages) {
+ RETURN_NOT_OK(ReadOneDictionary(message.get(), context));
+ }
+ return Status::OK();
+}
+
+Result<std::shared_ptr<RecordBatch>> IpcFileRecordBatchGenerator::ReadRecordBatch(
+ RecordBatchFileReaderImpl* state, Message* message) {
+ CHECK_HAS_BODY(*message);
+ ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
+ IpcReadContext context(&state->dictionary_memo_, state->options_, state->swap_endian_);
+ return ReadRecordBatchInternal(*message->metadata(), state->schema_,
+ state->field_inclusion_mask_, context, reader.get());
+}
+
Status Listener::OnEOS() { return Status::OK(); }
Status Listener::OnSchemaDecoded(std::shared_ptr<Schema> schema) { return Status::OK(); }
@@ -1452,16 +1452,16 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
};
public:
- explicit StreamDecoderImpl(std::shared_ptr<Listener> listener, IpcReadOptions options)
- : listener_(std::move(listener)),
- options_(std::move(options)),
+ explicit StreamDecoderImpl(std::shared_ptr<Listener> listener, IpcReadOptions options)
+ : listener_(std::move(listener)),
+ options_(std::move(options)),
state_(State::SCHEMA),
message_decoder_(std::shared_ptr<StreamDecoderImpl>(this, [](void*) {}),
options_.memory_pool),
- n_required_dictionaries_(0) {}
+ n_required_dictionaries_(0) {}
Status OnMessageDecoded(std::unique_ptr<Message> message) override {
- ++stats_.num_messages;
+ ++stats_.num_messages;
switch (state_) {
case State::SCHEMA:
ARROW_RETURN_NOT_OK(OnSchemaMessageDecoded(std::move(message)));
@@ -1495,13 +1495,13 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
int64_t next_required_size() const { return message_decoder_.next_required_size(); }
- ReadStats stats() const { return stats_; }
-
+ ReadStats stats() const { return stats_; }
+
private:
Status OnSchemaMessageDecoded(std::unique_ptr<Message> message) {
RETURN_NOT_OK(UnpackSchemaMessage(*message, options_, &dictionary_memo_, &schema_,
- &out_schema_, &field_inclusion_mask_,
- &swap_endian_));
+ &out_schema_, &field_inclusion_mask_,
+ &swap_endian_));
n_required_dictionaries_ = dictionary_memo_.fields().num_fields();
if (n_required_dictionaries_ == 0) {
@@ -1529,54 +1529,54 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener {
}
Status OnRecordBatchMessageDecoded(std::unique_ptr<Message> message) {
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
if (message->type() == MessageType::DICTIONARY_BATCH) {
return ReadDictionary(*message);
} else {
CHECK_HAS_BODY(*message);
ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body()));
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
ARROW_ASSIGN_OR_RAISE(
auto batch,
ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_,
- context, reader.get()));
- ++stats_.num_record_batches;
+ context, reader.get()));
+ ++stats_.num_record_batches;
return listener_->OnRecordBatchDecoded(std::move(batch));
}
}
// Read dictionary from dictionary batch
Status ReadDictionary(const Message& message) {
- DictionaryKind kind;
- IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
- RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
- ++stats_.num_dictionary_batches;
- switch (kind) {
- case DictionaryKind::New:
- break;
- case DictionaryKind::Delta:
- ++stats_.num_dictionary_deltas;
- break;
- case DictionaryKind::Replacement:
- ++stats_.num_replaced_dictionaries;
- break;
- }
- return Status::OK();
+ DictionaryKind kind;
+ IpcReadContext context(&dictionary_memo_, options_, swap_endian_);
+ RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind));
+ ++stats_.num_dictionary_batches;
+ switch (kind) {
+ case DictionaryKind::New:
+ break;
+ case DictionaryKind::Delta:
+ ++stats_.num_dictionary_deltas;
+ break;
+ case DictionaryKind::Replacement:
+ ++stats_.num_replaced_dictionaries;
+ break;
+ }
+ return Status::OK();
}
std::shared_ptr<Listener> listener_;
- const IpcReadOptions options_;
+ const IpcReadOptions options_;
State state_;
MessageDecoder message_decoder_;
std::vector<bool> field_inclusion_mask_;
int n_required_dictionaries_;
DictionaryMemo dictionary_memo_;
std::shared_ptr<Schema> schema_, out_schema_;
- ReadStats stats_;
- bool swap_endian_;
+ ReadStats stats_;
+ bool swap_endian_;
};
-StreamDecoder::StreamDecoder(std::shared_ptr<Listener> listener, IpcReadOptions options) {
+StreamDecoder::StreamDecoder(std::shared_ptr<Listener> listener, IpcReadOptions options) {
impl_.reset(new StreamDecoderImpl(std::move(listener), options));
}
@@ -1593,8 +1593,8 @@ std::shared_ptr<Schema> StreamDecoder::schema() const { return impl_->schema();
int64_t StreamDecoder::next_required_size() const { return impl_->next_required_size(); }
-ReadStats StreamDecoder::stats() const { return impl_->stats(); }
-
+ReadStats StreamDecoder::stats() const { return impl_->stats(); }
+
Result<std::shared_ptr<Schema>> ReadSchema(io::InputStream* stream,
DictionaryMemo* dictionary_memo) {
std::unique_ptr<MessageReader> reader = MessageReader::Open(stream);
@@ -2059,23 +2059,23 @@ Status FuzzIpcFile(const uint8_t* data, int64_t size) {
return Status::OK();
}
-Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
- auto buffer = std::make_shared<Buffer>(data, size);
- io::BufferReader buffer_reader(buffer);
-
- std::shared_ptr<Tensor> tensor;
-
- while (true) {
- ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader));
- if (tensor == nullptr) {
- break;
- }
- RETURN_NOT_OK(tensor->Validate());
- }
-
- return Status::OK();
-}
-
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<Buffer>(data, size);
+ io::BufferReader buffer_reader(buffer);
+
+ std::shared_ptr<Tensor> tensor;
+
+ while (true) {
+ ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader));
+ if (tensor == nullptr) {
+ break;
+ }
+ RETURN_NOT_OK(tensor->Validate());
+ }
+
+ return Status::OK();
+}
+
} // namespace internal
} // namespace ipc
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
index 60db2837a68..6f2157557f3 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/reader.h
@@ -25,14 +25,14 @@
#include <utility>
#include <vector>
-#include "arrow/io/caching.h"
-#include "arrow/io/type_fwd.h"
+#include "arrow/io/caching.h"
+#include "arrow/io/type_fwd.h"
#include "arrow/ipc/message.h"
#include "arrow/ipc/options.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/async_generator.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -101,8 +101,8 @@ class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader {
};
/// \brief Reads the record batch file format
-class ARROW_EXPORT RecordBatchFileReader
- : public std::enable_shared_from_this<RecordBatchFileReader> {
+class ARROW_EXPORT RecordBatchFileReader
+ : public std::enable_shared_from_this<RecordBatchFileReader> {
public:
virtual ~RecordBatchFileReader() = default;
@@ -150,26 +150,26 @@ class ARROW_EXPORT RecordBatchFileReader
const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
const IpcReadOptions& options = IpcReadOptions::Defaults());
- /// \brief Open a file asynchronously (owns the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
- /// \brief Open a file asynchronously (borrows the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- io::RandomAccessFile* file,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
- /// \brief Open a file asynchronously (owns the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
- /// \brief Open a file asynchronously (borrows the file).
- static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
- io::RandomAccessFile* file, int64_t footer_offset,
- const IpcReadOptions& options = IpcReadOptions::Defaults());
-
+ /// \brief Open a file asynchronously (owns the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (borrows the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ io::RandomAccessFile* file,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (owns the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+ /// \brief Open a file asynchronously (borrows the file).
+ static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+ io::RandomAccessFile* file, int64_t footer_offset,
+ const IpcReadOptions& options = IpcReadOptions::Defaults());
+
/// \brief The schema read from the file
virtual std::shared_ptr<Schema> schema() const = 0;
@@ -192,24 +192,24 @@ class ARROW_EXPORT RecordBatchFileReader
/// \brief Return current read statistics
virtual ReadStats stats() const = 0;
-
- /// \brief Computes the total number of rows in the file.
- virtual Result<int64_t> CountRows() = 0;
-
- /// \brief Get a reentrant generator of record batches.
- ///
- /// \param[in] coalesce If true, enable I/O coalescing.
- /// \param[in] io_context The IOContext to use (controls which thread pool
- /// is used for I/O).
- /// \param[in] cache_options Options for coalescing (if enabled).
- /// \param[in] executor Optionally, an executor to use for decoding record
- /// batches. This is generally only a benefit for very wide and/or
- /// compressed batches.
- virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
- const bool coalesce = false,
- const io::IOContext& io_context = io::default_io_context(),
- const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
- arrow::internal::Executor* executor = NULLPTR) = 0;
+
+ /// \brief Computes the total number of rows in the file.
+ virtual Result<int64_t> CountRows() = 0;
+
+ /// \brief Get a reentrant generator of record batches.
+ ///
+ /// \param[in] coalesce If true, enable I/O coalescing.
+ /// \param[in] io_context The IOContext to use (controls which thread pool
+ /// is used for I/O).
+ /// \param[in] cache_options Options for coalescing (if enabled).
+ /// \param[in] executor Optionally, an executor to use for decoding record
+ /// batches. This is generally only a benefit for very wide and/or
+ /// compressed batches.
+ virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+ const bool coalesce = false,
+ const io::IOContext& io_context = io::default_io_context(),
+ const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
+ arrow::internal::Executor* executor = NULLPTR) = 0;
};
/// \brief A general listener class to receive events.
@@ -304,7 +304,7 @@ class ARROW_EXPORT StreamDecoder {
/// Listener::OnRecordBatchDecoded() to receive decoded record batches
/// \param[in] options any IPC reading options (optional)
StreamDecoder(std::shared_ptr<Listener> listener,
- IpcReadOptions options = IpcReadOptions::Defaults());
+ IpcReadOptions options = IpcReadOptions::Defaults());
virtual ~StreamDecoder();
@@ -380,7 +380,7 @@ class ARROW_EXPORT StreamDecoder {
/// memcpy(buffer->mutable_data() + current_buffer_size,
/// small_chunk,
/// small_chunk_size);
- /// if (buffer->size() < decoder.next_required_size()) {
+ /// if (buffer->size() < decoder.next_required_size()) {
/// continue;
/// }
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
@@ -397,9 +397,9 @@ class ARROW_EXPORT StreamDecoder {
/// decoder
int64_t next_required_size() const;
- /// \brief Return current read statistics
- ReadStats stats() const;
-
+ /// \brief Return current read statistics
+ ReadStats stats() const;
+
private:
class StreamDecoderImpl;
std::unique_ptr<StreamDecoderImpl> impl_;
@@ -526,8 +526,8 @@ Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload&
ARROW_EXPORT
Status FuzzIpcStream(const uint8_t* data, int64_t size);
ARROW_EXPORT
-Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
-ARROW_EXPORT
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
+ARROW_EXPORT
Status FuzzIpcFile(const uint8_t* data, int64_t size);
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
index abb1dbc2dd6..3493c4f1409 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/type_fwd.h
@@ -47,9 +47,9 @@ enum class MessageType {
SPARSE_TENSOR
};
-struct IpcReadOptions;
-struct IpcWriteOptions;
-
+struct IpcReadOptions;
+struct IpcWriteOptions;
+
class MessageReader;
class RecordBatchStreamReader;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
index 7bb86316497..7b9254b7e59 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.cc
@@ -49,7 +49,7 @@
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/compression.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/util/make_unique.h"
@@ -70,18 +70,18 @@ using internal::kArrowMagicBytes;
namespace {
-bool HasNestedDict(const ArrayData& data) {
- if (data.type->id() == Type::DICTIONARY) {
- return true;
- }
- for (const auto& child : data.child_data) {
- if (HasNestedDict(*child)) {
- return true;
- }
- }
- return false;
-}
-
+bool HasNestedDict(const ArrayData& data) {
+ if (data.type->id() == Type::DICTIONARY) {
+ return true;
+ }
+ for (const auto& child : data.child_data) {
+ if (HasNestedDict(*child)) {
+ return true;
+ }
+ }
+ return false;
+}
+
Status GetTruncatedBitmap(int64_t offset, int64_t length,
const std::shared_ptr<Buffer> input, MemoryPool* pool,
std::shared_ptr<Buffer>* buffer) {
@@ -557,7 +557,7 @@ class DictionarySerializer : public RecordBatchSerializer {
Status Assemble(const std::shared_ptr<Array>& dictionary) {
// Make a dummy record batch. A bit tedious as we have to make a schema
auto schema = arrow::schema({arrow::field("dictionary", dictionary->type())});
- auto batch = RecordBatch::Make(std::move(schema), dictionary->length(), {dictionary});
+ auto batch = RecordBatch::Make(std::move(schema), dictionary->length(), {dictionary});
return RecordBatchSerializer::Assemble(*batch);
}
@@ -997,21 +997,21 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
IpcPayload payload;
RETURN_NOT_OK(GetRecordBatchPayload(batch, options_, &payload));
- RETURN_NOT_OK(WritePayload(payload));
- ++stats_.num_record_batches;
- return Status::OK();
+ RETURN_NOT_OK(WritePayload(payload));
+ ++stats_.num_record_batches;
+ return Status::OK();
+ }
+
+ Status WriteTable(const Table& table, int64_t max_chunksize) override {
+ if (is_file_format_ && options_.unify_dictionaries) {
+ ARROW_ASSIGN_OR_RAISE(auto unified_table,
+ DictionaryUnifier::UnifyTable(table, options_.memory_pool));
+ return RecordBatchWriter::WriteTable(*unified_table, max_chunksize);
+ } else {
+ return RecordBatchWriter::WriteTable(table, max_chunksize);
+ }
}
- Status WriteTable(const Table& table, int64_t max_chunksize) override {
- if (is_file_format_ && options_.unify_dictionaries) {
- ARROW_ASSIGN_OR_RAISE(auto unified_table,
- DictionaryUnifier::UnifyTable(table, options_.memory_pool));
- return RecordBatchWriter::WriteTable(*unified_table, max_chunksize);
- } else {
- return RecordBatchWriter::WriteTable(table, max_chunksize);
- }
- }
-
Status Close() override {
RETURN_NOT_OK(CheckStarted());
return payload_writer_->Close();
@@ -1023,11 +1023,11 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
IpcPayload payload;
RETURN_NOT_OK(GetSchemaPayload(schema_, options_, mapper_, &payload));
- return WritePayload(payload);
+ return WritePayload(payload);
}
- WriteStats stats() const override { return stats_; }
-
+ WriteStats stats() const override { return stats_; }
+
protected:
Status CheckStarted() {
if (!started_) {
@@ -1038,7 +1038,7 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
Status WriteDictionaries(const RecordBatch& batch) {
ARROW_ASSIGN_OR_RAISE(const auto dictionaries, CollectDictionaries(batch, mapper_));
- const auto equal_options = EqualOptions().nans_equal(true);
+ const auto equal_options = EqualOptions().nans_equal(true);
for (const auto& pair : dictionaries) {
int64_t dictionary_id = pair.first;
@@ -1047,57 +1047,57 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
// If a dictionary with this id was already emitted, check if it was the same.
auto* last_dictionary = &last_dictionaries_[dictionary_id];
const bool dictionary_exists = (*last_dictionary != nullptr);
- int64_t delta_start = 0;
+ int64_t delta_start = 0;
if (dictionary_exists) {
if ((*last_dictionary)->data() == dictionary->data()) {
// Fast shortcut for a common case.
// Same dictionary data by pointer => no need to emit it again
continue;
}
- const int64_t last_length = (*last_dictionary)->length();
- const int64_t new_length = dictionary->length();
- if (new_length == last_length &&
- ((*last_dictionary)->Equals(dictionary, equal_options))) {
+ const int64_t last_length = (*last_dictionary)->length();
+ const int64_t new_length = dictionary->length();
+ if (new_length == last_length &&
+ ((*last_dictionary)->Equals(dictionary, equal_options))) {
// Same dictionary by value => no need to emit it again
// (while this can have a CPU cost, this code path is required
// for the IPC file format)
continue;
}
- if (is_file_format_) {
- return Status::Invalid(
- "Dictionary replacement detected when writing IPC file format. "
- "Arrow IPC files only support a single dictionary for a given field "
- "across all batches.");
- }
-
- // (the read path doesn't support outer dictionary deltas, don't emit them)
- if (new_length > last_length && options_.emit_dictionary_deltas &&
- !HasNestedDict(*dictionary->data()) &&
- ((*last_dictionary)
- ->RangeEquals(dictionary, 0, last_length, 0, equal_options))) {
- // New dictionary starts with the current dictionary
- delta_start = last_length;
- }
+ if (is_file_format_) {
+ return Status::Invalid(
+ "Dictionary replacement detected when writing IPC file format. "
+ "Arrow IPC files only support a single dictionary for a given field "
+ "across all batches.");
+ }
+
+ // (the read path doesn't support outer dictionary deltas, don't emit them)
+ if (new_length > last_length && options_.emit_dictionary_deltas &&
+ !HasNestedDict(*dictionary->data()) &&
+ ((*last_dictionary)
+ ->RangeEquals(dictionary, 0, last_length, 0, equal_options))) {
+ // New dictionary starts with the current dictionary
+ delta_start = last_length;
+ }
}
- IpcPayload payload;
- if (delta_start) {
- RETURN_NOT_OK(GetDictionaryPayload(dictionary_id, /*is_delta=*/true,
- dictionary->Slice(delta_start), options_,
- &payload));
- } else {
- RETURN_NOT_OK(
- GetDictionaryPayload(dictionary_id, dictionary, options_, &payload));
+ IpcPayload payload;
+ if (delta_start) {
+ RETURN_NOT_OK(GetDictionaryPayload(dictionary_id, /*is_delta=*/true,
+ dictionary->Slice(delta_start), options_,
+ &payload));
+ } else {
+ RETURN_NOT_OK(
+ GetDictionaryPayload(dictionary_id, dictionary, options_, &payload));
+ }
+ RETURN_NOT_OK(WritePayload(payload));
+ ++stats_.num_dictionary_batches;
+ if (dictionary_exists) {
+ if (delta_start) {
+ ++stats_.num_dictionary_deltas;
+ } else {
+ ++stats_.num_replaced_dictionaries;
+ }
}
- RETURN_NOT_OK(WritePayload(payload));
- ++stats_.num_dictionary_batches;
- if (dictionary_exists) {
- if (delta_start) {
- ++stats_.num_dictionary_deltas;
- } else {
- ++stats_.num_replaced_dictionaries;
- }
- }
// Remember dictionary for next batches
*last_dictionary = dictionary;
@@ -1105,12 +1105,12 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
return Status::OK();
}
- Status WritePayload(const IpcPayload& payload) {
- RETURN_NOT_OK(payload_writer_->WritePayload(payload));
- ++stats_.num_messages;
- return Status::OK();
- }
-
+ Status WritePayload(const IpcPayload& payload) {
+ RETURN_NOT_OK(payload_writer_->WritePayload(payload));
+ ++stats_.num_messages;
+ return Status::OK();
+ }
+
std::unique_ptr<IpcPayloadWriter> payload_writer_;
std::shared_ptr<Schema> shared_schema_;
const Schema& schema_;
@@ -1126,7 +1126,7 @@ class ARROW_EXPORT IpcFormatWriter : public RecordBatchWriter {
bool started_ = false;
IpcWriteOptions options_;
- WriteStats stats_;
+ WriteStats stats_;
};
class StreamBookKeeper {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
index 05d62d1bcad..0ea83d7630a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/ipc/writer.h
@@ -60,23 +60,23 @@ struct IpcPayload {
int64_t body_length = 0;
};
-struct WriteStats {
- /// Number of IPC messages written.
- int64_t num_messages = 0;
- /// Number of record batches written.
- int64_t num_record_batches = 0;
- /// Number of dictionary batches written.
- ///
- /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
- int64_t num_dictionary_batches = 0;
-
- /// Number of dictionary deltas written.
- int64_t num_dictionary_deltas = 0;
- /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
- /// an existing dictionary with an unrelated new dictionary).
- int64_t num_replaced_dictionaries = 0;
-};
-
+struct WriteStats {
+ /// Number of IPC messages written.
+ int64_t num_messages = 0;
+ /// Number of record batches written.
+ int64_t num_record_batches = 0;
+ /// Number of dictionary batches written.
+ ///
+ /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
+ int64_t num_dictionary_batches = 0;
+
+ /// Number of dictionary deltas written.
+ int64_t num_dictionary_deltas = 0;
+ /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
+ /// an existing dictionary with an unrelated new dictionary).
+ int64_t num_replaced_dictionaries = 0;
+};
+
/// \class RecordBatchWriter
/// \brief Abstract interface for writing a stream of record batches
class ARROW_EXPORT RecordBatchWriter {
@@ -96,25 +96,25 @@ class ARROW_EXPORT RecordBatchWriter {
/// \brief Write Table with a particular chunksize
/// \param[in] table table to write
- /// \param[in] max_chunksize maximum length of table chunks. To indicate
- /// that no maximum should be enforced, pass -1.
+ /// \param[in] max_chunksize maximum length of table chunks. To indicate
+ /// that no maximum should be enforced, pass -1.
/// \return Status
- virtual Status WriteTable(const Table& table, int64_t max_chunksize);
+ virtual Status WriteTable(const Table& table, int64_t max_chunksize);
/// \brief Perform any logic necessary to finish the stream
///
/// \return Status
virtual Status Close() = 0;
-
- /// \brief Return current write statistics
- virtual WriteStats stats() const = 0;
+
+ /// \brief Return current write statistics
+ virtual WriteStats stats() const = 0;
};
-/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
-/// instances
-///
-/// @{
-
+/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
+/// instances
+///
+/// @{
+
/// Create a new IPC stream writer from stream sink and schema. User is
/// responsible for closing the actual OutputStream.
///
@@ -165,14 +165,14 @@ Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
-/// @}
-
-ARROW_DEPRECATED("Use MakeStreamWriter")
-ARROW_EXPORT
-Result<std::shared_ptr<RecordBatchWriter>> NewStreamWriter(
- io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
- const IpcWriteOptions& options = IpcWriteOptions::Defaults());
-
+/// @}
+
+ARROW_DEPRECATED("Use MakeStreamWriter")
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> NewStreamWriter(
+ io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+ const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
ARROW_DEPRECATED("Use MakeFileWriter")
ARROW_EXPORT
Result<std::shared_ptr<RecordBatchWriter>> NewFileWriter(
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
index eb5c3643dd4..2d6f3176224 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.cc
@@ -18,32 +18,32 @@
#include "arrow/memory_pool.h"
#include <algorithm> // IWYU pragma: keep
-#include <atomic>
-#include <cstdlib> // IWYU pragma: keep
-#include <cstring> // IWYU pragma: keep
-#include <iostream> // IWYU pragma: keep
+#include <atomic>
+#include <cstdlib> // IWYU pragma: keep
+#include <cstring> // IWYU pragma: keep
+#include <iostream> // IWYU pragma: keep
#include <limits>
#include <memory>
-#if defined(sun) || defined(__sun)
-#include <stdlib.h>
-#endif
-
-#include "arrow/buffer.h"
-#include "arrow/io/util_internal.h"
-#include "arrow/result.h"
+#if defined(sun) || defined(__sun)
+#include <stdlib.h>
+#endif
+
+#include "arrow/buffer.h"
+#include "arrow/io/util_internal.h"
+#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/io_util.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/io_util.h"
#include "arrow/util/logging.h" // IWYU pragma: keep
-#include "arrow/util/optional.h"
-#include "arrow/util/string.h"
-#include "arrow/util/thread_pool.h"
-
-#ifdef __GLIBC__
-#include <malloc.h>
-#endif
-
+#include "arrow/util/optional.h"
+#include "arrow/util/string.h"
+#include "arrow/util/thread_pool.h"
+
+#ifdef __GLIBC__
+#include <malloc.h>
+#endif
+
#ifdef ARROW_JEMALLOC
// Needed to support jemalloc 3 and 4
#define JEMALLOC_MANGLE
@@ -101,88 +101,88 @@ const char* je_arrow_malloc_conf =
namespace arrow {
-namespace {
-
+namespace {
+
constexpr size_t kAlignment = 64;
-constexpr char kDefaultBackendEnvVar[] = "ARROW_DEFAULT_MEMORY_POOL";
-
-enum class MemoryPoolBackend : uint8_t { System, Jemalloc, Mimalloc };
-
-struct SupportedBackend {
- const char* name;
- MemoryPoolBackend backend;
-};
-
-// See ARROW-12248 for why we use static in-function singletons rather than
-// global constants below (in SupportedBackends() and UserSelectedBackend()).
-// In some contexts (especially R bindings) `default_memory_pool()` may be
-// called before all globals are initialized, and then the ARROW_DEFAULT_MEMORY_POOL
-// environment variable would be ignored.
-
-const std::vector<SupportedBackend>& SupportedBackends() {
- static std::vector<SupportedBackend> backends = {
- // ARROW-12316: Apple => mimalloc first, then jemalloc
- // non-Apple => jemalloc first, then mimalloc
-#if defined(ARROW_JEMALLOC) && !defined(__APPLE__)
- {"jemalloc", MemoryPoolBackend::Jemalloc},
-#endif
-#ifdef ARROW_MIMALLOC
- {"mimalloc", MemoryPoolBackend::Mimalloc},
-#endif
-#if defined(ARROW_JEMALLOC) && defined(__APPLE__)
- {"jemalloc", MemoryPoolBackend::Jemalloc},
-#endif
- {"system", MemoryPoolBackend::System}
- };
- return backends;
-}
-
-// Return the MemoryPoolBackend selected by the user through the
-// ARROW_DEFAULT_MEMORY_POOL environment variable, if any.
-util::optional<MemoryPoolBackend> UserSelectedBackend() {
- static auto user_selected_backend = []() -> util::optional<MemoryPoolBackend> {
- auto unsupported_backend = [](const std::string& name) {
- std::vector<std::string> supported;
- for (const auto backend : SupportedBackends()) {
- supported.push_back(std::string("'") + backend.name + "'");
- }
- ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in "
- << kDefaultBackendEnvVar << " (supported backends are "
- << internal::JoinStrings(supported, ", ") << ")";
- };
-
- auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar);
- if (!maybe_name.ok()) {
- return {};
- }
- const auto name = *std::move(maybe_name);
- if (name.empty()) {
- // An empty environment variable is considered missing
- return {};
- }
- const auto found = std::find_if(
- SupportedBackends().begin(), SupportedBackends().end(),
- [&](const SupportedBackend& backend) { return name == backend.name; });
- if (found != SupportedBackends().end()) {
- return found->backend;
- }
- unsupported_backend(name);
- return {};
- }();
-
- return user_selected_backend;
-}
-
-MemoryPoolBackend DefaultBackend() {
- auto backend = UserSelectedBackend();
- if (backend.has_value()) {
- return backend.value();
- }
- struct SupportedBackend default_backend = SupportedBackends().front();
- return default_backend.backend;
-}
-
+constexpr char kDefaultBackendEnvVar[] = "ARROW_DEFAULT_MEMORY_POOL";
+
+enum class MemoryPoolBackend : uint8_t { System, Jemalloc, Mimalloc };
+
+struct SupportedBackend {
+ const char* name;
+ MemoryPoolBackend backend;
+};
+
+// See ARROW-12248 for why we use static in-function singletons rather than
+// global constants below (in SupportedBackends() and UserSelectedBackend()).
+// In some contexts (especially R bindings) `default_memory_pool()` may be
+// called before all globals are initialized, and then the ARROW_DEFAULT_MEMORY_POOL
+// environment variable would be ignored.
+
+const std::vector<SupportedBackend>& SupportedBackends() {
+ static std::vector<SupportedBackend> backends = {
+ // ARROW-12316: Apple => mimalloc first, then jemalloc
+ // non-Apple => jemalloc first, then mimalloc
+#if defined(ARROW_JEMALLOC) && !defined(__APPLE__)
+ {"jemalloc", MemoryPoolBackend::Jemalloc},
+#endif
+#ifdef ARROW_MIMALLOC
+ {"mimalloc", MemoryPoolBackend::Mimalloc},
+#endif
+#if defined(ARROW_JEMALLOC) && defined(__APPLE__)
+ {"jemalloc", MemoryPoolBackend::Jemalloc},
+#endif
+ {"system", MemoryPoolBackend::System}
+ };
+ return backends;
+}
+
+// Return the MemoryPoolBackend selected by the user through the
+// ARROW_DEFAULT_MEMORY_POOL environment variable, if any.
+util::optional<MemoryPoolBackend> UserSelectedBackend() {
+ static auto user_selected_backend = []() -> util::optional<MemoryPoolBackend> {
+ auto unsupported_backend = [](const std::string& name) {
+ std::vector<std::string> supported;
+ for (const auto backend : SupportedBackends()) {
+ supported.push_back(std::string("'") + backend.name + "'");
+ }
+ ARROW_LOG(WARNING) << "Unsupported backend '" << name << "' specified in "
+ << kDefaultBackendEnvVar << " (supported backends are "
+ << internal::JoinStrings(supported, ", ") << ")";
+ };
+
+ auto maybe_name = internal::GetEnvVar(kDefaultBackendEnvVar);
+ if (!maybe_name.ok()) {
+ return {};
+ }
+ const auto name = *std::move(maybe_name);
+ if (name.empty()) {
+ // An empty environment variable is considered missing
+ return {};
+ }
+ const auto found = std::find_if(
+ SupportedBackends().begin(), SupportedBackends().end(),
+ [&](const SupportedBackend& backend) { return name == backend.name; });
+ if (found != SupportedBackends().end()) {
+ return found->backend;
+ }
+ unsupported_backend(name);
+ return {};
+ }();
+
+ return user_selected_backend;
+}
+
+MemoryPoolBackend DefaultBackend() {
+ auto backend = UserSelectedBackend();
+ if (backend.has_value()) {
+ return backend.value();
+ }
+ struct SupportedBackend default_backend = SupportedBackends().front();
+ return default_backend.backend;
+}
+
// A static piece of memory for 0-size allocations, so as to return
// an aligned non-null pointer.
alignas(kAlignment) static uint8_t zero_size_area[1];
@@ -204,11 +204,11 @@ class SystemAllocator {
if (!*out) {
return Status::OutOfMemory("malloc of size ", size, " failed");
}
-#elif defined(sun) || defined(__sun)
- *out = reinterpret_cast<uint8_t*>(memalign(kAlignment, static_cast<size_t>(size)));
- if (!*out) {
- return Status::OutOfMemory("malloc of size ", size, " failed");
- }
+#elif defined(sun) || defined(__sun)
+ *out = reinterpret_cast<uint8_t*>(memalign(kAlignment, static_cast<size_t>(size)));
+ if (!*out) {
+ return Status::OutOfMemory("malloc of size ", size, " failed");
+ }
#else
const int result = posix_memalign(reinterpret_cast<void**>(out), kAlignment,
static_cast<size_t>(size));
@@ -262,14 +262,14 @@ class SystemAllocator {
#endif
}
}
-
- static void ReleaseUnused() {
-#ifdef __GLIBC__
- // The return value of malloc_trim is not an error but to inform
- // you if memory was actually released or not, which we do not care about here
- ARROW_UNUSED(malloc_trim(0));
-#endif
- }
+
+ static void ReleaseUnused() {
+#ifdef __GLIBC__
+ // The return value of malloc_trim is not an error but to inform
+ // you if memory was actually released or not, which we do not care about here
+ ARROW_UNUSED(malloc_trim(0));
+#endif
+ }
};
#ifdef ARROW_JEMALLOC
@@ -317,10 +317,10 @@ class JemallocAllocator {
dallocx(ptr, MALLOCX_ALIGN(kAlignment));
}
}
-
- static void ReleaseUnused() {
- mallctl("arena." ARROW_STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, 0);
- }
+
+ static void ReleaseUnused() {
+ mallctl("arena." ARROW_STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", NULL, NULL, NULL, 0);
+ }
};
#endif // defined(ARROW_JEMALLOC)
@@ -343,8 +343,8 @@ class MimallocAllocator {
return Status::OK();
}
- static void ReleaseUnused() { mi_collect(true); }
-
+ static void ReleaseUnused() { mi_collect(true); }
+
static Status ReallocateAligned(int64_t old_size, int64_t new_size, uint8_t** ptr) {
uint8_t* previous_ptr = *ptr;
if (previous_ptr == zero_size_area) {
@@ -451,8 +451,8 @@ class BaseMemoryPoolImpl : public MemoryPool {
stats_.UpdateAllocatedBytes(-size);
}
- void ReleaseUnused() override { Allocator::ReleaseUnused(); }
-
+ void ReleaseUnused() override { Allocator::ReleaseUnused(); }
+
int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
int64_t max_memory() const override { return stats_.max_memory(); }
@@ -480,46 +480,46 @@ class MimallocMemoryPool : public BaseMemoryPoolImpl<MimallocAllocator> {
};
#endif
-std::unique_ptr<MemoryPool> MemoryPool::CreateDefault() {
- auto backend = DefaultBackend();
- switch (backend) {
- case MemoryPoolBackend::System:
- return std::unique_ptr<MemoryPool>(new SystemMemoryPool);
+std::unique_ptr<MemoryPool> MemoryPool::CreateDefault() {
+ auto backend = DefaultBackend();
+ switch (backend) {
+ case MemoryPoolBackend::System:
+ return std::unique_ptr<MemoryPool>(new SystemMemoryPool);
#ifdef ARROW_JEMALLOC
- case MemoryPoolBackend::Jemalloc:
- return std::unique_ptr<MemoryPool>(new JemallocMemoryPool);
+ case MemoryPoolBackend::Jemalloc:
+ return std::unique_ptr<MemoryPool>(new JemallocMemoryPool);
+#endif
+#ifdef ARROW_MIMALLOC
+ case MemoryPoolBackend::Mimalloc:
+ return std::unique_ptr<MemoryPool>(new MimallocMemoryPool);
#endif
-#ifdef ARROW_MIMALLOC
- case MemoryPoolBackend::Mimalloc:
- return std::unique_ptr<MemoryPool>(new MimallocMemoryPool);
-#endif
- default:
- ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
- return nullptr;
- }
+ default:
+ ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
+ return nullptr;
+ }
}
-static struct GlobalState {
- ~GlobalState() { finalizing.store(true, std::memory_order_relaxed); }
-
- bool is_finalizing() const { return finalizing.load(std::memory_order_relaxed); }
-
- std::atomic<bool> finalizing{false}; // constructed first, destroyed last
-
- SystemMemoryPool system_pool;
+static struct GlobalState {
+ ~GlobalState() { finalizing.store(true, std::memory_order_relaxed); }
+
+ bool is_finalizing() const { return finalizing.load(std::memory_order_relaxed); }
+
+ std::atomic<bool> finalizing{false}; // constructed first, destroyed last
+
+ SystemMemoryPool system_pool;
#ifdef ARROW_JEMALLOC
- JemallocMemoryPool jemalloc_pool;
+ JemallocMemoryPool jemalloc_pool;
#endif
#ifdef ARROW_MIMALLOC
- MimallocMemoryPool mimalloc_pool;
+ MimallocMemoryPool mimalloc_pool;
#endif
-} global_state;
+} global_state;
-MemoryPool* system_memory_pool() { return &global_state.system_pool; }
+MemoryPool* system_memory_pool() { return &global_state.system_pool; }
Status jemalloc_memory_pool(MemoryPool** out) {
#ifdef ARROW_JEMALLOC
- *out = &global_state.jemalloc_pool;
+ *out = &global_state.jemalloc_pool;
return Status::OK();
#else
return Status::NotImplemented("This Arrow build does not enable jemalloc");
@@ -528,7 +528,7 @@ Status jemalloc_memory_pool(MemoryPool** out) {
Status mimalloc_memory_pool(MemoryPool** out) {
#ifdef ARROW_MIMALLOC
- *out = &global_state.mimalloc_pool;
+ *out = &global_state.mimalloc_pool;
return Status::OK();
#else
return Status::NotImplemented("This Arrow build does not enable mimalloc");
@@ -536,22 +536,22 @@ Status mimalloc_memory_pool(MemoryPool** out) {
}
MemoryPool* default_memory_pool() {
- auto backend = DefaultBackend();
- switch (backend) {
- case MemoryPoolBackend::System:
- return &global_state.system_pool;
+ auto backend = DefaultBackend();
+ switch (backend) {
+ case MemoryPoolBackend::System:
+ return &global_state.system_pool;
#ifdef ARROW_JEMALLOC
- case MemoryPoolBackend::Jemalloc:
- return &global_state.jemalloc_pool;
+ case MemoryPoolBackend::Jemalloc:
+ return &global_state.jemalloc_pool;
+#endif
+#ifdef ARROW_MIMALLOC
+ case MemoryPoolBackend::Mimalloc:
+ return &global_state.mimalloc_pool;
#endif
-#ifdef ARROW_MIMALLOC
- case MemoryPoolBackend::Mimalloc:
- return &global_state.mimalloc_pool;
-#endif
- default:
- ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
- return nullptr;
- }
+ default:
+ ARROW_LOG(FATAL) << "Internal error: cannot create default memory pool";
+ return nullptr;
+ }
}
#define RETURN_IF_JEMALLOC_ERROR(ERR) \
@@ -674,124 +674,124 @@ int64_t ProxyMemoryPool::max_memory() const { return impl_->max_memory(); }
std::string ProxyMemoryPool::backend_name() const { return impl_->backend_name(); }
-std::vector<std::string> SupportedMemoryBackendNames() {
- std::vector<std::string> supported;
- for (const auto backend : SupportedBackends()) {
- supported.push_back(backend.name);
- }
- return supported;
-}
-
-// -----------------------------------------------------------------------
-// Pool buffer and allocation
-
-/// A Buffer whose lifetime is tied to a particular MemoryPool
-class PoolBuffer final : public ResizableBuffer {
- public:
- explicit PoolBuffer(std::shared_ptr<MemoryManager> mm, MemoryPool* pool)
- : ResizableBuffer(nullptr, 0, std::move(mm)), pool_(pool) {}
-
- ~PoolBuffer() override {
- // Avoid calling pool_->Free if the global pools are destroyed
- // (XXX this will not work with user-defined pools)
-
- // This can happen if a Future is destructing on one thread while or
- // after memory pools are destructed on the main thread (as there is
- // no guarantee of destructor order between thread/memory pools)
- uint8_t* ptr = mutable_data();
- if (ptr && !global_state.is_finalizing()) {
- pool_->Free(ptr, capacity_);
- }
- }
-
- Status Reserve(const int64_t capacity) override {
- if (capacity < 0) {
- return Status::Invalid("Negative buffer capacity: ", capacity);
- }
- uint8_t* ptr = mutable_data();
- if (!ptr || capacity > capacity_) {
- int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
- if (ptr) {
- RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
- } else {
- RETURN_NOT_OK(pool_->Allocate(new_capacity, &ptr));
- }
- data_ = ptr;
- capacity_ = new_capacity;
- }
- return Status::OK();
- }
-
- Status Resize(const int64_t new_size, bool shrink_to_fit = true) override {
- if (ARROW_PREDICT_FALSE(new_size < 0)) {
- return Status::Invalid("Negative buffer resize: ", new_size);
- }
- uint8_t* ptr = mutable_data();
- if (ptr && shrink_to_fit && new_size <= size_) {
- // Buffer is non-null and is not growing, so shrink to the requested size without
- // excess space.
- int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size);
- if (capacity_ != new_capacity) {
- // Buffer hasn't got yet the requested size.
- RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
- data_ = ptr;
- capacity_ = new_capacity;
- }
- } else {
- RETURN_NOT_OK(Reserve(new_size));
- }
- size_ = new_size;
-
- return Status::OK();
- }
-
- static std::shared_ptr<PoolBuffer> MakeShared(MemoryPool* pool) {
- std::shared_ptr<MemoryManager> mm;
- if (pool == nullptr) {
- pool = default_memory_pool();
- mm = default_cpu_memory_manager();
- } else {
- mm = CPUDevice::memory_manager(pool);
- }
- return std::make_shared<PoolBuffer>(std::move(mm), pool);
- }
-
- static std::unique_ptr<PoolBuffer> MakeUnique(MemoryPool* pool) {
- std::shared_ptr<MemoryManager> mm;
- if (pool == nullptr) {
- pool = default_memory_pool();
- mm = default_cpu_memory_manager();
- } else {
- mm = CPUDevice::memory_manager(pool);
- }
- return std::unique_ptr<PoolBuffer>(new PoolBuffer(std::move(mm), pool));
- }
-
- private:
- MemoryPool* pool_;
-};
-
-namespace {
-// A utility that does most of the work of the `AllocateBuffer` and
-// `AllocateResizableBuffer` methods. The argument `buffer` should be a smart pointer to
-// a PoolBuffer.
-template <typename BufferPtr, typename PoolBufferPtr>
-inline Result<BufferPtr> ResizePoolBuffer(PoolBufferPtr&& buffer, const int64_t size) {
- RETURN_NOT_OK(buffer->Resize(size));
- buffer->ZeroPadding();
- return std::move(buffer);
-}
-
-} // namespace
-
-Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, MemoryPool* pool) {
- return ResizePoolBuffer<std::unique_ptr<Buffer>>(PoolBuffer::MakeUnique(pool), size);
-}
-
-Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(const int64_t size,
- MemoryPool* pool) {
- return ResizePoolBuffer<std::unique_ptr<ResizableBuffer>>(PoolBuffer::MakeUnique(pool),
- size);
-}
-
+std::vector<std::string> SupportedMemoryBackendNames() {
+ std::vector<std::string> supported;
+ for (const auto backend : SupportedBackends()) {
+ supported.push_back(backend.name);
+ }
+ return supported;
+}
+
+// -----------------------------------------------------------------------
+// Pool buffer and allocation
+
+/// A Buffer whose lifetime is tied to a particular MemoryPool
+class PoolBuffer final : public ResizableBuffer {
+ public:
+ explicit PoolBuffer(std::shared_ptr<MemoryManager> mm, MemoryPool* pool)
+ : ResizableBuffer(nullptr, 0, std::move(mm)), pool_(pool) {}
+
+ ~PoolBuffer() override {
+ // Avoid calling pool_->Free if the global pools are destroyed
+ // (XXX this will not work with user-defined pools)
+
+ // This can happen if a Future is destructing on one thread while or
+ // after memory pools are destructed on the main thread (as there is
+ // no guarantee of destructor order between thread/memory pools)
+ uint8_t* ptr = mutable_data();
+ if (ptr && !global_state.is_finalizing()) {
+ pool_->Free(ptr, capacity_);
+ }
+ }
+
+ Status Reserve(const int64_t capacity) override {
+ if (capacity < 0) {
+ return Status::Invalid("Negative buffer capacity: ", capacity);
+ }
+ uint8_t* ptr = mutable_data();
+ if (!ptr || capacity > capacity_) {
+ int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(capacity);
+ if (ptr) {
+ RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
+ } else {
+ RETURN_NOT_OK(pool_->Allocate(new_capacity, &ptr));
+ }
+ data_ = ptr;
+ capacity_ = new_capacity;
+ }
+ return Status::OK();
+ }
+
+ Status Resize(const int64_t new_size, bool shrink_to_fit = true) override {
+ if (ARROW_PREDICT_FALSE(new_size < 0)) {
+ return Status::Invalid("Negative buffer resize: ", new_size);
+ }
+ uint8_t* ptr = mutable_data();
+ if (ptr && shrink_to_fit && new_size <= size_) {
+ // Buffer is non-null and is not growing, so shrink to the requested size without
+ // excess space.
+ int64_t new_capacity = BitUtil::RoundUpToMultipleOf64(new_size);
+ if (capacity_ != new_capacity) {
+ // Buffer hasn't got yet the requested size.
+ RETURN_NOT_OK(pool_->Reallocate(capacity_, new_capacity, &ptr));
+ data_ = ptr;
+ capacity_ = new_capacity;
+ }
+ } else {
+ RETURN_NOT_OK(Reserve(new_size));
+ }
+ size_ = new_size;
+
+ return Status::OK();
+ }
+
+ static std::shared_ptr<PoolBuffer> MakeShared(MemoryPool* pool) {
+ std::shared_ptr<MemoryManager> mm;
+ if (pool == nullptr) {
+ pool = default_memory_pool();
+ mm = default_cpu_memory_manager();
+ } else {
+ mm = CPUDevice::memory_manager(pool);
+ }
+ return std::make_shared<PoolBuffer>(std::move(mm), pool);
+ }
+
+ static std::unique_ptr<PoolBuffer> MakeUnique(MemoryPool* pool) {
+ std::shared_ptr<MemoryManager> mm;
+ if (pool == nullptr) {
+ pool = default_memory_pool();
+ mm = default_cpu_memory_manager();
+ } else {
+ mm = CPUDevice::memory_manager(pool);
+ }
+ return std::unique_ptr<PoolBuffer>(new PoolBuffer(std::move(mm), pool));
+ }
+
+ private:
+ MemoryPool* pool_;
+};
+
+namespace {
+// A utility that does most of the work of the `AllocateBuffer` and
+// `AllocateResizableBuffer` methods. The argument `buffer` should be a smart pointer to
+// a PoolBuffer.
+template <typename BufferPtr, typename PoolBufferPtr>
+inline Result<BufferPtr> ResizePoolBuffer(PoolBufferPtr&& buffer, const int64_t size) {
+ RETURN_NOT_OK(buffer->Resize(size));
+ buffer->ZeroPadding();
+ return std::move(buffer);
+}
+
+} // namespace
+
+Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, MemoryPool* pool) {
+ return ResizePoolBuffer<std::unique_ptr<Buffer>>(PoolBuffer::MakeUnique(pool), size);
+}
+
+Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(const int64_t size,
+ MemoryPool* pool) {
+ return ResizePoolBuffer<std::unique_ptr<ResizableBuffer>>(PoolBuffer::MakeUnique(pool),
+ size);
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
index 45c49ff5cc8..81b1b112dc7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/memory_pool.h
@@ -63,7 +63,7 @@ class MemoryPoolStats {
/// take care of the required 64-byte alignment.
class ARROW_EXPORT MemoryPool {
public:
- virtual ~MemoryPool() = default;
+ virtual ~MemoryPool() = default;
/// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool
static std::unique_ptr<MemoryPool> CreateDefault();
@@ -87,13 +87,13 @@ class ARROW_EXPORT MemoryPool {
/// faster deallocation if supported by its backend.
virtual void Free(uint8_t* buffer, int64_t size) = 0;
- /// Return unused memory to the OS
- ///
- /// Only applies to allocators that hold onto unused memory. This will be
- /// best effort, a memory pool may not implement this feature or may be
- /// unable to fulfill the request due to fragmentation.
- virtual void ReleaseUnused() {}
-
+ /// Return unused memory to the OS
+ ///
+ /// Only applies to allocators that hold onto unused memory. This will be
+ /// best effort, a memory pool may not implement this feature or may be
+ /// unable to fulfill the request due to fragmentation.
+ virtual void ReleaseUnused() {}
+
/// The number of bytes that were allocated and not yet free'd through
/// this allocator.
virtual int64_t bytes_allocated() const = 0;
@@ -104,11 +104,11 @@ class ARROW_EXPORT MemoryPool {
/// returns -1
virtual int64_t max_memory() const;
- /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc").
+ /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc").
virtual std::string backend_name() const = 0;
protected:
- MemoryPool() = default;
+ MemoryPool() = default;
};
class ARROW_EXPORT LoggingMemoryPool : public MemoryPool {
@@ -156,10 +156,10 @@ class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
std::unique_ptr<ProxyMemoryPoolImpl> impl_;
};
-/// \brief Return a process-wide memory pool based on the system allocator.
+/// \brief Return a process-wide memory pool based on the system allocator.
ARROW_EXPORT MemoryPool* system_memory_pool();
-/// \brief Return a process-wide memory pool based on jemalloc.
+/// \brief Return a process-wide memory pool based on jemalloc.
///
/// May return NotImplemented if jemalloc is not available.
ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out);
@@ -175,11 +175,11 @@ ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out);
ARROW_EXPORT
Status jemalloc_set_decay_ms(int ms);
-/// \brief Return a process-wide memory pool based on mimalloc.
+/// \brief Return a process-wide memory pool based on mimalloc.
///
/// May return NotImplemented if mimalloc is not available.
ARROW_EXPORT Status mimalloc_memory_pool(MemoryPool** out);
-ARROW_EXPORT std::vector<std::string> SupportedMemoryBackendNames();
-
+ARROW_EXPORT std::vector<std::string> SupportedMemoryBackendNames();
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
index 8187af43345..8d1c16e0ed6 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.cc
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#include "arrow/pretty_print.h"
-
+#include "arrow/pretty_print.h"
+
#include <algorithm>
#include <chrono>
#include <cstddef>
@@ -69,12 +69,12 @@ class PrettyPrinter {
};
void PrettyPrinter::OpenArray(const Array& array) {
- if (!options_.skip_new_lines) {
- Indent();
- }
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
(*sink_) << "[";
if (array.length() > 0) {
- Newline();
+ Newline();
indent_ += options_.indent_size;
}
}
@@ -125,15 +125,15 @@ class ArrayPrinter : public PrettyPrinter {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink_) << ",";
- Newline();
+ (*sink_) << ",";
+ Newline();
+ }
+ if (!options_.skip_new_lines) {
+ Indent();
}
- if (!options_.skip_new_lines) {
- Indent();
- }
if ((i >= options_.window) && (i < (array.length() - options_.window))) {
- (*sink_) << "...";
- Newline();
+ (*sink_) << "...";
+ Newline();
i = array.length() - options_.window - 1;
skip_comma = true;
} else if (array.IsNull(i)) {
@@ -142,7 +142,7 @@ class ArrayPrinter : public PrettyPrinter {
func(i);
}
}
- Newline();
+ Newline();
}
Status WriteDataValues(const BooleanArray& array) {
@@ -232,11 +232,11 @@ class ArrayPrinter : public PrettyPrinter {
return Status::OK();
}
- Status WriteDataValues(const Decimal256Array& array) {
- WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); });
- return Status::OK();
- }
-
+ Status WriteDataValues(const Decimal256Array& array) {
+ WriteValues(array, [&](int64_t i) { (*sink_) << array.FormatValue(i); });
+ return Status::OK();
+ }
+
template <typename T>
enable_if_list_like<typename T::TypeClass, Status> WriteDataValues(const T& array) {
bool skip_comma = true;
@@ -244,13 +244,13 @@ class ArrayPrinter : public PrettyPrinter {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink_) << ",";
- Newline();
+ (*sink_) << ",";
+ Newline();
}
if ((i >= options_.window) && (i < (array.length() - options_.window))) {
Indent();
- (*sink_) << "...";
- Newline();
+ (*sink_) << "...";
+ Newline();
i = array.length() - options_.window - 1;
skip_comma = true;
} else if (array.IsNull(i)) {
@@ -259,11 +259,11 @@ class ArrayPrinter : public PrettyPrinter {
} else {
std::shared_ptr<Array> slice =
array.values()->Slice(array.value_offset(i), array.value_length(i));
- RETURN_NOT_OK(
- PrettyPrint(*slice, PrettyPrintOptions{indent_, options_.window}, sink_));
+ RETURN_NOT_OK(
+ PrettyPrint(*slice, PrettyPrintOptions{indent_, options_.window}, sink_));
}
}
- Newline();
+ Newline();
return Status::OK();
}
@@ -273,36 +273,36 @@ class ArrayPrinter : public PrettyPrinter {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink_) << ",";
- Newline();
+ (*sink_) << ",";
+ Newline();
}
-
- if (!options_.skip_new_lines) {
- Indent();
- }
-
+
+ if (!options_.skip_new_lines) {
+ Indent();
+ }
+
if ((i >= options_.window) && (i < (array.length() - options_.window))) {
- (*sink_) << "...";
- Newline();
+ (*sink_) << "...";
+ Newline();
i = array.length() - options_.window - 1;
skip_comma = true;
} else if (array.IsNull(i)) {
(*sink_) << options_.null_rep;
} else {
- (*sink_) << "keys:";
- Newline();
+ (*sink_) << "keys:";
+ Newline();
auto keys_slice =
array.keys()->Slice(array.value_offset(i), array.value_length(i));
- RETURN_NOT_OK(PrettyPrint(*keys_slice,
- PrettyPrintOptions{indent_, options_.window}, sink_));
- Newline();
+ RETURN_NOT_OK(PrettyPrint(*keys_slice,
+ PrettyPrintOptions{indent_, options_.window}, sink_));
+ Newline();
Indent();
- (*sink_) << "values:";
- Newline();
+ (*sink_) << "values:";
+ Newline();
auto values_slice =
array.items()->Slice(array.value_offset(i), array.value_length(i));
- RETURN_NOT_OK(PrettyPrint(*values_slice,
- PrettyPrintOptions{indent_, options_.window}, sink_));
+ RETURN_NOT_OK(PrettyPrint(*values_slice,
+ PrettyPrintOptions{indent_, options_.window}, sink_));
}
}
(*sink_) << "\n";
@@ -341,7 +341,7 @@ class ArrayPrinter : public PrettyPrinter {
int64_t length) {
for (size_t i = 0; i < fields.size(); ++i) {
Newline();
- Indent();
+ Indent();
std::stringstream ss;
ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << "\n";
Write(ss.str());
@@ -369,14 +369,14 @@ class ArrayPrinter : public PrettyPrinter {
RETURN_NOT_OK(WriteValidityBitmap(array));
Newline();
- Indent();
+ Indent();
Write("-- type_ids: ");
UInt8Array type_codes(array.length(), array.type_codes(), nullptr, 0, array.offset());
RETURN_NOT_OK(PrettyPrint(type_codes, indent_ + options_.indent_size, sink_));
if (array.mode() == UnionMode::DENSE) {
Newline();
- Indent();
+ Indent();
Write("-- value_offsets: ");
Int32Array value_offsets(
array.length(), checked_cast<const DenseUnionArray&>(array).value_offsets(),
@@ -395,13 +395,13 @@ class ArrayPrinter : public PrettyPrinter {
Status Visit(const DictionaryArray& array) {
Newline();
- Indent();
+ Indent();
Write("-- dictionary:\n");
RETURN_NOT_OK(
PrettyPrint(*array.dictionary(), indent_ + options_.indent_size, sink_));
Newline();
- Indent();
+ Indent();
Write("-- indices:\n");
return PrettyPrint(*array.indices(), indent_ + options_.indent_size, sink_);
}
@@ -452,7 +452,7 @@ Status ArrayPrinter::WriteValidityBitmap(const Array& array) {
if (array.null_count() > 0) {
Newline();
- Indent();
+ Indent();
BooleanArray is_valid(array.length(), array.null_bitmap(), nullptr, 0,
array.offset());
return PrettyPrint(is_valid, indent_ + options_.indent_size, sink_);
@@ -492,28 +492,28 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op
for (int i = 0; i < indent; ++i) {
(*sink) << " ";
}
- (*sink) << "[";
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ (*sink) << "[";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
bool skip_comma = true;
for (int i = 0; i < num_chunks; ++i) {
if (skip_comma) {
skip_comma = false;
} else {
- (*sink) << ",";
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ (*sink) << ",";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
}
if ((i >= window) && (i < (num_chunks - window))) {
for (int i = 0; i < indent; ++i) {
(*sink) << " ";
}
- (*sink) << "...";
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ (*sink) << "...";
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
i = num_chunks - window - 1;
skip_comma = true;
} else {
@@ -523,9 +523,9 @@ Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& op
RETURN_NOT_OK(printer.Print(*chunked_arr.chunk(i)));
}
}
- if (!options.skip_new_lines) {
- *sink << "\n";
- }
+ if (!options.skip_new_lines) {
+ *sink << "\n";
+ }
for (int i = 0; i < indent; ++i) {
(*sink) << " ";
@@ -605,7 +605,7 @@ class SchemaPrinter : public PrettyPrinter {
void PrintVerboseMetadata(const KeyValueMetadata& metadata) {
for (int64_t i = 0; i < metadata.size(); ++i) {
Newline();
- Indent();
+ Indent();
Write(metadata.key(i) + ": '" + metadata.value(i) + "'");
}
}
@@ -613,7 +613,7 @@ class SchemaPrinter : public PrettyPrinter {
void PrintTruncatedMetadata(const KeyValueMetadata& metadata) {
for (int64_t i = 0; i < metadata.size(); ++i) {
Newline();
- Indent();
+ Indent();
size_t size = metadata.value(i).size();
size_t truncated_size = std::max<size_t>(10, 70 - metadata.key(i).size() - indent_);
if (size <= truncated_size) {
@@ -629,7 +629,7 @@ class SchemaPrinter : public PrettyPrinter {
void PrintMetadata(const std::string& metadata_type, const KeyValueMetadata& metadata) {
if (metadata.size() > 0) {
Newline();
- Indent();
+ Indent();
Write(metadata_type);
if (options_.truncate_metadata) {
PrintTruncatedMetadata(metadata);
@@ -643,7 +643,7 @@ class SchemaPrinter : public PrettyPrinter {
for (int i = 0; i < schema_.num_fields(); ++i) {
if (i > 0) {
Newline();
- Indent();
+ Indent();
} else {
Indent();
}
@@ -668,7 +668,7 @@ Status SchemaPrinter::PrintType(const DataType& type, bool nullable) {
}
for (int i = 0; i < type.num_fields(); ++i) {
Newline();
- Indent();
+ Indent();
std::stringstream ss;
ss << "child " << i << ", ";
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
index d85684cf460..1bc086a6889 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/pretty_print.h
@@ -19,7 +19,7 @@
#include <iosfwd>
#include <string>
-#include <utility>
+#include <utility>
#include "arrow/util/visibility.h"
@@ -35,14 +35,14 @@ class Table;
struct PrettyPrintOptions {
PrettyPrintOptions() = default;
- PrettyPrintOptions(int indent_arg, // NOLINT runtime/explicit
- int window_arg = 10, int indent_size_arg = 2,
+ PrettyPrintOptions(int indent_arg, // NOLINT runtime/explicit
+ int window_arg = 10, int indent_size_arg = 2,
std::string null_rep_arg = "null", bool skip_new_lines_arg = false,
bool truncate_metadata_arg = true)
: indent(indent_arg),
indent_size(indent_size_arg),
window(window_arg),
- null_rep(std::move(null_rep_arg)),
+ null_rep(std::move(null_rep_arg)),
skip_new_lines(skip_new_lines_arg),
truncate_metadata(truncate_metadata_arg) {}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
index 21703f3cf24..66f9e932b58 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.cc
@@ -69,14 +69,14 @@ class SimpleRecordBatch : public RecordBatch {
boxed_columns_.resize(schema_->num_fields());
}
- const std::vector<std::shared_ptr<Array>>& columns() const override {
- for (int i = 0; i < num_columns(); ++i) {
- // Force all columns to be boxed
- column(i);
- }
- return boxed_columns_;
- }
-
+ const std::vector<std::shared_ptr<Array>>& columns() const override {
+ for (int i = 0; i < num_columns(); ++i) {
+ // Force all columns to be boxed
+ column(i);
+ }
+ return boxed_columns_;
+ }
+
std::shared_ptr<Array> column(int i) const override {
std::shared_ptr<Array> result = internal::atomic_load(&boxed_columns_[i]);
if (!result) {
@@ -88,7 +88,7 @@ class SimpleRecordBatch : public RecordBatch {
std::shared_ptr<ArrayData> column_data(int i) const override { return columns_[i]; }
- const ArrayDataVector& column_data() const override { return columns_; }
+ const ArrayDataVector& column_data() const override { return columns_; }
Result<std::shared_ptr<RecordBatch>> AddColumn(
int i, const std::shared_ptr<Field>& field,
@@ -97,9 +97,9 @@ class SimpleRecordBatch : public RecordBatch {
ARROW_CHECK(column != nullptr);
if (!field->type()->Equals(column->type())) {
- return Status::TypeError("Column data type ", field->type()->name(),
- " does not match field data type ",
- column->type()->name());
+ return Status::TypeError("Column data type ", field->type()->name(),
+ " does not match field data type ",
+ column->type()->name());
}
if (column->length() != num_rows_) {
return Status::Invalid(
@@ -108,42 +108,42 @@ class SimpleRecordBatch : public RecordBatch {
}
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->AddField(i, field));
- return RecordBatch::Make(std::move(new_schema), num_rows_,
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
internal::AddVectorElement(columns_, i, column->data()));
}
- Result<std::shared_ptr<RecordBatch>> SetColumn(
- int i, const std::shared_ptr<Field>& field,
- const std::shared_ptr<Array>& column) const override {
- ARROW_CHECK(field != nullptr);
- ARROW_CHECK(column != nullptr);
-
- if (!field->type()->Equals(column->type())) {
- return Status::TypeError("Column data type ", field->type()->name(),
- " does not match field data type ",
- column->type()->name());
- }
- if (column->length() != num_rows_) {
- return Status::Invalid(
- "Added column's length must match record batch's length. Expected length ",
- num_rows_, " but got length ", column->length());
- }
-
- ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field));
- return RecordBatch::Make(std::move(new_schema), num_rows_,
- internal::ReplaceVectorElement(columns_, i, column->data()));
- }
-
+ Result<std::shared_ptr<RecordBatch>> SetColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const override {
+ ARROW_CHECK(field != nullptr);
+ ARROW_CHECK(column != nullptr);
+
+ if (!field->type()->Equals(column->type())) {
+ return Status::TypeError("Column data type ", field->type()->name(),
+ " does not match field data type ",
+ column->type()->name());
+ }
+ if (column->length() != num_rows_) {
+ return Status::Invalid(
+ "Added column's length must match record batch's length. Expected length ",
+ num_rows_, " but got length ", column->length());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field));
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
+ internal::ReplaceVectorElement(columns_, i, column->data()));
+ }
+
Result<std::shared_ptr<RecordBatch>> RemoveColumn(int i) const override {
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->RemoveField(i));
- return RecordBatch::Make(std::move(new_schema), num_rows_,
+ return RecordBatch::Make(std::move(new_schema), num_rows_,
internal::DeleteVectorElement(columns_, i));
}
std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
const std::shared_ptr<const KeyValueMetadata>& metadata) const override {
auto new_schema = schema_->WithMetadata(metadata);
- return RecordBatch::Make(std::move(new_schema), num_rows_, columns_);
+ return RecordBatch::Make(std::move(new_schema), num_rows_, columns_);
}
std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const override {
@@ -191,8 +191,8 @@ std::shared_ptr<RecordBatch> RecordBatch::Make(
Result<std::shared_ptr<RecordBatch>> RecordBatch::FromStructArray(
const std::shared_ptr<Array>& array) {
if (array->type_id() != Type::STRUCT) {
- return Status::TypeError("Cannot construct record batch from array of type ",
- *array->type());
+ return Status::TypeError("Cannot construct record batch from array of type ",
+ *array->type());
}
if (array->null_count() != 0) {
return Status::Invalid(
@@ -251,27 +251,27 @@ bool RecordBatch::ApproxEquals(const RecordBatch& other) const {
return true;
}
-Result<std::shared_ptr<RecordBatch>> RecordBatch::SelectColumns(
- const std::vector<int>& indices) const {
- int n = static_cast<int>(indices.size());
-
- FieldVector fields(n);
- ArrayVector columns(n);
-
- for (int i = 0; i < n; i++) {
- int pos = indices[i];
- if (pos < 0 || pos > num_columns() - 1) {
- return Status::Invalid("Invalid column index ", pos, " to select columns.");
- }
- fields[i] = schema()->field(pos);
- columns[i] = column(pos);
- }
-
- auto new_schema =
- std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
- return RecordBatch::Make(std::move(new_schema), num_rows(), std::move(columns));
-}
-
+Result<std::shared_ptr<RecordBatch>> RecordBatch::SelectColumns(
+ const std::vector<int>& indices) const {
+ int n = static_cast<int>(indices.size());
+
+ FieldVector fields(n);
+ ArrayVector columns(n);
+
+ for (int i = 0; i < n; i++) {
+ int pos = indices[i];
+ if (pos < 0 || pos > num_columns() - 1) {
+ return Status::Invalid("Invalid column index ", pos, " to select columns.");
+ }
+ fields[i] = schema()->field(pos);
+ columns[i] = column(pos);
+ }
+
+ auto new_schema =
+ std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
+ return RecordBatch::Make(std::move(new_schema), num_rows(), std::move(columns));
+}
+
std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset) const {
return Slice(offset, this->num_rows() - offset);
}
@@ -304,7 +304,7 @@ Status RecordBatch::ValidateFull() const {
RETURN_NOT_OK(Validate());
for (int i = 0; i < num_columns(); ++i) {
const auto& array = *this->column(i);
- RETURN_NOT_OK(internal::ValidateArrayFull(array));
+ RETURN_NOT_OK(internal::ValidateArrayFull(array));
}
return Status::OK();
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
index 735d4f6f06b..3dc1f54a083 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/record_batch.h
@@ -87,10 +87,10 @@ class ARROW_EXPORT RecordBatch {
// \return the table's schema
/// \return true if batches are equal
- const std::shared_ptr<Schema>& schema() const { return schema_; }
+ const std::shared_ptr<Schema>& schema() const { return schema_; }
/// \brief Retrieve all columns at once
- virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
+ virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
/// \brief Retrieve an array from the record batch
/// \param[in] i field index, does not boundscheck
@@ -108,7 +108,7 @@ class ARROW_EXPORT RecordBatch {
virtual std::shared_ptr<ArrayData> column_data(int i) const = 0;
/// \brief Retrieve all arrays' internal data from the record batch.
- virtual const ArrayDataVector& column_data() const = 0;
+ virtual const ArrayDataVector& column_data() const = 0;
/// \brief Add column to the record batch, producing a new RecordBatch
///
@@ -130,11 +130,11 @@ class ARROW_EXPORT RecordBatch {
virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
int i, std::string field_name, const std::shared_ptr<Array>& column) const;
- /// \brief Replace a column in the table, producing a new Table
- virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
- int i, const std::shared_ptr<Field>& field,
- const std::shared_ptr<Array>& column) const = 0;
-
+ /// \brief Replace a column in the table, producing a new Table
+ virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
+ int i, const std::shared_ptr<Field>& field,
+ const std::shared_ptr<Array>& column) const = 0;
+
/// \brief Remove column from the record batch, producing a new RecordBatch
///
/// \param[in] i field index, does boundscheck
@@ -166,10 +166,10 @@ class ARROW_EXPORT RecordBatch {
/// \return PrettyPrint representation suitable for debugging
std::string ToString() const;
- /// \brief Return new record batch with specified columns
- Result<std::shared_ptr<RecordBatch>> SelectColumns(
- const std::vector<int>& indices) const;
-
+ /// \brief Return new record batch with specified columns
+ Result<std::shared_ptr<RecordBatch>> SelectColumns(
+ const std::vector<int>& indices) const;
+
/// \brief Perform cheap validation checks to determine obvious inconsistencies
/// within the record batch's schema and internal data.
///
@@ -199,8 +199,8 @@ class ARROW_EXPORT RecordBatch {
/// \brief Abstract interface for reading stream of record batches
class ARROW_EXPORT RecordBatchReader {
public:
- using ValueType = std::shared_ptr<RecordBatch>;
-
+ using ValueType = std::shared_ptr<RecordBatch>;
+
virtual ~RecordBatchReader() = default;
/// \return the shared schema of the record batches in the stream
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/result.h b/contrib/libs/apache/arrow/cpp/src/arrow/result.h
index 21483c89533..cb7437cd242 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/result.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/result.h
@@ -18,7 +18,7 @@
#pragma once
-#include <cstddef>
+#include <cstddef>
#include <new>
#include <string>
#include <type_traits>
@@ -29,9 +29,9 @@
namespace arrow {
-template <typename>
-struct EnsureResult;
-
+template <typename>
+struct EnsureResult;
+
namespace internal {
#if __cplusplus >= 201703L
@@ -317,7 +317,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
return ValueUnsafe();
}
const T& operator*() const& { return ValueOrDie(); }
- const T* operator->() const { return &ValueOrDie(); }
+ const T* operator->() const { return &ValueOrDie(); }
/// Gets a mutable reference to the stored `T` value.
///
@@ -332,7 +332,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
return ValueUnsafe();
}
T& operator*() & { return ValueOrDie(); }
- T* operator->() { return &ValueOrDie(); }
+ T* operator->() { return &ValueOrDie(); }
/// Moves and returns the internally-stored `T` value.
///
@@ -385,7 +385,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
/// Apply a function to the internally stored value to produce a new result or propagate
/// the stored error.
template <typename M>
- typename EnsureResult<typename std::result_of<M && (T)>::type>::type Map(M&& m) && {
+ typename EnsureResult<typename std::result_of<M && (T)>::type>::type Map(M&& m) && {
if (!ok()) {
return status();
}
@@ -395,36 +395,36 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
/// Apply a function to the internally stored value to produce a new result or propagate
/// the stored error.
template <typename M>
- typename EnsureResult<typename std::result_of<M && (const T&)>::type>::type Map(
- M&& m) const& {
+ typename EnsureResult<typename std::result_of<M && (const T&)>::type>::type Map(
+ M&& m) const& {
if (!ok()) {
return status();
}
return std::forward<M>(m)(ValueUnsafe());
}
- /// Cast the internally stored value to produce a new result or propagate the stored
- /// error.
- template <typename U, typename E = typename std::enable_if<
- std::is_constructible<U, T>::value>::type>
- Result<U> As() && {
- if (!ok()) {
- return status();
- }
- return U(MoveValueUnsafe());
- }
-
- /// Cast the internally stored value to produce a new result or propagate the stored
- /// error.
- template <typename U, typename E = typename std::enable_if<
- std::is_constructible<U, const T&>::value>::type>
- Result<U> As() const& {
- if (!ok()) {
- return status();
- }
- return U(ValueUnsafe());
- }
-
+ /// Cast the internally stored value to produce a new result or propagate the stored
+ /// error.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, T>::value>::type>
+ Result<U> As() && {
+ if (!ok()) {
+ return status();
+ }
+ return U(MoveValueUnsafe());
+ }
+
+ /// Cast the internally stored value to produce a new result or propagate the stored
+ /// error.
+ template <typename U, typename E = typename std::enable_if<
+ std::is_constructible<U, const T&>::value>::type>
+ Result<U> As() const& {
+ if (!ok()) {
+ return status();
+ }
+ return U(ValueUnsafe());
+ }
+
const T& ValueUnsafe() const& {
return *internal::launder(reinterpret_cast<const T*>(&data_));
}
@@ -448,16 +448,16 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
void Destroy() {
if (ARROW_PREDICT_TRUE(status_.ok())) {
- static_assert(offsetof(Result<T>, status_) == 0,
- "Status is guaranteed to be at the start of Result<>");
+ static_assert(offsetof(Result<T>, status_) == 0,
+ "Status is guaranteed to be at the start of Result<>");
internal::launder(reinterpret_cast<const T*>(&data_))->~T();
}
}
};
-#define ARROW_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
- auto&& result_name = (rexpr); \
- ARROW_RETURN_IF_(!(result_name).ok(), (result_name).status(), ARROW_STRINGIFY(rexpr)); \
+#define ARROW_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr) \
+ auto&& result_name = (rexpr); \
+ ARROW_RETURN_IF_(!(result_name).ok(), (result_name).status(), ARROW_STRINGIFY(rexpr)); \
lhs = std::move(result_name).ValueUnsafe();
#define ARROW_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)
@@ -475,14 +475,14 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
/// WARNING: ARROW_ASSIGN_OR_RAISE expands into multiple statements;
/// it cannot be used in a single statement (e.g. as the body of an if
/// statement without {})!
-///
-/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
-/// an lvalue Result which you *don't* want to move out of cast appropriately.
-///
-/// WARNING: ARROW_ASSIGN_OR_RAISE is not a single expression; it will not
-/// maintain lifetimes of all temporaries in `rexpr` (e.g.
-/// `ARROW_ASSIGN_OR_RAISE(auto x, MakeTemp().GetResultRef());`
-/// will most likely segfault)!
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
+/// an lvalue Result which you *don't* want to move out of cast appropriately.
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE is not a single expression; it will not
+/// maintain lifetimes of all temporaries in `rexpr` (e.g.
+/// `ARROW_ASSIGN_OR_RAISE(auto x, MakeTemp().GetResultRef());`
+/// will most likely segfault)!
#define ARROW_ASSIGN_OR_RAISE(lhs, rexpr) \
ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
lhs, rexpr);
@@ -490,7 +490,7 @@ class ARROW_MUST_USE_TYPE Result : public util::EqualityComparable<Result<T>> {
namespace internal {
template <typename T>
-inline const Status& GenericToStatus(const Result<T>& res) {
+inline const Status& GenericToStatus(const Result<T>& res) {
return res.status();
}
@@ -501,19 +501,19 @@ inline Status GenericToStatus(Result<T>&& res) {
} // namespace internal
-template <typename T, typename R = typename EnsureResult<T>::type>
-R ToResult(T t) {
- return R(std::move(t));
+template <typename T, typename R = typename EnsureResult<T>::type>
+R ToResult(T t) {
+ return R(std::move(t));
}
-template <typename T>
-struct EnsureResult {
- using type = Result<T>;
-};
-
-template <typename T>
-struct EnsureResult<Result<T>> {
- using type = Result<T>;
-};
-
+template <typename T>
+struct EnsureResult {
+ using type = Result<T>;
+};
+
+template <typename T>
+struct EnsureResult<Result<T>> {
+ using type = Result<T>;
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
index 4f9d94a0a38..cb7755ba3f1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.cc
@@ -18,7 +18,7 @@
#include "arrow/scalar.h"
#include <memory>
-#include <sstream>
+#include <sstream>
#include <string>
#include <utility>
@@ -45,10 +45,10 @@ bool Scalar::Equals(const Scalar& other, const EqualOptions& options) const {
return ScalarEquals(*this, other, options);
}
-bool Scalar::ApproxEquals(const Scalar& other, const EqualOptions& options) const {
- return ScalarApproxEquals(*this, other, options);
-}
-
+bool Scalar::ApproxEquals(const Scalar& other, const EqualOptions& options) const {
+ return ScalarApproxEquals(*this, other, options);
+}
+
struct ScalarHashImpl {
static std::hash<std::string> string_hash;
@@ -74,14 +74,14 @@ struct ScalarHashImpl {
return StdHash(s.value.low_bits()) & StdHash(s.value.high_bits());
}
- Status Visit(const Decimal256Scalar& s) {
- Status status = Status::OK();
- for (uint64_t elem : s.value.little_endian_array()) {
- status &= StdHash(elem);
- }
- return status;
- }
-
+ Status Visit(const Decimal256Scalar& s) {
+ Status status = Status::OK();
+ for (uint64_t elem : s.value.little_endian_array()) {
+ status &= StdHash(elem);
+ }
+ return status;
+ }
+
Status Visit(const BaseListScalar& s) { return ArrayHash(*s.value); }
Status Visit(const StructScalar& s) {
@@ -91,11 +91,11 @@ struct ScalarHashImpl {
return Status::OK();
}
- Status Visit(const DictionaryScalar& s) {
- AccumulateHashFrom(*s.value.index);
- return Status::OK();
- }
-
+ Status Visit(const DictionaryScalar& s) {
+ AccumulateHashFrom(*s.value.index);
+ return Status::OK();
+ }
+
// TODO(bkietz) implement less wimpy hashing when these have ValueType
Status Visit(const UnionScalar& s) { return Status::OK(); }
Status Visit(const ExtensionScalar& s) { return Status::OK(); }
@@ -132,21 +132,21 @@ struct ScalarHashImpl {
return Status::OK();
}
- explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
- if (scalar.is_valid) {
- AccumulateHashFrom(scalar);
- }
- }
+ explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
+ if (scalar.is_valid) {
+ AccumulateHashFrom(scalar);
+ }
+ }
void AccumulateHashFrom(const Scalar& scalar) {
DCHECK_OK(StdHash(scalar.type->fingerprint()));
DCHECK_OK(VisitScalarInline(scalar, this));
}
- size_t hash_;
+ size_t hash_;
};
-size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; }
+size_t Scalar::hash() const { return ScalarHashImpl(*this).hash_; }
StringScalar::StringScalar(std::string s)
: StringScalar(Buffer::FromString(std::move(s))) {}
@@ -193,20 +193,20 @@ FixedSizeListScalar::FixedSizeListScalar(std::shared_ptr<Array> value)
: BaseListScalar(
value, fixed_size_list(value->type(), static_cast<int32_t>(value->length()))) {}
-Result<std::shared_ptr<StructScalar>> StructScalar::Make(
- ScalarVector values, std::vector<std::string> field_names) {
- if (values.size() != field_names.size()) {
- return Status::Invalid("Mismatching number of field names and child scalars");
- }
-
- FieldVector fields(field_names.size());
- for (size_t i = 0; i < fields.size(); ++i) {
- fields[i] = arrow::field(std::move(field_names[i]), values[i]->type);
- }
-
- return std::make_shared<StructScalar>(std::move(values), struct_(std::move(fields)));
-}
-
+Result<std::shared_ptr<StructScalar>> StructScalar::Make(
+ ScalarVector values, std::vector<std::string> field_names) {
+ if (values.size() != field_names.size()) {
+ return Status::Invalid("Mismatching number of field names and child scalars");
+ }
+
+ FieldVector fields(field_names.size());
+ for (size_t i = 0; i < fields.size(); ++i) {
+ fields[i] = arrow::field(std::move(field_names[i]), values[i]->type);
+ }
+
+ return std::make_shared<StructScalar>(std::move(values), struct_(std::move(fields)));
+}
+
Result<std::shared_ptr<Scalar>> StructScalar::field(FieldRef ref) const {
ARROW_ASSIGN_OR_RAISE(auto path, ref.FindOne(*type));
if (path.indices().size() != 1) {
@@ -277,13 +277,13 @@ Result<std::shared_ptr<Scalar>> DictionaryScalar::GetEncodedValue() const {
return value.dictionary->GetScalar(index_value);
}
-std::shared_ptr<DictionaryScalar> DictionaryScalar::Make(std::shared_ptr<Scalar> index,
- std::shared_ptr<Array> dict) {
- auto type = dictionary(index->type, dict->type());
- return std::make_shared<DictionaryScalar>(ValueType{std::move(index), std::move(dict)},
- std::move(type));
-}
-
+std::shared_ptr<DictionaryScalar> DictionaryScalar::Make(std::shared_ptr<Scalar> index,
+ std::shared_ptr<Array> dict) {
+ auto type = dictionary(index->type, dict->type());
+ return std::make_shared<DictionaryScalar>(ValueType{std::move(index), std::move(dict)},
+ std::move(type));
+}
+
template <typename T>
using scalar_constructor_has_arrow_type =
std::is_constructible<typename TypeTraits<T>::ScalarType, std::shared_ptr<DataType>>;
@@ -551,31 +551,31 @@ Status CastImpl(const ScalarType& from, StringScalar* to) {
return Status::OK();
}
-Status CastImpl(const Decimal128Scalar& from, StringScalar* to) {
- auto from_type = checked_cast<const Decimal128Type*>(from.type.get());
- to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
- return Status::OK();
-}
-
-Status CastImpl(const Decimal256Scalar& from, StringScalar* to) {
- auto from_type = checked_cast<const Decimal256Type*>(from.type.get());
- to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
- return Status::OK();
-}
-
-Status CastImpl(const StructScalar& from, StringScalar* to) {
- std::stringstream ss;
- ss << '{';
- for (int i = 0; static_cast<size_t>(i) < from.value.size(); i++) {
- if (i > 0) ss << ", ";
- ss << from.type->field(i)->name() << ':' << from.type->field(i)->type()->ToString()
- << " = " << from.value[i]->ToString();
- }
- ss << '}';
- to->value = Buffer::FromString(ss.str());
- return Status::OK();
-}
-
+Status CastImpl(const Decimal128Scalar& from, StringScalar* to) {
+ auto from_type = checked_cast<const Decimal128Type*>(from.type.get());
+ to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
+ return Status::OK();
+}
+
+Status CastImpl(const Decimal256Scalar& from, StringScalar* to) {
+ auto from_type = checked_cast<const Decimal256Type*>(from.type.get());
+ to->value = Buffer::FromString(from.value.ToString(from_type->scale()));
+ return Status::OK();
+}
+
+Status CastImpl(const StructScalar& from, StringScalar* to) {
+ std::stringstream ss;
+ ss << '{';
+ for (int i = 0; static_cast<size_t>(i) < from.value.size(); i++) {
+ if (i > 0) ss << ", ";
+ ss << from.type->field(i)->name() << ':' << from.type->field(i)->type()->ToString()
+ << " = " << from.value[i]->ToString();
+ }
+ ss << '}';
+ to->value = Buffer::FromString(ss.str());
+ return Status::OK();
+}
+
struct CastImplVisitor {
Status NotImplemented() {
return Status::NotImplemented("cast to ", *to_type_, " from ", *from_.type);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
index 1d5e2c93ff4..24744859686 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/scalar.h
@@ -65,19 +65,19 @@ struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
bool Equals(const Scalar& other,
const EqualOptions& options = EqualOptions::Defaults()) const;
- bool ApproxEquals(const Scalar& other,
- const EqualOptions& options = EqualOptions::Defaults()) const;
-
+ bool ApproxEquals(const Scalar& other,
+ const EqualOptions& options = EqualOptions::Defaults()) const;
+
struct ARROW_EXPORT Hash {
- size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
+ size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
size_t operator()(const std::shared_ptr<Scalar>& scalar) const {
- return scalar->hash();
+ return scalar->hash();
}
};
- size_t hash() const;
-
+ size_t hash() const;
+
std::string ToString() const;
static Result<std::shared_ptr<Scalar>> Parse(const std::shared_ptr<DataType>& type,
@@ -350,17 +350,17 @@ struct ARROW_EXPORT Decimal128Scalar : public Scalar {
Decimal128 value;
};
-struct ARROW_EXPORT Decimal256Scalar : public Scalar {
- using Scalar::Scalar;
- using TypeClass = Decimal256Type;
- using ValueType = Decimal256;
-
- Decimal256Scalar(Decimal256 value, std::shared_ptr<DataType> type)
- : Scalar(std::move(type), true), value(value) {}
-
- Decimal256 value;
-};
-
+struct ARROW_EXPORT Decimal256Scalar : public Scalar {
+ using Scalar::Scalar;
+ using TypeClass = Decimal256Type;
+ using ValueType = Decimal256;
+
+ Decimal256Scalar(Decimal256 value, std::shared_ptr<DataType> type)
+ : Scalar(std::move(type), true), value(value) {}
+
+ Decimal256 value;
+};
+
struct ARROW_EXPORT BaseListScalar : public Scalar {
using Scalar::Scalar;
using ValueType = std::shared_ptr<Array>;
@@ -411,9 +411,9 @@ struct ARROW_EXPORT StructScalar : public Scalar {
StructScalar(ValueType value, std::shared_ptr<DataType> type)
: Scalar(std::move(type), true), value(std::move(value)) {}
- static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
- std::vector<std::string> field_names);
-
+ static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
+ std::vector<std::string> field_names);
+
explicit StructScalar(std::shared_ptr<DataType> type) : Scalar(std::move(type)) {}
};
@@ -448,9 +448,9 @@ struct ARROW_EXPORT DictionaryScalar : public Scalar {
DictionaryScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true)
: Scalar(std::move(type), is_valid), value(std::move(value)) {}
- static std::shared_ptr<DictionaryScalar> Make(std::shared_ptr<Scalar> index,
- std::shared_ptr<Array> dict);
-
+ static std::shared_ptr<DictionaryScalar> Make(std::shared_ptr<Scalar> index,
+ std::shared_ptr<Array> dict);
+
Result<std::shared_ptr<Scalar>> GetEncodedValue() const;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/status.cc b/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
index d6399f8bfce..0f02cb57a23 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/status.cc
@@ -68,9 +68,9 @@ std::string Status::CodeAsString(StatusCode code) {
case StatusCode::Invalid:
type = "Invalid";
break;
- case StatusCode::Cancelled:
- type = "Cancelled";
- break;
+ case StatusCode::Cancelled:
+ type = "Cancelled";
+ break;
case StatusCode::IOError:
type = "IOError";
break;
@@ -135,7 +135,7 @@ void Status::Abort(const std::string& message) const {
void Status::AddContextLine(const char* filename, int line, const char* expr) {
ARROW_CHECK(!ok()) << "Cannot add context line to ok status";
std::stringstream ss;
- ss << "\n" << filename << ":" << line << " " << expr;
+ ss << "\n" << filename << ":" << line << " " << expr;
state_->msg += ss.str();
}
#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/status.h b/contrib/libs/apache/arrow/cpp/src/arrow/status.h
index 9fbc840a541..056d60d6f32 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/status.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/status.h
@@ -83,7 +83,7 @@ enum class StatusCode : char {
IOError = 5,
CapacityError = 6,
IndexError = 7,
- Cancelled = 8,
+ Cancelled = 8,
UnknownError = 9,
NotImplemented = 10,
SerializationError = 11,
@@ -205,12 +205,12 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
return Status::FromArgs(StatusCode::Invalid, std::forward<Args>(args)...);
}
- /// Return an error status for cancelled operation
- template <typename... Args>
- static Status Cancelled(Args&&... args) {
- return Status::FromArgs(StatusCode::Cancelled, std::forward<Args>(args)...);
- }
-
+ /// Return an error status for cancelled operation
+ template <typename... Args>
+ static Status Cancelled(Args&&... args) {
+ return Status::FromArgs(StatusCode::Cancelled, std::forward<Args>(args)...);
+ }
+
/// Return an error status when an index is out of bounds
template <typename... Args>
static Status IndexError(Args&&... args) {
@@ -270,8 +270,8 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
bool IsKeyError() const { return code() == StatusCode::KeyError; }
/// Return true iff the status indicates invalid data.
bool IsInvalid() const { return code() == StatusCode::Invalid; }
- /// Return true iff the status indicates a cancelled operation.
- bool IsCancelled() const { return code() == StatusCode::Cancelled; }
+ /// Return true iff the status indicates a cancelled operation.
+ bool IsCancelled() const { return code() == StatusCode::Cancelled; }
/// Return true iff the status indicates an IO-related failure.
bool IsIOError() const { return code() == StatusCode::IOError; }
/// Return true iff the status indicates a container reaching capacity limits.
@@ -312,10 +312,10 @@ class ARROW_MUST_USE_TYPE ARROW_EXPORT Status : public util::EqualityComparable<
StatusCode code() const { return ok() ? StatusCode::OK : state_->code; }
/// \brief Return the specific error message attached to this status.
- const std::string& message() const {
- static const std::string no_message = "";
- return ok() ? no_message : state_->msg;
- }
+ const std::string& message() const {
+ static const std::string no_message = "";
+ return ok() ? no_message : state_->msg;
+ }
/// \brief Return the status detail attached to this message.
const std::shared_ptr<StatusDetail>& detail() const {
@@ -443,7 +443,7 @@ namespace internal {
// Extract Status from Status or Result<T>
// Useful for the status check macros such as RETURN_NOT_OK.
-inline const Status& GenericToStatus(const Status& st) { return st; }
+inline const Status& GenericToStatus(const Status& st) { return st; }
inline Status GenericToStatus(Status&& st) { return std::move(st); }
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h b/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
index c996923ca67..6225a89aae4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/stl_iterator.h
@@ -1,146 +1,146 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstddef>
-#include <iterator>
-#include <utility>
-
-#include "arrow/type_fwd.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/optional.h"
-
-namespace arrow {
-namespace stl {
-
-namespace detail {
-
-template <typename ArrayType>
-struct DefaultValueAccessor {
- using ValueType = decltype(std::declval<ArrayType>().GetView(0));
-
- ValueType operator()(const ArrayType& array, int64_t index) {
- return array.GetView(index);
- }
-};
-
-} // namespace detail
-
-template <typename ArrayType,
- typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
-class ArrayIterator {
- public:
- using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
- using difference_type = int64_t;
- using pointer = value_type*;
- using reference = value_type&;
- using iterator_category = std::random_access_iterator_tag;
-
- // Some algorithms need to default-construct an iterator
- ArrayIterator() : array_(NULLPTR), index_(0) {}
-
- explicit ArrayIterator(const ArrayType& array, int64_t index = 0)
- : array_(&array), index_(index) {}
-
- // Value access
- value_type operator*() const {
- return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
- }
-
- value_type operator[](difference_type n) const {
- return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
- }
-
- int64_t index() const { return index_; }
-
- // Forward / backward
- ArrayIterator& operator++() {
- ++index_;
- return *this;
- }
- ArrayIterator& operator--() {
- --index_;
- return *this;
- }
- ArrayIterator operator++(int) {
- ArrayIterator tmp(*this);
- ++index_;
- return tmp;
- }
- ArrayIterator operator--(int) {
- ArrayIterator tmp(*this);
- --index_;
- return tmp;
- }
-
- // Arithmetic
- difference_type operator-(const ArrayIterator& other) const {
- return index_ - other.index_;
- }
- ArrayIterator operator+(difference_type n) const {
- return ArrayIterator(*array_, index_ + n);
- }
- ArrayIterator operator-(difference_type n) const {
- return ArrayIterator(*array_, index_ - n);
- }
- friend inline ArrayIterator operator+(difference_type diff,
- const ArrayIterator& other) {
- return ArrayIterator(*other.array_, diff + other.index_);
- }
- friend inline ArrayIterator operator-(difference_type diff,
- const ArrayIterator& other) {
- return ArrayIterator(*other.array_, diff - other.index_);
- }
- ArrayIterator& operator+=(difference_type n) {
- index_ += n;
- return *this;
- }
- ArrayIterator& operator-=(difference_type n) {
- index_ -= n;
- return *this;
- }
-
- // Comparisons
- bool operator==(const ArrayIterator& other) const { return index_ == other.index_; }
- bool operator!=(const ArrayIterator& other) const { return index_ != other.index_; }
- bool operator<(const ArrayIterator& other) const { return index_ < other.index_; }
- bool operator>(const ArrayIterator& other) const { return index_ > other.index_; }
- bool operator<=(const ArrayIterator& other) const { return index_ <= other.index_; }
- bool operator>=(const ArrayIterator& other) const { return index_ >= other.index_; }
-
- private:
- const ArrayType* array_;
- int64_t index_;
-};
-
-} // namespace stl
-} // namespace arrow
-
-namespace std {
-
-template <typename ArrayType>
-struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
- using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
- using difference_type = typename IteratorType::difference_type;
- using value_type = typename IteratorType::value_type;
- using pointer = typename IteratorType::pointer;
- using reference = typename IteratorType::reference;
- using iterator_category = typename IteratorType::iterator_category;
-};
-
-} // namespace std
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+namespace stl {
+
+namespace detail {
+
+template <typename ArrayType>
+struct DefaultValueAccessor {
+ using ValueType = decltype(std::declval<ArrayType>().GetView(0));
+
+ ValueType operator()(const ArrayType& array, int64_t index) {
+ return array.GetView(index);
+ }
+};
+
+} // namespace detail
+
+template <typename ArrayType,
+ typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ArrayIterator {
+ public:
+ using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+ using difference_type = int64_t;
+ using pointer = value_type*;
+ using reference = value_type&;
+ using iterator_category = std::random_access_iterator_tag;
+
+ // Some algorithms need to default-construct an iterator
+ ArrayIterator() : array_(NULLPTR), index_(0) {}
+
+ explicit ArrayIterator(const ArrayType& array, int64_t index = 0)
+ : array_(&array), index_(index) {}
+
+ // Value access
+ value_type operator*() const {
+ return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+ }
+
+ value_type operator[](difference_type n) const {
+ return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
+ }
+
+ int64_t index() const { return index_; }
+
+ // Forward / backward
+ ArrayIterator& operator++() {
+ ++index_;
+ return *this;
+ }
+ ArrayIterator& operator--() {
+ --index_;
+ return *this;
+ }
+ ArrayIterator operator++(int) {
+ ArrayIterator tmp(*this);
+ ++index_;
+ return tmp;
+ }
+ ArrayIterator operator--(int) {
+ ArrayIterator tmp(*this);
+ --index_;
+ return tmp;
+ }
+
+ // Arithmetic
+ difference_type operator-(const ArrayIterator& other) const {
+ return index_ - other.index_;
+ }
+ ArrayIterator operator+(difference_type n) const {
+ return ArrayIterator(*array_, index_ + n);
+ }
+ ArrayIterator operator-(difference_type n) const {
+ return ArrayIterator(*array_, index_ - n);
+ }
+ friend inline ArrayIterator operator+(difference_type diff,
+ const ArrayIterator& other) {
+ return ArrayIterator(*other.array_, diff + other.index_);
+ }
+ friend inline ArrayIterator operator-(difference_type diff,
+ const ArrayIterator& other) {
+ return ArrayIterator(*other.array_, diff - other.index_);
+ }
+ ArrayIterator& operator+=(difference_type n) {
+ index_ += n;
+ return *this;
+ }
+ ArrayIterator& operator-=(difference_type n) {
+ index_ -= n;
+ return *this;
+ }
+
+ // Comparisons
+ bool operator==(const ArrayIterator& other) const { return index_ == other.index_; }
+ bool operator!=(const ArrayIterator& other) const { return index_ != other.index_; }
+ bool operator<(const ArrayIterator& other) const { return index_ < other.index_; }
+ bool operator>(const ArrayIterator& other) const { return index_ > other.index_; }
+ bool operator<=(const ArrayIterator& other) const { return index_ <= other.index_; }
+ bool operator>=(const ArrayIterator& other) const { return index_ >= other.index_; }
+
+ private:
+ const ArrayType* array_;
+ int64_t index_;
+};
+
+} // namespace stl
+} // namespace arrow
+
+namespace std {
+
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
+ using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
+ using difference_type = typename IteratorType::difference_type;
+ using value_type = typename IteratorType::value_type;
+ using pointer = typename IteratorType::pointer;
+ using reference = typename IteratorType::reference;
+ using iterator_category = typename IteratorType::iterator_category;
+};
+
+} // namespace std
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table.cc b/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
index 6b5362c873d..d4c7802c834 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table.cc
@@ -92,10 +92,10 @@ class SimpleTable : public Table {
std::shared_ptr<ChunkedArray> column(int i) const override { return columns_[i]; }
- const std::vector<std::shared_ptr<ChunkedArray>>& columns() const override {
- return columns_;
- }
-
+ const std::vector<std::shared_ptr<ChunkedArray>>& columns() const override {
+ return columns_;
+ }
+
std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const override {
auto sliced = columns_;
int64_t num_rows = length;
@@ -103,13 +103,13 @@ class SimpleTable : public Table {
column = column->Slice(offset, length);
num_rows = column->length();
}
- return Table::Make(schema_, std::move(sliced), num_rows);
+ return Table::Make(schema_, std::move(sliced), num_rows);
}
Result<std::shared_ptr<Table>> RemoveColumn(int i) const override {
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->RemoveField(i));
- return Table::Make(std::move(new_schema), internal::DeleteVectorElement(columns_, i),
+ return Table::Make(std::move(new_schema), internal::DeleteVectorElement(columns_, i),
this->num_rows());
}
@@ -129,7 +129,7 @@ class SimpleTable : public Table {
}
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->AddField(i, field_arg));
- return Table::Make(std::move(new_schema),
+ return Table::Make(std::move(new_schema),
internal::AddVectorElement(columns_, i, std::move(col)));
}
@@ -149,14 +149,14 @@ class SimpleTable : public Table {
}
ARROW_ASSIGN_OR_RAISE(auto new_schema, schema_->SetField(i, field_arg));
- return Table::Make(std::move(new_schema),
+ return Table::Make(std::move(new_schema),
internal::ReplaceVectorElement(columns_, i, std::move(col)));
}
std::shared_ptr<Table> ReplaceSchemaMetadata(
const std::shared_ptr<const KeyValueMetadata>& metadata) const override {
auto new_schema = schema_->WithMetadata(metadata);
- return Table::Make(std::move(new_schema), columns_);
+ return Table::Make(std::move(new_schema), columns_);
}
Result<std::shared_ptr<Table>> Flatten(MemoryPool* pool) const override {
@@ -374,7 +374,7 @@ Result<std::shared_ptr<Table>> Table::SelectColumns(
auto new_schema =
std::make_shared<arrow::Schema>(std::move(fields), schema()->metadata());
- return Table::Make(std::move(new_schema), std::move(columns), num_rows());
+ return Table::Make(std::move(new_schema), std::move(columns), num_rows());
}
std::string Table::ToString() const {
@@ -435,7 +435,7 @@ Result<std::shared_ptr<Table>> ConcatenateTables(
}
columns[i] = std::make_shared<ChunkedArray>(column_arrays, schema->field(i)->type());
}
- return Table::Make(std::move(schema), std::move(columns));
+ return Table::Make(std::move(schema), std::move(columns));
}
Result<std::shared_ptr<Table>> PromoteTableToSchema(const std::shared_ptr<Table>& table,
@@ -564,7 +564,7 @@ Result<std::shared_ptr<Table>> Table::CombineChunks(MemoryPool* pool) const {
compacted_columns[i] = std::make_shared<ChunkedArray>(compacted);
}
}
- return Table::Make(schema(), std::move(compacted_columns), num_rows_);
+ return Table::Make(schema(), std::move(compacted_columns), num_rows_);
}
// ----------------------------------------------------------------------
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table.h b/contrib/libs/apache/arrow/cpp/src/arrow/table.h
index 96b50e002f9..f1e5f23eed8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/table.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table.h
@@ -98,7 +98,7 @@ class ARROW_EXPORT Table {
virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
/// \brief Return vector of all columns for table
- virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
+ virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
/// Return a column's field by index
std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
@@ -151,7 +151,7 @@ class ARROW_EXPORT Table {
/// \brief Return new table with specified columns
Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
- /// \brief Replace schema key-value metadata with new metadata
+ /// \brief Replace schema key-value metadata with new metadata
/// \since 0.5.0
///
/// \param[in] metadata new KeyValueMetadata
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
index 170dfc70c3c..c026c355758 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/table_builder.cc
@@ -21,7 +21,7 @@
#include <utility>
#include "arrow/array/array_base.h"
-#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_base.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/type.h"
@@ -74,9 +74,9 @@ Status RecordBatchBuilder::Flush(bool reset_builders,
}
}
std::shared_ptr<Schema> schema =
- std::make_shared<Schema>(std::move(schema_fields), schema_->metadata());
+ std::make_shared<Schema>(std::move(schema_fields), schema_->metadata());
- *batch = RecordBatch::Make(std::move(schema), length, std::move(fields));
+ *batch = RecordBatch::Make(std::move(schema), length, std::move(fields));
if (reset_builders) {
return InitBuilders();
} else {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
index 91d5975715b..d591bacff02 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.cc
@@ -31,7 +31,7 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/int_util_internal.h"
+#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -41,72 +41,72 @@ using internal::checked_cast;
namespace internal {
-Status ComputeRowMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides) {
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides) {
const int byte_width = GetByteWidth(type);
- const size_t ndim = shape.size();
-
- int64_t remaining = 0;
- if (!shape.empty() && shape.front() > 0) {
- remaining = byte_width;
- for (size_t i = 1; i < ndim; ++i) {
- if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) {
- return Status::Invalid(
- "Row-major strides computed from shape would not fit in 64-bit integer");
- }
- }
+ const size_t ndim = shape.size();
+
+ int64_t remaining = 0;
+ if (!shape.empty() && shape.front() > 0) {
+ remaining = byte_width;
+ for (size_t i = 1; i < ndim; ++i) {
+ if (internal::MultiplyWithOverflow(remaining, shape[i], &remaining)) {
+ return Status::Invalid(
+ "Row-major strides computed from shape would not fit in 64-bit integer");
+ }
+ }
}
if (remaining == 0) {
strides->assign(shape.size(), byte_width);
- return Status::OK();
+ return Status::OK();
}
- strides->push_back(remaining);
- for (size_t i = 1; i < ndim; ++i) {
- remaining /= shape[i];
+ strides->push_back(remaining);
+ for (size_t i = 1; i < ndim; ++i) {
+ remaining /= shape[i];
strides->push_back(remaining);
}
-
- return Status::OK();
+
+ return Status::OK();
}
-Status ComputeColumnMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides) {
- const int byte_width = internal::GetByteWidth(type);
- const size_t ndim = shape.size();
-
- int64_t total = 0;
- if (!shape.empty() && shape.back() > 0) {
- total = byte_width;
- for (size_t i = 0; i < ndim - 1; ++i) {
- if (internal::MultiplyWithOverflow(total, shape[i], &total)) {
- return Status::Invalid(
- "Column-major strides computed from shape would not fit in 64-bit "
- "integer");
- }
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides) {
+ const int byte_width = internal::GetByteWidth(type);
+ const size_t ndim = shape.size();
+
+ int64_t total = 0;
+ if (!shape.empty() && shape.back() > 0) {
+ total = byte_width;
+ for (size_t i = 0; i < ndim - 1; ++i) {
+ if (internal::MultiplyWithOverflow(total, shape[i], &total)) {
+ return Status::Invalid(
+ "Column-major strides computed from shape would not fit in 64-bit "
+ "integer");
+ }
}
}
-
- if (total == 0) {
- strides->assign(shape.size(), byte_width);
- return Status::OK();
- }
-
- total = byte_width;
- for (size_t i = 0; i < ndim - 1; ++i) {
+
+ if (total == 0) {
+ strides->assign(shape.size(), byte_width);
+ return Status::OK();
+ }
+
+ total = byte_width;
+ for (size_t i = 0; i < ndim - 1; ++i) {
strides->push_back(total);
- total *= shape[i];
+ total *= shape[i];
}
- strides->push_back(total);
-
- return Status::OK();
+ strides->push_back(total);
+
+ return Status::OK();
}
-} // namespace internal
-
+} // namespace internal
+
namespace {
inline bool IsTensorStridesRowMajor(const std::shared_ptr<DataType>& type,
@@ -114,11 +114,11 @@ inline bool IsTensorStridesRowMajor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& strides) {
std::vector<int64_t> c_strides;
const auto& fw_type = checked_cast<const FixedWidthType&>(*type);
- if (internal::ComputeRowMajorStrides(fw_type, shape, &c_strides).ok()) {
- return strides == c_strides;
- } else {
- return false;
- }
+ if (internal::ComputeRowMajorStrides(fw_type, shape, &c_strides).ok()) {
+ return strides == c_strides;
+ } else {
+ return false;
+ }
}
inline bool IsTensorStridesColumnMajor(const std::shared_ptr<DataType>& type,
@@ -126,11 +126,11 @@ inline bool IsTensorStridesColumnMajor(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& strides) {
std::vector<int64_t> f_strides;
const auto& fw_type = checked_cast<const FixedWidthType&>(*type);
- if (internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides).ok()) {
- return strides == f_strides;
- } else {
- return false;
- }
+ if (internal::ComputeColumnMajorStrides(fw_type, shape, &f_strides).ok()) {
+ return strides == f_strides;
+ } else {
+ return false;
+ }
}
inline Status CheckTensorValidity(const std::shared_ptr<DataType>& type,
@@ -162,29 +162,29 @@ Status CheckTensorStridesValidity(const std::shared_ptr<Buffer>& data,
return Status::OK();
}
- // Check the largest offset can be computed without overflow
- const size_t ndim = shape.size();
- int64_t largest_offset = 0;
- for (size_t i = 0; i < ndim; ++i) {
- if (shape[i] == 0) continue;
- if (strides[i] < 0) {
- // TODO(mrkn): Support negative strides for sharing views
- return Status::Invalid("negative strides not supported");
- }
-
- int64_t dim_offset;
- if (!internal::MultiplyWithOverflow(shape[i] - 1, strides[i], &dim_offset)) {
- if (!internal::AddWithOverflow(largest_offset, dim_offset, &largest_offset)) {
- continue;
- }
- }
-
- return Status::Invalid(
- "offsets computed from shape and strides would not fit in 64-bit integer");
+ // Check the largest offset can be computed without overflow
+ const size_t ndim = shape.size();
+ int64_t largest_offset = 0;
+ for (size_t i = 0; i < ndim; ++i) {
+ if (shape[i] == 0) continue;
+ if (strides[i] < 0) {
+ // TODO(mrkn): Support negative strides for sharing views
+ return Status::Invalid("negative strides not supported");
+ }
+
+ int64_t dim_offset;
+ if (!internal::MultiplyWithOverflow(shape[i] - 1, strides[i], &dim_offset)) {
+ if (!internal::AddWithOverflow(largest_offset, dim_offset, &largest_offset)) {
+ continue;
+ }
+ }
+
+ return Status::Invalid(
+ "offsets computed from shape and strides would not fit in 64-bit integer");
}
-
+
const int byte_width = internal::GetByteWidth(*type);
- if (largest_offset > data->size() - byte_width) {
+ if (largest_offset > data->size() - byte_width) {
return Status::Invalid("strides must not involve buffer over run");
}
return Status::OK();
@@ -209,10 +209,10 @@ Status ValidateTensorParameters(const std::shared_ptr<DataType>& type,
RETURN_NOT_OK(CheckTensorValidity(type, data, shape));
if (!strides.empty()) {
RETURN_NOT_OK(CheckTensorStridesValidity(data, shape, strides, type));
- } else {
- std::vector<int64_t> tmp_strides;
- RETURN_NOT_OK(ComputeRowMajorStrides(checked_cast<const FixedWidthType&>(*type),
- shape, &tmp_strides));
+ } else {
+ std::vector<int64_t> tmp_strides;
+ RETURN_NOT_OK(ComputeRowMajorStrides(checked_cast<const FixedWidthType&>(*type),
+ shape, &tmp_strides));
}
if (dim_names.size() > shape.size()) {
return Status::Invalid("too many dim_names are supplied");
@@ -229,8 +229,8 @@ Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buff
: type_(type), data_(data), shape_(shape), strides_(strides), dim_names_(dim_names) {
ARROW_CHECK(is_tensor_supported(type->id()));
if (shape.size() > 0 && strides.size() == 0) {
- ARROW_CHECK_OK(internal::ComputeRowMajorStrides(
- checked_cast<const FixedWidthType&>(*type_), shape, &strides_));
+ ARROW_CHECK_OK(internal::ComputeRowMajorStrides(
+ checked_cast<const FixedWidthType&>(*type_), shape, &strides_));
}
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
index eebb488272e..91e9ad26066 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor.h
@@ -56,16 +56,16 @@ static inline bool is_tensor_supported(Type::type type_id) {
namespace internal {
ARROW_EXPORT
-Status ComputeRowMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides);
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides);
+
+ARROW_EXPORT
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+ const std::vector<int64_t>& shape,
+ std::vector<int64_t>* strides);
ARROW_EXPORT
-Status ComputeColumnMajorStrides(const FixedWidthType& type,
- const std::vector<int64_t>& shape,
- std::vector<int64_t>* strides);
-
-ARROW_EXPORT
bool IsTensorStridesContiguous(const std::shared_ptr<DataType>& type,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& strides);
@@ -180,10 +180,10 @@ class ARROW_EXPORT Tensor {
return *ptr;
}
- Status Validate() const {
- return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
- }
-
+ Status Validate() const {
+ return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
+ }
+
protected:
Tensor() {}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
index d79739240af..2124d0a4e4b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/coo_converter.cc
@@ -213,9 +213,9 @@ class SparseCOOTensorConverter : private SparseTensorConverterMixin {
// make results
const std::vector<int64_t> indices_shape = {nonzero_count, ndim};
std::vector<int64_t> indices_strides;
- RETURN_NOT_OK(internal::ComputeRowMajorStrides(
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(
checked_cast<const FixedWidthType&>(*index_value_type_), indices_shape,
- &indices_strides));
+ &indices_strides));
auto coords = std::make_shared<Tensor>(index_value_type_, std::move(indices_buffer),
indices_shape, indices_strides);
ARROW_ASSIGN_OR_RAISE(sparse_index, SparseCOOIndex::Make(coords, true));
@@ -305,7 +305,7 @@ Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
std::fill_n(values, value_elsize * sparse_tensor->size(), 0);
std::vector<int64_t> strides;
- RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
+ RETURN_NOT_OK(ComputeRowMajorStrides(value_type, sparse_tensor->shape(), &strides));
const auto* raw_data = sparse_tensor->raw_data();
const int ndim = sparse_tensor->ndim();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
index 27173dbc697..77a71d8a12e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csf_converter.cc
@@ -211,7 +211,7 @@ class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin {
}
Result<std::shared_ptr<Tensor>> Build() {
- RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
+ RETURN_NOT_OK(internal::ComputeRowMajorStrides(value_type_, shape_, &strides_));
ARROW_ASSIGN_OR_RAISE(values_buffer_,
AllocateBuffer(value_elsize_ * tensor_size_, pool_));
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
index 8c71b1efdaf..137b5d3202f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/tensor/csx_converter.cc
@@ -177,7 +177,7 @@ Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
std::fill_n(values, value_elsize * tensor_size, 0);
std::vector<int64_t> strides;
- RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));
+ RETURN_NOT_OK(ComputeRowMajorStrides(fw_value_type, shape, &strides));
const auto nc = shape[1];
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type.cc b/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
index 6551b31575d..41914f43663 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type.cc
@@ -68,8 +68,8 @@ constexpr Type::type StructType::type_id;
constexpr Type::type Decimal128Type::type_id;
-constexpr Type::type Decimal256Type::type_id;
-
+constexpr Type::type Decimal256Type::type_id;
+
constexpr Type::type SparseUnionType::type_id;
constexpr Type::type DenseUnionType::type_id;
@@ -130,8 +130,8 @@ std::string ToString(Type::type id) {
TO_STRING_CASE(HALF_FLOAT)
TO_STRING_CASE(FLOAT)
TO_STRING_CASE(DOUBLE)
- TO_STRING_CASE(DECIMAL128)
- TO_STRING_CASE(DECIMAL256)
+ TO_STRING_CASE(DECIMAL128)
+ TO_STRING_CASE(DECIMAL256)
TO_STRING_CASE(DATE32)
TO_STRING_CASE(DATE64)
TO_STRING_CASE(TIME32)
@@ -188,32 +188,32 @@ int GetByteWidth(const DataType& type) {
namespace {
-struct PhysicalTypeVisitor {
- const std::shared_ptr<DataType>& real_type;
- std::shared_ptr<DataType> result;
-
- Status Visit(const DataType&) {
- result = real_type;
- return Status::OK();
- }
-
- template <typename Type, typename PhysicalType = typename Type::PhysicalType>
- Status Visit(const Type&) {
- result = TypeTraits<PhysicalType>::type_singleton();
- return Status::OK();
- }
-};
-
-} // namespace
-
-std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& real_type) {
- PhysicalTypeVisitor visitor{real_type, {}};
- ARROW_CHECK_OK(VisitTypeInline(*real_type, &visitor));
- return std::move(visitor.result);
-}
-
-namespace {
-
+struct PhysicalTypeVisitor {
+ const std::shared_ptr<DataType>& real_type;
+ std::shared_ptr<DataType> result;
+
+ Status Visit(const DataType&) {
+ result = real_type;
+ return Status::OK();
+ }
+
+ template <typename Type, typename PhysicalType = typename Type::PhysicalType>
+ Status Visit(const Type&) {
+ result = TypeTraits<PhysicalType>::type_singleton();
+ return Status::OK();
+ }
+};
+
+} // namespace
+
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& real_type) {
+ PhysicalTypeVisitor visitor{real_type, {}};
+ ARROW_CHECK_OK(VisitTypeInline(*real_type, &visitor));
+ return std::move(visitor.result);
+}
+
+namespace {
+
using internal::checked_cast;
// Merges `existing` and `other` if one of them is of NullType, otherwise
@@ -771,44 +771,44 @@ std::vector<std::shared_ptr<Field>> StructType::GetAllFieldsByName(
return result;
}
-Result<std::shared_ptr<DataType>> DecimalType::Make(Type::type type_id, int32_t precision,
- int32_t scale) {
- if (type_id == Type::DECIMAL128) {
- return Decimal128Type::Make(precision, scale);
- } else if (type_id == Type::DECIMAL256) {
- return Decimal256Type::Make(precision, scale);
- } else {
- return Status::Invalid("Not a decimal type_id: ", type_id);
- }
-}
-
-// Taken from the Apache Impala codebase. The comments next
-// to the return values are the maximum value that can be represented in 2's
-// complement with the returned number of bytes.
-int32_t DecimalType::DecimalSize(int32_t precision) {
- DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
- << precision;
-
- // Generated in python with:
- // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
- // >>> [-1] + [decimal_size(i) for i in range(1, 77)]
- constexpr int32_t kBytes[] = {
- -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
- 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
- 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
- 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32};
-
- if (precision <= 76) {
- return kBytes[precision];
- }
- return static_cast<int32_t>(std::ceil((precision / 8.0) * std::log2(10) + 1));
-}
-
+Result<std::shared_ptr<DataType>> DecimalType::Make(Type::type type_id, int32_t precision,
+ int32_t scale) {
+ if (type_id == Type::DECIMAL128) {
+ return Decimal128Type::Make(precision, scale);
+ } else if (type_id == Type::DECIMAL256) {
+ return Decimal256Type::Make(precision, scale);
+ } else {
+ return Status::Invalid("Not a decimal type_id: ", type_id);
+ }
+}
+
+// Taken from the Apache Impala codebase. The comments next
+// to the return values are the maximum value that can be represented in 2's
+// complement with the returned number of bytes.
+int32_t DecimalType::DecimalSize(int32_t precision) {
+ DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
+ << precision;
+
+ // Generated in python with:
+ // >>> decimal_size = lambda prec: int(math.ceil((prec * math.log2(10) + 1) / 8))
+ // >>> [-1] + [decimal_size(i) for i in range(1, 77)]
+ constexpr int32_t kBytes[] = {
+ -1, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 9,
+ 9, 9, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 16, 17,
+ 17, 18, 18, 18, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, 25,
+ 26, 26, 26, 27, 27, 28, 28, 28, 29, 29, 30, 30, 31, 31, 31, 32, 32};
+
+ if (precision <= 76) {
+ return kBytes[precision];
+ }
+ return static_cast<int32_t>(std::ceil((precision / 8.0) * std::log2(10) + 1));
+}
+
// ----------------------------------------------------------------------
// Decimal128 type
Decimal128Type::Decimal128Type(int32_t precision, int32_t scale)
- : DecimalType(type_id, 16, precision, scale) {
+ : DecimalType(type_id, 16, precision, scale) {
ARROW_CHECK_GE(precision, kMinPrecision);
ARROW_CHECK_LE(precision, kMaxPrecision);
}
@@ -821,22 +821,22 @@ Result<std::shared_ptr<DataType>> Decimal128Type::Make(int32_t precision, int32_
}
// ----------------------------------------------------------------------
-// Decimal256 type
-
-Decimal256Type::Decimal256Type(int32_t precision, int32_t scale)
- : DecimalType(type_id, 32, precision, scale) {
- ARROW_CHECK_GE(precision, kMinPrecision);
- ARROW_CHECK_LE(precision, kMaxPrecision);
-}
-
-Result<std::shared_ptr<DataType>> Decimal256Type::Make(int32_t precision, int32_t scale) {
- if (precision < kMinPrecision || precision > kMaxPrecision) {
- return Status::Invalid("Decimal precision out of range: ", precision);
- }
- return std::make_shared<Decimal256Type>(precision, scale);
-}
-
-// ----------------------------------------------------------------------
+// Decimal256 type
+
+Decimal256Type::Decimal256Type(int32_t precision, int32_t scale)
+ : DecimalType(type_id, 32, precision, scale) {
+ ARROW_CHECK_GE(precision, kMinPrecision);
+ ARROW_CHECK_LE(precision, kMaxPrecision);
+}
+
+Result<std::shared_ptr<DataType>> Decimal256Type::Make(int32_t precision, int32_t scale) {
+ if (precision < kMinPrecision || precision > kMaxPrecision) {
+ return Status::Invalid("Decimal precision out of range: ", precision);
+ }
+ return std::make_shared<Decimal256Type>(precision, scale);
+}
+
+// ----------------------------------------------------------------------
// Dictionary-encoded type
Status DictionaryType::ValidateParameters(const DataType& index_type,
@@ -894,15 +894,15 @@ size_t FieldPath::hash() const {
}
std::string FieldPath::ToString() const {
- if (this->indices().empty()) {
- return "FieldPath(empty)";
- }
-
+ if (this->indices().empty()) {
+ return "FieldPath(empty)";
+ }
+
std::string repr = "FieldPath(";
for (auto index : this->indices()) {
repr += std::to_string(index) + " ";
}
- repr.back() = ')';
+ repr.back() = ')';
return repr;
}
@@ -964,10 +964,10 @@ struct FieldPathGetImpl {
int depth = 0;
const T* out;
for (int index : path->indices()) {
- if (children == nullptr) {
- return Status::NotImplemented("Get child data of non-struct array");
- }
-
+ if (children == nullptr) {
+ return Status::NotImplemented("Get child data of non-struct array");
+ }
+
if (index < 0 || static_cast<size_t>(index) >= children->size()) {
*out_of_range_depth = depth;
return nullptr;
@@ -1005,11 +1005,11 @@ struct FieldPathGetImpl {
const ArrayDataVector& child_data) {
return FieldPathGetImpl::Get(
path, &child_data,
- [](const std::shared_ptr<ArrayData>& data) -> const ArrayDataVector* {
- if (data->type->id() != Type::STRUCT) {
- return nullptr;
+ [](const std::shared_ptr<ArrayData>& data) -> const ArrayDataVector* {
+ if (data->type->id() != Type::STRUCT) {
+ return nullptr;
}
- return &data->child_data;
+ return &data->child_data;
});
}
};
@@ -1032,21 +1032,21 @@ Result<std::shared_ptr<Field>> FieldPath::Get(const FieldVector& fields) const {
Result<std::shared_ptr<Array>> FieldPath::Get(const RecordBatch& batch) const {
ARROW_ASSIGN_OR_RAISE(auto data, FieldPathGetImpl::Get(this, batch.column_data()));
- return MakeArray(std::move(data));
+ return MakeArray(std::move(data));
}
-Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const {
- ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data()));
- return MakeArray(std::move(data));
+Result<std::shared_ptr<Array>> FieldPath::Get(const Array& array) const {
+ ARROW_ASSIGN_OR_RAISE(auto data, Get(*array.data()));
+ return MakeArray(std::move(data));
+}
+
+Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const {
+ if (data.type->id() != Type::STRUCT) {
+ return Status::NotImplemented("Get child data of non-struct array");
+ }
+ return FieldPathGetImpl::Get(this, data.child_data);
}
-Result<std::shared_ptr<ArrayData>> FieldPath::Get(const ArrayData& data) const {
- if (data.type->id() != Type::STRUCT) {
- return Status::NotImplemented("Get child data of non-struct array");
- }
- return FieldPathGetImpl::Get(this, data.child_data);
-}
-
FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
DCHECK_GT(util::get<FieldPath>(impl_).indices().size(), 0);
}
@@ -1054,13 +1054,13 @@ FieldRef::FieldRef(FieldPath indices) : impl_(std::move(indices)) {
void FieldRef::Flatten(std::vector<FieldRef> children) {
// flatten children
struct Visitor {
- void operator()(std::string* name) { *out++ = FieldRef(std::move(*name)); }
+ void operator()(std::string* name) { *out++ = FieldRef(std::move(*name)); }
- void operator()(FieldPath* indices) { *out++ = FieldRef(std::move(*indices)); }
+ void operator()(FieldPath* indices) { *out++ = FieldRef(std::move(*indices)); }
- void operator()(std::vector<FieldRef>* children) {
- for (auto& child : *children) {
- util::visit(*this, &child.impl_);
+ void operator()(std::vector<FieldRef>* children) {
+ for (auto& child : *children) {
+ util::visit(*this, &child.impl_);
}
}
@@ -1069,7 +1069,7 @@ void FieldRef::Flatten(std::vector<FieldRef> children) {
std::vector<FieldRef> out;
Visitor visitor{std::back_inserter(out)};
- visitor(&children);
+ visitor(&children);
DCHECK(!out.empty());
DCHECK(std::none_of(out.begin(), out.end(),
@@ -1195,10 +1195,10 @@ std::string FieldRef::ToString() const {
}
std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const {
- if (auto name = this->name()) {
- return internal::MapVector([](int i) { return FieldPath{i}; },
- schema.GetAllFieldIndices(*name));
- }
+ if (auto name = this->name()) {
+ return internal::MapVector([](int i) { return FieldPath{i}; },
+ schema.GetAllFieldIndices(*name));
+ }
return FindAll(schema.fields());
}
@@ -1296,11 +1296,11 @@ std::vector<FieldPath> FieldRef::FindAll(const FieldVector& fields) const {
return util::visit(Visitor{fields}, impl_);
}
-std::vector<FieldPath> FieldRef::FindAll(const ArrayData& array) const {
- return FindAll(*array.type);
+std::vector<FieldPath> FieldRef::FindAll(const ArrayData& array) const {
+ return FindAll(*array.type);
}
-std::vector<FieldPath> FieldRef::FindAll(const Array& array) const {
+std::vector<FieldPath> FieldRef::FindAll(const Array& array) const {
return FindAll(*array.type());
}
@@ -1313,56 +1313,56 @@ void PrintTo(const FieldRef& ref, std::ostream* os) { *os << ref.ToString(); }
// ----------------------------------------------------------------------
// Schema implementation
-std::string EndiannessToString(Endianness endianness) {
- switch (endianness) {
- case Endianness::Little:
- return "little";
- case Endianness::Big:
- return "big";
- default:
- DCHECK(false) << "invalid endianness";
- return "???";
- }
-}
-
+std::string EndiannessToString(Endianness endianness) {
+ switch (endianness) {
+ case Endianness::Little:
+ return "little";
+ case Endianness::Big:
+ return "big";
+ default:
+ DCHECK(false) << "invalid endianness";
+ return "???";
+ }
+}
+
class Schema::Impl {
public:
- Impl(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ Impl(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
std::shared_ptr<const KeyValueMetadata> metadata)
: fields_(std::move(fields)),
- endianness_(endianness),
+ endianness_(endianness),
name_to_index_(CreateNameToIndexMap(fields_)),
metadata_(std::move(metadata)) {}
std::vector<std::shared_ptr<Field>> fields_;
- Endianness endianness_;
+ Endianness endianness_;
std::unordered_multimap<std::string, int> name_to_index_;
std::shared_ptr<const KeyValueMetadata> metadata_;
};
-Schema::Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata)
- : detail::Fingerprintable(),
- impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {}
-
+Schema::Schema(std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata)
+ : detail::Fingerprintable(),
+ impl_(new Impl(std::move(fields), endianness, std::move(metadata))) {}
+
Schema::Schema(std::vector<std::shared_ptr<Field>> fields,
std::shared_ptr<const KeyValueMetadata> metadata)
: detail::Fingerprintable(),
- impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {}
+ impl_(new Impl(std::move(fields), Endianness::Native, std::move(metadata))) {}
Schema::Schema(const Schema& schema)
: detail::Fingerprintable(), impl_(new Impl(*schema.impl_)) {}
-Schema::~Schema() = default;
+Schema::~Schema() = default;
+
+std::shared_ptr<Schema> Schema::WithEndianness(Endianness endianness) const {
+ return std::make_shared<Schema>(impl_->fields_, endianness, impl_->metadata_);
+}
+
+Endianness Schema::endianness() const { return impl_->endianness_; }
+
+bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; }
-std::shared_ptr<Schema> Schema::WithEndianness(Endianness endianness) const {
- return std::make_shared<Schema>(impl_->fields_, endianness, impl_->metadata_);
-}
-
-Endianness Schema::endianness() const { return impl_->endianness_; }
-
-bool Schema::is_native_endian() const { return impl_->endianness_ == Endianness::Native; }
-
int Schema::num_fields() const { return static_cast<int>(impl_->fields_.size()); }
const std::shared_ptr<Field>& Schema::field(int i) const {
@@ -1380,11 +1380,11 @@ bool Schema::Equals(const Schema& other, bool check_metadata) const {
return true;
}
- // checks endianness equality
- if (endianness() != other.endianness()) {
- return false;
- }
-
+ // checks endianness equality
+ if (endianness() != other.endianness()) {
+ return false;
+ }
+
// checks field equality
if (num_fields() != other.num_fields()) {
return false;
@@ -1509,7 +1509,7 @@ std::shared_ptr<Schema> Schema::WithMetadata(
return std::make_shared<Schema>(impl_->fields_, metadata);
}
-const std::shared_ptr<const KeyValueMetadata>& Schema::metadata() const {
+const std::shared_ptr<const KeyValueMetadata>& Schema::metadata() const {
return impl_->metadata_;
}
@@ -1529,10 +1529,10 @@ std::string Schema::ToString(bool show_metadata) const {
++i;
}
- if (impl_->endianness_ != Endianness::Native) {
- buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --";
- }
-
+ if (impl_->endianness_ != Endianness::Native) {
+ buffer << "\n-- endianness: " << EndiannessToString(impl_->endianness_) << " --";
+ }
+
if (show_metadata && HasMetadata()) {
buffer << impl_->metadata_->ToString();
}
@@ -1712,12 +1712,12 @@ std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
return std::make_shared<Schema>(std::move(fields), std::move(metadata));
}
-std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
- Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata) {
- return std::make_shared<Schema>(std::move(fields), endianness, std::move(metadata));
-}
-
+std::shared_ptr<Schema> schema(std::vector<std::shared_ptr<Field>> fields,
+ Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Schema>(std::move(fields), endianness, std::move(metadata));
+}
+
Result<std::shared_ptr<Schema>> UnifySchemas(
const std::vector<std::shared_ptr<Schema>>& schemas,
const Field::MergeOptions field_merge_options) {
@@ -1876,7 +1876,7 @@ std::string Schema::ComputeFingerprint() const {
}
ss << field_fingerprint << ";";
}
- ss << (endianness() == Endianness::Little ? "L" : "B");
+ ss << (endianness() == Endianness::Little ? "L" : "B");
ss << "}";
return ss.str();
}
@@ -2248,35 +2248,35 @@ std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
std::move(metadata));
}
-std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
- std::shared_ptr<const KeyValueMetadata> metadata) {
- return std::make_shared<Field>(std::move(name), std::move(type), /*nullable=*/true,
- std::move(metadata));
-}
-
+std::shared_ptr<Field> field(std::string name, std::shared_ptr<DataType> type,
+ std::shared_ptr<const KeyValueMetadata> metadata) {
+ return std::make_shared<Field>(std::move(name), std::move(type), /*nullable=*/true,
+ std::move(metadata));
+}
+
std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale) {
- return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale)
- : decimal256(precision, scale);
-}
-
-std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale) {
+ return precision <= Decimal128Type::kMaxPrecision ? decimal128(precision, scale)
+ : decimal256(precision, scale);
+}
+
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale) {
return std::make_shared<Decimal128Type>(precision, scale);
}
-std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale) {
- return std::make_shared<Decimal256Type>(precision, scale);
-}
-
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale) {
+ return std::make_shared<Decimal256Type>(precision, scale);
+}
+
std::string Decimal128Type::ToString() const {
std::stringstream s;
- s << "decimal128(" << precision_ << ", " << scale_ << ")";
+ s << "decimal128(" << precision_ << ", " << scale_ << ")";
+ return s.str();
+}
+
+std::string Decimal256Type::ToString() const {
+ std::stringstream s;
+ s << "decimal256(" << precision_ << ", " << scale_ << ")";
return s.str();
}
-std::string Decimal256Type::ToString() const {
- std::stringstream s;
- s << "decimal256(" << precision_ << ", " << scale_ << ")";
- return s.str();
-}
-
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type.h b/contrib/libs/apache/arrow/cpp/src/arrow/type.h
index eb65603e0ea..b933da66089 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type.h
@@ -30,7 +30,7 @@
#include "arrow/result.h"
#include "arrow/type_fwd.h" // IWYU pragma: export
#include "arrow/util/checked_cast.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
#include "arrow/util/variant.h"
#include "arrow/util/visibility.h"
@@ -127,7 +127,7 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
ARROW_DEPRECATED("Use field(i)")
const std::shared_ptr<Field>& child(int i) const { return field(i); }
- /// Returns the child-field at index i.
+ /// Returns the child-field at index i.
const std::shared_ptr<Field>& field(int i) const { return children_[i]; }
ARROW_DEPRECATED("Use fields()")
@@ -182,18 +182,18 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
ARROW_EXPORT
std::ostream& operator<<(std::ostream& os, const DataType& type);
-/// \brief Return the compatible physical data type
-///
-/// Some types may have distinct logical meanings but the exact same physical
-/// representation. For example, TimestampType has Int64Type as a physical
-/// type (defined as TimestampType::PhysicalType).
-///
-/// The return value is as follows:
-/// - if a `PhysicalType` alias exists in the concrete type class, return
-/// an instance of `PhysicalType`.
-/// - otherwise, return the input type itself.
-std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type);
-
+/// \brief Return the compatible physical data type
+///
+/// Some types may have distinct logical meanings but the exact same physical
+/// representation. For example, TimestampType has Int64Type as a physical
+/// type (defined as TimestampType::PhysicalType).
+///
+/// The return value is as follows:
+/// - if a `PhysicalType` alias exists in the concrete type class, return
+/// an instance of `PhysicalType`.
+/// - otherwise, return the input type itself.
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type);
+
/// \brief Base class for all fixed-width data types
class ARROW_EXPORT FixedWidthType : public DataType {
public:
@@ -626,10 +626,10 @@ class ARROW_EXPORT LargeListType : public BaseListType {
/// \brief Concrete type class for map data
///
/// Map data is nested data where each value is a variable number of
-/// key-item pairs. Its physical representation is the same as
-/// a list of `{key, item}` structs.
-///
-/// Maps can be recursively nested, for example map(utf8, map(utf8, int32)).
+/// key-item pairs. Its physical representation is the same as
+/// a list of `{key, item}` structs.
+///
+/// Maps can be recursively nested, for example map(utf8, map(utf8, int32)).
class ARROW_EXPORT MapType : public ListType {
public:
static constexpr Type::type type_id = Type::MAP;
@@ -876,22 +876,22 @@ class ARROW_EXPORT StructType : public NestedType {
/// \brief Base type class for (fixed-size) decimal data
class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
public:
- explicit DecimalType(Type::type type_id, int32_t byte_width, int32_t precision,
- int32_t scale)
- : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
-
- /// Constructs concrete decimal types
- static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
- int32_t scale);
-
+ explicit DecimalType(Type::type type_id, int32_t byte_width, int32_t precision,
+ int32_t scale)
+ : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
+
+ /// Constructs concrete decimal types
+ static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
+ int32_t scale);
+
int32_t precision() const { return precision_; }
int32_t scale() const { return scale_; }
- /// \brief Returns the number of bytes needed for precision.
- ///
- /// precision must be >= 1
- static int32_t DecimalSize(int32_t precision);
-
+ /// \brief Returns the number of bytes needed for precision.
+ ///
+ /// precision must be >= 1
+ static int32_t DecimalSize(int32_t precision);
+
protected:
std::string ComputeFingerprint() const override;
@@ -900,24 +900,24 @@ class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
};
/// \brief Concrete type class for 128-bit decimal data
-///
-/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
-/// integer. The precision is the number of significant digits that the
-/// decimal type can represent; the scale is the number of digits after
-/// the decimal point (note the scale can be negative).
-///
-/// As an example, `Decimal128Type(7, 3)` can exactly represent the numbers
-/// 1234.567 and -1234.567 (encoded internally as the 128-bit integers
-/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
-///
-/// Decimal128Type has a maximum precision of 38 significant digits
-/// (also available as Decimal128Type::kMaxPrecision).
-/// If higher precision is needed, consider using Decimal256Type.
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer. The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// As an example, `Decimal128Type(7, 3)` can exactly represent the numbers
+/// 1234.567 and -1234.567 (encoded internally as the 128-bit integers
+/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+///
+/// Decimal128Type has a maximum precision of 38 significant digits
+/// (also available as Decimal128Type::kMaxPrecision).
+/// If higher precision is needed, consider using Decimal256Type.
class ARROW_EXPORT Decimal128Type : public DecimalType {
public:
- static constexpr Type::type type_id = Type::DECIMAL128;
+ static constexpr Type::type type_id = Type::DECIMAL128;
- static constexpr const char* type_name() { return "decimal128"; }
+ static constexpr const char* type_name() { return "decimal128"; }
/// Decimal128Type constructor that aborts on invalid input.
explicit Decimal128Type(int32_t precision, int32_t scale);
@@ -926,47 +926,47 @@ class ARROW_EXPORT Decimal128Type : public DecimalType {
static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
std::string ToString() const override;
- std::string name() const override { return "decimal128"; }
+ std::string name() const override { return "decimal128"; }
static constexpr int32_t kMinPrecision = 1;
static constexpr int32_t kMaxPrecision = 38;
- static constexpr int32_t kByteWidth = 16;
+ static constexpr int32_t kByteWidth = 16;
+};
+
+/// \brief Concrete type class for 256-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer. The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// Decimal256Type has a maximum precision of 76 significant digits.
+/// (also available as Decimal256Type::kMaxPrecision).
+///
+/// For most use cases, the maximum precision offered by Decimal128Type
+/// is sufficient, and it will result in a more compact and more efficient
+/// encoding.
+class ARROW_EXPORT Decimal256Type : public DecimalType {
+ public:
+ static constexpr Type::type type_id = Type::DECIMAL256;
+
+ static constexpr const char* type_name() { return "decimal256"; }
+
+ /// Decimal256Type constructor that aborts on invalid input.
+ explicit Decimal256Type(int32_t precision, int32_t scale);
+
+ /// Decimal256Type constructor that returns an error on invalid input.
+ static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+ std::string ToString() const override;
+ std::string name() const override { return "decimal256"; }
+
+ static constexpr int32_t kMinPrecision = 1;
+ static constexpr int32_t kMaxPrecision = 76;
+ static constexpr int32_t kByteWidth = 32;
};
-/// \brief Concrete type class for 256-bit decimal data
-///
-/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
-/// integer. The precision is the number of significant digits that the
-/// decimal type can represent; the scale is the number of digits after
-/// the decimal point (note the scale can be negative).
-///
-/// Decimal256Type has a maximum precision of 76 significant digits.
-/// (also available as Decimal256Type::kMaxPrecision).
-///
-/// For most use cases, the maximum precision offered by Decimal128Type
-/// is sufficient, and it will result in a more compact and more efficient
-/// encoding.
-class ARROW_EXPORT Decimal256Type : public DecimalType {
- public:
- static constexpr Type::type type_id = Type::DECIMAL256;
-
- static constexpr const char* type_name() { return "decimal256"; }
-
- /// Decimal256Type constructor that aborts on invalid input.
- explicit Decimal256Type(int32_t precision, int32_t scale);
-
- /// Decimal256Type constructor that returns an error on invalid input.
- static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
-
- std::string ToString() const override;
- std::string name() const override { return "decimal256"; }
-
- static constexpr int32_t kMinPrecision = 1;
- static constexpr int32_t kMaxPrecision = 76;
- static constexpr int32_t kByteWidth = 32;
-};
-
-/// \brief Base type class for union data
+/// \brief Base type class for union data
class ARROW_EXPORT UnionType : public NestedType {
public:
static constexpr int8_t kMaxTypeCode = 127;
@@ -1014,17 +1014,17 @@ class ARROW_EXPORT UnionType : public NestedType {
std::vector<int> child_ids_;
};
-/// \brief Concrete type class for sparse union data
-///
-/// A sparse union is a nested type where each logical value is taken from
-/// a single child. A buffer of 8-bit type ids indicates which child
-/// a given logical value is to be taken from.
-///
-/// In a sparse union, each child array should have the same length as the
-/// union array, regardless of the actual number of union values that
-/// refer to it.
-///
-/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+/// \brief Concrete type class for sparse union data
+///
+/// A sparse union is a nested type where each logical value is taken from
+/// a single child. A buffer of 8-bit type ids indicates which child
+/// a given logical value is to be taken from.
+///
+/// In a sparse union, each child array should have the same length as the
+/// union array, regardless of the actual number of union values that
+/// refer to it.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
class ARROW_EXPORT SparseUnionType : public UnionType {
public:
static constexpr Type::type type_id = Type::SPARSE_UNION;
@@ -1041,20 +1041,20 @@ class ARROW_EXPORT SparseUnionType : public UnionType {
std::string name() const override { return "sparse_union"; }
};
-/// \brief Concrete type class for dense union data
-///
-/// A dense union is a nested type where each logical value is taken from
-/// a single child, at a specific offset. A buffer of 8-bit type ids
-/// indicates which child a given logical value is to be taken from,
-/// and a buffer of 32-bit offsets indicates at which physical position
-/// in the given child array the logical value is to be taken from.
-///
-/// Unlike a sparse union, a dense union allows encoding only the child array
-/// values which are actually referred to by the union array. This is
-/// counterbalanced by the additional footprint of the offsets buffer, and
-/// the additional indirection cost when looking up values.
-///
-/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+/// \brief Concrete type class for dense union data
+///
+/// A dense union is a nested type where each logical value is taken from
+/// a single child, at a specific offset. A buffer of 8-bit type ids
+/// indicates which child a given logical value is to be taken from,
+/// and a buffer of 32-bit offsets indicates at which physical position
+/// in the given child array the logical value is to be taken from.
+///
+/// Unlike a sparse union, a dense union allows encoding only the child array
+/// values which are actually referred to by the union array. This is
+/// counterbalanced by the additional footprint of the offsets buffer, and
+/// the additional indirection cost when looking up values.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
class ARROW_EXPORT DenseUnionType : public UnionType {
public:
static constexpr Type::type type_id = Type::DENSE_UNION;
@@ -1413,7 +1413,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
/// FieldPaths provide a number of accessors for drilling down to potentially nested
/// children. They are overloaded for convenience to support Schema (returns a field),
/// DataType (returns a child field), Field (returns a child field of this field's type)
-/// Array (returns a child array), RecordBatch (returns a column).
+/// Array (returns a child array), RecordBatch (returns a column).
class ARROW_EXPORT FieldPath {
public:
FieldPath() = default;
@@ -1427,11 +1427,11 @@ class ARROW_EXPORT FieldPath {
std::string ToString() const;
size_t hash() const;
- struct Hash {
- size_t operator()(const FieldPath& path) const { return path.hash(); }
- };
+ struct Hash {
+ size_t operator()(const FieldPath& path) const { return path.hash(); }
+ };
- bool empty() const { return indices_.empty(); }
+ bool empty() const { return indices_.empty(); }
bool operator==(const FieldPath& other) const { return indices() == other.indices(); }
bool operator!=(const FieldPath& other) const { return indices() != other.indices(); }
@@ -1449,9 +1449,9 @@ class ARROW_EXPORT FieldPath {
/// \brief Retrieve the referenced column from a RecordBatch or Table
Result<std::shared_ptr<Array>> Get(const RecordBatch& batch) const;
- /// \brief Retrieve the referenced child from an Array or ArrayData
+ /// \brief Retrieve the referenced child from an Array or ArrayData
Result<std::shared_ptr<Array>> Get(const Array& array) const;
- Result<std::shared_ptr<ArrayData>> Get(const ArrayData& data) const;
+ Result<std::shared_ptr<ArrayData>> Get(const ArrayData& data) const;
private:
std::vector<int> indices_;
@@ -1543,13 +1543,13 @@ class ARROW_EXPORT FieldRef {
std::string ToString() const;
size_t hash() const;
- struct Hash {
- size_t operator()(const FieldRef& ref) const { return ref.hash(); }
- };
+ struct Hash {
+ size_t operator()(const FieldRef& ref) const { return ref.hash(); }
+ };
+
+ explicit operator bool() const { return Equals(FieldPath{}); }
+ bool operator!() const { return !Equals(FieldPath{}); }
- explicit operator bool() const { return Equals(FieldPath{}); }
- bool operator!() const { return !Equals(FieldPath{}); }
-
bool IsFieldPath() const { return util::holds_alternative<FieldPath>(impl_); }
bool IsName() const { return util::holds_alternative<std::string>(impl_); }
bool IsNested() const {
@@ -1558,13 +1558,13 @@ class ARROW_EXPORT FieldRef {
return true;
}
- const FieldPath* field_path() const {
- return IsFieldPath() ? &util::get<FieldPath>(impl_) : NULLPTR;
- }
- const std::string* name() const {
- return IsName() ? &util::get<std::string>(impl_) : NULLPTR;
- }
-
+ const FieldPath* field_path() const {
+ return IsFieldPath() ? &util::get<FieldPath>(impl_) : NULLPTR;
+ }
+ const std::string* name() const {
+ return IsName() ? &util::get<std::string>(impl_) : NULLPTR;
+ }
+
/// \brief Retrieve FieldPath of every child field which matches this FieldRef.
std::vector<FieldPath> FindAll(const Schema& schema) const;
std::vector<FieldPath> FindAll(const Field& field) const;
@@ -1572,7 +1572,7 @@ class ARROW_EXPORT FieldRef {
std::vector<FieldPath> FindAll(const FieldVector& fields) const;
/// \brief Convenience function which applies FindAll to arg's type or schema.
- std::vector<FieldPath> FindAll(const ArrayData& array) const;
+ std::vector<FieldPath> FindAll(const ArrayData& array) const;
std::vector<FieldPath> FindAll(const Array& array) const;
std::vector<FieldPath> FindAll(const RecordBatch& batch) const;
@@ -1644,16 +1644,16 @@ class ARROW_EXPORT FieldRef {
template <typename T>
Result<GetType<T>> GetOneOrNone(const T& root) const {
ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root));
- if (match.empty()) {
- return static_cast<GetType<T>>(NULLPTR);
+ if (match.empty()) {
+ return static_cast<GetType<T>>(NULLPTR);
}
- return match.Get(root).ValueOrDie();
+ return match.Get(root).ValueOrDie();
}
private:
void Flatten(std::vector<FieldRef> children);
- util::Variant<FieldPath, std::string, std::vector<FieldRef>> impl_;
+ util::Variant<FieldPath, std::string, std::vector<FieldRef>> impl_;
ARROW_EXPORT friend void PrintTo(const FieldRef& ref, std::ostream* os);
};
@@ -1661,16 +1661,16 @@ class ARROW_EXPORT FieldRef {
// ----------------------------------------------------------------------
// Schema
-enum class Endianness {
- Little = 0,
- Big = 1,
-#if ARROW_LITTLE_ENDIAN
- Native = Little
-#else
- Native = Big
-#endif
-};
-
+enum class Endianness {
+ Little = 0,
+ Big = 1,
+#if ARROW_LITTLE_ENDIAN
+ Native = Little
+#else
+ Native = Big
+#endif
+};
+
/// \class Schema
/// \brief Sequence of arrow::Field objects describing the columns of a record
/// batch or table data structure
@@ -1678,12 +1678,12 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
public util::EqualityComparable<Schema>,
public util::ToStringOstreamable<Schema> {
public:
- explicit Schema(FieldVector fields, Endianness endianness,
+ explicit Schema(FieldVector fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+ explicit Schema(FieldVector fields,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
- explicit Schema(FieldVector fields,
- std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-
Schema(const Schema&);
~Schema() override;
@@ -1692,24 +1692,24 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
bool Equals(const Schema& other, bool check_metadata = false) const;
bool Equals(const std::shared_ptr<Schema>& other, bool check_metadata = false) const;
- /// \brief Set endianness in the schema
- ///
- /// \return new Schema
- std::shared_ptr<Schema> WithEndianness(Endianness endianness) const;
-
- /// \brief Return endianness in the schema
- Endianness endianness() const;
-
- /// \brief Indicate if endianness is equal to platform-native endianness
- bool is_native_endian() const;
-
+ /// \brief Set endianness in the schema
+ ///
+ /// \return new Schema
+ std::shared_ptr<Schema> WithEndianness(Endianness endianness) const;
+
+ /// \brief Return endianness in the schema
+ Endianness endianness() const;
+
+ /// \brief Indicate if endianness is equal to platform-native endianness
+ bool is_native_endian() const;
+
/// \brief Return the number of fields (columns) in the schema
int num_fields() const;
/// Return the ith schema element. Does not boundscheck
const std::shared_ptr<Field>& field(int i) const;
- const FieldVector& fields() const;
+ const FieldVector& fields() const;
std::vector<std::string> field_names() const;
@@ -1717,7 +1717,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
/// \brief Return the indices of all fields having this name in sorted order
- FieldVector GetAllFieldsByName(const std::string& name) const;
+ FieldVector GetAllFieldsByName(const std::string& name) const;
/// Returns -1 if name not found
int GetFieldIndex(const std::string& name) const;
@@ -1731,7 +1731,7 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
/// \brief The custom key-value metadata, if any
///
/// \return metadata may be null
- const std::shared_ptr<const KeyValueMetadata>& metadata() const;
+ const std::shared_ptr<const KeyValueMetadata>& metadata() const;
/// \brief Render a string representation of the schema suitable for debugging
/// \param[in] show_metadata when true, if KeyValueMetadata is non-empty,
@@ -1771,9 +1771,9 @@ class ARROW_EXPORT Schema : public detail::Fingerprintable,
std::unique_ptr<Impl> impl_;
};
-ARROW_EXPORT
-std::string EndiannessToString(Endianness endianness);
-
+ARROW_EXPORT
+std::string EndiannessToString(Endianness endianness);
+
// ----------------------------------------------------------------------
/// \brief Convenience class to incrementally construct/merge schemas.
@@ -1802,18 +1802,18 @@ class ARROW_EXPORT SchemaBuilder {
};
/// \brief Construct an empty SchemaBuilder
- /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
SchemaBuilder(
ConflictPolicy conflict_policy = CONFLICT_APPEND,
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
/// \brief Construct a SchemaBuilder from a list of fields
- /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
SchemaBuilder(
std::vector<std::shared_ptr<Field>> fields,
ConflictPolicy conflict_policy = CONFLICT_APPEND,
Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
/// \brief Construct a SchemaBuilder from a schema, preserving the metadata
- /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+ /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
SchemaBuilder(
const std::shared_ptr<Schema>& schema,
ConflictPolicy conflict_policy = CONFLICT_APPEND,
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
index 80b8345b625..7e564106bbe 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type_fwd.h
@@ -29,20 +29,20 @@ namespace arrow {
template <typename T>
class Iterator;
-template <typename T>
-struct IterationTraits;
+template <typename T>
+struct IterationTraits;
template <typename T>
class Result;
class Status;
-namespace internal {
-struct Empty;
-} // namespace internal
-template <typename T = internal::Empty>
-class Future;
-
+namespace internal {
+struct Empty;
+} // namespace internal
+template <typename T = internal::Empty>
+class Future;
+
namespace util {
class Codec;
} // namespace util
@@ -60,7 +60,7 @@ class DataType;
class Field;
class FieldRef;
class KeyValueMetadata;
-enum class Endianness;
+enum class Endianness;
class Schema;
using DataTypeVector = std::vector<std::shared_ptr<DataType>>;
@@ -80,9 +80,9 @@ class RecordBatch;
class RecordBatchReader;
class Table;
-struct Datum;
-struct ValueDescr;
-
+struct Datum;
+struct ValueDescr;
+
using ChunkedArrayVector = std::vector<std::shared_ptr<ChunkedArray>>;
using RecordBatchVector = std::vector<std::shared_ptr<RecordBatch>>;
using RecordBatchIterator = Iterator<std::shared_ptr<RecordBatch>>;
@@ -154,16 +154,16 @@ class StructBuilder;
struct StructScalar;
class Decimal128;
-class Decimal256;
+class Decimal256;
class DecimalType;
class Decimal128Type;
-class Decimal256Type;
+class Decimal256Type;
class Decimal128Array;
-class Decimal256Array;
+class Decimal256Array;
class Decimal128Builder;
-class Decimal256Builder;
+class Decimal256Builder;
struct Decimal128Scalar;
-struct Decimal256Scalar;
+struct Decimal256Scalar;
struct UnionMode {
enum type { SPARSE, DENSE };
@@ -262,9 +262,9 @@ class ExtensionType;
class ExtensionArray;
struct ExtensionScalar;
-class Tensor;
-class SparseTensor;
-
+class Tensor;
+class SparseTensor;
+
// ----------------------------------------------------------------------
struct Type {
@@ -345,15 +345,15 @@ struct Type {
/// DAY_TIME interval in SQL style
INTERVAL_DAY_TIME,
- /// Precision- and scale-based decimal type with 128 bits.
- DECIMAL128,
+ /// Precision- and scale-based decimal type with 128 bits.
+ DECIMAL128,
+
+ /// Defined for backward-compatibility.
+ DECIMAL = DECIMAL128,
+
+ /// Precision- and scale-based decimal type with 256 bits.
+ DECIMAL256,
- /// Defined for backward-compatibility.
- DECIMAL = DECIMAL128,
-
- /// Precision- and scale-based decimal type with 256 bits.
- DECIMAL256,
-
/// A list of some logical data type
LIST,
@@ -447,21 +447,21 @@ std::shared_ptr<DataType> ARROW_EXPORT date64();
ARROW_EXPORT
std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width);
-/// \brief Create a DecimalType instance depending on the precision
-///
-/// If the precision is greater than 38, a Decimal256Type is returned,
-/// otherwise a Decimal128Type.
+/// \brief Create a DecimalType instance depending on the precision
+///
+/// If the precision is greater than 38, a Decimal256Type is returned,
+/// otherwise a Decimal128Type.
ARROW_EXPORT
std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale);
-/// \brief Create a Decimal128Type instance
-ARROW_EXPORT
-std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale);
-
-/// \brief Create a Decimal256Type instance
-ARROW_EXPORT
-std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale);
-
+/// \brief Create a Decimal128Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal256Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale);
+
/// \brief Create a ListType instance from its child Field type
ARROW_EXPORT
std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_type);
@@ -502,7 +502,7 @@ ARROW_EXPORT
std::shared_ptr<DataType> fixed_size_list(const std::shared_ptr<DataType>& value_type,
int32_t list_size);
/// \brief Return a Duration instance (naming use _type to avoid namespace conflict with
-/// built in time classes).
+/// built in time classes).
std::shared_ptr<DataType> ARROW_EXPORT duration(TimeUnit::type unit);
/// \brief Return a DayTimeIntervalType instance
@@ -638,17 +638,17 @@ std::shared_ptr<Field> ARROW_EXPORT
field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-/// \brief Create a Field instance with metadata
-///
-/// The field will be assumed to be nullable.
-///
-/// \param name the field name
-/// \param type the field value type
-/// \param metadata any custom key-value metadata
-std::shared_ptr<Field> ARROW_EXPORT
-field(std::string name, std::shared_ptr<DataType> type,
- std::shared_ptr<const KeyValueMetadata> metadata);
-
+/// \brief Create a Field instance with metadata
+///
+/// The field will be assumed to be nullable.
+///
+/// \param name the field name
+/// \param type the field value type
+/// \param metadata any custom key-value metadata
+std::shared_ptr<Field> ARROW_EXPORT
+field(std::string name, std::shared_ptr<DataType> type,
+ std::shared_ptr<const KeyValueMetadata> metadata);
+
/// \brief Create a Schema instance
///
/// \param fields the schema's fields
@@ -659,17 +659,17 @@ std::shared_ptr<Schema> schema(
std::vector<std::shared_ptr<Field>> fields,
std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-/// \brief Create a Schema instance
-///
-/// \param fields the schema's fields
-/// \param endianness the endianness of the data
-/// \param metadata any custom key-value metadata, default null
-/// \return schema shared_ptr to Schema
-ARROW_EXPORT
-std::shared_ptr<Schema> schema(
- std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
- std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
-
+/// \brief Create a Schema instance
+///
+/// \param fields the schema's fields
+/// \param endianness the endianness of the data
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+ std::vector<std::shared_ptr<Field>> fields, Endianness endianness,
+ std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
/// @}
/// Return the process-wide default memory pool.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
index c9637e09ed5..e4d809967f9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h
@@ -66,8 +66,8 @@ TYPE_ID_TRAIT(TIMESTAMP, TimestampType)
TYPE_ID_TRAIT(INTERVAL_DAY_TIME, DayTimeIntervalType)
TYPE_ID_TRAIT(INTERVAL_MONTHS, MonthIntervalType)
TYPE_ID_TRAIT(DURATION, DurationType)
-TYPE_ID_TRAIT(DECIMAL128, Decimal128Type)
-TYPE_ID_TRAIT(DECIMAL256, Decimal256Type)
+TYPE_ID_TRAIT(DECIMAL128, Decimal128Type)
+TYPE_ID_TRAIT(DECIMAL256, Decimal256Type)
TYPE_ID_TRAIT(STRUCT, StructType)
TYPE_ID_TRAIT(LIST, ListType)
TYPE_ID_TRAIT(LARGE_LIST, LargeListType)
@@ -233,7 +233,7 @@ struct TypeTraits<MonthIntervalType> {
using ArrayType = MonthIntervalArray;
using BuilderType = MonthIntervalBuilder;
using ScalarType = MonthIntervalScalar;
- using CType = MonthIntervalType::c_type;
+ using CType = MonthIntervalType::c_type;
static constexpr int64_t bytes_required(int64_t elements) {
return elements * static_cast<int64_t>(sizeof(int32_t));
@@ -291,14 +291,14 @@ struct TypeTraits<Decimal128Type> {
};
template <>
-struct TypeTraits<Decimal256Type> {
- using ArrayType = Decimal256Array;
- using BuilderType = Decimal256Builder;
- using ScalarType = Decimal256Scalar;
- constexpr static bool is_parameter_free = false;
-};
-
-template <>
+struct TypeTraits<Decimal256Type> {
+ using ArrayType = Decimal256Array;
+ using BuilderType = Decimal256Builder;
+ using ScalarType = Decimal256Scalar;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
struct TypeTraits<BinaryType> {
using ArrayType = BinaryArray;
using BuilderType = BinaryBuilder;
@@ -587,18 +587,18 @@ using is_decimal_type = std::is_base_of<DecimalType, T>;
template <typename T, typename R = void>
using enable_if_decimal = enable_if_t<is_decimal_type<T>::value, R>;
-template <typename T>
-using is_decimal128_type = std::is_base_of<Decimal128Type, T>;
-
-template <typename T, typename R = void>
-using enable_if_decimal128 = enable_if_t<is_decimal128_type<T>::value, R>;
-
-template <typename T>
-using is_decimal256_type = std::is_base_of<Decimal256Type, T>;
-
-template <typename T, typename R = void>
-using enable_if_decimal256 = enable_if_t<is_decimal256_type<T>::value, R>;
-
+template <typename T>
+using is_decimal128_type = std::is_base_of<Decimal128Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal128 = enable_if_t<is_decimal128_type<T>::value, R>;
+
+template <typename T>
+using is_decimal256_type = std::is_base_of<Decimal256Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal256 = enable_if_t<is_decimal256_type<T>::value, R>;
+
// Nested Types
template <typename T>
@@ -636,7 +636,7 @@ template <typename T>
using is_list_type =
std::integral_constant<bool, std::is_same<T, ListType>::value ||
std::is_same<T, LargeListType>::value ||
- std::is_same<T, FixedSizeListType>::value>;
+ std::is_same<T, FixedSizeListType>::value>;
template <typename T, typename R = void>
using enable_if_list_type = enable_if_t<is_list_type<T>::value, R>;
@@ -846,17 +846,17 @@ static inline bool is_floating(Type::type type_id) {
return false;
}
-static inline bool is_decimal(Type::type type_id) {
- switch (type_id) {
- case Type::DECIMAL128:
- case Type::DECIMAL256:
- return true;
- default:
- break;
- }
- return false;
-}
-
+static inline bool is_decimal(Type::type type_id) {
+ switch (type_id) {
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
static inline bool is_primitive(Type::type type_id) {
switch (type_id) {
case Type::BOOL:
@@ -927,8 +927,8 @@ static inline bool is_dictionary(Type::type type_id) {
static inline bool is_fixed_size_binary(Type::type type_id) {
switch (type_id) {
- case Type::DECIMAL128:
- case Type::DECIMAL256:
+ case Type::DECIMAL128:
+ case Type::DECIMAL256:
case Type::FIXED_SIZE_BINARY:
return true;
default:
@@ -941,52 +941,52 @@ static inline bool is_fixed_width(Type::type type_id) {
return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
}
-static inline int bit_width(Type::type type_id) {
- switch (type_id) {
- case Type::BOOL:
- return 1;
- case Type::UINT8:
- case Type::INT8:
- return 8;
- case Type::UINT16:
- case Type::INT16:
- return 16;
- case Type::UINT32:
- case Type::INT32:
- case Type::DATE32:
- case Type::TIME32:
- return 32;
- case Type::UINT64:
- case Type::INT64:
- case Type::DATE64:
- case Type::TIME64:
- case Type::TIMESTAMP:
- case Type::DURATION:
- return 64;
-
- case Type::HALF_FLOAT:
- return 16;
- case Type::FLOAT:
- return 32;
- case Type::DOUBLE:
- return 64;
-
- case Type::INTERVAL_MONTHS:
- return 32;
- case Type::INTERVAL_DAY_TIME:
- return 64;
-
- case Type::DECIMAL128:
- return 128;
- case Type::DECIMAL256:
- return 256;
-
- default:
- break;
- }
- return 0;
-}
-
+static inline int bit_width(Type::type type_id) {
+ switch (type_id) {
+ case Type::BOOL:
+ return 1;
+ case Type::UINT8:
+ case Type::INT8:
+ return 8;
+ case Type::UINT16:
+ case Type::INT16:
+ return 16;
+ case Type::UINT32:
+ case Type::INT32:
+ case Type::DATE32:
+ case Type::TIME32:
+ return 32;
+ case Type::UINT64:
+ case Type::INT64:
+ case Type::DATE64:
+ case Type::TIME64:
+ case Type::TIMESTAMP:
+ case Type::DURATION:
+ return 64;
+
+ case Type::HALF_FLOAT:
+ return 16;
+ case Type::FLOAT:
+ return 32;
+ case Type::DOUBLE:
+ return 64;
+
+ case Type::INTERVAL_MONTHS:
+ return 32;
+ case Type::INTERVAL_DAY_TIME:
+ return 64;
+
+ case Type::DECIMAL128:
+ return 128;
+ case Type::DECIMAL256:
+ return 256;
+
+ default:
+ break;
+ }
+ return 0;
+}
+
static inline bool is_nested(Type::type type_id) {
switch (type_id) {
case Type::LIST:
@@ -1003,22 +1003,22 @@ static inline bool is_nested(Type::type type_id) {
return false;
}
-static inline int offset_bit_width(Type::type type_id) {
- switch (type_id) {
- case Type::STRING:
- case Type::BINARY:
- case Type::LIST:
- case Type::MAP:
- case Type::DENSE_UNION:
- return 32;
- case Type::LARGE_STRING:
- case Type::LARGE_BINARY:
- case Type::LARGE_LIST:
- return 64;
- default:
- break;
- }
- return 0;
-}
-
+static inline int offset_bit_width(Type::type type_id) {
+ switch (type_id) {
+ case Type::STRING:
+ case Type::BINARY:
+ case Type::LIST:
+ case Type::MAP:
+ case Type::DENSE_UNION:
+ return 32;
+ case Type::LARGE_STRING:
+ case Type::LARGE_BINARY:
+ case Type::LARGE_LIST:
+ return 64;
+ default:
+ break;
+ }
+ return 0;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
index 8f9ae1f7706..2a0e6ba709d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/algorithm.h
@@ -1,33 +1,33 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/result.h"
-
-namespace arrow {
-
-template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
-Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
- UnaryOperation unary_op) {
- for (; first != last; ++first, (void)++out) {
- ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
- }
- return Status::OK();
-}
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+
+namespace arrow {
+
+template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
+Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
+ UnaryOperation unary_op) {
+ for (; first != last; ++first, (void)++out) {
+ ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
index c672ebab778..9d1021edff5 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/async_generator.h
@@ -1,1614 +1,1614 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <atomic>
-#include <cassert>
-#include <cstring>
-#include <deque>
-#include <limits>
-#include <queue>
-
-#include "arrow/util/functional.h"
-#include "arrow/util/future.h"
-#include "arrow/util/io_util.h"
-#include "arrow/util/iterator.h"
-#include "arrow/util/mutex.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/queue.h"
-#include "arrow/util/thread_pool.h"
-
-namespace arrow {
-
-// The methods in this file create, modify, and utilize AsyncGenerator which is an
-// iterator of futures. This allows an asynchronous source (like file input) to be run
-// through a pipeline in the same way that iterators can be used to create pipelined
-// workflows.
-//
-// In order to support pipeline parallelism we introduce the concept of asynchronous
-// reentrancy. This is different than synchronous reentrancy. With synchronous code a
-// function is reentrant if the function can be called again while a previous call to that
-// function is still running. Unless otherwise specified none of these generators are
-// synchronously reentrant. Care should be taken to avoid calling them in such a way (and
-// the utilities Visit/Collect/Await take care to do this).
-//
-// Asynchronous reentrancy on the other hand means the function is called again before the
-// future returned by the function is marked finished (but after the call to get the
-// future returns). Some of these generators are async-reentrant while others (e.g.
-// those that depend on ordered processing like decompression) are not. Read the MakeXYZ
-// function comments to determine which generators support async reentrancy.
-//
-// Note: Generators that are not asynchronously reentrant can still support readahead
-// (\see MakeSerialReadaheadGenerator).
-//
-// Readahead operators, and some other operators, may introduce queueing. Any operators
-// that introduce buffering should detail the amount of buffering they introduce in their
-// MakeXYZ function comments.
-template <typename T>
-using AsyncGenerator = std::function<Future<T>()>;
-
-template <typename T>
-struct IterationTraits<AsyncGenerator<T>> {
- /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
- /// an empty function indicates the end of iteration.
- static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
-
- static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
-};
-
-template <typename T>
-Future<T> AsyncGeneratorEnd() {
- return Future<T>::MakeFinished(IterationTraits<T>::End());
-}
-
-/// returning a future that completes when all have been visited
-template <typename T, typename Visitor>
-Future<> VisitAsyncGenerator(AsyncGenerator<T> generator, Visitor visitor) {
- struct LoopBody {
- struct Callback {
- Result<ControlFlow<>> operator()(const T& next) {
- if (IsIterationEnd(next)) {
- return Break();
- } else {
- auto visited = visitor(next);
- if (visited.ok()) {
- return Continue();
- } else {
- return visited;
- }
- }
- }
-
- Visitor visitor;
- };
-
- Future<ControlFlow<>> operator()() {
- Callback callback{visitor};
- auto next = generator();
- return next.Then(std::move(callback));
- }
-
- AsyncGenerator<T> generator;
- Visitor visitor;
- };
-
- return Loop(LoopBody{std::move(generator), std::move(visitor)});
-}
-
-/// \brief Waits for an async generator to complete, discarding results.
-template <typename T>
-Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
- std::function<Status(T)> visitor = [](const T&) { return Status::OK(); };
- return VisitAsyncGenerator(generator, visitor);
-}
-
-/// \brief Collects the results of an async generator into a vector
-template <typename T>
-Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
- auto vec = std::make_shared<std::vector<T>>();
- struct LoopBody {
- Future<ControlFlow<std::vector<T>>> operator()() {
- auto next = generator_();
- auto vec = vec_;
- return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
- if (IsIterationEnd(result)) {
- return Break(*vec);
- } else {
- vec->push_back(result);
- return Continue();
- }
- });
- }
- AsyncGenerator<T> generator_;
- std::shared_ptr<std::vector<T>> vec_;
- };
- return Loop(LoopBody{std::move(generator), std::move(vec)});
-}
-
-/// \see MakeMappedGenerator
-template <typename T, typename V>
-class MappingGenerator {
- public:
- MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
- : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
-
- Future<V> operator()() {
- auto future = Future<V>::Make();
- bool should_trigger;
- {
- auto guard = state_->mutex.Lock();
- if (state_->finished) {
- return AsyncGeneratorEnd<V>();
- }
- should_trigger = state_->waiting_jobs.empty();
- state_->waiting_jobs.push_back(future);
- }
- if (should_trigger) {
- state_->source().AddCallback(Callback{state_});
- }
- return future;
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
- : source(std::move(source)),
- map(std::move(map)),
- waiting_jobs(),
- mutex(),
- finished(false) {}
-
- void Purge() {
- // This might be called by an original callback (if the source iterator fails or
- // ends) or by a mapped callback (if the map function fails or ends prematurely).
- // Either way it should only be called once and after finished is set so there is no
- // need to guard access to `waiting_jobs`.
- while (!waiting_jobs.empty()) {
- waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
- waiting_jobs.pop_front();
- }
- }
-
- AsyncGenerator<T> source;
- std::function<Future<V>(const T&)> map;
- std::deque<Future<V>> waiting_jobs;
- util::Mutex mutex;
- bool finished;
- };
-
- struct Callback;
-
- struct MappedCallback {
- void operator()(const Result<V>& maybe_next) {
- bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
- bool should_purge = false;
- if (end) {
- {
- auto guard = state->mutex.Lock();
- should_purge = !state->finished;
- state->finished = true;
- }
- }
- sink.MarkFinished(maybe_next);
- if (should_purge) {
- state->Purge();
- }
- }
- std::shared_ptr<State> state;
- Future<V> sink;
- };
-
- struct Callback {
- void operator()(const Result<T>& maybe_next) {
- Future<V> sink;
- bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
- bool should_purge = false;
- bool should_trigger;
- {
- auto guard = state->mutex.Lock();
- if (end) {
- should_purge = !state->finished;
- state->finished = true;
- }
- sink = state->waiting_jobs.front();
- state->waiting_jobs.pop_front();
- should_trigger = !end && !state->waiting_jobs.empty();
- }
- if (should_purge) {
- state->Purge();
- }
- if (should_trigger) {
- state->source().AddCallback(Callback{state});
- }
- if (maybe_next.ok()) {
- const T& val = maybe_next.ValueUnsafe();
- if (IsIterationEnd(val)) {
- sink.MarkFinished(IterationTraits<V>::End());
- } else {
- Future<V> mapped_fut = state->map(val);
- mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
- }
- } else {
- sink.MarkFinished(maybe_next.status());
- }
- }
-
- std::shared_ptr<State> state;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief Creates a generator that will apply the map function to each element of
-/// source. The map function is not called on the end token.
-///
-/// Note: This function makes a copy of `map` for each item
-/// Note: Errors returned from the `map` function will be propagated
-///
-/// If the source generator is async-reentrant then this generator will be also
-template <typename T, typename MapFn,
- typename Mapped = detail::result_of_t<MapFn(const T&)>,
- typename V = typename EnsureFuture<Mapped>::type::ValueType>
-AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
- struct MapCallback {
- MapFn map_;
-
- Future<V> operator()(const T& val) { return ToFuture(map_(val)); }
- };
-
- return MappingGenerator<T, V>(std::move(source_generator), MapCallback{std::move(map)});
-}
-
-/// \see MakeSequencingGenerator
-template <typename T, typename ComesAfter, typename IsNext>
-class SequencingGenerator {
- public:
- SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
- T initial_value)
- : state_(std::make_shared<State>(std::move(source), std::move(compare),
- std::move(is_next), std::move(initial_value))) {}
-
- Future<T> operator()() {
- {
- auto guard = state_->mutex.Lock();
- // We can send a result immediately if the top of the queue is either an
- // error or the next item
- if (!state_->queue.empty() &&
- (!state_->queue.top().ok() ||
- state_->is_next(state_->previous_value, *state_->queue.top()))) {
- auto result = std::move(state_->queue.top());
- if (result.ok()) {
- state_->previous_value = *result;
- }
- state_->queue.pop();
- return Future<T>::MakeFinished(result);
- }
- if (state_->finished) {
- return AsyncGeneratorEnd<T>();
- }
- // The next item is not in the queue so we will need to wait
- auto new_waiting_fut = Future<T>::Make();
- state_->waiting_future = new_waiting_fut;
- guard.Unlock();
- state_->source().AddCallback(Callback{state_});
- return new_waiting_fut;
- }
- }
-
- private:
- struct WrappedComesAfter {
- bool operator()(const Result<T>& left, const Result<T>& right) {
- if (!left.ok() || !right.ok()) {
- // Should never happen
- return false;
- }
- return compare(*left, *right);
- }
- ComesAfter compare;
- };
-
- struct State {
- State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
- : source(std::move(source)),
- is_next(std::move(is_next)),
- previous_value(std::move(initial_value)),
- waiting_future(),
- queue(WrappedComesAfter{compare}),
- finished(false),
- mutex() {}
-
- AsyncGenerator<T> source;
- IsNext is_next;
- T previous_value;
- Future<T> waiting_future;
- std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
- bool finished;
- util::Mutex mutex;
- };
-
- class Callback {
- public:
- explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
-
- void operator()(const Result<T> result) {
- Future<T> to_deliver;
- bool finished;
- {
- auto guard = state_->mutex.Lock();
- bool ready_to_deliver = false;
- if (!result.ok()) {
- // Clear any cached results
- while (!state_->queue.empty()) {
- state_->queue.pop();
- }
- ready_to_deliver = true;
- state_->finished = true;
- } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
- ready_to_deliver = state_->queue.empty();
- state_->finished = true;
- } else {
- ready_to_deliver = state_->is_next(state_->previous_value, *result);
- }
-
- if (ready_to_deliver && state_->waiting_future.is_valid()) {
- to_deliver = state_->waiting_future;
- if (result.ok()) {
- state_->previous_value = *result;
- }
- } else {
- state_->queue.push(result);
- }
- // Capture state_->finished so we can access it outside the mutex
- finished = state_->finished;
- }
- // Must deliver result outside of the mutex
- if (to_deliver.is_valid()) {
- to_deliver.MarkFinished(result);
- } else {
- // Otherwise, if we didn't get the next item (or a terminal item), we
- // need to keep looking
- if (!finished) {
- state_->source().AddCallback(Callback{state_});
- }
- }
- }
-
- private:
- const std::shared_ptr<State> state_;
- };
-
- const std::shared_ptr<State> state_;
-};
-
-/// \brief Buffers an AsyncGenerator to return values in sequence order ComesAfter
-/// and IsNext determine the sequence order.
-///
-/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
-///
-/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
-/// `b` follows immediately after `a`. It should return true given `initial_value` and
-/// `b` if `b` is the first item in the sequence.
-///
-/// This operator will queue unboundedly while waiting for the next item. It is intended
-/// for jittery sources that might scatter an ordered sequence. It is NOT intended to
-/// sort. Using it to try and sort could result in excessive RAM usage. This generator
-/// will queue up to N blocks where N is the max "out of order"ness of the source.
-///
-/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
-/// blocks beyond where it belongs.
-///
-/// This generator is not async-reentrant but it consists only of a simple log(n)
-/// insertion into a priority queue.
-template <typename T, typename ComesAfter, typename IsNext>
-AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
- ComesAfter compare, IsNext is_next,
- T initial_value) {
- return SequencingGenerator<T, ComesAfter, IsNext>(
- std::move(source_generator), std::move(compare), std::move(is_next),
- std::move(initial_value));
-}
-
-/// \see MakeTransformedGenerator
-template <typename T, typename V>
-class TransformingGenerator {
- // The transforming generator state will be referenced as an async generator but will
- // also be referenced via callback to various futures. If the async generator owner
- // moves it around we need the state to be consistent for future callbacks.
- struct TransformingGeneratorState
- : std::enable_shared_from_this<TransformingGeneratorState> {
- TransformingGeneratorState(AsyncGenerator<T> generator, Transformer<T, V> transformer)
- : generator_(std::move(generator)),
- transformer_(std::move(transformer)),
- last_value_(),
- finished_() {}
-
- Future<V> operator()() {
- while (true) {
- auto maybe_next_result = Pump();
- if (!maybe_next_result.ok()) {
- return Future<V>::MakeFinished(maybe_next_result.status());
- }
- auto maybe_next = std::move(maybe_next_result).ValueUnsafe();
- if (maybe_next.has_value()) {
- return Future<V>::MakeFinished(*std::move(maybe_next));
- }
-
- auto next_fut = generator_();
- // If finished already, process results immediately inside the loop to avoid
- // stack overflow
- if (next_fut.is_finished()) {
- auto next_result = next_fut.result();
- if (next_result.ok()) {
- last_value_ = *next_result;
- } else {
- return Future<V>::MakeFinished(next_result.status());
- }
- // Otherwise, if not finished immediately, add callback to process results
- } else {
- auto self = this->shared_from_this();
- return next_fut.Then([self](const T& next_result) {
- self->last_value_ = next_result;
- return (*self)();
- });
- }
- }
- }
-
- // See comment on TransformingIterator::Pump
- Result<util::optional<V>> Pump() {
- if (!finished_ && last_value_.has_value()) {
- ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
- if (next.ReadyForNext()) {
- if (IsIterationEnd(*last_value_)) {
- finished_ = true;
- }
- last_value_.reset();
- }
- if (next.Finished()) {
- finished_ = true;
- }
- if (next.HasValue()) {
- return next.Value();
- }
- }
- if (finished_) {
- return IterationTraits<V>::End();
- }
- return util::nullopt;
- }
-
- AsyncGenerator<T> generator_;
- Transformer<T, V> transformer_;
- util::optional<T> last_value_;
- bool finished_;
- };
-
- public:
- explicit TransformingGenerator(AsyncGenerator<T> generator,
- Transformer<T, V> transformer)
- : state_(std::make_shared<TransformingGeneratorState>(std::move(generator),
- std::move(transformer))) {}
-
- Future<V> operator()() { return (*state_)(); }
-
- protected:
- std::shared_ptr<TransformingGeneratorState> state_;
-};
-
-/// \brief Transforms an async generator using a transformer function returning a new
-/// AsyncGenerator
-///
-/// The transform function here behaves exactly the same as the transform function in
-/// MakeTransformedIterator and you can safely use the same transform function to
-/// transform both synchronous and asynchronous streams.
-///
-/// This generator is not async-reentrant
-///
-/// This generator may queue up to 1 instance of T but will not delay
-template <typename T, typename V>
-AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
- Transformer<T, V> transformer) {
- return TransformingGenerator<T, V>(generator, transformer);
-}
-
-/// \see MakeSerialReadaheadGenerator
-template <typename T>
-class SerialReadaheadGenerator {
- public:
- SerialReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
- : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
-
- Future<T> operator()() {
- if (state_->first_) {
- // Lazy generator, need to wait for the first ask to prime the pump
- state_->first_ = false;
- auto next = state_->source_();
- return next.Then(Callback{state_}, ErrCallback{state_});
- }
-
- // This generator is not async-reentrant. We won't be called until the last
- // future finished so we know there is something in the queue
- auto finished = state_->finished_.load();
- if (finished && state_->readahead_queue_.IsEmpty()) {
- return AsyncGeneratorEnd<T>();
- }
-
- std::shared_ptr<Future<T>> next;
- if (!state_->readahead_queue_.Read(next)) {
- return Status::UnknownError("Could not read from readahead_queue");
- }
-
- auto last_available = state_->spaces_available_.fetch_add(1);
- if (last_available == 0 && !finished) {
- // Reader idled out, we need to restart it
- ARROW_RETURN_NOT_OK(state_->Pump(state_));
- }
- return *next;
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source, int max_readahead)
- : first_(true),
- source_(std::move(source)),
- finished_(false),
- // There is one extra "space" for the in-flight request
- spaces_available_(max_readahead + 1),
- // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
- readahead_queue_(max_readahead + 1) {}
-
- Status Pump(const std::shared_ptr<State>& self) {
- // Can't do readahead_queue.write(source().Then(...)) because then the
- // callback might run immediately and add itself to the queue before this gets added
- // to the queue messing up the order.
- auto next_slot = std::make_shared<Future<T>>();
- auto written = readahead_queue_.Write(next_slot);
- if (!written) {
- return Status::UnknownError("Could not write to readahead_queue");
- }
- // If this Pump is being called from a callback it is possible for the source to
- // poll and read from the queue between the Write and this spot where we fill the
- // value in. However, it is not possible for the future to read this value we are
- // writing. That is because this callback (the callback for future X) must be
- // finished before future X is marked complete and this source is not pulled
- // reentrantly so it will not poll for future X+1 until this callback has completed.
- *next_slot = source_().Then(Callback{self}, ErrCallback{self});
- return Status::OK();
- }
-
- // Only accessed by the consumer end
- bool first_;
- // Accessed by both threads
- AsyncGenerator<T> source_;
- std::atomic<bool> finished_;
- // The queue has a size but it is not atomic. We keep track of how many spaces are
- // left in the queue here so we know if we've just written the last value and we need
- // to stop reading ahead or if we've just read from a full queue and we need to
- // restart reading ahead
- std::atomic<uint32_t> spaces_available_;
- // Needs to be a queue of shared_ptr and not Future because we set the value of the
- // future after we add it to the queue
- util::SpscQueue<std::shared_ptr<Future<T>>> readahead_queue_;
- };
-
- struct Callback {
- Result<T> operator()(const T& next) {
- if (IsIterationEnd(next)) {
- state_->finished_.store(true);
- return next;
- }
- auto last_available = state_->spaces_available_.fetch_sub(1);
- if (last_available > 1) {
- ARROW_RETURN_NOT_OK(state_->Pump(state_));
- }
- return next;
- }
-
- std::shared_ptr<State> state_;
- };
-
- struct ErrCallback {
- Result<T> operator()(const Status& st) {
- state_->finished_.store(true);
- return st;
- }
-
- std::shared_ptr<State> state_;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \see MakeFromFuture
-template <typename T>
-class FutureFirstGenerator {
- public:
- explicit FutureFirstGenerator(Future<AsyncGenerator<T>> future)
- : state_(std::make_shared<State>(std::move(future))) {}
-
- Future<T> operator()() {
- if (state_->source_) {
- return state_->source_();
- } else {
- auto state = state_;
- return state_->future_.Then([state](const AsyncGenerator<T>& source) {
- state->source_ = source;
- return state->source_();
- });
- }
- }
-
- private:
- struct State {
- explicit State(Future<AsyncGenerator<T>> future) : future_(future), source_() {}
-
- Future<AsyncGenerator<T>> future_;
- AsyncGenerator<T> source_;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief Transforms a Future<AsyncGenerator<T>> into an AsyncGenerator<T>
-/// that waits for the future to complete as part of the first item.
-///
-/// This generator is not async-reentrant (even if the generator yielded by future is)
-///
-/// This generator does not queue
-template <typename T>
-AsyncGenerator<T> MakeFromFuture(Future<AsyncGenerator<T>> future) {
- return FutureFirstGenerator<T>(std::move(future));
-}
-
-/// \brief Creates a generator that will pull from the source into a queue. Unlike
-/// MakeReadaheadGenerator this will not pull reentrantly from the source.
-///
-/// The source generator does not need to be async-reentrant
-///
-/// This generator is not async-reentrant (even if the source is)
-///
-/// This generator may queue up to max_readahead additional instances of T
-template <typename T>
-AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
- int max_readahead) {
- return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
-}
-
-/// \see MakeReadaheadGenerator
-template <typename T>
-class ReadaheadGenerator {
- public:
- ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
- : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
-
- Future<T> AddMarkFinishedContinuation(Future<T> fut) {
- auto state = state_;
- return fut.Then(
- [state](const T& result) -> Result<T> {
- state->MarkFinishedIfDone(result);
- return result;
- },
- [state](const Status& err) -> Result<T> {
- state->finished.store(true);
- return err;
- });
- }
-
- Future<T> operator()() {
- if (state_->readahead_queue.empty()) {
- // This is the first request, let's pump the underlying queue
- for (int i = 0; i < state_->max_readahead; i++) {
- auto next = state_->source_generator();
- auto next_after_check = AddMarkFinishedContinuation(std::move(next));
- state_->readahead_queue.push(std::move(next_after_check));
- }
- }
- // Pop one and add one
- auto result = state_->readahead_queue.front();
- state_->readahead_queue.pop();
- if (state_->finished.load()) {
- state_->readahead_queue.push(AsyncGeneratorEnd<T>());
- } else {
- auto back_of_queue = state_->source_generator();
- auto back_of_queue_after_check =
- AddMarkFinishedContinuation(std::move(back_of_queue));
- state_->readahead_queue.push(std::move(back_of_queue_after_check));
- }
- return result;
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source_generator, int max_readahead)
- : source_generator(std::move(source_generator)), max_readahead(max_readahead) {
- finished.store(false);
- }
-
- void MarkFinishedIfDone(const T& next_result) {
- if (IsIterationEnd(next_result)) {
- finished.store(true);
- }
- }
-
- AsyncGenerator<T> source_generator;
- int max_readahead;
- std::atomic<bool> finished;
- std::queue<Future<T>> readahead_queue;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief A generator where the producer pushes items on a queue.
-///
-/// No back-pressure is applied, so this generator is mostly useful when
-/// producing the values is neither CPU- nor memory-expensive (e.g. fetching
-/// filesystem metadata).
-///
-/// This generator is not async-reentrant.
-template <typename T>
-class PushGenerator {
- struct State {
- util::Mutex mutex;
- std::deque<Result<T>> result_q;
- util::optional<Future<T>> consumer_fut;
- bool finished = false;
- };
-
- public:
- /// Producer API for PushGenerator
- class Producer {
- public:
- explicit Producer(const std::shared_ptr<State>& state) : weak_state_(state) {}
-
- /// \brief Push a value on the queue
- ///
- /// True is returned if the value was pushed, false if the generator is
- /// already closed or destroyed. If the latter, it is recommended to stop
- /// producing any further values.
- bool Push(Result<T> result) {
- auto state = weak_state_.lock();
- if (!state) {
- // Generator was destroyed
- return false;
- }
- auto lock = state->mutex.Lock();
- if (state->finished) {
- // Closed early
- return false;
- }
- if (state->consumer_fut.has_value()) {
- auto fut = std::move(state->consumer_fut.value());
- state->consumer_fut.reset();
- lock.Unlock(); // unlock before potentially invoking a callback
- fut.MarkFinished(std::move(result));
- } else {
- state->result_q.push_back(std::move(result));
- }
- return true;
- }
-
- /// \brief Tell the consumer we have finished producing
- ///
- /// It is allowed to call this and later call Push() again ("early close").
- /// In this case, calls to Push() after the queue is closed are silently
- /// ignored. This can help implementing non-trivial cancellation cases.
- ///
- /// True is returned on success, false if the generator is already closed
- /// or destroyed.
- bool Close() {
- auto state = weak_state_.lock();
- if (!state) {
- // Generator was destroyed
- return false;
- }
- auto lock = state->mutex.Lock();
- if (state->finished) {
- // Already closed
- return false;
- }
- state->finished = true;
- if (state->consumer_fut.has_value()) {
- auto fut = std::move(state->consumer_fut.value());
- state->consumer_fut.reset();
- lock.Unlock(); // unlock before potentially invoking a callback
- fut.MarkFinished(IterationTraits<T>::End());
- }
- return true;
- }
-
- /// Return whether the generator was closed or destroyed.
- bool is_closed() const {
- auto state = weak_state_.lock();
- if (!state) {
- // Generator was destroyed
- return true;
- }
- auto lock = state->mutex.Lock();
- return state->finished;
- }
-
- private:
- const std::weak_ptr<State> weak_state_;
- };
-
- PushGenerator() : state_(std::make_shared<State>()) {}
-
- /// Read an item from the queue
- Future<T> operator()() {
- auto lock = state_->mutex.Lock();
- assert(!state_->consumer_fut.has_value()); // Non-reentrant
- if (!state_->result_q.empty()) {
- auto fut = Future<T>::MakeFinished(std::move(state_->result_q.front()));
- state_->result_q.pop_front();
- return fut;
- }
- if (state_->finished) {
- return AsyncGeneratorEnd<T>();
- }
- auto fut = Future<T>::Make();
- state_->consumer_fut = fut;
- return fut;
- }
-
- /// \brief Return producer-side interface
- ///
- /// The returned object must be used by the producer to push values on the queue.
- /// Only a single Producer object should be instantiated.
- Producer producer() { return Producer{state_}; }
-
- private:
- const std::shared_ptr<State> state_;
-};
-
-/// \brief Creates a generator that pulls reentrantly from a source
-/// This generator will pull reentrantly from a source, ensuring that max_readahead
-/// requests are active at any given time.
-///
-/// The source generator must be async-reentrant
-///
-/// This generator itself is async-reentrant.
-///
-/// This generator may queue up to max_readahead instances of T
-template <typename T>
-AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
- int max_readahead) {
- return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
-}
-
-/// \brief Creates a generator that will yield finished futures from a vector
-///
-/// This generator is async-reentrant
-template <typename T>
-AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
- struct State {
- explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
-
- std::vector<T> vec;
- std::atomic<std::size_t> vec_idx;
- };
-
- auto state = std::make_shared<State>(std::move(vec));
- return [state]() {
- auto idx = state->vec_idx.fetch_add(1);
- if (idx >= state->vec.size()) {
- // Eagerly return memory
- state->vec.clear();
- return AsyncGeneratorEnd<T>();
- }
- return Future<T>::MakeFinished(state->vec[idx]);
- };
-}
-
-/// \see MakeMergedGenerator
-template <typename T>
-class MergedGenerator {
- public:
- explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
- int max_subscriptions)
- : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
-
- Future<T> operator()() {
- Future<T> waiting_future;
- std::shared_ptr<DeliveredJob> delivered_job;
- {
- auto guard = state_->mutex.Lock();
- if (!state_->delivered_jobs.empty()) {
- delivered_job = std::move(state_->delivered_jobs.front());
- state_->delivered_jobs.pop_front();
- } else if (state_->finished) {
- return IterationTraits<T>::End();
- } else {
- waiting_future = Future<T>::Make();
- state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
- }
- }
- if (delivered_job) {
- // deliverer will be invalid if outer callback encounters an error and delivers a
- // failed result
- if (delivered_job->deliverer) {
- delivered_job->deliverer().AddCallback(
- InnerCallback{state_, delivered_job->index});
- }
- return std::move(delivered_job->value);
- }
- if (state_->first) {
- state_->first = false;
- for (std::size_t i = 0; i < state_->active_subscriptions.size(); i++) {
- state_->PullSource().AddCallback(OuterCallback{state_, i});
- }
- }
- return waiting_future;
- }
-
- private:
- struct DeliveredJob {
- explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
- std::size_t index_)
- : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
-
- AsyncGenerator<T> deliverer;
- Result<T> value;
- std::size_t index;
- };
-
- struct State {
- State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
- : source(std::move(source)),
- active_subscriptions(max_subscriptions),
- delivered_jobs(),
- waiting_jobs(),
- mutex(),
- first(true),
- source_exhausted(false),
- finished(false),
- num_active_subscriptions(max_subscriptions) {}
-
- Future<AsyncGenerator<T>> PullSource() {
- // Need to guard access to source() so we don't pull sync-reentrantly which
- // is never valid.
- auto lock = mutex.Lock();
- return source();
- }
-
- AsyncGenerator<AsyncGenerator<T>> source;
- // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
- std::vector<AsyncGenerator<T>> active_subscriptions;
- std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
- // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
- // backpressure
- std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
- util::Mutex mutex;
- bool first;
- bool source_exhausted;
- bool finished;
- int num_active_subscriptions;
- };
-
- struct InnerCallback {
- void operator()(const Result<T>& maybe_next) {
- Future<T> sink;
- bool sub_finished = maybe_next.ok() && IsIterationEnd(*maybe_next);
- {
- auto guard = state->mutex.Lock();
- if (state->finished) {
- // We've errored out so just ignore this result and don't keep pumping
- return;
- }
- if (!sub_finished) {
- if (state->waiting_jobs.empty()) {
- state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
- state->active_subscriptions[index], maybe_next, index));
- } else {
- sink = std::move(*state->waiting_jobs.front());
- state->waiting_jobs.pop_front();
- }
- }
- }
- if (sub_finished) {
- state->PullSource().AddCallback(OuterCallback{state, index});
- } else if (sink.is_valid()) {
- sink.MarkFinished(maybe_next);
- if (maybe_next.ok()) {
- state->active_subscriptions[index]().AddCallback(*this);
- }
- }
- }
- std::shared_ptr<State> state;
- std::size_t index;
- };
-
- struct OuterCallback {
- void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
- bool should_purge = false;
- bool should_continue = false;
- Future<T> error_sink;
- {
- auto guard = state->mutex.Lock();
- if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
- state->source_exhausted = true;
- if (!maybe_next.ok() || --state->num_active_subscriptions == 0) {
- state->finished = true;
- should_purge = true;
- }
- if (!maybe_next.ok()) {
- if (state->waiting_jobs.empty()) {
- state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
- AsyncGenerator<T>(), maybe_next.status(), index));
- } else {
- error_sink = std::move(*state->waiting_jobs.front());
- state->waiting_jobs.pop_front();
- }
- }
- } else {
- state->active_subscriptions[index] = *maybe_next;
- should_continue = true;
- }
- }
- if (error_sink.is_valid()) {
- error_sink.MarkFinished(maybe_next.status());
- }
- if (should_continue) {
- (*maybe_next)().AddCallback(InnerCallback{state, index});
- } else if (should_purge) {
- // At this point state->finished has been marked true so no one else
- // will be interacting with waiting_jobs and we can iterate outside lock
- while (!state->waiting_jobs.empty()) {
- state->waiting_jobs.front()->MarkFinished(IterationTraits<T>::End());
- state->waiting_jobs.pop_front();
- }
- }
- }
- std::shared_ptr<State> state;
- std::size_t index;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// \brief Creates a generator that takes in a stream of generators and pulls from up to
-/// max_subscriptions at a time
-///
-/// Note: This may deliver items out of sequence. For example, items from the third
-/// AsyncGenerator generated by the source may be emitted before some items from the first
-/// AsyncGenerator generated by the source.
-///
-/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
-/// This generator will not pull from the individual subscriptions reentrantly. Add
-/// readahead to the individual subscriptions if that is desired.
-/// This generator is async-reentrant
-///
-/// This generator may queue up to max_subscriptions instances of T
-template <typename T>
-AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
- int max_subscriptions) {
- return MergedGenerator<T>(std::move(source), max_subscriptions);
-}
-
-/// \brief Creates a generator that takes in a stream of generators and pulls from each
-/// one in sequence.
-///
-/// This generator is async-reentrant but will never pull from source reentrantly and
-/// will never pull from any subscription reentrantly.
-///
-/// This generator may queue 1 instance of T
-///
-/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
-/// forwards async-reentrant requests instead of buffering them (which is what
-/// MergedGenerator does)
-template <typename T>
-AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
- return MergedGenerator<T>(std::move(source), 1);
-}
-
-template <typename T>
-struct Enumerated {
- T value;
- int index;
- bool last;
-};
-
-template <typename T>
-struct IterationTraits<Enumerated<T>> {
- static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
- static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
-};
-
-/// \see MakeEnumeratedGenerator
-template <typename T>
-class EnumeratingGenerator {
- public:
- EnumeratingGenerator(AsyncGenerator<T> source, T initial_value)
- : state_(std::make_shared<State>(std::move(source), std::move(initial_value))) {}
-
- Future<Enumerated<T>> operator()() {
- if (state_->finished) {
- return AsyncGeneratorEnd<Enumerated<T>>();
- } else {
- auto state = state_;
- return state->source().Then([state](const T& next) {
- auto finished = IsIterationEnd<T>(next);
- auto prev = Enumerated<T>{state->prev_value, state->prev_index, finished};
- state->prev_value = next;
- state->prev_index++;
- state->finished = finished;
- return prev;
- });
- }
- }
-
- private:
- struct State {
- State(AsyncGenerator<T> source, T initial_value)
- : source(std::move(source)), prev_value(std::move(initial_value)), prev_index(0) {
- finished = IsIterationEnd<T>(prev_value);
- }
-
- AsyncGenerator<T> source;
- T prev_value;
- int prev_index;
- bool finished;
- };
-
- std::shared_ptr<State> state_;
-};
-
-/// Wraps items from a source generator with positional information
-///
-/// When used with MakeMergedGenerator and MakeSequencingGenerator this allows items to be
-/// processed in a "first-available" fashion and later resequenced which can reduce the
-/// impact of sources with erratic performance (e.g. a filesystem where some items may
-/// take longer to read than others).
-///
-/// TODO(ARROW-12371) Would require this generator be async-reentrant
-///
-/// \see MakeSequencingGenerator for an example of putting items back in order
-///
-/// This generator is not async-reentrant
-///
-/// This generator buffers one item (so it knows which item is the last item)
-template <typename T>
-AsyncGenerator<Enumerated<T>> MakeEnumeratedGenerator(AsyncGenerator<T> source) {
- return FutureFirstGenerator<Enumerated<T>>(
- source().Then([source](const T& initial_value) -> AsyncGenerator<Enumerated<T>> {
- return EnumeratingGenerator<T>(std::move(source), initial_value);
- }));
-}
-
-/// \see MakeTransferredGenerator
-template <typename T>
-class TransferringGenerator {
- public:
- explicit TransferringGenerator(AsyncGenerator<T> source, internal::Executor* executor)
- : source_(std::move(source)), executor_(executor) {}
-
- Future<T> operator()() { return executor_->Transfer(source_()); }
-
- private:
- AsyncGenerator<T> source_;
- internal::Executor* executor_;
-};
-
-/// \brief Transfers a future to an underlying executor.
-///
-/// Continuations run on the returned future will be run on the given executor
-/// if they cannot be run synchronously.
-///
-/// This is often needed to move computation off I/O threads or other external
-/// completion sources and back on to the CPU executor so the I/O thread can
-/// stay busy and focused on I/O
-///
-/// Keep in mind that continuations called on an already completed future will
-/// always be run synchronously and so no transfer will happen in that case.
-///
-/// This generator is async reentrant if the source is
-///
-/// This generator will not queue
-template <typename T>
-AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
- internal::Executor* executor) {
- return TransferringGenerator<T>(std::move(source), executor);
-}
-
-/// \see MakeBackgroundGenerator
-template <typename T>
-class BackgroundGenerator {
- public:
- explicit BackgroundGenerator(Iterator<T> it, internal::Executor* io_executor, int max_q,
- int q_restart)
- : state_(std::make_shared<State>(io_executor, std::move(it), max_q, q_restart)),
- cleanup_(std::make_shared<Cleanup>(state_.get())) {}
-
- Future<T> operator()() {
- auto guard = state_->mutex.Lock();
- Future<T> waiting_future;
- if (state_->queue.empty()) {
- if (state_->finished) {
- return AsyncGeneratorEnd<T>();
- } else {
- waiting_future = Future<T>::Make();
- state_->waiting_future = waiting_future;
- }
- } else {
- auto next = Future<T>::MakeFinished(std::move(state_->queue.front()));
- state_->queue.pop();
- if (state_->NeedsRestart()) {
- return state_->RestartTask(state_, std::move(guard), std::move(next));
- }
- return next;
- }
- // This should only trigger the very first time this method is called
- if (state_->NeedsRestart()) {
- return state_->RestartTask(state_, std::move(guard), std::move(waiting_future));
- }
- return waiting_future;
- }
-
- protected:
- static constexpr uint64_t kUnlikelyThreadId{std::numeric_limits<uint64_t>::max()};
-
- struct State {
- State(internal::Executor* io_executor, Iterator<T> it, int max_q, int q_restart)
- : io_executor(io_executor),
- max_q(max_q),
- q_restart(q_restart),
- it(std::move(it)),
- reading(false),
- finished(false),
- should_shutdown(false) {}
-
- void ClearQueue() {
- while (!queue.empty()) {
- queue.pop();
- }
- }
-
- bool TaskIsRunning() const { return task_finished.is_valid(); }
-
- bool NeedsRestart() const {
- return !finished && !reading && static_cast<int>(queue.size()) <= q_restart;
- }
-
- void DoRestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard) {
- // If we get here we are actually going to start a new task so let's create a
- // task_finished future for it
- state->task_finished = Future<>::Make();
- state->reading = true;
- auto spawn_status = io_executor->Spawn(
- [state]() { BackgroundGenerator::WorkerTask(std::move(state)); });
- if (!spawn_status.ok()) {
- // If we can't spawn a new task then send an error to the consumer (either via a
- // waiting future or the queue) and mark ourselves finished
- state->finished = true;
- state->task_finished = Future<>();
- if (waiting_future.has_value()) {
- auto to_deliver = std::move(waiting_future.value());
- waiting_future.reset();
- guard.Unlock();
- to_deliver.MarkFinished(spawn_status);
- } else {
- ClearQueue();
- queue.push(spawn_status);
- }
- }
- }
-
- Future<T> RestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard,
- Future<T> next) {
- if (TaskIsRunning()) {
- // If the task is still cleaning up we need to wait for it to finish before
- // restarting. We also want to block the consumer until we've restarted the
- // reader to avoid multiple restarts
- return task_finished.Then([state, next]() {
- // This may appear dangerous (recursive mutex) but we should be guaranteed the
- // outer guard has been released by this point. We know...
- // * task_finished is not already finished (it would be invalid in that case)
- // * task_finished will not be marked complete until we've given up the mutex
- auto guard_ = state->mutex.Lock();
- state->DoRestartTask(state, std::move(guard_));
- return next;
- });
- }
- // Otherwise we can restart immediately
- DoRestartTask(std::move(state), std::move(guard));
- return next;
- }
-
- internal::Executor* io_executor;
- const int max_q;
- const int q_restart;
- Iterator<T> it;
- std::atomic<uint64_t> worker_thread_id{kUnlikelyThreadId};
-
- // If true, the task is actively pumping items from the queue and does not need a
- // restart
- bool reading;
- // Set to true when a terminal item arrives
- bool finished;
- // Signal to the background task to end early because consumers have given up on it
- bool should_shutdown;
- // If the queue is empty, the consumer will create a waiting future and wait for it
- std::queue<Result<T>> queue;
- util::optional<Future<T>> waiting_future;
- // Every background task is given a future to complete when it is entirely finished
- // processing and ready for the next task to start or for State to be destroyed
- Future<> task_finished;
- util::Mutex mutex;
- };
-
- // Cleanup task that will be run when all consumer references to the generator are lost
- struct Cleanup {
- explicit Cleanup(State* state) : state(state) {}
- ~Cleanup() {
- /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
- /// there is no need to perform this check.
- ///
- /// It's a deadlock if we enter cleanup from
- /// the worker thread but it can happen if the consumer doesn't transfer away
- assert(state->worker_thread_id.load() != ::arrow::internal::GetThreadId());
- Future<> finish_fut;
- {
- auto lock = state->mutex.Lock();
- if (!state->TaskIsRunning()) {
- return;
- }
- // Signal the current task to stop and wait for it to finish
- state->should_shutdown = true;
- finish_fut = state->task_finished;
- }
- // Using future as a condition variable here
- Status st = finish_fut.status();
- ARROW_UNUSED(st);
- }
- State* state;
- };
-
- static void WorkerTask(std::shared_ptr<State> state) {
- state->worker_thread_id.store(::arrow::internal::GetThreadId());
- // We need to capture the state to read while outside the mutex
- bool reading = true;
- while (reading) {
- auto next = state->it.Next();
- // Need to capture state->waiting_future inside the mutex to mark finished outside
- Future<T> waiting_future;
- {
- auto guard = state->mutex.Lock();
-
- if (state->should_shutdown) {
- state->finished = true;
- break;
- }
-
- if (!next.ok() || IsIterationEnd<T>(*next)) {
- // Terminal item. Mark finished to true, send this last item, and quit
- state->finished = true;
- if (!next.ok()) {
- state->ClearQueue();
- }
- }
- // At this point we are going to send an item. Either we will add it to the
- // queue or deliver it to a waiting future.
- if (state->waiting_future.has_value()) {
- waiting_future = std::move(state->waiting_future.value());
- state->waiting_future.reset();
- } else {
- state->queue.push(std::move(next));
- // We just filled up the queue so it is time to quit. We may need to notify
- // a cleanup task so we transition to Quitting
- if (static_cast<int>(state->queue.size()) >= state->max_q) {
- state->reading = false;
- }
- }
- reading = state->reading && !state->finished;
- }
- // This should happen outside the mutex. Presumably there is a
- // transferring generator on the other end that will quickly transfer any
- // callbacks off of this thread so we can continue looping. Still, best not to
- // rely on that
- if (waiting_future.is_valid()) {
- waiting_future.MarkFinished(next);
- }
- }
- // Once we've sent our last item we can notify any waiters that we are done and so
- // either state can be cleaned up or a new background task can be started
- Future<> task_finished;
- {
- auto guard = state->mutex.Lock();
- // After we give up the mutex state can be safely deleted. We will no longer
- // reference it. We can safely transition to idle now.
- task_finished = state->task_finished;
- state->task_finished = Future<>();
- state->worker_thread_id.store(kUnlikelyThreadId);
- }
- task_finished.MarkFinished();
- }
-
- std::shared_ptr<State> state_;
- // state_ is held by both the generator and the background thread so it won't be cleaned
- // up when all consumer references are relinquished. cleanup_ is only held by the
- // generator so it will be destructed when the last consumer reference is gone. We use
- // this to cleanup / stop the background generator in case the consuming end stops
- // listening (e.g. due to a downstream error)
- std::shared_ptr<Cleanup> cleanup_;
-};
-
-constexpr int kDefaultBackgroundMaxQ = 32;
-constexpr int kDefaultBackgroundQRestart = 16;
-
-/// \brief Creates an AsyncGenerator<T> by iterating over an Iterator<T> on a background
-/// thread
-///
-/// The parameter max_q and q_restart control queue size and background thread task
-/// management. If the background task is fast you typically don't want it creating a
-/// thread task for every item. Instead the background thread will run until it fills
-/// up a readahead queue.
-///
-/// Once the queue has filled up the background thread task will terminate (allowing other
-/// I/O tasks to use the thread). Once the queue has been drained enough (specified by
-/// q_restart) then the background thread task will be restarted. If q_restart is too low
-/// then you may exhaust the queue waiting for the background thread task to start running
-/// again. If it is too high then it will be constantly stopping and restarting the
-/// background queue task
-///
-/// The "background thread" is a logical thread and will run as tasks on the io_executor.
-/// This thread may stop and start when the queue fills up but there will only be one
-/// active background thread task at any given time. You MUST transfer away from this
-/// background generator. Otherwise there could be a race condition if a callback on the
-/// background thread deletes the last consumer reference to the background generator. You
-/// can transfer onto the same executor as the background thread, it is only neccesary to
-/// create a new thread task, not to switch executors.
-///
-/// This generator is not async-reentrant
-///
-/// This generator will queue up to max_q blocks
-template <typename T>
-static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
- Iterator<T> iterator, internal::Executor* io_executor,
- int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart) {
- if (max_q < q_restart) {
- return Status::Invalid("max_q must be >= q_restart");
- }
- return BackgroundGenerator<T>(std::move(iterator), io_executor, max_q, q_restart);
-}
-
-/// \see MakeGeneratorIterator
-template <typename T>
-class GeneratorIterator {
- public:
- explicit GeneratorIterator(AsyncGenerator<T> source) : source_(std::move(source)) {}
-
- Result<T> Next() { return source_().result(); }
-
- private:
- AsyncGenerator<T> source_;
-};
-
-/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
-/// is finished
-template <typename T>
-Iterator<T> MakeGeneratorIterator(AsyncGenerator<T> source) {
- return Iterator<T>(GeneratorIterator<T>(std::move(source)));
-}
-
-/// \brief Adds readahead to an iterator using a background thread.
-///
-/// Under the hood this is converting the iterator to a generator using
-/// MakeBackgroundGenerator, adding readahead to the converted generator with
-/// MakeReadaheadGenerator, and then converting back to an iterator using
-/// MakeGeneratorIterator.
-template <typename T>
-Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
- ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
- auto max_q = readahead_queue_size;
- auto q_restart = std::max(1, max_q / 2);
- ARROW_ASSIGN_OR_RAISE(
- auto background_generator,
- MakeBackgroundGenerator(std::move(it), io_executor.get(), max_q, q_restart));
- // Capture io_executor to keep it alive as long as owned_bg_generator is still
- // referenced
- AsyncGenerator<T> owned_bg_generator = [io_executor, background_generator]() {
- return background_generator();
- };
- return MakeGeneratorIterator(std::move(owned_bg_generator));
-}
-
-/// \brief Make a generator that returns a single pre-generated future
-///
-/// This generator is async-reentrant.
-template <typename T>
-std::function<Future<T>()> MakeSingleFutureGenerator(Future<T> future) {
- assert(future.is_valid());
- auto state = std::make_shared<Future<T>>(std::move(future));
- return [state]() -> Future<T> {
- auto fut = std::move(*state);
- if (fut.is_valid()) {
- return fut;
- } else {
- return AsyncGeneratorEnd<T>();
- }
- };
-}
-
-/// \brief Make a generator that immediately ends.
-///
-/// This generator is async-reentrant.
-template <typename T>
-std::function<Future<T>()> MakeEmptyGenerator() {
- return []() -> Future<T> { return AsyncGeneratorEnd<T>(); };
-}
-
-/// \brief Make a generator that always fails with a given error
-///
-/// This generator is async-reentrant.
-template <typename T>
-AsyncGenerator<T> MakeFailingGenerator(Status st) {
- assert(!st.ok());
- auto state = std::make_shared<Status>(std::move(st));
- return [state]() -> Future<T> {
- auto st = std::move(*state);
- if (!st.ok()) {
- return std::move(st);
- } else {
- return AsyncGeneratorEnd<T>();
- }
- };
-}
-
-/// \brief Make a generator that always fails with a given error
-///
-/// This overload allows inferring the return type from the argument.
-template <typename T>
-AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
- return MakeFailingGenerator<T>(result.status());
-}
-
-/// \brief Prepends initial_values onto a generator
-///
-/// This generator is async-reentrant but will buffer requests and will not
-/// pull from following_values async-reentrantly.
-template <typename T>
-AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
- AsyncGenerator<T> following_values) {
- auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
- auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
- {std::move(initial_values_vec_gen), std::move(following_values)});
- return MakeConcatenatedGenerator(std::move(gen_gen));
-}
-
-template <typename T>
-struct CancellableGenerator {
- Future<T> operator()() {
- if (stop_token.IsStopRequested()) {
- return stop_token.Poll();
- }
- return source();
- }
-
- AsyncGenerator<T> source;
- StopToken stop_token;
-};
-
-/// \brief Allows an async generator to be cancelled
-///
-/// This generator is async-reentrant
-template <typename T>
-AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
- return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
-}
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <queue>
+
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/queue.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+// The methods in this file create, modify, and utilize AsyncGenerator which is an
+// iterator of futures. This allows an asynchronous source (like file input) to be run
+// through a pipeline in the same way that iterators can be used to create pipelined
+// workflows.
+//
+// In order to support pipeline parallelism we introduce the concept of asynchronous
+// reentrancy. This is different than synchronous reentrancy. With synchronous code a
+// function is reentrant if the function can be called again while a previous call to that
+// function is still running. Unless otherwise specified none of these generators are
+// synchronously reentrant. Care should be taken to avoid calling them in such a way (and
+// the utilities Visit/Collect/Await take care to do this).
+//
+// Asynchronous reentrancy on the other hand means the function is called again before the
+// future returned by the function is marked finished (but after the call to get the
+// future returns). Some of these generators are async-reentrant while others (e.g.
+// those that depend on ordered processing like decompression) are not. Read the MakeXYZ
+// function comments to determine which generators support async reentrancy.
+//
+// Note: Generators that are not asynchronously reentrant can still support readahead
+// (\see MakeSerialReadaheadGenerator).
+//
+// Readahead operators, and some other operators, may introduce queueing. Any operators
+// that introduce buffering should detail the amount of buffering they introduce in their
+// MakeXYZ function comments.
+template <typename T>
+using AsyncGenerator = std::function<Future<T>()>;
+
+template <typename T>
+struct IterationTraits<AsyncGenerator<T>> {
+ /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
+ /// an empty function indicates the end of iteration.
+ static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
+
+ static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
+};
+
+template <typename T>
+Future<T> AsyncGeneratorEnd() {
+ return Future<T>::MakeFinished(IterationTraits<T>::End());
+}
+
+/// returning a future that completes when all have been visited
+template <typename T, typename Visitor>
+Future<> VisitAsyncGenerator(AsyncGenerator<T> generator, Visitor visitor) {
+ struct LoopBody {
+ struct Callback {
+ Result<ControlFlow<>> operator()(const T& next) {
+ if (IsIterationEnd(next)) {
+ return Break();
+ } else {
+ auto visited = visitor(next);
+ if (visited.ok()) {
+ return Continue();
+ } else {
+ return visited;
+ }
+ }
+ }
+
+ Visitor visitor;
+ };
+
+ Future<ControlFlow<>> operator()() {
+ Callback callback{visitor};
+ auto next = generator();
+ return next.Then(std::move(callback));
+ }
+
+ AsyncGenerator<T> generator;
+ Visitor visitor;
+ };
+
+ return Loop(LoopBody{std::move(generator), std::move(visitor)});
+}
+
+/// \brief Waits for an async generator to complete, discarding results.
+template <typename T>
+Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
+ std::function<Status(T)> visitor = [](const T&) { return Status::OK(); };
+ return VisitAsyncGenerator(generator, visitor);
+}
+
+/// \brief Collects the results of an async generator into a vector
+template <typename T>
+Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
+ auto vec = std::make_shared<std::vector<T>>();
+ struct LoopBody {
+ Future<ControlFlow<std::vector<T>>> operator()() {
+ auto next = generator_();
+ auto vec = vec_;
+ return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
+ if (IsIterationEnd(result)) {
+ return Break(*vec);
+ } else {
+ vec->push_back(result);
+ return Continue();
+ }
+ });
+ }
+ AsyncGenerator<T> generator_;
+ std::shared_ptr<std::vector<T>> vec_;
+ };
+ return Loop(LoopBody{std::move(generator), std::move(vec)});
+}
+
+/// \see MakeMappedGenerator
+template <typename T, typename V>
+class MappingGenerator {
+ public:
+ MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+ : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
+
+ Future<V> operator()() {
+ auto future = Future<V>::Make();
+ bool should_trigger;
+ {
+ auto guard = state_->mutex.Lock();
+ if (state_->finished) {
+ return AsyncGeneratorEnd<V>();
+ }
+ should_trigger = state_->waiting_jobs.empty();
+ state_->waiting_jobs.push_back(future);
+ }
+ if (should_trigger) {
+ state_->source().AddCallback(Callback{state_});
+ }
+ return future;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+ : source(std::move(source)),
+ map(std::move(map)),
+ waiting_jobs(),
+ mutex(),
+ finished(false) {}
+
+ void Purge() {
+ // This might be called by an original callback (if the source iterator fails or
+ // ends) or by a mapped callback (if the map function fails or ends prematurely).
+ // Either way it should only be called once and after finished is set so there is no
+ // need to guard access to `waiting_jobs`.
+ while (!waiting_jobs.empty()) {
+ waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
+ waiting_jobs.pop_front();
+ }
+ }
+
+ AsyncGenerator<T> source;
+ std::function<Future<V>(const T&)> map;
+ std::deque<Future<V>> waiting_jobs;
+ util::Mutex mutex;
+ bool finished;
+ };
+
+ struct Callback;
+
+ struct MappedCallback {
+ void operator()(const Result<V>& maybe_next) {
+ bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+ bool should_purge = false;
+ if (end) {
+ {
+ auto guard = state->mutex.Lock();
+ should_purge = !state->finished;
+ state->finished = true;
+ }
+ }
+ sink.MarkFinished(maybe_next);
+ if (should_purge) {
+ state->Purge();
+ }
+ }
+ std::shared_ptr<State> state;
+ Future<V> sink;
+ };
+
+ struct Callback {
+ void operator()(const Result<T>& maybe_next) {
+ Future<V> sink;
+ bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+ bool should_purge = false;
+ bool should_trigger;
+ {
+ auto guard = state->mutex.Lock();
+ if (end) {
+ should_purge = !state->finished;
+ state->finished = true;
+ }
+ sink = state->waiting_jobs.front();
+ state->waiting_jobs.pop_front();
+ should_trigger = !end && !state->waiting_jobs.empty();
+ }
+ if (should_purge) {
+ state->Purge();
+ }
+ if (should_trigger) {
+ state->source().AddCallback(Callback{state});
+ }
+ if (maybe_next.ok()) {
+ const T& val = maybe_next.ValueUnsafe();
+ if (IsIterationEnd(val)) {
+ sink.MarkFinished(IterationTraits<V>::End());
+ } else {
+ Future<V> mapped_fut = state->map(val);
+ mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
+ }
+ } else {
+ sink.MarkFinished(maybe_next.status());
+ }
+ }
+
+ std::shared_ptr<State> state;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that will apply the map function to each element of
+/// source. The map function is not called on the end token.
+///
+/// Note: This function makes a copy of `map` for each item
+/// Note: Errors returned from the `map` function will be propagated
+///
+/// If the source generator is async-reentrant then this generator will be also
+template <typename T, typename MapFn,
+ typename Mapped = detail::result_of_t<MapFn(const T&)>,
+ typename V = typename EnsureFuture<Mapped>::type::ValueType>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
+ struct MapCallback {
+ MapFn map_;
+
+ Future<V> operator()(const T& val) { return ToFuture(map_(val)); }
+ };
+
+ return MappingGenerator<T, V>(std::move(source_generator), MapCallback{std::move(map)});
+}
+
+/// \see MakeSequencingGenerator
+template <typename T, typename ComesAfter, typename IsNext>
+class SequencingGenerator {
+ public:
+ SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
+ T initial_value)
+ : state_(std::make_shared<State>(std::move(source), std::move(compare),
+ std::move(is_next), std::move(initial_value))) {}
+
+ Future<T> operator()() {
+ {
+ auto guard = state_->mutex.Lock();
+ // We can send a result immediately if the top of the queue is either an
+ // error or the next item
+ if (!state_->queue.empty() &&
+ (!state_->queue.top().ok() ||
+ state_->is_next(state_->previous_value, *state_->queue.top()))) {
+ auto result = std::move(state_->queue.top());
+ if (result.ok()) {
+ state_->previous_value = *result;
+ }
+ state_->queue.pop();
+ return Future<T>::MakeFinished(result);
+ }
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ }
+ // The next item is not in the queue so we will need to wait
+ auto new_waiting_fut = Future<T>::Make();
+ state_->waiting_future = new_waiting_fut;
+ guard.Unlock();
+ state_->source().AddCallback(Callback{state_});
+ return new_waiting_fut;
+ }
+ }
+
+ private:
+ struct WrappedComesAfter {
+ bool operator()(const Result<T>& left, const Result<T>& right) {
+ if (!left.ok() || !right.ok()) {
+ // Should never happen
+ return false;
+ }
+ return compare(*left, *right);
+ }
+ ComesAfter compare;
+ };
+
+ struct State {
+ State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
+ : source(std::move(source)),
+ is_next(std::move(is_next)),
+ previous_value(std::move(initial_value)),
+ waiting_future(),
+ queue(WrappedComesAfter{compare}),
+ finished(false),
+ mutex() {}
+
+ AsyncGenerator<T> source;
+ IsNext is_next;
+ T previous_value;
+ Future<T> waiting_future;
+ std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
+ bool finished;
+ util::Mutex mutex;
+ };
+
+ class Callback {
+ public:
+ explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
+
+ void operator()(const Result<T> result) {
+ Future<T> to_deliver;
+ bool finished;
+ {
+ auto guard = state_->mutex.Lock();
+ bool ready_to_deliver = false;
+ if (!result.ok()) {
+ // Clear any cached results
+ while (!state_->queue.empty()) {
+ state_->queue.pop();
+ }
+ ready_to_deliver = true;
+ state_->finished = true;
+ } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
+ ready_to_deliver = state_->queue.empty();
+ state_->finished = true;
+ } else {
+ ready_to_deliver = state_->is_next(state_->previous_value, *result);
+ }
+
+ if (ready_to_deliver && state_->waiting_future.is_valid()) {
+ to_deliver = state_->waiting_future;
+ if (result.ok()) {
+ state_->previous_value = *result;
+ }
+ } else {
+ state_->queue.push(result);
+ }
+ // Capture state_->finished so we can access it outside the mutex
+ finished = state_->finished;
+ }
+ // Must deliver result outside of the mutex
+ if (to_deliver.is_valid()) {
+ to_deliver.MarkFinished(result);
+ } else {
+ // Otherwise, if we didn't get the next item (or a terminal item), we
+ // need to keep looking
+ if (!finished) {
+ state_->source().AddCallback(Callback{state_});
+ }
+ }
+ }
+
+ private:
+ const std::shared_ptr<State> state_;
+ };
+
+ const std::shared_ptr<State> state_;
+};
+
+/// \brief Buffers an AsyncGenerator to return values in sequence order ComesAfter
+/// and IsNext determine the sequence order.
+///
+/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
+///
+/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
+/// `b` follows immediately after `a`. It should return true given `initial_value` and
+/// `b` if `b` is the first item in the sequence.
+///
+/// This operator will queue unboundedly while waiting for the next item. It is intended
+/// for jittery sources that might scatter an ordered sequence. It is NOT intended to
+/// sort. Using it to try and sort could result in excessive RAM usage. This generator
+/// will queue up to N blocks where N is the max "out of order"ness of the source.
+///
+/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
+/// blocks beyond where it belongs.
+///
+/// This generator is not async-reentrant but it consists only of a simple log(n)
+/// insertion into a priority queue.
+template <typename T, typename ComesAfter, typename IsNext>
+AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
+ ComesAfter compare, IsNext is_next,
+ T initial_value) {
+ return SequencingGenerator<T, ComesAfter, IsNext>(
+ std::move(source_generator), std::move(compare), std::move(is_next),
+ std::move(initial_value));
+}
+
+/// \see MakeTransformedGenerator
+template <typename T, typename V>
+class TransformingGenerator {
+ // The transforming generator state will be referenced as an async generator but will
+ // also be referenced via callback to various futures. If the async generator owner
+ // moves it around we need the state to be consistent for future callbacks.
+ struct TransformingGeneratorState
+ : std::enable_shared_from_this<TransformingGeneratorState> {
+ TransformingGeneratorState(AsyncGenerator<T> generator, Transformer<T, V> transformer)
+ : generator_(std::move(generator)),
+ transformer_(std::move(transformer)),
+ last_value_(),
+ finished_() {}
+
+ Future<V> operator()() {
+ while (true) {
+ auto maybe_next_result = Pump();
+ if (!maybe_next_result.ok()) {
+ return Future<V>::MakeFinished(maybe_next_result.status());
+ }
+ auto maybe_next = std::move(maybe_next_result).ValueUnsafe();
+ if (maybe_next.has_value()) {
+ return Future<V>::MakeFinished(*std::move(maybe_next));
+ }
+
+ auto next_fut = generator_();
+ // If finished already, process results immediately inside the loop to avoid
+ // stack overflow
+ if (next_fut.is_finished()) {
+ auto next_result = next_fut.result();
+ if (next_result.ok()) {
+ last_value_ = *next_result;
+ } else {
+ return Future<V>::MakeFinished(next_result.status());
+ }
+ // Otherwise, if not finished immediately, add callback to process results
+ } else {
+ auto self = this->shared_from_this();
+ return next_fut.Then([self](const T& next_result) {
+ self->last_value_ = next_result;
+ return (*self)();
+ });
+ }
+ }
+ }
+
+ // See comment on TransformingIterator::Pump
+ Result<util::optional<V>> Pump() {
+ if (!finished_ && last_value_.has_value()) {
+ ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
+ if (next.ReadyForNext()) {
+ if (IsIterationEnd(*last_value_)) {
+ finished_ = true;
+ }
+ last_value_.reset();
+ }
+ if (next.Finished()) {
+ finished_ = true;
+ }
+ if (next.HasValue()) {
+ return next.Value();
+ }
+ }
+ if (finished_) {
+ return IterationTraits<V>::End();
+ }
+ return util::nullopt;
+ }
+
+ AsyncGenerator<T> generator_;
+ Transformer<T, V> transformer_;
+ util::optional<T> last_value_;
+ bool finished_;
+ };
+
+ public:
+ explicit TransformingGenerator(AsyncGenerator<T> generator,
+ Transformer<T, V> transformer)
+ : state_(std::make_shared<TransformingGeneratorState>(std::move(generator),
+ std::move(transformer))) {}
+
+ Future<V> operator()() { return (*state_)(); }
+
+ protected:
+ std::shared_ptr<TransformingGeneratorState> state_;
+};
+
+/// \brief Transforms an async generator using a transformer function returning a new
+/// AsyncGenerator
+///
+/// The transform function here behaves exactly the same as the transform function in
+/// MakeTransformedIterator and you can safely use the same transform function to
+/// transform both synchronous and asynchronous streams.
+///
+/// This generator is not async-reentrant
+///
+/// This generator may queue up to 1 instance of T but will not delay
+template <typename T, typename V>
+AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
+ Transformer<T, V> transformer) {
+ return TransformingGenerator<T, V>(generator, transformer);
+}
+
+/// \see MakeSerialReadaheadGenerator
+template <typename T>
+class SerialReadaheadGenerator {
+ public:
+ SerialReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+ : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+ Future<T> operator()() {
+ if (state_->first_) {
+ // Lazy generator, need to wait for the first ask to prime the pump
+ state_->first_ = false;
+ auto next = state_->source_();
+ return next.Then(Callback{state_}, ErrCallback{state_});
+ }
+
+ // This generator is not async-reentrant. We won't be called until the last
+ // future finished so we know there is something in the queue
+ auto finished = state_->finished_.load();
+ if (finished && state_->readahead_queue_.IsEmpty()) {
+ return AsyncGeneratorEnd<T>();
+ }
+
+ std::shared_ptr<Future<T>> next;
+ if (!state_->readahead_queue_.Read(next)) {
+ return Status::UnknownError("Could not read from readahead_queue");
+ }
+
+ auto last_available = state_->spaces_available_.fetch_add(1);
+ if (last_available == 0 && !finished) {
+ // Reader idled out, we need to restart it
+ ARROW_RETURN_NOT_OK(state_->Pump(state_));
+ }
+ return *next;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, int max_readahead)
+ : first_(true),
+ source_(std::move(source)),
+ finished_(false),
+ // There is one extra "space" for the in-flight request
+ spaces_available_(max_readahead + 1),
+ // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
+ readahead_queue_(max_readahead + 1) {}
+
+ Status Pump(const std::shared_ptr<State>& self) {
+ // Can't do readahead_queue.write(source().Then(...)) because then the
+ // callback might run immediately and add itself to the queue before this gets added
+ // to the queue messing up the order.
+ auto next_slot = std::make_shared<Future<T>>();
+ auto written = readahead_queue_.Write(next_slot);
+ if (!written) {
+ return Status::UnknownError("Could not write to readahead_queue");
+ }
+ // If this Pump is being called from a callback it is possible for the source to
+ // poll and read from the queue between the Write and this spot where we fill the
+ // value in. However, it is not possible for the future to read this value we are
+ // writing. That is because this callback (the callback for future X) must be
+ // finished before future X is marked complete and this source is not pulled
+ // reentrantly so it will not poll for future X+1 until this callback has completed.
+ *next_slot = source_().Then(Callback{self}, ErrCallback{self});
+ return Status::OK();
+ }
+
+ // Only accessed by the consumer end
+ bool first_;
+ // Accessed by both threads
+ AsyncGenerator<T> source_;
+ std::atomic<bool> finished_;
+ // The queue has a size but it is not atomic. We keep track of how many spaces are
+ // left in the queue here so we know if we've just written the last value and we need
+ // to stop reading ahead or if we've just read from a full queue and we need to
+ // restart reading ahead
+ std::atomic<uint32_t> spaces_available_;
+ // Needs to be a queue of shared_ptr and not Future because we set the value of the
+ // future after we add it to the queue
+ util::SpscQueue<std::shared_ptr<Future<T>>> readahead_queue_;
+ };
+
+ struct Callback {
+ Result<T> operator()(const T& next) {
+ if (IsIterationEnd(next)) {
+ state_->finished_.store(true);
+ return next;
+ }
+ auto last_available = state_->spaces_available_.fetch_sub(1);
+ if (last_available > 1) {
+ ARROW_RETURN_NOT_OK(state_->Pump(state_));
+ }
+ return next;
+ }
+
+ std::shared_ptr<State> state_;
+ };
+
+ struct ErrCallback {
+ Result<T> operator()(const Status& st) {
+ state_->finished_.store(true);
+ return st;
+ }
+
+ std::shared_ptr<State> state_;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \see MakeFromFuture
+template <typename T>
+class FutureFirstGenerator {
+ public:
+ explicit FutureFirstGenerator(Future<AsyncGenerator<T>> future)
+ : state_(std::make_shared<State>(std::move(future))) {}
+
+ Future<T> operator()() {
+ if (state_->source_) {
+ return state_->source_();
+ } else {
+ auto state = state_;
+ return state_->future_.Then([state](const AsyncGenerator<T>& source) {
+ state->source_ = source;
+ return state->source_();
+ });
+ }
+ }
+
+ private:
+ struct State {
+ explicit State(Future<AsyncGenerator<T>> future) : future_(future), source_() {}
+
+ Future<AsyncGenerator<T>> future_;
+ AsyncGenerator<T> source_;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Transforms a Future<AsyncGenerator<T>> into an AsyncGenerator<T>
+/// that waits for the future to complete as part of the first item.
+///
+/// This generator is not async-reentrant (even if the generator yielded by future is)
+///
+/// This generator does not queue
+template <typename T>
+AsyncGenerator<T> MakeFromFuture(Future<AsyncGenerator<T>> future) {
+ return FutureFirstGenerator<T>(std::move(future));
+}
+
+/// \brief Creates a generator that will pull from the source into a queue. Unlike
+/// MakeReadaheadGenerator this will not pull reentrantly from the source.
+///
+/// The source generator does not need to be async-reentrant
+///
+/// This generator is not async-reentrant (even if the source is)
+///
+/// This generator may queue up to max_readahead additional instances of T
+template <typename T>
+AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
+ int max_readahead) {
+ return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \see MakeReadaheadGenerator
+template <typename T>
+class ReadaheadGenerator {
+ public:
+ ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+ : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+ Future<T> AddMarkFinishedContinuation(Future<T> fut) {
+ auto state = state_;
+ return fut.Then(
+ [state](const T& result) -> Result<T> {
+ state->MarkFinishedIfDone(result);
+ return result;
+ },
+ [state](const Status& err) -> Result<T> {
+ state->finished.store(true);
+ return err;
+ });
+ }
+
+ Future<T> operator()() {
+ if (state_->readahead_queue.empty()) {
+ // This is the first request, let's pump the underlying queue
+ for (int i = 0; i < state_->max_readahead; i++) {
+ auto next = state_->source_generator();
+ auto next_after_check = AddMarkFinishedContinuation(std::move(next));
+ state_->readahead_queue.push(std::move(next_after_check));
+ }
+ }
+ // Pop one and add one
+ auto result = state_->readahead_queue.front();
+ state_->readahead_queue.pop();
+ if (state_->finished.load()) {
+ state_->readahead_queue.push(AsyncGeneratorEnd<T>());
+ } else {
+ auto back_of_queue = state_->source_generator();
+ auto back_of_queue_after_check =
+ AddMarkFinishedContinuation(std::move(back_of_queue));
+ state_->readahead_queue.push(std::move(back_of_queue_after_check));
+ }
+ return result;
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source_generator, int max_readahead)
+ : source_generator(std::move(source_generator)), max_readahead(max_readahead) {
+ finished.store(false);
+ }
+
+ void MarkFinishedIfDone(const T& next_result) {
+ if (IsIterationEnd(next_result)) {
+ finished.store(true);
+ }
+ }
+
+ AsyncGenerator<T> source_generator;
+ int max_readahead;
+ std::atomic<bool> finished;
+ std::queue<Future<T>> readahead_queue;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief A generator where the producer pushes items on a queue.
+///
+/// No back-pressure is applied, so this generator is mostly useful when
+/// producing the values is neither CPU- nor memory-expensive (e.g. fetching
+/// filesystem metadata).
+///
+/// This generator is not async-reentrant.
+template <typename T>
+class PushGenerator {
+ struct State {
+ util::Mutex mutex;
+ std::deque<Result<T>> result_q;
+ util::optional<Future<T>> consumer_fut;
+ bool finished = false;
+ };
+
+ public:
+ /// Producer API for PushGenerator
+ class Producer {
+ public:
+ explicit Producer(const std::shared_ptr<State>& state) : weak_state_(state) {}
+
+ /// \brief Push a value on the queue
+ ///
+ /// True is returned if the value was pushed, false if the generator is
+ /// already closed or destroyed. If the latter, it is recommended to stop
+ /// producing any further values.
+ bool Push(Result<T> result) {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return false;
+ }
+ auto lock = state->mutex.Lock();
+ if (state->finished) {
+ // Closed early
+ return false;
+ }
+ if (state->consumer_fut.has_value()) {
+ auto fut = std::move(state->consumer_fut.value());
+ state->consumer_fut.reset();
+ lock.Unlock(); // unlock before potentially invoking a callback
+ fut.MarkFinished(std::move(result));
+ } else {
+ state->result_q.push_back(std::move(result));
+ }
+ return true;
+ }
+
+ /// \brief Tell the consumer we have finished producing
+ ///
+ /// It is allowed to call this and later call Push() again ("early close").
+ /// In this case, calls to Push() after the queue is closed are silently
+ /// ignored. This can help implementing non-trivial cancellation cases.
+ ///
+ /// True is returned on success, false if the generator is already closed
+ /// or destroyed.
+ bool Close() {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return false;
+ }
+ auto lock = state->mutex.Lock();
+ if (state->finished) {
+ // Already closed
+ return false;
+ }
+ state->finished = true;
+ if (state->consumer_fut.has_value()) {
+ auto fut = std::move(state->consumer_fut.value());
+ state->consumer_fut.reset();
+ lock.Unlock(); // unlock before potentially invoking a callback
+ fut.MarkFinished(IterationTraits<T>::End());
+ }
+ return true;
+ }
+
+ /// Return whether the generator was closed or destroyed.
+ bool is_closed() const {
+ auto state = weak_state_.lock();
+ if (!state) {
+ // Generator was destroyed
+ return true;
+ }
+ auto lock = state->mutex.Lock();
+ return state->finished;
+ }
+
+ private:
+ const std::weak_ptr<State> weak_state_;
+ };
+
+ PushGenerator() : state_(std::make_shared<State>()) {}
+
+ /// Read an item from the queue
+ Future<T> operator()() {
+ auto lock = state_->mutex.Lock();
+ assert(!state_->consumer_fut.has_value()); // Non-reentrant
+ if (!state_->result_q.empty()) {
+ auto fut = Future<T>::MakeFinished(std::move(state_->result_q.front()));
+ state_->result_q.pop_front();
+ return fut;
+ }
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ }
+ auto fut = Future<T>::Make();
+ state_->consumer_fut = fut;
+ return fut;
+ }
+
+ /// \brief Return producer-side interface
+ ///
+ /// The returned object must be used by the producer to push values on the queue.
+ /// Only a single Producer object should be instantiated.
+ Producer producer() { return Producer{state_}; }
+
+ private:
+ const std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that pulls reentrantly from a source
+/// This generator will pull reentrantly from a source, ensuring that max_readahead
+/// requests are active at any given time.
+///
+/// The source generator must be async-reentrant
+///
+/// This generator itself is async-reentrant.
+///
+/// This generator may queue up to max_readahead instances of T
+template <typename T>
+AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
+ int max_readahead) {
+ return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \brief Creates a generator that will yield finished futures from a vector
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
+ struct State {
+ explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
+
+ std::vector<T> vec;
+ std::atomic<std::size_t> vec_idx;
+ };
+
+ auto state = std::make_shared<State>(std::move(vec));
+ return [state]() {
+ auto idx = state->vec_idx.fetch_add(1);
+ if (idx >= state->vec.size()) {
+ // Eagerly return memory
+ state->vec.clear();
+ return AsyncGeneratorEnd<T>();
+ }
+ return Future<T>::MakeFinished(state->vec[idx]);
+ };
+}
+
+/// \see MakeMergedGenerator
+template <typename T>
+class MergedGenerator {
+ public:
+ explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+ int max_subscriptions)
+ : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
+
+ Future<T> operator()() {
+ Future<T> waiting_future;
+ std::shared_ptr<DeliveredJob> delivered_job;
+ {
+ auto guard = state_->mutex.Lock();
+ if (!state_->delivered_jobs.empty()) {
+ delivered_job = std::move(state_->delivered_jobs.front());
+ state_->delivered_jobs.pop_front();
+ } else if (state_->finished) {
+ return IterationTraits<T>::End();
+ } else {
+ waiting_future = Future<T>::Make();
+ state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
+ }
+ }
+ if (delivered_job) {
+ // deliverer will be invalid if outer callback encounters an error and delivers a
+ // failed result
+ if (delivered_job->deliverer) {
+ delivered_job->deliverer().AddCallback(
+ InnerCallback{state_, delivered_job->index});
+ }
+ return std::move(delivered_job->value);
+ }
+ if (state_->first) {
+ state_->first = false;
+ for (std::size_t i = 0; i < state_->active_subscriptions.size(); i++) {
+ state_->PullSource().AddCallback(OuterCallback{state_, i});
+ }
+ }
+ return waiting_future;
+ }
+
+ private:
+ struct DeliveredJob {
+ explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
+ std::size_t index_)
+ : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
+
+ AsyncGenerator<T> deliverer;
+ Result<T> value;
+ std::size_t index;
+ };
+
+ struct State {
+ State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
+ : source(std::move(source)),
+ active_subscriptions(max_subscriptions),
+ delivered_jobs(),
+ waiting_jobs(),
+ mutex(),
+ first(true),
+ source_exhausted(false),
+ finished(false),
+ num_active_subscriptions(max_subscriptions) {}
+
+ Future<AsyncGenerator<T>> PullSource() {
+ // Need to guard access to source() so we don't pull sync-reentrantly which
+ // is never valid.
+ auto lock = mutex.Lock();
+ return source();
+ }
+
+ AsyncGenerator<AsyncGenerator<T>> source;
+ // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
+ std::vector<AsyncGenerator<T>> active_subscriptions;
+ std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
+ // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
+ // backpressure
+ std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+ util::Mutex mutex;
+ bool first;
+ bool source_exhausted;
+ bool finished;
+ int num_active_subscriptions;
+ };
+
+ struct InnerCallback {
+ void operator()(const Result<T>& maybe_next) {
+ Future<T> sink;
+ bool sub_finished = maybe_next.ok() && IsIterationEnd(*maybe_next);
+ {
+ auto guard = state->mutex.Lock();
+ if (state->finished) {
+ // We've errored out so just ignore this result and don't keep pumping
+ return;
+ }
+ if (!sub_finished) {
+ if (state->waiting_jobs.empty()) {
+ state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+ state->active_subscriptions[index], maybe_next, index));
+ } else {
+ sink = std::move(*state->waiting_jobs.front());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ }
+ if (sub_finished) {
+ state->PullSource().AddCallback(OuterCallback{state, index});
+ } else if (sink.is_valid()) {
+ sink.MarkFinished(maybe_next);
+ if (maybe_next.ok()) {
+ state->active_subscriptions[index]().AddCallback(*this);
+ }
+ }
+ }
+ std::shared_ptr<State> state;
+ std::size_t index;
+ };
+
+ struct OuterCallback {
+ void operator()(const Result<AsyncGenerator<T>>& maybe_next) {
+ bool should_purge = false;
+ bool should_continue = false;
+ Future<T> error_sink;
+ {
+ auto guard = state->mutex.Lock();
+ if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
+ state->source_exhausted = true;
+ if (!maybe_next.ok() || --state->num_active_subscriptions == 0) {
+ state->finished = true;
+ should_purge = true;
+ }
+ if (!maybe_next.ok()) {
+ if (state->waiting_jobs.empty()) {
+ state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+ AsyncGenerator<T>(), maybe_next.status(), index));
+ } else {
+ error_sink = std::move(*state->waiting_jobs.front());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ } else {
+ state->active_subscriptions[index] = *maybe_next;
+ should_continue = true;
+ }
+ }
+ if (error_sink.is_valid()) {
+ error_sink.MarkFinished(maybe_next.status());
+ }
+ if (should_continue) {
+ (*maybe_next)().AddCallback(InnerCallback{state, index});
+ } else if (should_purge) {
+ // At this point state->finished has been marked true so no one else
+ // will be interacting with waiting_jobs and we can iterate outside lock
+ while (!state->waiting_jobs.empty()) {
+ state->waiting_jobs.front()->MarkFinished(IterationTraits<T>::End());
+ state->waiting_jobs.pop_front();
+ }
+ }
+ }
+ std::shared_ptr<State> state;
+ std::size_t index;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from up to
+/// max_subscriptions at a time
+///
+/// Note: This may deliver items out of sequence. For example, items from the third
+/// AsyncGenerator generated by the source may be emitted before some items from the first
+/// AsyncGenerator generated by the source.
+///
+/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
+/// This generator will not pull from the individual subscriptions reentrantly. Add
+/// readahead to the individual subscriptions if that is desired.
+/// This generator is async-reentrant
+///
+/// This generator may queue up to max_subscriptions instances of T
+template <typename T>
+AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+ int max_subscriptions) {
+ return MergedGenerator<T>(std::move(source), max_subscriptions);
+}
+
+/// \brief Creates a generator that takes in a stream of generators and pulls from each
+/// one in sequence.
+///
+/// This generator is async-reentrant but will never pull from source reentrantly and
+/// will never pull from any subscription reentrantly.
+///
+/// This generator may queue 1 instance of T
+///
+/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
+/// forwards async-reentrant requests instead of buffering them (which is what
+/// MergedGenerator does)
+template <typename T>
+AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
+ return MergedGenerator<T>(std::move(source), 1);
+}
+
+template <typename T>
+struct Enumerated {
+ T value;
+ int index;
+ bool last;
+};
+
+template <typename T>
+struct IterationTraits<Enumerated<T>> {
+ static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
+ static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
+};
+
+/// \see MakeEnumeratedGenerator
+template <typename T>
+class EnumeratingGenerator {
+ public:
+ EnumeratingGenerator(AsyncGenerator<T> source, T initial_value)
+ : state_(std::make_shared<State>(std::move(source), std::move(initial_value))) {}
+
+ Future<Enumerated<T>> operator()() {
+ if (state_->finished) {
+ return AsyncGeneratorEnd<Enumerated<T>>();
+ } else {
+ auto state = state_;
+ return state->source().Then([state](const T& next) {
+ auto finished = IsIterationEnd<T>(next);
+ auto prev = Enumerated<T>{state->prev_value, state->prev_index, finished};
+ state->prev_value = next;
+ state->prev_index++;
+ state->finished = finished;
+ return prev;
+ });
+ }
+ }
+
+ private:
+ struct State {
+ State(AsyncGenerator<T> source, T initial_value)
+ : source(std::move(source)), prev_value(std::move(initial_value)), prev_index(0) {
+ finished = IsIterationEnd<T>(prev_value);
+ }
+
+ AsyncGenerator<T> source;
+ T prev_value;
+ int prev_index;
+ bool finished;
+ };
+
+ std::shared_ptr<State> state_;
+};
+
+/// Wraps items from a source generator with positional information
+///
+/// When used with MakeMergedGenerator and MakeSequencingGenerator this allows items to be
+/// processed in a "first-available" fashion and later resequenced which can reduce the
+/// impact of sources with erratic performance (e.g. a filesystem where some items may
+/// take longer to read than others).
+///
+/// TODO(ARROW-12371) Would require this generator be async-reentrant
+///
+/// \see MakeSequencingGenerator for an example of putting items back in order
+///
+/// This generator is not async-reentrant
+///
+/// This generator buffers one item (so it knows which item is the last item)
+template <typename T>
+AsyncGenerator<Enumerated<T>> MakeEnumeratedGenerator(AsyncGenerator<T> source) {
+ return FutureFirstGenerator<Enumerated<T>>(
+ source().Then([source](const T& initial_value) -> AsyncGenerator<Enumerated<T>> {
+ return EnumeratingGenerator<T>(std::move(source), initial_value);
+ }));
+}
+
+/// \see MakeTransferredGenerator
+template <typename T>
+class TransferringGenerator {
+ public:
+ explicit TransferringGenerator(AsyncGenerator<T> source, internal::Executor* executor)
+ : source_(std::move(source)), executor_(executor) {}
+
+ Future<T> operator()() { return executor_->Transfer(source_()); }
+
+ private:
+ AsyncGenerator<T> source_;
+ internal::Executor* executor_;
+};
+
+/// \brief Transfers a future to an underlying executor.
+///
+/// Continuations run on the returned future will be run on the given executor
+/// if they cannot be run synchronously.
+///
+/// This is often needed to move computation off I/O threads or other external
+/// completion sources and back on to the CPU executor so the I/O thread can
+/// stay busy and focused on I/O
+///
+/// Keep in mind that continuations called on an already completed future will
+/// always be run synchronously and so no transfer will happen in that case.
+///
+/// This generator is async reentrant if the source is
+///
+/// This generator will not queue
+template <typename T>
+AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
+ internal::Executor* executor) {
+ return TransferringGenerator<T>(std::move(source), executor);
+}
+
+/// \see MakeBackgroundGenerator
+template <typename T>
+class BackgroundGenerator {
+ public:
+ explicit BackgroundGenerator(Iterator<T> it, internal::Executor* io_executor, int max_q,
+ int q_restart)
+ : state_(std::make_shared<State>(io_executor, std::move(it), max_q, q_restart)),
+ cleanup_(std::make_shared<Cleanup>(state_.get())) {}
+
+ Future<T> operator()() {
+ auto guard = state_->mutex.Lock();
+ Future<T> waiting_future;
+ if (state_->queue.empty()) {
+ if (state_->finished) {
+ return AsyncGeneratorEnd<T>();
+ } else {
+ waiting_future = Future<T>::Make();
+ state_->waiting_future = waiting_future;
+ }
+ } else {
+ auto next = Future<T>::MakeFinished(std::move(state_->queue.front()));
+ state_->queue.pop();
+ if (state_->NeedsRestart()) {
+ return state_->RestartTask(state_, std::move(guard), std::move(next));
+ }
+ return next;
+ }
+ // This should only trigger the very first time this method is called
+ if (state_->NeedsRestart()) {
+ return state_->RestartTask(state_, std::move(guard), std::move(waiting_future));
+ }
+ return waiting_future;
+ }
+
+ protected:
+ static constexpr uint64_t kUnlikelyThreadId{std::numeric_limits<uint64_t>::max()};
+
+ struct State {
+ State(internal::Executor* io_executor, Iterator<T> it, int max_q, int q_restart)
+ : io_executor(io_executor),
+ max_q(max_q),
+ q_restart(q_restart),
+ it(std::move(it)),
+ reading(false),
+ finished(false),
+ should_shutdown(false) {}
+
+ void ClearQueue() {
+ while (!queue.empty()) {
+ queue.pop();
+ }
+ }
+
+ bool TaskIsRunning() const { return task_finished.is_valid(); }
+
+ bool NeedsRestart() const {
+ return !finished && !reading && static_cast<int>(queue.size()) <= q_restart;
+ }
+
+ void DoRestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard) {
+ // If we get here we are actually going to start a new task so let's create a
+ // task_finished future for it
+ state->task_finished = Future<>::Make();
+ state->reading = true;
+ auto spawn_status = io_executor->Spawn(
+ [state]() { BackgroundGenerator::WorkerTask(std::move(state)); });
+ if (!spawn_status.ok()) {
+ // If we can't spawn a new task then send an error to the consumer (either via a
+ // waiting future or the queue) and mark ourselves finished
+ state->finished = true;
+ state->task_finished = Future<>();
+ if (waiting_future.has_value()) {
+ auto to_deliver = std::move(waiting_future.value());
+ waiting_future.reset();
+ guard.Unlock();
+ to_deliver.MarkFinished(spawn_status);
+ } else {
+ ClearQueue();
+ queue.push(spawn_status);
+ }
+ }
+ }
+
+ Future<T> RestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard,
+ Future<T> next) {
+ if (TaskIsRunning()) {
+ // If the task is still cleaning up we need to wait for it to finish before
+ // restarting. We also want to block the consumer until we've restarted the
+ // reader to avoid multiple restarts
+ return task_finished.Then([state, next]() {
+ // This may appear dangerous (recursive mutex) but we should be guaranteed the
+ // outer guard has been released by this point. We know...
+ // * task_finished is not already finished (it would be invalid in that case)
+ // * task_finished will not be marked complete until we've given up the mutex
+ auto guard_ = state->mutex.Lock();
+ state->DoRestartTask(state, std::move(guard_));
+ return next;
+ });
+ }
+ // Otherwise we can restart immediately
+ DoRestartTask(std::move(state), std::move(guard));
+ return next;
+ }
+
+ internal::Executor* io_executor;
+ const int max_q;
+ const int q_restart;
+ Iterator<T> it;
+ std::atomic<uint64_t> worker_thread_id{kUnlikelyThreadId};
+
+ // If true, the task is actively pumping items from the queue and does not need a
+ // restart
+ bool reading;
+ // Set to true when a terminal item arrives
+ bool finished;
+ // Signal to the background task to end early because consumers have given up on it
+ bool should_shutdown;
+ // If the queue is empty, the consumer will create a waiting future and wait for it
+ std::queue<Result<T>> queue;
+ util::optional<Future<T>> waiting_future;
+ // Every background task is given a future to complete when it is entirely finished
+ // processing and ready for the next task to start or for State to be destroyed
+ Future<> task_finished;
+ util::Mutex mutex;
+ };
+
+ // Cleanup task that will be run when all consumer references to the generator are lost
+ struct Cleanup {
+ explicit Cleanup(State* state) : state(state) {}
+ ~Cleanup() {
+ /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
+ /// there is no need to perform this check.
+ ///
+ /// It's a deadlock if we enter cleanup from
+ /// the worker thread but it can happen if the consumer doesn't transfer away
+ assert(state->worker_thread_id.load() != ::arrow::internal::GetThreadId());
+ Future<> finish_fut;
+ {
+ auto lock = state->mutex.Lock();
+ if (!state->TaskIsRunning()) {
+ return;
+ }
+ // Signal the current task to stop and wait for it to finish
+ state->should_shutdown = true;
+ finish_fut = state->task_finished;
+ }
+ // Using future as a condition variable here
+ Status st = finish_fut.status();
+ ARROW_UNUSED(st);
+ }
+ State* state;
+ };
+
+ static void WorkerTask(std::shared_ptr<State> state) {
+ state->worker_thread_id.store(::arrow::internal::GetThreadId());
+ // We need to capture the state to read while outside the mutex
+ bool reading = true;
+ while (reading) {
+ auto next = state->it.Next();
+ // Need to capture state->waiting_future inside the mutex to mark finished outside
+ Future<T> waiting_future;
+ {
+ auto guard = state->mutex.Lock();
+
+ if (state->should_shutdown) {
+ state->finished = true;
+ break;
+ }
+
+ if (!next.ok() || IsIterationEnd<T>(*next)) {
+ // Terminal item. Mark finished to true, send this last item, and quit
+ state->finished = true;
+ if (!next.ok()) {
+ state->ClearQueue();
+ }
+ }
+ // At this point we are going to send an item. Either we will add it to the
+ // queue or deliver it to a waiting future.
+ if (state->waiting_future.has_value()) {
+ waiting_future = std::move(state->waiting_future.value());
+ state->waiting_future.reset();
+ } else {
+ state->queue.push(std::move(next));
+ // We just filled up the queue so it is time to quit. We may need to notify
+ // a cleanup task so we transition to Quitting
+ if (static_cast<int>(state->queue.size()) >= state->max_q) {
+ state->reading = false;
+ }
+ }
+ reading = state->reading && !state->finished;
+ }
+ // This should happen outside the mutex. Presumably there is a
+ // transferring generator on the other end that will quickly transfer any
+ // callbacks off of this thread so we can continue looping. Still, best not to
+ // rely on that
+ if (waiting_future.is_valid()) {
+ waiting_future.MarkFinished(next);
+ }
+ }
+ // Once we've sent our last item we can notify any waiters that we are done and so
+ // either state can be cleaned up or a new background task can be started
+ Future<> task_finished;
+ {
+ auto guard = state->mutex.Lock();
+ // After we give up the mutex state can be safely deleted. We will no longer
+ // reference it. We can safely transition to idle now.
+ task_finished = state->task_finished;
+ state->task_finished = Future<>();
+ state->worker_thread_id.store(kUnlikelyThreadId);
+ }
+ task_finished.MarkFinished();
+ }
+
+ std::shared_ptr<State> state_;
+ // state_ is held by both the generator and the background thread so it won't be cleaned
+ // up when all consumer references are relinquished. cleanup_ is only held by the
+ // generator so it will be destructed when the last consumer reference is gone. We use
+ // this to cleanup / stop the background generator in case the consuming end stops
+ // listening (e.g. due to a downstream error)
+ std::shared_ptr<Cleanup> cleanup_;
+};
+
+constexpr int kDefaultBackgroundMaxQ = 32;
+constexpr int kDefaultBackgroundQRestart = 16;
+
+/// \brief Creates an AsyncGenerator<T> by iterating over an Iterator<T> on a background
+/// thread
+///
+/// The parameter max_q and q_restart control queue size and background thread task
+/// management. If the background task is fast you typically don't want it creating a
+/// thread task for every item. Instead the background thread will run until it fills
+/// up a readahead queue.
+///
+/// Once the queue has filled up the background thread task will terminate (allowing other
+/// I/O tasks to use the thread). Once the queue has been drained enough (specified by
+/// q_restart) then the background thread task will be restarted. If q_restart is too low
+/// then you may exhaust the queue waiting for the background thread task to start running
+/// again. If it is too high then it will be constantly stopping and restarting the
+/// background queue task
+///
+/// The "background thread" is a logical thread and will run as tasks on the io_executor.
+/// This thread may stop and start when the queue fills up but there will only be one
+/// active background thread task at any given time. You MUST transfer away from this
+/// background generator. Otherwise there could be a race condition if a callback on the
+/// background thread deletes the last consumer reference to the background generator. You
+/// can transfer onto the same executor as the background thread, it is only neccesary to
+/// create a new thread task, not to switch executors.
+///
+/// This generator is not async-reentrant
+///
+/// This generator will queue up to max_q blocks
+template <typename T>
+static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
+ Iterator<T> iterator, internal::Executor* io_executor,
+ int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart) {
+ if (max_q < q_restart) {
+ return Status::Invalid("max_q must be >= q_restart");
+ }
+ return BackgroundGenerator<T>(std::move(iterator), io_executor, max_q, q_restart);
+}
+
+/// \see MakeGeneratorIterator
+template <typename T>
+class GeneratorIterator {
+ public:
+ explicit GeneratorIterator(AsyncGenerator<T> source) : source_(std::move(source)) {}
+
+ Result<T> Next() { return source_().result(); }
+
+ private:
+ AsyncGenerator<T> source_;
+};
+
+/// \brief Converts an AsyncGenerator<T> to an Iterator<T> by blocking until each future
+/// is finished
+template <typename T>
+Iterator<T> MakeGeneratorIterator(AsyncGenerator<T> source) {
+ return Iterator<T>(GeneratorIterator<T>(std::move(source)));
+}
+
+/// \brief Adds readahead to an iterator using a background thread.
+///
+/// Under the hood this is converting the iterator to a generator using
+/// MakeBackgroundGenerator, adding readahead to the converted generator with
+/// MakeReadaheadGenerator, and then converting back to an iterator using
+/// MakeGeneratorIterator.
+template <typename T>
+Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
+ ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
+ auto max_q = readahead_queue_size;
+ auto q_restart = std::max(1, max_q / 2);
+ ARROW_ASSIGN_OR_RAISE(
+ auto background_generator,
+ MakeBackgroundGenerator(std::move(it), io_executor.get(), max_q, q_restart));
+ // Capture io_executor to keep it alive as long as owned_bg_generator is still
+ // referenced
+ AsyncGenerator<T> owned_bg_generator = [io_executor, background_generator]() {
+ return background_generator();
+ };
+ return MakeGeneratorIterator(std::move(owned_bg_generator));
+}
+
+/// \brief Make a generator that returns a single pre-generated future
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeSingleFutureGenerator(Future<T> future) {
+ assert(future.is_valid());
+ auto state = std::make_shared<Future<T>>(std::move(future));
+ return [state]() -> Future<T> {
+ auto fut = std::move(*state);
+ if (fut.is_valid()) {
+ return fut;
+ } else {
+ return AsyncGeneratorEnd<T>();
+ }
+ };
+}
+
+/// \brief Make a generator that immediately ends.
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeEmptyGenerator() {
+ return []() -> Future<T> { return AsyncGeneratorEnd<T>(); };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This generator is async-reentrant.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(Status st) {
+ assert(!st.ok());
+ auto state = std::make_shared<Status>(std::move(st));
+ return [state]() -> Future<T> {
+ auto st = std::move(*state);
+ if (!st.ok()) {
+ return std::move(st);
+ } else {
+ return AsyncGeneratorEnd<T>();
+ }
+ };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This overload allows inferring the return type from the argument.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
+ return MakeFailingGenerator<T>(result.status());
+}
+
+/// \brief Prepends initial_values onto a generator
+///
+/// This generator is async-reentrant but will buffer requests and will not
+/// pull from following_values async-reentrantly.
+template <typename T>
+AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
+ AsyncGenerator<T> following_values) {
+ auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
+ auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
+ {std::move(initial_values_vec_gen), std::move(following_values)});
+ return MakeConcatenatedGenerator(std::move(gen_gen));
+}
+
+template <typename T>
+struct CancellableGenerator {
+ Future<T> operator()() {
+ if (stop_token.IsStopRequested()) {
+ return stop_token.Poll();
+ }
+ return source();
+ }
+
+ AsyncGenerator<T> source;
+ StopToken stop_token;
+};
+
+/// \brief Allows an async generator to be cancelled
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
+ return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
index d6640775c4f..56809f28165 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.cc
@@ -28,7 +28,7 @@
#include <string>
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/int128_internal.h"
#include "arrow/util/int_util_internal.h"
#include "arrow/util/logging.h"
@@ -121,223 +121,223 @@ static const BasicDecimal128 ScaleMultipliersHalf[] = {
BasicDecimal128(271050543121376108LL, 9257742014424809472ULL),
BasicDecimal128(2710505431213761085LL, 343699775700336640ULL)};
-static const BasicDecimal256 ScaleMultipliersDecimal256[] = {
- BasicDecimal256({1ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({100000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({1000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({10000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({7766279631452241920ULL, 5ULL, 0ULL, 0ULL}),
- BasicDecimal256({3875820019684212736ULL, 54ULL, 0ULL, 0ULL}),
- BasicDecimal256({1864712049423024128ULL, 542ULL, 0ULL, 0ULL}),
- BasicDecimal256({200376420520689664ULL, 5421ULL, 0ULL, 0ULL}),
- BasicDecimal256({2003764205206896640ULL, 54210ULL, 0ULL, 0ULL}),
- BasicDecimal256({1590897978359414784ULL, 542101ULL, 0ULL, 0ULL}),
- BasicDecimal256({15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL}),
- BasicDecimal256({11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL}),
- BasicDecimal256({4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL}),
- BasicDecimal256({7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL}),
- BasicDecimal256({5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL}),
- BasicDecimal256({13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL}),
- BasicDecimal256({9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL}),
- BasicDecimal256({4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL}),
- BasicDecimal256({4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL}),
- BasicDecimal256({3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL}),
- BasicDecimal256({12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL}),
- BasicDecimal256({68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL}),
- BasicDecimal256({687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL}),
- BasicDecimal256({6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL}),
- BasicDecimal256({13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL}),
- BasicDecimal256({4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL}),
- BasicDecimal256({11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, 0ULL}),
- BasicDecimal256({7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL}),
- BasicDecimal256({80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL}),
- BasicDecimal256({802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, 0ULL}),
- BasicDecimal256({8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, 0ULL}),
- BasicDecimal256(
- {6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, 0ULL}),
- BasicDecimal256(
- {9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, 0ULL}),
- BasicDecimal256(
- {17909126868192198656ULL, 10278298888501128114ULL, 29387358770ULL, 0ULL}),
- BasicDecimal256(
- {13070572018536022016ULL, 10549268516463523069ULL, 293873587705ULL, 0ULL}),
- BasicDecimal256(
- {1578511669393358848ULL, 13258964796087472617ULL, 2938735877055ULL, 0ULL}),
- BasicDecimal256(
- {15785116693933588480ULL, 3462439444907864858ULL, 29387358770557ULL, 0ULL}),
- BasicDecimal256(
- {10277214349659471872ULL, 16177650375369096972ULL, 293873587705571ULL, 0ULL}),
- BasicDecimal256(
- {10538423128046960640ULL, 14202551164014556797ULL, 2938735877055718ULL, 0ULL}),
- BasicDecimal256(
- {13150510911921848320ULL, 12898303124178706663ULL, 29387358770557187ULL, 0ULL}),
- BasicDecimal256(
- {2377900603251621888ULL, 18302566799529756941ULL, 293873587705571876ULL, 0ULL}),
- BasicDecimal256(
- {5332261958806667264ULL, 17004971331911604867ULL, 2938735877055718769ULL, 0ULL}),
- BasicDecimal256(
- {16429131440647569408ULL, 4029016655730084128ULL, 10940614696847636083ULL, 1ULL}),
- BasicDecimal256({16717361816799281152ULL, 3396678409881738056ULL,
- 17172426599928602752ULL, 15ULL}),
- BasicDecimal256({1152921504606846976ULL, 15520040025107828953ULL,
- 5703569335900062977ULL, 159ULL}),
- BasicDecimal256({11529215046068469760ULL, 7626447661401876602ULL,
- 1695461137871974930ULL, 1593ULL}),
- BasicDecimal256({4611686018427387904ULL, 2477500319180559562ULL,
- 16954611378719749304ULL, 15930ULL}),
- BasicDecimal256({9223372036854775808ULL, 6328259118096044006ULL,
- 3525417123811528497ULL, 159309ULL}),
- BasicDecimal256({0ULL, 7942358959831785217ULL, 16807427164405733357ULL, 1593091ULL}),
- BasicDecimal256({0ULL, 5636613303479645706ULL, 2053574980671369030ULL, 15930919ULL}),
- BasicDecimal256({0ULL, 1025900813667802212ULL, 2089005733004138687ULL, 159309191ULL}),
- BasicDecimal256(
- {0ULL, 10259008136678022120ULL, 2443313256331835254ULL, 1593091911ULL}),
- BasicDecimal256(
- {0ULL, 10356360998232463120ULL, 5986388489608800929ULL, 15930919111ULL}),
- BasicDecimal256(
- {0ULL, 11329889613776873120ULL, 4523652674959354447ULL, 159309191113ULL}),
- BasicDecimal256(
- {0ULL, 2618431695511421504ULL, 8343038602174441244ULL, 1593091911132ULL}),
- BasicDecimal256(
- {0ULL, 7737572881404663424ULL, 9643409726906205977ULL, 15930919111324ULL}),
- BasicDecimal256(
- {0ULL, 3588752519208427776ULL, 4200376900514301694ULL, 159309191113245ULL}),
- BasicDecimal256(
- {0ULL, 17440781118374726144ULL, 5110280857723913709ULL, 1593091911132452ULL}),
- BasicDecimal256(
- {0ULL, 8387114520361296896ULL, 14209320429820033867ULL, 15930919111324522ULL}),
- BasicDecimal256(
- {0ULL, 10084168908774762496ULL, 12965995782233477362ULL, 159309191113245227ULL}),
- BasicDecimal256(
- {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})};
-
-static const BasicDecimal256 ScaleMultipliersHalfDecimal256[] = {
- BasicDecimal256({0ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({50000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({500000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
- BasicDecimal256({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}),
- BasicDecimal256({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}),
- BasicDecimal256({932356024711512064ULL, 271ULL, 0ULL, 0ULL}),
- BasicDecimal256({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}),
- BasicDecimal256({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}),
- BasicDecimal256({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}),
- BasicDecimal256({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}),
- BasicDecimal256({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}),
- BasicDecimal256({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}),
- BasicDecimal256({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}),
- BasicDecimal256({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}),
- BasicDecimal256({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}),
- BasicDecimal256({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}),
- BasicDecimal256({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}),
- BasicDecimal256({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}),
- BasicDecimal256({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}),
- BasicDecimal256({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}),
- BasicDecimal256({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}),
- BasicDecimal256({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}),
- BasicDecimal256({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}),
- BasicDecimal256({15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}),
- BasicDecimal256({11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}),
- BasicDecimal256({5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}),
- BasicDecimal256({3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}),
- BasicDecimal256({40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}),
- BasicDecimal256({401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}),
- BasicDecimal256({4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}),
- BasicDecimal256(
- {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}),
- BasicDecimal256(
- {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}),
- BasicDecimal256(
- {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}),
- BasicDecimal256(
- {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}),
- BasicDecimal256(
- {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}),
- BasicDecimal256(
- {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}),
- BasicDecimal256(
- {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}),
- BasicDecimal256(
- {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}),
- BasicDecimal256(
- {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}),
- BasicDecimal256(
- {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}),
- BasicDecimal256(
- {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}),
- BasicDecimal256(
- {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}),
- BasicDecimal256(
- {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}),
- BasicDecimal256({9799832789158199296ULL, 16983392049408690284ULL,
- 12075156704804807296ULL, 79ULL}),
- BasicDecimal256({5764607523034234880ULL, 3813223830700938301ULL,
- 10071102605790763273ULL, 796ULL}),
- BasicDecimal256({2305843009213693952ULL, 1238750159590279781ULL,
- 8477305689359874652ULL, 7965ULL}),
- BasicDecimal256({4611686018427387904ULL, 12387501595902797811ULL,
- 10986080598760540056ULL, 79654ULL}),
- BasicDecimal256({9223372036854775808ULL, 13194551516770668416ULL,
- 17627085619057642486ULL, 796545ULL}),
- BasicDecimal256({0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}),
- BasicDecimal256({0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}),
- BasicDecimal256(
- {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}),
- BasicDecimal256(
- {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}),
- BasicDecimal256(
- {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}),
- BasicDecimal256(
- {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}),
- BasicDecimal256(
- {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}),
- BasicDecimal256(
- {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}),
- BasicDecimal256(
- {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}),
- BasicDecimal256(
- {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}),
- BasicDecimal256(
- {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}),
- BasicDecimal256(
- {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})};
-
+static const BasicDecimal256 ScaleMultipliersDecimal256[] = {
+ BasicDecimal256({1ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({100000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7766279631452241920ULL, 5ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3875820019684212736ULL, 54ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1864712049423024128ULL, 542ULL, 0ULL, 0ULL}),
+ BasicDecimal256({200376420520689664ULL, 5421ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2003764205206896640ULL, 54210ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1590897978359414784ULL, 542101ULL, 0ULL, 0ULL}),
+ BasicDecimal256({15908979783594147840ULL, 5421010ULL, 0ULL, 0ULL}),
+ BasicDecimal256({11515845246265065472ULL, 54210108ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4477988020393345024ULL, 542101086ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7886392056514347008ULL, 5421010862ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5076944270305263616ULL, 54210108624ULL, 0ULL, 0ULL}),
+ BasicDecimal256({13875954555633532928ULL, 542101086242ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9632337040368467968ULL, 5421010862427ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4089650035136921600ULL, 54210108624275ULL, 0ULL, 0ULL}),
+ BasicDecimal256({4003012203950112768ULL, 542101086242752ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3136633892082024448ULL, 5421010862427522ULL, 0ULL, 0ULL}),
+ BasicDecimal256({12919594847110692864ULL, 54210108624275221ULL, 0ULL, 0ULL}),
+ BasicDecimal256({68739955140067328ULL, 542101086242752217ULL, 0ULL, 0ULL}),
+ BasicDecimal256({687399551400673280ULL, 5421010862427522170ULL, 0ULL, 0ULL}),
+ BasicDecimal256({6873995514006732800ULL, 17316620476856118468ULL, 2ULL, 0ULL}),
+ BasicDecimal256({13399722918938673152ULL, 7145508105175220139ULL, 29ULL, 0ULL}),
+ BasicDecimal256({4870020673419870208ULL, 16114848830623546549ULL, 293ULL, 0ULL}),
+ BasicDecimal256({11806718586779598848ULL, 13574535716559052564ULL, 2938ULL, 0ULL}),
+ BasicDecimal256({7386721425538678784ULL, 6618148649623664334ULL, 29387ULL, 0ULL}),
+ BasicDecimal256({80237960548581376ULL, 10841254275107988496ULL, 293873ULL, 0ULL}),
+ BasicDecimal256({802379605485813760ULL, 16178822382532126880ULL, 2938735ULL, 0ULL}),
+ BasicDecimal256({8023796054858137600ULL, 14214271235644855872ULL, 29387358ULL, 0ULL}),
+ BasicDecimal256(
+ {6450984253743169536ULL, 13015503840481697412ULL, 293873587ULL, 0ULL}),
+ BasicDecimal256(
+ {9169610316303040512ULL, 1027829888850112811ULL, 2938735877ULL, 0ULL}),
+ BasicDecimal256(
+ {17909126868192198656ULL, 10278298888501128114ULL, 29387358770ULL, 0ULL}),
+ BasicDecimal256(
+ {13070572018536022016ULL, 10549268516463523069ULL, 293873587705ULL, 0ULL}),
+ BasicDecimal256(
+ {1578511669393358848ULL, 13258964796087472617ULL, 2938735877055ULL, 0ULL}),
+ BasicDecimal256(
+ {15785116693933588480ULL, 3462439444907864858ULL, 29387358770557ULL, 0ULL}),
+ BasicDecimal256(
+ {10277214349659471872ULL, 16177650375369096972ULL, 293873587705571ULL, 0ULL}),
+ BasicDecimal256(
+ {10538423128046960640ULL, 14202551164014556797ULL, 2938735877055718ULL, 0ULL}),
+ BasicDecimal256(
+ {13150510911921848320ULL, 12898303124178706663ULL, 29387358770557187ULL, 0ULL}),
+ BasicDecimal256(
+ {2377900603251621888ULL, 18302566799529756941ULL, 293873587705571876ULL, 0ULL}),
+ BasicDecimal256(
+ {5332261958806667264ULL, 17004971331911604867ULL, 2938735877055718769ULL, 0ULL}),
+ BasicDecimal256(
+ {16429131440647569408ULL, 4029016655730084128ULL, 10940614696847636083ULL, 1ULL}),
+ BasicDecimal256({16717361816799281152ULL, 3396678409881738056ULL,
+ 17172426599928602752ULL, 15ULL}),
+ BasicDecimal256({1152921504606846976ULL, 15520040025107828953ULL,
+ 5703569335900062977ULL, 159ULL}),
+ BasicDecimal256({11529215046068469760ULL, 7626447661401876602ULL,
+ 1695461137871974930ULL, 1593ULL}),
+ BasicDecimal256({4611686018427387904ULL, 2477500319180559562ULL,
+ 16954611378719749304ULL, 15930ULL}),
+ BasicDecimal256({9223372036854775808ULL, 6328259118096044006ULL,
+ 3525417123811528497ULL, 159309ULL}),
+ BasicDecimal256({0ULL, 7942358959831785217ULL, 16807427164405733357ULL, 1593091ULL}),
+ BasicDecimal256({0ULL, 5636613303479645706ULL, 2053574980671369030ULL, 15930919ULL}),
+ BasicDecimal256({0ULL, 1025900813667802212ULL, 2089005733004138687ULL, 159309191ULL}),
+ BasicDecimal256(
+ {0ULL, 10259008136678022120ULL, 2443313256331835254ULL, 1593091911ULL}),
+ BasicDecimal256(
+ {0ULL, 10356360998232463120ULL, 5986388489608800929ULL, 15930919111ULL}),
+ BasicDecimal256(
+ {0ULL, 11329889613776873120ULL, 4523652674959354447ULL, 159309191113ULL}),
+ BasicDecimal256(
+ {0ULL, 2618431695511421504ULL, 8343038602174441244ULL, 1593091911132ULL}),
+ BasicDecimal256(
+ {0ULL, 7737572881404663424ULL, 9643409726906205977ULL, 15930919111324ULL}),
+ BasicDecimal256(
+ {0ULL, 3588752519208427776ULL, 4200376900514301694ULL, 159309191113245ULL}),
+ BasicDecimal256(
+ {0ULL, 17440781118374726144ULL, 5110280857723913709ULL, 1593091911132452ULL}),
+ BasicDecimal256(
+ {0ULL, 8387114520361296896ULL, 14209320429820033867ULL, 15930919111324522ULL}),
+ BasicDecimal256(
+ {0ULL, 10084168908774762496ULL, 12965995782233477362ULL, 159309191113245227ULL}),
+ BasicDecimal256(
+ {0ULL, 8607968719199866880ULL, 532749306367912313ULL, 1593091911132452277ULL})};
+
+static const BasicDecimal256 ScaleMultipliersHalfDecimal256[] = {
+ BasicDecimal256({0ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({50000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({500000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5000000000000000000ULL, 0ULL, 0ULL, 0ULL}),
+ BasicDecimal256({13106511852580896768ULL, 2ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1937910009842106368ULL, 27ULL, 0ULL, 0ULL}),
+ BasicDecimal256({932356024711512064ULL, 271ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9323560247115120640ULL, 2710ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1001882102603448320ULL, 27105ULL, 0ULL, 0ULL}),
+ BasicDecimal256({10018821026034483200ULL, 271050ULL, 0ULL, 0ULL}),
+ BasicDecimal256({7954489891797073920ULL, 2710505ULL, 0ULL, 0ULL}),
+ BasicDecimal256({5757922623132532736ULL, 27105054ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2238994010196672512ULL, 271050543ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3943196028257173504ULL, 2710505431ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2538472135152631808ULL, 27105054312ULL, 0ULL, 0ULL}),
+ BasicDecimal256({6937977277816766464ULL, 271050543121ULL, 0ULL, 0ULL}),
+ BasicDecimal256({14039540557039009792ULL, 2710505431213ULL, 0ULL, 0ULL}),
+ BasicDecimal256({11268197054423236608ULL, 27105054312137ULL, 0ULL, 0ULL}),
+ BasicDecimal256({2001506101975056384ULL, 271050543121376ULL, 0ULL, 0ULL}),
+ BasicDecimal256({1568316946041012224ULL, 2710505431213761ULL, 0ULL, 0ULL}),
+ BasicDecimal256({15683169460410122240ULL, 27105054312137610ULL, 0ULL, 0ULL}),
+ BasicDecimal256({9257742014424809472ULL, 271050543121376108ULL, 0ULL, 0ULL}),
+ BasicDecimal256({343699775700336640ULL, 2710505431213761085ULL, 0ULL, 0ULL}),
+ BasicDecimal256({3436997757003366400ULL, 8658310238428059234ULL, 1ULL, 0ULL}),
+ BasicDecimal256({15923233496324112384ULL, 12796126089442385877ULL, 14ULL, 0ULL}),
+ BasicDecimal256({11658382373564710912ULL, 17280796452166549082ULL, 146ULL, 0ULL}),
+ BasicDecimal256({5903359293389799424ULL, 6787267858279526282ULL, 1469ULL, 0ULL}),
+ BasicDecimal256({3693360712769339392ULL, 12532446361666607975ULL, 14693ULL, 0ULL}),
+ BasicDecimal256({40118980274290688ULL, 14643999174408770056ULL, 146936ULL, 0ULL}),
+ BasicDecimal256({401189802742906880ULL, 17312783228120839248ULL, 1469367ULL, 0ULL}),
+ BasicDecimal256({4011898027429068800ULL, 7107135617822427936ULL, 14693679ULL, 0ULL}),
+ BasicDecimal256(
+ {3225492126871584768ULL, 15731123957095624514ULL, 146936793ULL, 0ULL}),
+ BasicDecimal256(
+ {13808177195006296064ULL, 9737286981279832213ULL, 1469367938ULL, 0ULL}),
+ BasicDecimal256(
+ {8954563434096099328ULL, 5139149444250564057ULL, 14693679385ULL, 0ULL}),
+ BasicDecimal256(
+ {15758658046122786816ULL, 14498006295086537342ULL, 146936793852ULL, 0ULL}),
+ BasicDecimal256(
+ {10012627871551455232ULL, 15852854434898512116ULL, 1469367938527ULL, 0ULL}),
+ BasicDecimal256(
+ {7892558346966794240ULL, 10954591759308708237ULL, 14693679385278ULL, 0ULL}),
+ BasicDecimal256(
+ {5138607174829735936ULL, 17312197224539324294ULL, 146936793852785ULL, 0ULL}),
+ BasicDecimal256(
+ {14492583600878256128ULL, 7101275582007278398ULL, 1469367938527859ULL, 0ULL}),
+ BasicDecimal256(
+ {15798627492815699968ULL, 15672523598944129139ULL, 14693679385278593ULL, 0ULL}),
+ BasicDecimal256(
+ {10412322338480586752ULL, 9151283399764878470ULL, 146936793852785938ULL, 0ULL}),
+ BasicDecimal256(
+ {11889503016258109440ULL, 17725857702810578241ULL, 1469367938527859384ULL, 0ULL}),
+ BasicDecimal256(
+ {8214565720323784704ULL, 11237880364719817872ULL, 14693679385278593849ULL, 0ULL}),
+ BasicDecimal256(
+ {8358680908399640576ULL, 1698339204940869028ULL, 17809585336819077184ULL, 7ULL}),
+ BasicDecimal256({9799832789158199296ULL, 16983392049408690284ULL,
+ 12075156704804807296ULL, 79ULL}),
+ BasicDecimal256({5764607523034234880ULL, 3813223830700938301ULL,
+ 10071102605790763273ULL, 796ULL}),
+ BasicDecimal256({2305843009213693952ULL, 1238750159590279781ULL,
+ 8477305689359874652ULL, 7965ULL}),
+ BasicDecimal256({4611686018427387904ULL, 12387501595902797811ULL,
+ 10986080598760540056ULL, 79654ULL}),
+ BasicDecimal256({9223372036854775808ULL, 13194551516770668416ULL,
+ 17627085619057642486ULL, 796545ULL}),
+ BasicDecimal256({0ULL, 2818306651739822853ULL, 10250159527190460323ULL, 7965459ULL}),
+ BasicDecimal256({0ULL, 9736322443688676914ULL, 10267874903356845151ULL, 79654595ULL}),
+ BasicDecimal256(
+ {0ULL, 5129504068339011060ULL, 10445028665020693435ULL, 796545955ULL}),
+ BasicDecimal256(
+ {0ULL, 14401552535971007368ULL, 12216566281659176272ULL, 7965459555ULL}),
+ BasicDecimal256(
+ {0ULL, 14888316843743212368ULL, 11485198374334453031ULL, 79654595556ULL}),
+ BasicDecimal256(
+ {0ULL, 1309215847755710752ULL, 4171519301087220622ULL, 796545955566ULL}),
+ BasicDecimal256(
+ {0ULL, 13092158477557107520ULL, 4821704863453102988ULL, 7965459555662ULL}),
+ BasicDecimal256(
+ {0ULL, 1794376259604213888ULL, 11323560487111926655ULL, 79654595556622ULL}),
+ BasicDecimal256(
+ {0ULL, 17943762596042138880ULL, 2555140428861956854ULL, 796545955566226ULL}),
+ BasicDecimal256(
+ {0ULL, 13416929297035424256ULL, 7104660214910016933ULL, 7965459555662261ULL}),
+ BasicDecimal256(
+ {0ULL, 5042084454387381248ULL, 15706369927971514489ULL, 79654595556622613ULL}),
+ BasicDecimal256(
+ {0ULL, 13527356396454709248ULL, 9489746690038731964ULL, 796545955566226138ULL})};
+
#ifdef ARROW_USE_NATIVE_INT128
static constexpr uint64_t kInt64Mask = 0xFFFFFFFFFFFFFFFF;
#else
-static constexpr uint64_t kInt32Mask = 0xFFFFFFFF;
+static constexpr uint64_t kInt32Mask = 0xFFFFFFFF;
#endif
// same as ScaleMultipliers[38] - 1
@@ -468,127 +468,127 @@ BasicDecimal128& BasicDecimal128::operator>>=(uint32_t bits) {
namespace {
-// Convenience wrapper type over 128 bit unsigned integers. We opt not to
-// replace the uint128_t type in int128_internal.h because it would require
-// significantly more implementation work to be done. This class merely
-// provides the minimum necessary set of functions to perform 128+ bit
-// multiplication operations when there may or may not be native support.
+// Convenience wrapper type over 128 bit unsigned integers. We opt not to
+// replace the uint128_t type in int128_internal.h because it would require
+// significantly more implementation work to be done. This class merely
+// provides the minimum necessary set of functions to perform 128+ bit
+// multiplication operations when there may or may not be native support.
#ifdef ARROW_USE_NATIVE_INT128
-struct uint128_t {
- uint128_t() {}
- uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {}
- explicit uint128_t(const BasicDecimal128& decimal) {
- val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits();
- }
-
- explicit uint128_t(uint64_t value) : val_(value) {}
-
- uint64_t hi() { return val_ >> 64; }
- uint64_t lo() { return val_ & kInt64Mask; }
-
- uint128_t& operator+=(const uint128_t& other) {
- val_ += other.val_;
- return *this;
- }
-
- uint128_t& operator*=(const uint128_t& other) {
- val_ *= other.val_;
- return *this;
- }
-
- __uint128_t val_;
-};
-
+struct uint128_t {
+ uint128_t() {}
+ uint128_t(uint64_t hi, uint64_t lo) : val_((static_cast<__uint128_t>(hi) << 64) | lo) {}
+ explicit uint128_t(const BasicDecimal128& decimal) {
+ val_ = (static_cast<__uint128_t>(decimal.high_bits()) << 64) | decimal.low_bits();
+ }
+
+ explicit uint128_t(uint64_t value) : val_(value) {}
+
+ uint64_t hi() { return val_ >> 64; }
+ uint64_t lo() { return val_ & kInt64Mask; }
+
+ uint128_t& operator+=(const uint128_t& other) {
+ val_ += other.val_;
+ return *this;
+ }
+
+ uint128_t& operator*=(const uint128_t& other) {
+ val_ *= other.val_;
+ return *this;
+ }
+
+ __uint128_t val_;
+};
+
#else
-// Multiply two 64 bit word components into a 128 bit result, with high bits
-// stored in hi and low bits in lo.
-inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
- // Perform multiplication on two 64 bit words x and y into a 128 bit result
+// Multiply two 64 bit word components into a 128 bit result, with high bits
+// stored in hi and low bits in lo.
+inline void ExtendAndMultiply(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* lo) {
+ // Perform multiplication on two 64 bit words x and y into a 128 bit result
// by splitting up x and y into 32 bit high/low bit components,
// allowing us to represent the multiplication as
// x * y = x_lo * y_lo + x_hi * y_lo * 2^32 + y_hi * x_lo * 2^32
- // + x_hi * y_hi * 2^64
+ // + x_hi * y_hi * 2^64
//
- // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
+ // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
// Therefore,
// lo_lo is (x_lo * y_lo)_lo,
// lo_hi is ((x_lo * y_lo)_hi + (x_hi * y_lo)_lo + (x_lo * y_hi)_lo)_lo,
// hi_lo is ((x_hi * y_hi)_lo + (x_hi * y_lo)_hi + (x_lo * y_hi)_hi)_hi,
// hi_hi is (x_hi * y_hi)_hi
- const uint64_t x_lo = x & kInt32Mask;
- const uint64_t y_lo = y & kInt32Mask;
+ const uint64_t x_lo = x & kInt32Mask;
+ const uint64_t y_lo = y & kInt32Mask;
const uint64_t x_hi = x >> 32;
const uint64_t y_hi = y >> 32;
const uint64_t t = x_lo * y_lo;
- const uint64_t t_lo = t & kInt32Mask;
+ const uint64_t t_lo = t & kInt32Mask;
const uint64_t t_hi = t >> 32;
const uint64_t u = x_hi * y_lo + t_hi;
- const uint64_t u_lo = u & kInt32Mask;
+ const uint64_t u_lo = u & kInt32Mask;
const uint64_t u_hi = u >> 32;
const uint64_t v = x_lo * y_hi + u_lo;
const uint64_t v_hi = v >> 32;
*hi = x_hi * y_hi + u_hi + v_hi;
- *lo = (v << 32) + t_lo;
-}
-
-struct uint128_t {
- uint128_t() {}
- uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {}
- explicit uint128_t(const BasicDecimal128& decimal) {
- hi_ = decimal.high_bits();
- lo_ = decimal.low_bits();
- }
-
- uint64_t hi() const { return hi_; }
- uint64_t lo() const { return lo_; }
-
- uint128_t& operator+=(const uint128_t& other) {
- // To deduce the carry bit, we perform "65 bit" addition on the low bits and
- // seeing if the resulting high bit is 1. This is accomplished by shifting the
- // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the
- // result of adding the two chopped bits would have produced a carry.
- uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63;
- hi_ += other.hi_ + carry;
- lo_ += other.lo_;
- return *this;
- }
-
- uint128_t& operator*=(const uint128_t& other) {
- uint128_t r;
- ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_);
- r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_);
- *this = r;
- return *this;
- }
-
- uint64_t hi_;
- uint64_t lo_;
-};
+ *lo = (v << 32) + t_lo;
+}
+
+struct uint128_t {
+ uint128_t() {}
+ uint128_t(uint64_t hi, uint64_t lo) : hi_(hi), lo_(lo) {}
+ explicit uint128_t(const BasicDecimal128& decimal) {
+ hi_ = decimal.high_bits();
+ lo_ = decimal.low_bits();
+ }
+
+ uint64_t hi() const { return hi_; }
+ uint64_t lo() const { return lo_; }
+
+ uint128_t& operator+=(const uint128_t& other) {
+ // To deduce the carry bit, we perform "65 bit" addition on the low bits and
+ // seeing if the resulting high bit is 1. This is accomplished by shifting the
+ // low bits to the right by 1 (chopping off the lowest bit), then adding 1 if the
+ // result of adding the two chopped bits would have produced a carry.
+ uint64_t carry = (((lo_ & other.lo_) & 1) + (lo_ >> 1) + (other.lo_ >> 1)) >> 63;
+ hi_ += other.hi_ + carry;
+ lo_ += other.lo_;
+ return *this;
+ }
+
+ uint128_t& operator*=(const uint128_t& other) {
+ uint128_t r;
+ ExtendAndMultiply(lo_, other.lo_, &r.hi_, &r.lo_);
+ r.hi_ += (hi_ * other.lo_) + (lo_ * other.hi_);
+ *this = r;
+ return *this;
+ }
+
+ uint64_t hi_;
+ uint64_t lo_;
+};
#endif
-// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t
-// array into a same sized output. Elements in the array should be in
-// little endian order, and output will be the same. Overflow in multiplication
-// will result in the lower N * 64 bits of the result being set.
-template <int N>
-inline void MultiplyUnsignedArray(const std::array<uint64_t, N>& lh,
- const std::array<uint64_t, N>& rh,
- std::array<uint64_t, N>* result) {
- for (int j = 0; j < N; ++j) {
- uint64_t carry = 0;
- for (int i = 0; i < N - j; ++i) {
- uint128_t tmp(lh[i]);
- tmp *= uint128_t(rh[j]);
- tmp += uint128_t((*result)[i + j]);
- tmp += uint128_t(carry);
- (*result)[i + j] = tmp.lo();
- carry = tmp.hi();
- }
- }
+// Multiplies two N * 64 bit unsigned integer types, represented by a uint64_t
+// array into a same sized output. Elements in the array should be in
+// little endian order, and output will be the same. Overflow in multiplication
+// will result in the lower N * 64 bits of the result being set.
+template <int N>
+inline void MultiplyUnsignedArray(const std::array<uint64_t, N>& lh,
+ const std::array<uint64_t, N>& rh,
+ std::array<uint64_t, N>* result) {
+ for (int j = 0; j < N; ++j) {
+ uint64_t carry = 0;
+ for (int i = 0; i < N - j; ++i) {
+ uint128_t tmp(lh[i]);
+ tmp *= uint128_t(rh[j]);
+ tmp += uint128_t((*result)[i + j]);
+ tmp += uint128_t(carry);
+ (*result)[i + j] = tmp.lo();
+ carry = tmp.hi();
+ }
+ }
}
} // namespace
@@ -599,62 +599,62 @@ BasicDecimal128& BasicDecimal128::operator*=(const BasicDecimal128& right) {
const bool negate = Sign() != right.Sign();
BasicDecimal128 x = BasicDecimal128::Abs(*this);
BasicDecimal128 y = BasicDecimal128::Abs(right);
- uint128_t r(x);
- r *= uint128_t{y};
- high_bits_ = r.hi();
- low_bits_ = r.lo();
+ uint128_t r(x);
+ r *= uint128_t{y};
+ high_bits_ = r.hi();
+ low_bits_ = r.lo();
if (negate) {
Negate();
}
return *this;
}
-/// Expands the given little endian array of uint64_t into a big endian array of
-/// uint32_t. The value of input array is expected to be non-negative. The result_array
-/// will remove leading zeros from the input array.
-/// \param value_array a little endian array to represent the value
-/// \param result_array a big endian array of length N*2 to set with the value
-/// \result the output length of the array
-template <size_t N>
-static int64_t FillInArray(const std::array<uint64_t, N>& value_array,
- uint32_t* result_array) {
- int64_t next_index = 0;
- // 1st loop to find out 1st non-negative value in input
- int64_t i = N - 1;
- for (; i >= 0; i--) {
- if (value_array[i] != 0) {
- if (value_array[i] <= std::numeric_limits<uint32_t>::max()) {
- result_array[next_index++] = static_cast<uint32_t>(value_array[i]);
- i--;
- }
- break;
- }
- }
- // 2nd loop to fill in the rest of the array.
- for (int64_t j = i; j >= 0; j--) {
- result_array[next_index++] = static_cast<uint32_t>(value_array[j] >> 32);
- result_array[next_index++] = static_cast<uint32_t>(value_array[j]);
- }
- return next_index;
-}
-
-/// Expands the given value into a big endian array of ints so that we can work on
-/// it. The array will be converted to an absolute value and the was_negative
+/// Expands the given little endian array of uint64_t into a big endian array of
+/// uint32_t. The value of input array is expected to be non-negative. The result_array
+/// will remove leading zeros from the input array.
+/// \param value_array a little endian array to represent the value
+/// \param result_array a big endian array of length N*2 to set with the value
+/// \result the output length of the array
+template <size_t N>
+static int64_t FillInArray(const std::array<uint64_t, N>& value_array,
+ uint32_t* result_array) {
+ int64_t next_index = 0;
+ // 1st loop to find out 1st non-negative value in input
+ int64_t i = N - 1;
+ for (; i >= 0; i--) {
+ if (value_array[i] != 0) {
+ if (value_array[i] <= std::numeric_limits<uint32_t>::max()) {
+ result_array[next_index++] = static_cast<uint32_t>(value_array[i]);
+ i--;
+ }
+ break;
+ }
+ }
+ // 2nd loop to fill in the rest of the array.
+ for (int64_t j = i; j >= 0; j--) {
+ result_array[next_index++] = static_cast<uint32_t>(value_array[j] >> 32);
+ result_array[next_index++] = static_cast<uint32_t>(value_array[j]);
+ }
+ return next_index;
+}
+
+/// Expands the given value into a big endian array of ints so that we can work on
+/// it. The array will be converted to an absolute value and the was_negative
/// flag will be set appropriately. The array will remove leading zeros from
/// the value.
-/// \param array a big endian array of length 4 to set with the value
+/// \param array a big endian array of length 4 to set with the value
/// \param was_negative a flag for whether the value was original negative
/// \result the output length of the array
static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
bool& was_negative) {
- BasicDecimal128 abs_value = BasicDecimal128::Abs(value);
- was_negative = value.high_bits() < 0;
- uint64_t high = static_cast<uint64_t>(abs_value.high_bits());
- uint64_t low = abs_value.low_bits();
-
- // FillInArray(std::array<uint64_t, N>& value_array, uint32_t* result_array) is not
- // called here as the following code has better performance, to avoid regression on
- // BasicDecimal128 Division.
+ BasicDecimal128 abs_value = BasicDecimal128::Abs(value);
+ was_negative = value.high_bits() < 0;
+ uint64_t high = static_cast<uint64_t>(abs_value.high_bits());
+ uint64_t low = abs_value.low_bits();
+
+ // FillInArray(std::array<uint64_t, N>& value_array, uint32_t* result_array) is not
+ // called here as the following code has better performance, to avoid regression on
+ // BasicDecimal128 Division.
if (high != 0) {
if (high > std::numeric_limits<uint32_t>::max()) {
array[0] = static_cast<uint32_t>(high >> 32);
@@ -670,7 +670,7 @@ static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
return 3;
}
- if (low > std::numeric_limits<uint32_t>::max()) {
+ if (low > std::numeric_limits<uint32_t>::max()) {
array[0] = static_cast<uint32_t>(low >> 32);
array[1] = static_cast<uint32_t>(low);
return 2;
@@ -684,24 +684,24 @@ static int64_t FillInArray(const BasicDecimal128& value, uint32_t* array,
return 1;
}
-/// Expands the given value into a big endian array of ints so that we can work on
-/// it. The array will be converted to an absolute value and the was_negative
-/// flag will be set appropriately. The array will remove leading zeros from
-/// the value.
-/// \param array a big endian array of length 8 to set with the value
-/// \param was_negative a flag for whether the value was original negative
-/// \result the output length of the array
-static int64_t FillInArray(const BasicDecimal256& value, uint32_t* array,
- bool& was_negative) {
- BasicDecimal256 positive_value = value;
- was_negative = false;
- if (positive_value.IsNegative()) {
- positive_value.Negate();
- was_negative = true;
- }
- return FillInArray<4>(positive_value.little_endian_array(), array);
-}
-
+/// Expands the given value into a big endian array of ints so that we can work on
+/// it. The array will be converted to an absolute value and the was_negative
+/// flag will be set appropriately. The array will remove leading zeros from
+/// the value.
+/// \param array a big endian array of length 8 to set with the value
+/// \param was_negative a flag for whether the value was original negative
+/// \result the output length of the array
+static int64_t FillInArray(const BasicDecimal256& value, uint32_t* array,
+ bool& was_negative) {
+ BasicDecimal256 positive_value = value;
+ was_negative = false;
+ if (positive_value.IsNegative()) {
+ positive_value.Negate();
+ was_negative = true;
+ }
+ return FillInArray<4>(positive_value.little_endian_array(), array);
+}
+
/// Shift the number in the array left by bits positions.
/// \param array the number to shift, must have length elements
/// \param length the number of entries in the array
@@ -719,7 +719,7 @@ static void ShiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
/// \param array the number to shift, must have length elements
/// \param length the number of entries in the array
/// \param bits the number of bits to shift (0 <= bits < 32)
-static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
+static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
if (length > 0 && bits != 0) {
for (int64_t i = length - 1; i > 0; --i) {
array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits));
@@ -730,10 +730,10 @@ static inline void ShiftArrayRight(uint32_t* array, int64_t length, int64_t bits
/// \brief Fix the signs of the result and remainder at the end of the division based on
/// the signs of the dividend and divisor.
-template <class DecimalClass>
-static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainder,
- bool dividend_was_negative,
- bool divisor_was_negative) {
+template <class DecimalClass>
+static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainder,
+ bool dividend_was_negative,
+ bool divisor_was_negative) {
if (dividend_was_negative != divisor_was_negative) {
result->Negate();
}
@@ -743,65 +743,65 @@ static inline void FixDivisionSigns(DecimalClass* result, DecimalClass* remainde
}
}
-/// \brief Build a little endian array of uint64_t from a big endian array of uint32_t.
-template <size_t N>
-static DecimalStatus BuildFromArray(std::array<uint64_t, N>* result_array,
- const uint32_t* array, int64_t length) {
- for (int64_t i = length - 2 * N - 1; i >= 0; i--) {
- if (array[i] != 0) {
+/// \brief Build a little endian array of uint64_t from a big endian array of uint32_t.
+template <size_t N>
+static DecimalStatus BuildFromArray(std::array<uint64_t, N>* result_array,
+ const uint32_t* array, int64_t length) {
+ for (int64_t i = length - 2 * N - 1; i >= 0; i--) {
+ if (array[i] != 0) {
return DecimalStatus::kOverflow;
- }
+ }
+ }
+ int64_t next_index = length - 1;
+ size_t i = 0;
+ for (; i < N && next_index >= 0; i++) {
+ uint64_t lower_bits = array[next_index--];
+ (*result_array)[i] =
+ (next_index < 0)
+ ? lower_bits
+ : ((static_cast<uint64_t>(array[next_index--]) << 32) + lower_bits);
+ }
+ for (; i < N; i++) {
+ (*result_array)[i] = 0;
}
- int64_t next_index = length - 1;
- size_t i = 0;
- for (; i < N && next_index >= 0; i++) {
- uint64_t lower_bits = array[next_index--];
- (*result_array)[i] =
- (next_index < 0)
- ? lower_bits
- : ((static_cast<uint64_t>(array[next_index--]) << 32) + lower_bits);
- }
- for (; i < N; i++) {
- (*result_array)[i] = 0;
- }
- return DecimalStatus::kSuccess;
-}
-
-/// \brief Build a BasicDecimal128 from a big endian array of uint32_t.
-static DecimalStatus BuildFromArray(BasicDecimal128* value, const uint32_t* array,
- int64_t length) {
- std::array<uint64_t, 2> result_array;
- auto status = BuildFromArray(&result_array, array, length);
- if (status != DecimalStatus::kSuccess) {
- return status;
- }
- *value = {static_cast<int64_t>(result_array[1]), result_array[0]};
return DecimalStatus::kSuccess;
}
-/// \brief Build a BasicDecimal256 from a big endian array of uint32_t.
-static DecimalStatus BuildFromArray(BasicDecimal256* value, const uint32_t* array,
- int64_t length) {
- std::array<uint64_t, 4> result_array;
- auto status = BuildFromArray(&result_array, array, length);
- if (status != DecimalStatus::kSuccess) {
- return status;
- }
- *value = result_array;
- return DecimalStatus::kSuccess;
-}
-
+/// \brief Build a BasicDecimal128 from a big endian array of uint32_t.
+static DecimalStatus BuildFromArray(BasicDecimal128* value, const uint32_t* array,
+ int64_t length) {
+ std::array<uint64_t, 2> result_array;
+ auto status = BuildFromArray(&result_array, array, length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ *value = {static_cast<int64_t>(result_array[1]), result_array[0]};
+ return DecimalStatus::kSuccess;
+}
+
+/// \brief Build a BasicDecimal256 from a big endian array of uint32_t.
+static DecimalStatus BuildFromArray(BasicDecimal256* value, const uint32_t* array,
+ int64_t length) {
+ std::array<uint64_t, 4> result_array;
+ auto status = BuildFromArray(&result_array, array, length);
+ if (status != DecimalStatus::kSuccess) {
+ return status;
+ }
+ *value = result_array;
+ return DecimalStatus::kSuccess;
+}
+
/// \brief Do a division where the divisor fits into a single 32 bit value.
-template <class DecimalClass>
-static inline DecimalStatus SingleDivide(const uint32_t* dividend,
- int64_t dividend_length, uint32_t divisor,
- DecimalClass* remainder,
- bool dividend_was_negative,
- bool divisor_was_negative,
- DecimalClass* result) {
+template <class DecimalClass>
+static inline DecimalStatus SingleDivide(const uint32_t* dividend,
+ int64_t dividend_length, uint32_t divisor,
+ DecimalClass* remainder,
+ bool dividend_was_negative,
+ bool divisor_was_negative,
+ DecimalClass* result) {
uint64_t r = 0;
- constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t) + 1;
- uint32_t result_array[kDecimalArrayLength];
+ constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t) + 1;
+ uint32_t result_array[kDecimalArrayLength];
for (int64_t j = 0; j < dividend_length; j++) {
r <<= 32;
r += dividend[j];
@@ -818,27 +818,27 @@ static inline DecimalStatus SingleDivide(const uint32_t* dividend,
return DecimalStatus::kSuccess;
}
-/// \brief Do a decimal division with remainder.
-template <class DecimalClass>
-static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
- const DecimalClass& divisor,
- DecimalClass* result, DecimalClass* remainder) {
- constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t);
+/// \brief Do a decimal division with remainder.
+template <class DecimalClass>
+static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
+ const DecimalClass& divisor,
+ DecimalClass* result, DecimalClass* remainder) {
+ constexpr int64_t kDecimalArrayLength = DecimalClass::bit_width / sizeof(uint32_t);
// Split the dividend and divisor into integer pieces so that we can
// work on them.
- uint32_t dividend_array[kDecimalArrayLength + 1];
- uint32_t divisor_array[kDecimalArrayLength];
+ uint32_t dividend_array[kDecimalArrayLength + 1];
+ uint32_t divisor_array[kDecimalArrayLength];
bool dividend_was_negative;
bool divisor_was_negative;
// leave an extra zero before the dividend
dividend_array[0] = 0;
int64_t dividend_length =
- FillInArray(dividend, dividend_array + 1, dividend_was_negative) + 1;
+ FillInArray(dividend, dividend_array + 1, dividend_was_negative) + 1;
int64_t divisor_length = FillInArray(divisor, divisor_array, divisor_was_negative);
// Handle some of the easy cases.
if (dividend_length <= divisor_length) {
- *remainder = dividend;
+ *remainder = dividend;
*result = 0;
return DecimalStatus::kSuccess;
}
@@ -853,8 +853,8 @@ static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
}
int64_t result_length = dividend_length - divisor_length;
- uint32_t result_array[kDecimalArrayLength];
- DCHECK_LE(result_length, kDecimalArrayLength);
+ uint32_t result_array[kDecimalArrayLength];
+ DCHECK_LE(result_length, kDecimalArrayLength);
// Normalize by shifting both by a multiple of 2 so that
// the digit guessing is better. The requirement is that
@@ -933,12 +933,12 @@ static inline DecimalStatus DecimalDivide(const DecimalClass& dividend,
return DecimalStatus::kSuccess;
}
-DecimalStatus BasicDecimal128::Divide(const BasicDecimal128& divisor,
- BasicDecimal128* result,
- BasicDecimal128* remainder) const {
- return DecimalDivide(*this, divisor, result, remainder);
-}
-
+DecimalStatus BasicDecimal128::Divide(const BasicDecimal128& divisor,
+ BasicDecimal128* result,
+ BasicDecimal128* remainder) const {
+ return DecimalDivide(*this, divisor, result, remainder);
+}
+
bool operator==(const BasicDecimal128& left, const BasicDecimal128& right) {
return left.high_bits() == right.high_bits() && left.low_bits() == right.low_bits();
}
@@ -1008,13 +1008,13 @@ BasicDecimal128 operator%(const BasicDecimal128& left, const BasicDecimal128& ri
return remainder;
}
-template <class DecimalClass>
-static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_scale,
- const DecimalClass& multiplier,
- DecimalClass* result) {
+template <class DecimalClass>
+static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_scale,
+ const DecimalClass& multiplier,
+ DecimalClass* result) {
if (delta_scale < 0) {
DCHECK_NE(multiplier, 0);
- DecimalClass remainder;
+ DecimalClass remainder;
auto status = value.Divide(multiplier, result, &remainder);
DCHECK_EQ(status, DecimalStatus::kSuccess);
return remainder != 0;
@@ -1024,23 +1024,23 @@ static bool RescaleWouldCauseDataLoss(const DecimalClass& value, int32_t delta_s
return (value < 0) ? *result > value : *result < value;
}
-template <class DecimalClass>
-DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
- int32_t new_scale, DecimalClass* out) {
+template <class DecimalClass>
+DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
+ int32_t new_scale, DecimalClass* out) {
DCHECK_NE(out, nullptr);
if (original_scale == new_scale) {
- *out = value;
+ *out = value;
return DecimalStatus::kSuccess;
}
const int32_t delta_scale = new_scale - original_scale;
const int32_t abs_delta_scale = std::abs(delta_scale);
- DecimalClass multiplier = DecimalClass::GetScaleMultiplier(abs_delta_scale);
+ DecimalClass multiplier = DecimalClass::GetScaleMultiplier(abs_delta_scale);
const bool rescale_would_cause_data_loss =
- RescaleWouldCauseDataLoss(value, delta_scale, multiplier, out);
+ RescaleWouldCauseDataLoss(value, delta_scale, multiplier, out);
// Fail if we overflow or truncate
if (ARROW_PREDICT_FALSE(rescale_would_cause_data_loss)) {
@@ -1050,11 +1050,11 @@ DecimalStatus DecimalRescale(const DecimalClass& value, int32_t original_scale,
return DecimalStatus::kSuccess;
}
-DecimalStatus BasicDecimal128::Rescale(int32_t original_scale, int32_t new_scale,
- BasicDecimal128* out) const {
- return DecimalRescale(*this, original_scale, new_scale, out);
-}
-
+DecimalStatus BasicDecimal128::Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal128* out) const {
+ return DecimalRescale(*this, original_scale, new_scale, out);
+}
+
void BasicDecimal128::GetWholeAndFraction(int scale, BasicDecimal128* whole,
BasicDecimal128* fraction) const {
DCHECK_GE(scale, 0);
@@ -1117,228 +1117,228 @@ int32_t BasicDecimal128::CountLeadingBinaryZeros() const {
}
}
-#if ARROW_LITTLE_ENDIAN
-BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
- : little_endian_array_(
- std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[0],
- reinterpret_cast<const uint64_t*>(bytes)[1],
- reinterpret_cast<const uint64_t*>(bytes)[2],
- reinterpret_cast<const uint64_t*>(bytes)[3]})) {}
-#else
-BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
- : little_endian_array_(
- std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[3],
- reinterpret_cast<const uint64_t*>(bytes)[2],
- reinterpret_cast<const uint64_t*>(bytes)[1],
- reinterpret_cast<const uint64_t*>(bytes)[0]})) {}
-#endif
-
-BasicDecimal256& BasicDecimal256::Negate() {
- uint64_t carry = 1;
- for (uint64_t& elem : little_endian_array_) {
- elem = ~elem + carry;
- carry &= (elem == 0);
- }
- return *this;
-}
-
-BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; }
-
-BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) {
- BasicDecimal256 result(in);
- return result.Abs();
-}
-
-BasicDecimal256& BasicDecimal256::operator+=(const BasicDecimal256& right) {
- uint64_t carry = 0;
- for (size_t i = 0; i < little_endian_array_.size(); i++) {
- const uint64_t right_value = right.little_endian_array_[i];
- uint64_t sum = right_value + carry;
- carry = 0;
- if (sum < right_value) {
- carry += 1;
- }
- sum += little_endian_array_[i];
- if (sum < little_endian_array_[i]) {
- carry += 1;
- }
- little_endian_array_[i] = sum;
- }
- return *this;
-}
-
-BasicDecimal256& BasicDecimal256::operator-=(const BasicDecimal256& right) {
- *this += -right;
- return *this;
-}
-
-BasicDecimal256& BasicDecimal256::operator<<=(uint32_t bits) {
- if (bits == 0) {
- return *this;
- }
- int cross_word_shift = bits / 64;
- if (static_cast<size_t>(cross_word_shift) >= little_endian_array_.size()) {
- little_endian_array_ = {0, 0, 0, 0};
- return *this;
- }
- uint32_t in_word_shift = bits % 64;
- for (int i = static_cast<int>(little_endian_array_.size() - 1); i >= cross_word_shift;
- i--) {
- // Account for shifts larger then 64 bits
- little_endian_array_[i] = little_endian_array_[i - cross_word_shift];
- little_endian_array_[i] <<= in_word_shift;
- if (in_word_shift != 0 && i >= cross_word_shift + 1) {
- little_endian_array_[i] |=
- little_endian_array_[i - (cross_word_shift + 1)] >> (64 - in_word_shift);
- }
- }
- for (int i = cross_word_shift - 1; i >= 0; i--) {
- little_endian_array_[i] = 0;
- }
- return *this;
-}
-
-std::array<uint8_t, 32> BasicDecimal256::ToBytes() const {
- std::array<uint8_t, 32> out{{0}};
- ToBytes(out.data());
- return out;
-}
-
-void BasicDecimal256::ToBytes(uint8_t* out) const {
- DCHECK_NE(out, nullptr);
-#if ARROW_LITTLE_ENDIAN
- reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[0];
- reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[1];
- reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[2];
- reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[3];
-#else
- reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[3];
- reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[2];
- reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[1];
- reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[0];
-#endif
-}
-
-BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) {
- // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the
- // min the negation taking the absolute values here should always be safe.
- const bool negate = Sign() != right.Sign();
- BasicDecimal256 x = BasicDecimal256::Abs(*this);
- BasicDecimal256 y = BasicDecimal256::Abs(right);
-
- uint128_t r_hi;
- uint128_t r_lo;
- std::array<uint64_t, 4> res{0, 0, 0, 0};
- MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res);
- little_endian_array_ = res;
- if (negate) {
- Negate();
- }
- return *this;
-}
-
-DecimalStatus BasicDecimal256::Divide(const BasicDecimal256& divisor,
- BasicDecimal256* result,
- BasicDecimal256* remainder) const {
- return DecimalDivide(*this, divisor, result, remainder);
-}
-
-DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale,
- BasicDecimal256* out) const {
- return DecimalRescale(*this, original_scale, new_scale, out);
-}
-
-BasicDecimal256 BasicDecimal256::IncreaseScaleBy(int32_t increase_by) const {
- DCHECK_GE(increase_by, 0);
- DCHECK_LE(increase_by, 76);
-
- return (*this) * ScaleMultipliersDecimal256[increase_by];
-}
-
-BasicDecimal256 BasicDecimal256::ReduceScaleBy(int32_t reduce_by, bool round) const {
- DCHECK_GE(reduce_by, 0);
- DCHECK_LE(reduce_by, 76);
-
- if (reduce_by == 0) {
- return *this;
- }
-
- BasicDecimal256 divisor(ScaleMultipliersDecimal256[reduce_by]);
- BasicDecimal256 result;
- BasicDecimal256 remainder;
- auto s = Divide(divisor, &result, &remainder);
- DCHECK_EQ(s, DecimalStatus::kSuccess);
- if (round) {
- auto divisor_half = ScaleMultipliersHalfDecimal256[reduce_by];
- if (remainder.Abs() >= divisor_half) {
- if (result > 0) {
- result += 1;
- } else {
- result -= 1;
- }
- }
- }
- return result;
-}
-
-bool BasicDecimal256::FitsInPrecision(int32_t precision) const {
- DCHECK_GT(precision, 0);
- DCHECK_LE(precision, 76);
- return BasicDecimal256::Abs(*this) < ScaleMultipliersDecimal256[precision];
-}
-
-const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) {
- DCHECK_GE(scale, 0);
- DCHECK_LE(scale, 76);
-
- return ScaleMultipliersDecimal256[scale];
-}
-
-BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
- BasicDecimal256 result = left;
- result *= right;
- return result;
-}
-
-bool operator<(const BasicDecimal256& left, const BasicDecimal256& right) {
- const std::array<uint64_t, 4>& lhs = left.little_endian_array();
- const std::array<uint64_t, 4>& rhs = right.little_endian_array();
- return lhs[3] != rhs[3]
- ? static_cast<int64_t>(lhs[3]) < static_cast<int64_t>(rhs[3])
- : lhs[2] != rhs[2] ? lhs[2] < rhs[2]
- : lhs[1] != rhs[1] ? lhs[1] < rhs[1] : lhs[0] < rhs[0];
-}
-
-BasicDecimal256 operator-(const BasicDecimal256& operand) {
- BasicDecimal256 result(operand);
- return result.Negate();
-}
-
-BasicDecimal256 operator~(const BasicDecimal256& operand) {
- const std::array<uint64_t, 4>& arr = operand.little_endian_array();
- BasicDecimal256 result({~arr[0], ~arr[1], ~arr[2], ~arr[3]});
- return result;
-}
-
-BasicDecimal256& BasicDecimal256::operator/=(const BasicDecimal256& right) {
- BasicDecimal256 remainder;
- auto s = Divide(right, this, &remainder);
- DCHECK_EQ(s, DecimalStatus::kSuccess);
- return *this;
-}
-
-BasicDecimal256 operator+(const BasicDecimal256& left, const BasicDecimal256& right) {
- BasicDecimal256 sum = left;
- sum += right;
- return sum;
-}
-
-BasicDecimal256 operator/(const BasicDecimal256& left, const BasicDecimal256& right) {
- BasicDecimal256 remainder;
- BasicDecimal256 result;
- auto s = left.Divide(right, &result, &remainder);
- DCHECK_EQ(s, DecimalStatus::kSuccess);
- return result;
-}
-
+#if ARROW_LITTLE_ENDIAN
+BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
+ : little_endian_array_(
+ std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[0],
+ reinterpret_cast<const uint64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[2],
+ reinterpret_cast<const uint64_t*>(bytes)[3]})) {}
+#else
+BasicDecimal256::BasicDecimal256(const uint8_t* bytes)
+ : little_endian_array_(
+ std::array<uint64_t, 4>({reinterpret_cast<const uint64_t*>(bytes)[3],
+ reinterpret_cast<const uint64_t*>(bytes)[2],
+ reinterpret_cast<const uint64_t*>(bytes)[1],
+ reinterpret_cast<const uint64_t*>(bytes)[0]})) {}
+#endif
+
+BasicDecimal256& BasicDecimal256::Negate() {
+ uint64_t carry = 1;
+ for (uint64_t& elem : little_endian_array_) {
+ elem = ~elem + carry;
+ carry &= (elem == 0);
+ }
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::Abs() { return *this < 0 ? Negate() : *this; }
+
+BasicDecimal256 BasicDecimal256::Abs(const BasicDecimal256& in) {
+ BasicDecimal256 result(in);
+ return result.Abs();
+}
+
+BasicDecimal256& BasicDecimal256::operator+=(const BasicDecimal256& right) {
+ uint64_t carry = 0;
+ for (size_t i = 0; i < little_endian_array_.size(); i++) {
+ const uint64_t right_value = right.little_endian_array_[i];
+ uint64_t sum = right_value + carry;
+ carry = 0;
+ if (sum < right_value) {
+ carry += 1;
+ }
+ sum += little_endian_array_[i];
+ if (sum < little_endian_array_[i]) {
+ carry += 1;
+ }
+ little_endian_array_[i] = sum;
+ }
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::operator-=(const BasicDecimal256& right) {
+ *this += -right;
+ return *this;
+}
+
+BasicDecimal256& BasicDecimal256::operator<<=(uint32_t bits) {
+ if (bits == 0) {
+ return *this;
+ }
+ int cross_word_shift = bits / 64;
+ if (static_cast<size_t>(cross_word_shift) >= little_endian_array_.size()) {
+ little_endian_array_ = {0, 0, 0, 0};
+ return *this;
+ }
+ uint32_t in_word_shift = bits % 64;
+ for (int i = static_cast<int>(little_endian_array_.size() - 1); i >= cross_word_shift;
+ i--) {
+ // Account for shifts larger then 64 bits
+ little_endian_array_[i] = little_endian_array_[i - cross_word_shift];
+ little_endian_array_[i] <<= in_word_shift;
+ if (in_word_shift != 0 && i >= cross_word_shift + 1) {
+ little_endian_array_[i] |=
+ little_endian_array_[i - (cross_word_shift + 1)] >> (64 - in_word_shift);
+ }
+ }
+ for (int i = cross_word_shift - 1; i >= 0; i--) {
+ little_endian_array_[i] = 0;
+ }
+ return *this;
+}
+
+std::array<uint8_t, 32> BasicDecimal256::ToBytes() const {
+ std::array<uint8_t, 32> out{{0}};
+ ToBytes(out.data());
+ return out;
+}
+
+void BasicDecimal256::ToBytes(uint8_t* out) const {
+ DCHECK_NE(out, nullptr);
+#if ARROW_LITTLE_ENDIAN
+ reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[0];
+ reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[1];
+ reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[2];
+ reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[3];
+#else
+ reinterpret_cast<int64_t*>(out)[0] = little_endian_array_[3];
+ reinterpret_cast<int64_t*>(out)[1] = little_endian_array_[2];
+ reinterpret_cast<int64_t*>(out)[2] = little_endian_array_[1];
+ reinterpret_cast<int64_t*>(out)[3] = little_endian_array_[0];
+#endif
+}
+
+BasicDecimal256& BasicDecimal256::operator*=(const BasicDecimal256& right) {
+ // Since the max value of BasicDecimal256 is supposed to be 1e76 - 1 and the
+ // min the negation taking the absolute values here should always be safe.
+ const bool negate = Sign() != right.Sign();
+ BasicDecimal256 x = BasicDecimal256::Abs(*this);
+ BasicDecimal256 y = BasicDecimal256::Abs(right);
+
+ uint128_t r_hi;
+ uint128_t r_lo;
+ std::array<uint64_t, 4> res{0, 0, 0, 0};
+ MultiplyUnsignedArray<4>(x.little_endian_array_, y.little_endian_array_, &res);
+ little_endian_array_ = res;
+ if (negate) {
+ Negate();
+ }
+ return *this;
+}
+
+DecimalStatus BasicDecimal256::Divide(const BasicDecimal256& divisor,
+ BasicDecimal256* result,
+ BasicDecimal256* remainder) const {
+ return DecimalDivide(*this, divisor, result, remainder);
+}
+
+DecimalStatus BasicDecimal256::Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal256* out) const {
+ return DecimalRescale(*this, original_scale, new_scale, out);
+}
+
+BasicDecimal256 BasicDecimal256::IncreaseScaleBy(int32_t increase_by) const {
+ DCHECK_GE(increase_by, 0);
+ DCHECK_LE(increase_by, 76);
+
+ return (*this) * ScaleMultipliersDecimal256[increase_by];
+}
+
+BasicDecimal256 BasicDecimal256::ReduceScaleBy(int32_t reduce_by, bool round) const {
+ DCHECK_GE(reduce_by, 0);
+ DCHECK_LE(reduce_by, 76);
+
+ if (reduce_by == 0) {
+ return *this;
+ }
+
+ BasicDecimal256 divisor(ScaleMultipliersDecimal256[reduce_by]);
+ BasicDecimal256 result;
+ BasicDecimal256 remainder;
+ auto s = Divide(divisor, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ if (round) {
+ auto divisor_half = ScaleMultipliersHalfDecimal256[reduce_by];
+ if (remainder.Abs() >= divisor_half) {
+ if (result > 0) {
+ result += 1;
+ } else {
+ result -= 1;
+ }
+ }
+ }
+ return result;
+}
+
+bool BasicDecimal256::FitsInPrecision(int32_t precision) const {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 76);
+ return BasicDecimal256::Abs(*this) < ScaleMultipliersDecimal256[precision];
+}
+
+const BasicDecimal256& BasicDecimal256::GetScaleMultiplier(int32_t scale) {
+ DCHECK_GE(scale, 0);
+ DCHECK_LE(scale, 76);
+
+ return ScaleMultipliersDecimal256[scale];
+}
+
+BasicDecimal256 operator*(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 result = left;
+ result *= right;
+ return result;
+}
+
+bool operator<(const BasicDecimal256& left, const BasicDecimal256& right) {
+ const std::array<uint64_t, 4>& lhs = left.little_endian_array();
+ const std::array<uint64_t, 4>& rhs = right.little_endian_array();
+ return lhs[3] != rhs[3]
+ ? static_cast<int64_t>(lhs[3]) < static_cast<int64_t>(rhs[3])
+ : lhs[2] != rhs[2] ? lhs[2] < rhs[2]
+ : lhs[1] != rhs[1] ? lhs[1] < rhs[1] : lhs[0] < rhs[0];
+}
+
+BasicDecimal256 operator-(const BasicDecimal256& operand) {
+ BasicDecimal256 result(operand);
+ return result.Negate();
+}
+
+BasicDecimal256 operator~(const BasicDecimal256& operand) {
+ const std::array<uint64_t, 4>& arr = operand.little_endian_array();
+ BasicDecimal256 result({~arr[0], ~arr[1], ~arr[2], ~arr[3]});
+ return result;
+}
+
+BasicDecimal256& BasicDecimal256::operator/=(const BasicDecimal256& right) {
+ BasicDecimal256 remainder;
+ auto s = Divide(right, this, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return *this;
+}
+
+BasicDecimal256 operator+(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 sum = left;
+ sum += right;
+ return sum;
+}
+
+BasicDecimal256 operator/(const BasicDecimal256& left, const BasicDecimal256& right) {
+ BasicDecimal256 remainder;
+ BasicDecimal256 result;
+ auto s = left.Divide(right, &result, &remainder);
+ DCHECK_EQ(s, DecimalStatus::kSuccess);
+ return result;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
index a8f61c73c87..acc8ea4930f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/basic_decimal.h
@@ -42,8 +42,8 @@ enum class DecimalStatus {
/// streams and boost.
class ARROW_EXPORT BasicDecimal128 {
public:
- static constexpr int bit_width = 128;
-
+ static constexpr int bit_width = 128;
+
/// \brief Create a BasicDecimal128 from the two's complement representation.
constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
: low_bits_(low), high_bits_(high) {}
@@ -111,10 +111,10 @@ class ARROW_EXPORT BasicDecimal128 {
BasicDecimal128& operator>>=(uint32_t bits);
/// \brief Get the high bits of the two's complement representation of the number.
- inline constexpr int64_t high_bits() const { return high_bits_; }
+ inline constexpr int64_t high_bits() const { return high_bits_; }
/// \brief Get the low bits of the two's complement representation of the number.
- inline constexpr uint64_t low_bits() const { return low_bits_; }
+ inline constexpr uint64_t low_bits() const { return low_bits_; }
/// \brief Return the raw bytes of the value in native-endian byte order.
std::array<uint8_t, 16> ToBytes() const;
@@ -180,163 +180,163 @@ ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left,
ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
const BasicDecimal128& right);
-class ARROW_EXPORT BasicDecimal256 {
- private:
- // Due to a bug in clang, we have to declare the extend method prior to its
- // usage.
- template <typename T>
- inline static constexpr uint64_t extend(T low_bits) noexcept {
- return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
- }
-
- public:
- static constexpr int bit_width = 256;
-
- /// \brief Create a BasicDecimal256 from the two's complement representation.
- constexpr BasicDecimal256(const std::array<uint64_t, 4>& little_endian_array) noexcept
- : little_endian_array_(little_endian_array) {}
-
- /// \brief Empty constructor creates a BasicDecimal256 with a value of 0.
- constexpr BasicDecimal256() noexcept : little_endian_array_({0, 0, 0, 0}) {}
-
- /// \brief Convert any integer value into a BasicDecimal256.
- template <typename T,
- typename = typename std::enable_if<
- std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
- constexpr BasicDecimal256(T value) noexcept
- : little_endian_array_({static_cast<uint64_t>(value), extend(value), extend(value),
- extend(value)}) {}
-
- constexpr BasicDecimal256(const BasicDecimal128& value) noexcept
- : little_endian_array_({value.low_bits(), static_cast<uint64_t>(value.high_bits()),
- extend(value.high_bits()), extend(value.high_bits())}) {}
-
- /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in
- /// native-endian byte order.
- explicit BasicDecimal256(const uint8_t* bytes);
-
- /// \brief Negate the current value (in-place)
- BasicDecimal256& Negate();
-
- /// \brief Absolute value (in-place)
- BasicDecimal256& Abs();
-
- /// \brief Absolute value
- static BasicDecimal256 Abs(const BasicDecimal256& left);
-
- /// \brief Add a number to this one. The result is truncated to 256 bits.
- BasicDecimal256& operator+=(const BasicDecimal256& right);
-
- /// \brief Subtract a number from this one. The result is truncated to 256 bits.
- BasicDecimal256& operator-=(const BasicDecimal256& right);
-
- /// \brief Get the bits of the two's complement representation of the number. The 4
- /// elements are in little endian order. The bits within each uint64_t element are in
- /// native endian order. For example,
- /// BasicDecimal256(123).little_endian_array() = {123, 0, 0, 0};
- /// BasicDecimal256(-2).little_endian_array() = {0xFF...FE, 0xFF...FF, 0xFF...FF,
- /// 0xFF...FF}.
- inline const std::array<uint64_t, 4>& little_endian_array() const {
- return little_endian_array_;
- }
-
- /// \brief Get the lowest bits of the two's complement representation of the number.
- inline constexpr uint64_t low_bits() const { return little_endian_array_[0]; }
-
- /// \brief Return the raw bytes of the value in native-endian byte order.
- std::array<uint8_t, 32> ToBytes() const;
- void ToBytes(uint8_t* out) const;
-
- /// \brief Scale multiplier for given scale value.
- static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
-
- /// \brief Convert BasicDecimal256 from one scale to another
- DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
- BasicDecimal256* out) const;
-
- /// \brief Scale up.
- BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
-
- /// \brief Scale down.
- /// - If 'round' is true, the right-most digits are dropped and the result value is
- /// rounded up (+1 for positive, -1 for negative) based on the value of the
- /// dropped digits (>= 10^reduce_by / 2).
- /// - If 'round' is false, the right-most digits are simply dropped.
- BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
-
- /// \brief Whether this number fits in the given precision
- ///
- /// Return true if the number of significant digits is less or equal to `precision`.
- bool FitsInPrecision(int32_t precision) const;
-
- inline int64_t Sign() const {
- return 1 | (static_cast<int64_t>(little_endian_array_[3]) >> 63);
- }
-
- inline int64_t IsNegative() const {
- return static_cast<int64_t>(little_endian_array_[3]) < 0;
- }
-
- /// \brief Multiply this number by another number. The result is truncated to 256 bits.
- BasicDecimal256& operator*=(const BasicDecimal256& right);
-
- /// Divide this number by right and return the result.
- ///
- /// This operation is not destructive.
- /// The answer rounds to zero. Signs work like:
- /// 21 / 5 -> 4, 1
- /// -21 / 5 -> -4, -1
- /// 21 / -5 -> -4, 1
- /// -21 / -5 -> 4, -1
- /// \param[in] divisor the number to divide by
- /// \param[out] result the quotient
- /// \param[out] remainder the remainder after the division
- DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
- BasicDecimal256* remainder) const;
-
- /// \brief Shift left by the given number of bits.
- BasicDecimal256& operator<<=(uint32_t bits);
-
- /// \brief In-place division.
- BasicDecimal256& operator/=(const BasicDecimal256& right);
-
- private:
- std::array<uint64_t, 4> little_endian_array_;
-};
-
-ARROW_EXPORT inline bool operator==(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return left.little_endian_array() == right.little_endian_array();
-}
-
-ARROW_EXPORT inline bool operator!=(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return left.little_endian_array() != right.little_endian_array();
-}
-
-ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
-
-ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return !operator<(right, left);
-}
-
-ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return operator<(right, left);
-}
-
-ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
- const BasicDecimal256& right) {
- return !operator<(left, right);
-}
-
-ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
-ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
-ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
- const BasicDecimal256& right);
-ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
- const BasicDecimal256& right);
-ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
- const BasicDecimal256& right);
+class ARROW_EXPORT BasicDecimal256 {
+ private:
+ // Due to a bug in clang, we have to declare the extend method prior to its
+ // usage.
+ template <typename T>
+ inline static constexpr uint64_t extend(T low_bits) noexcept {
+ return low_bits >= T() ? uint64_t{0} : ~uint64_t{0};
+ }
+
+ public:
+ static constexpr int bit_width = 256;
+
+ /// \brief Create a BasicDecimal256 from the two's complement representation.
+ constexpr BasicDecimal256(const std::array<uint64_t, 4>& little_endian_array) noexcept
+ : little_endian_array_(little_endian_array) {}
+
+ /// \brief Empty constructor creates a BasicDecimal256 with a value of 0.
+ constexpr BasicDecimal256() noexcept : little_endian_array_({0, 0, 0, 0}) {}
+
+ /// \brief Convert any integer value into a BasicDecimal256.
+ template <typename T,
+ typename = typename std::enable_if<
+ std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
+ constexpr BasicDecimal256(T value) noexcept
+ : little_endian_array_({static_cast<uint64_t>(value), extend(value), extend(value),
+ extend(value)}) {}
+
+ constexpr BasicDecimal256(const BasicDecimal128& value) noexcept
+ : little_endian_array_({value.low_bits(), static_cast<uint64_t>(value.high_bits()),
+ extend(value.high_bits()), extend(value.high_bits())}) {}
+
+ /// \brief Create a BasicDecimal256 from an array of bytes. Bytes are assumed to be in
+ /// native-endian byte order.
+ explicit BasicDecimal256(const uint8_t* bytes);
+
+ /// \brief Negate the current value (in-place)
+ BasicDecimal256& Negate();
+
+ /// \brief Absolute value (in-place)
+ BasicDecimal256& Abs();
+
+ /// \brief Absolute value
+ static BasicDecimal256 Abs(const BasicDecimal256& left);
+
+ /// \brief Add a number to this one. The result is truncated to 256 bits.
+ BasicDecimal256& operator+=(const BasicDecimal256& right);
+
+ /// \brief Subtract a number from this one. The result is truncated to 256 bits.
+ BasicDecimal256& operator-=(const BasicDecimal256& right);
+
+ /// \brief Get the bits of the two's complement representation of the number. The 4
+ /// elements are in little endian order. The bits within each uint64_t element are in
+ /// native endian order. For example,
+ /// BasicDecimal256(123).little_endian_array() = {123, 0, 0, 0};
+ /// BasicDecimal256(-2).little_endian_array() = {0xFF...FE, 0xFF...FF, 0xFF...FF,
+ /// 0xFF...FF}.
+ inline const std::array<uint64_t, 4>& little_endian_array() const {
+ return little_endian_array_;
+ }
+
+ /// \brief Get the lowest bits of the two's complement representation of the number.
+ inline constexpr uint64_t low_bits() const { return little_endian_array_[0]; }
+
+ /// \brief Return the raw bytes of the value in native-endian byte order.
+ std::array<uint8_t, 32> ToBytes() const;
+ void ToBytes(uint8_t* out) const;
+
+ /// \brief Scale multiplier for given scale value.
+ static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
+
+ /// \brief Convert BasicDecimal256 from one scale to another
+ DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+ BasicDecimal256* out) const;
+
+ /// \brief Scale up.
+ BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
+
+ /// \brief Scale down.
+ /// - If 'round' is true, the right-most digits are dropped and the result value is
+ /// rounded up (+1 for positive, -1 for negative) based on the value of the
+ /// dropped digits (>= 10^reduce_by / 2).
+ /// - If 'round' is false, the right-most digits are simply dropped.
+ BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+ /// \brief Whether this number fits in the given precision
+ ///
+ /// Return true if the number of significant digits is less or equal to `precision`.
+ bool FitsInPrecision(int32_t precision) const;
+
+ inline int64_t Sign() const {
+ return 1 | (static_cast<int64_t>(little_endian_array_[3]) >> 63);
+ }
+
+ inline int64_t IsNegative() const {
+ return static_cast<int64_t>(little_endian_array_[3]) < 0;
+ }
+
+ /// \brief Multiply this number by another number. The result is truncated to 256 bits.
+ BasicDecimal256& operator*=(const BasicDecimal256& right);
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \param[out] result the quotient
+ /// \param[out] remainder the remainder after the division
+ DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
+ BasicDecimal256* remainder) const;
+
+ /// \brief Shift left by the given number of bits.
+ BasicDecimal256& operator<<=(uint32_t bits);
+
+ /// \brief In-place division.
+ BasicDecimal256& operator/=(const BasicDecimal256& right);
+
+ private:
+ std::array<uint64_t, 4> little_endian_array_;
+};
+
+ARROW_EXPORT inline bool operator==(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return left.little_endian_array() == right.little_endian_array();
+}
+
+ARROW_EXPORT inline bool operator!=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return left.little_endian_array() != right.little_endian_array();
+}
+
+ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
+
+ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return !operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
+ const BasicDecimal256& right) {
+ return !operator<(left, right);
+}
+
+ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
+ const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
+ const BasicDecimal256& right);
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
index c7c97676f7c..c67cedc4a06 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.cc
@@ -27,7 +27,7 @@
namespace arrow {
namespace internal {
-BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
+BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
const int16_t run_length = static_cast<int16_t>(std::min(bits_remaining_, block_size));
int16_t popcount = static_cast<int16_t>(CountSetBits(bitmap_, offset_, run_length));
bits_remaining_ -= run_length;
@@ -37,11 +37,11 @@ BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
return {run_length, popcount};
}
-// Prevent pointer arithmetic on nullptr, which is undefined behavior even if the pointer
-// is never dereferenced.
-inline const uint8_t* EnsureNotNull(const uint8_t* ptr) {
- static const uint8_t byte{};
- return ptr == nullptr ? &byte : ptr;
+// Prevent pointer arithmetic on nullptr, which is undefined behavior even if the pointer
+// is never dereferenced.
+inline const uint8_t* EnsureNotNull(const uint8_t* ptr) {
+ static const uint8_t byte{};
+ return ptr == nullptr ? &byte : ptr;
}
OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
@@ -49,7 +49,7 @@ OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
: has_bitmap_(validity_bitmap != nullptr),
position_(0),
length_(length),
- counter_(EnsureNotNull(validity_bitmap), offset, length) {}
+ counter_(EnsureNotNull(validity_bitmap), offset, length) {}
OptionalBitBlockCounter::OptionalBitBlockCounter(
const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset, int64_t length)
@@ -64,10 +64,10 @@ OptionalBinaryBitBlockCounter::OptionalBinaryBitBlockCounter(const uint8_t* left
: has_bitmap_(HasBitmapFromBitmaps(left_bitmap != nullptr, right_bitmap != nullptr)),
position_(0),
length_(length),
- unary_counter_(EnsureNotNull(left_bitmap != nullptr ? left_bitmap : right_bitmap),
+ unary_counter_(EnsureNotNull(left_bitmap != nullptr ? left_bitmap : right_bitmap),
left_bitmap != nullptr ? left_offset : right_offset, length),
- binary_counter_(EnsureNotNull(left_bitmap), left_offset,
- EnsureNotNull(right_bitmap), right_offset, length) {}
+ binary_counter_(EnsureNotNull(left_bitmap), left_offset,
+ EnsureNotNull(right_bitmap), right_offset, length) {}
OptionalBinaryBitBlockCounter::OptionalBinaryBitBlockCounter(
const std::shared_ptr<Buffer>& left_bitmap, int64_t left_offset,
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
index 5a14031cf0e..63036af52a4 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_block_counter.h
@@ -25,26 +25,26 @@
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
-#include "arrow/util/ubsan.h"
+#include "arrow/util/ubsan.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
namespace detail {
-inline uint64_t LoadWord(const uint8_t* bytes) {
- return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
-}
-
-inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
- if (shift == 0) {
- return current;
- }
- return (current >> shift) | (next << (64 - shift));
-}
-
+inline uint64_t LoadWord(const uint8_t* bytes) {
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
+}
+
+inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
+ if (shift == 0) {
+ return current;
+ }
+ return (current >> shift) | (next << (64 - shift));
+}
+
// These templates are here to help with unit tests
template <typename T>
@@ -58,16 +58,16 @@ struct BitBlockAnd<bool> {
};
template <typename T>
-struct BitBlockAndNot {
- static T Call(T left, T right) { return left & ~right; }
-};
-
-template <>
-struct BitBlockAndNot<bool> {
- static bool Call(bool left, bool right) { return left && !right; }
-};
-
-template <typename T>
+struct BitBlockAndNot {
+ static T Call(T left, T right) { return left & ~right; }
+};
+
+template <>
+struct BitBlockAndNot<bool> {
+ static bool Call(bool left, bool right) { return left && !right; }
+};
+
+template <typename T>
struct BitBlockOr {
static T Call(T left, T right) { return left | right; }
};
@@ -120,82 +120,82 @@ class ARROW_EXPORT BitBlockCounter {
/// block will have a length less than 256 if the bitmap length is not a
/// multiple of 256, and will return 0-length blocks in subsequent
/// invocations.
- BitBlockCount NextFourWords() {
- using detail::LoadWord;
- using detail::ShiftWord;
-
- if (!bits_remaining_) {
- return {0, 0};
- }
- int64_t total_popcount = 0;
- if (offset_ == 0) {
- if (bits_remaining_ < kFourWordsBits) {
- return GetBlockSlow(kFourWordsBits);
- }
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_));
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 8));
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 16));
- total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 24));
- } else {
- // When the offset is > 0, we need there to be a word beyond the last
- // aligned word in the bitmap for the bit shifting logic.
- if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
- return GetBlockSlow(kFourWordsBits);
- }
- auto current = LoadWord(bitmap_);
- auto next = LoadWord(bitmap_ + 8);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- current = next;
- next = LoadWord(bitmap_ + 16);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- current = next;
- next = LoadWord(bitmap_ + 24);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- current = next;
- next = LoadWord(bitmap_ + 32);
- total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
- }
- bitmap_ += BitUtil::BytesForBits(kFourWordsBits);
- bits_remaining_ -= kFourWordsBits;
- return {256, static_cast<int16_t>(total_popcount)};
- }
-
+ BitBlockCount NextFourWords() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ int64_t total_popcount = 0;
+ if (offset_ == 0) {
+ if (bits_remaining_ < kFourWordsBits) {
+ return GetBlockSlow(kFourWordsBits);
+ }
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 8));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 16));
+ total_popcount += BitUtil::PopCount(LoadWord(bitmap_ + 24));
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
+ return GetBlockSlow(kFourWordsBits);
+ }
+ auto current = LoadWord(bitmap_);
+ auto next = LoadWord(bitmap_ + 8);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 16);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 24);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ current = next;
+ next = LoadWord(bitmap_ + 32);
+ total_popcount += BitUtil::PopCount(ShiftWord(current, next, offset_));
+ }
+ bitmap_ += BitUtil::BytesForBits(kFourWordsBits);
+ bits_remaining_ -= kFourWordsBits;
+ return {256, static_cast<int16_t>(total_popcount)};
+ }
+
/// \brief Return the next run of available bits, usually 64. The returned
/// pair contains the size of run and the number of true values. The last
/// block will have a length less than 64 if the bitmap length is not a
/// multiple of 64, and will return 0-length blocks in subsequent
/// invocations.
- BitBlockCount NextWord() {
- using detail::LoadWord;
- using detail::ShiftWord;
-
- if (!bits_remaining_) {
- return {0, 0};
- }
- int64_t popcount = 0;
- if (offset_ == 0) {
- if (bits_remaining_ < kWordBits) {
- return GetBlockSlow(kWordBits);
- }
- popcount = BitUtil::PopCount(LoadWord(bitmap_));
- } else {
- // When the offset is > 0, we need there to be a word beyond the last
- // aligned word in the bitmap for the bit shifting logic.
- if (bits_remaining_ < 2 * kWordBits - offset_) {
- return GetBlockSlow(kWordBits);
- }
- popcount =
- BitUtil::PopCount(ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
- }
- bitmap_ += kWordBits / 8;
- bits_remaining_ -= kWordBits;
- return {64, static_cast<int16_t>(popcount)};
- }
-
+ BitBlockCount NextWord() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ int64_t popcount = 0;
+ if (offset_ == 0) {
+ if (bits_remaining_ < kWordBits) {
+ return GetBlockSlow(kWordBits);
+ }
+ popcount = BitUtil::PopCount(LoadWord(bitmap_));
+ } else {
+ // When the offset is > 0, we need there to be a word beyond the last
+ // aligned word in the bitmap for the bit shifting logic.
+ if (bits_remaining_ < 2 * kWordBits - offset_) {
+ return GetBlockSlow(kWordBits);
+ }
+ popcount =
+ BitUtil::PopCount(ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
+ }
+ bitmap_ += kWordBits / 8;
+ bits_remaining_ -= kWordBits;
+ return {64, static_cast<int16_t>(popcount)};
+ }
+
private:
/// \brief Return block with the requested size when doing word-wise
/// computation is not possible due to inadequate bits remaining.
- BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
+ BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
const uint8_t* bitmap_;
int64_t bits_remaining_;
@@ -274,67 +274,67 @@ class ARROW_EXPORT BinaryBitBlockCounter {
/// the number of true values. The last block will have a length less than 64
/// if the bitmap length is not a multiple of 64, and will return 0-length
/// blocks in subsequent invocations.
- BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
+ BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
+
+ /// \brief Computes "x & ~y" block for each available run of bits.
+ BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
- /// \brief Computes "x & ~y" block for each available run of bits.
- BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
-
/// \brief Computes "x | y" block for each available run of bits.
- BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
+ BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
/// \brief Computes "x | ~y" block for each available run of bits.
- BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
+ BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
private:
template <template <typename T> class Op>
- BitBlockCount NextWord() {
- using detail::LoadWord;
- using detail::ShiftWord;
-
- if (!bits_remaining_) {
- return {0, 0};
- }
- // When the offset is > 0, we need there to be a word beyond the last aligned
- // word in the bitmap for the bit shifting logic.
- constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
- const int64_t bits_required_to_use_words =
- std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
- right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
- if (bits_remaining_ < bits_required_to_use_words) {
- const int16_t run_length =
- static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
- int16_t popcount = 0;
- for (int64_t i = 0; i < run_length; ++i) {
- if (Op<bool>::Call(BitUtil::GetBit(left_bitmap_, left_offset_ + i),
- BitUtil::GetBit(right_bitmap_, right_offset_ + i))) {
- ++popcount;
- }
- }
- // This code path should trigger _at most_ 2 times. In the "two times"
- // case, the first time the run length will be a multiple of 8.
- left_bitmap_ += run_length / 8;
- right_bitmap_ += run_length / 8;
- bits_remaining_ -= run_length;
- return {run_length, popcount};
- }
-
- int64_t popcount = 0;
- if (left_offset_ == 0 && right_offset_ == 0) {
- popcount = BitUtil::PopCount(
- Op<uint64_t>::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
- } else {
- auto left_word =
- ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
- auto right_word =
- ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
- popcount = BitUtil::PopCount(Op<uint64_t>::Call(left_word, right_word));
- }
- left_bitmap_ += kWordBits / 8;
- right_bitmap_ += kWordBits / 8;
- bits_remaining_ -= kWordBits;
- return {64, static_cast<int16_t>(popcount)};
- }
-
+ BitBlockCount NextWord() {
+ using detail::LoadWord;
+ using detail::ShiftWord;
+
+ if (!bits_remaining_) {
+ return {0, 0};
+ }
+ // When the offset is > 0, we need there to be a word beyond the last aligned
+ // word in the bitmap for the bit shifting logic.
+ constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
+ const int64_t bits_required_to_use_words =
+ std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
+ right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
+ if (bits_remaining_ < bits_required_to_use_words) {
+ const int16_t run_length =
+ static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
+ int16_t popcount = 0;
+ for (int64_t i = 0; i < run_length; ++i) {
+ if (Op<bool>::Call(BitUtil::GetBit(left_bitmap_, left_offset_ + i),
+ BitUtil::GetBit(right_bitmap_, right_offset_ + i))) {
+ ++popcount;
+ }
+ }
+ // This code path should trigger _at most_ 2 times. In the "two times"
+ // case, the first time the run length will be a multiple of 8.
+ left_bitmap_ += run_length / 8;
+ right_bitmap_ += run_length / 8;
+ bits_remaining_ -= run_length;
+ return {run_length, popcount};
+ }
+
+ int64_t popcount = 0;
+ if (left_offset_ == 0 && right_offset_ == 0) {
+ popcount = BitUtil::PopCount(
+ Op<uint64_t>::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
+ } else {
+ auto left_word =
+ ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
+ auto right_word =
+ ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
+ popcount = BitUtil::PopCount(Op<uint64_t>::Call(left_word, right_word));
+ }
+ left_bitmap_ += kWordBits / 8;
+ right_bitmap_ += kWordBits / 8;
+ bits_remaining_ -= kWordBits;
+ return {64, static_cast<int16_t>(popcount)};
+ }
+
const uint8_t* left_bitmap_;
int64_t left_offset_;
const uint8_t* right_bitmap_;
@@ -379,30 +379,30 @@ class ARROW_EXPORT OptionalBinaryBitBlockCounter {
}
}
- BitBlockCount NextOrNotBlock() {
- static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
- switch (has_bitmap_) {
- case HasBitmap::BOTH: {
- BitBlockCount block = binary_counter_.NextOrNotWord();
- position_ += block.length;
- return block;
- }
- case HasBitmap::ONE: {
- BitBlockCount block = unary_counter_.NextWord();
- position_ += block.length;
- return block;
- }
- case HasBitmap::NONE:
- default: {
- const int16_t block_size =
- static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
- position_ += block_size;
- // All values are non-null
- return {block_size, block_size};
- }
- }
- }
-
+ BitBlockCount NextOrNotBlock() {
+ static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+ switch (has_bitmap_) {
+ case HasBitmap::BOTH: {
+ BitBlockCount block = binary_counter_.NextOrNotWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::ONE: {
+ BitBlockCount block = unary_counter_.NextWord();
+ position_ += block.length;
+ return block;
+ }
+ case HasBitmap::NONE:
+ default: {
+ const int16_t block_size =
+ static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+ position_ += block_size;
+ // All values are non-null
+ return {block_size, block_size};
+ }
+ }
+ }
+
private:
enum class HasBitmap : int { BOTH, ONE, NONE };
@@ -427,9 +427,9 @@ class ARROW_EXPORT OptionalBinaryBitBlockCounter {
// Functional-style bit block visitors.
template <typename VisitNotNull, typename VisitNull>
-static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
- int64_t length, VisitNotNull&& visit_not_null,
- VisitNull&& visit_null) {
+static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+ int64_t length, VisitNotNull&& visit_not_null,
+ VisitNull&& visit_null) {
const uint8_t* bitmap = NULLPTR;
if (bitmap_buf != NULLPTR) {
bitmap = bitmap_buf->data();
@@ -460,9 +460,9 @@ static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t
}
template <typename VisitNotNull, typename VisitNull>
-static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
- int64_t length, VisitNotNull&& visit_not_null,
- VisitNull&& visit_null) {
+static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+ int64_t length, VisitNotNull&& visit_not_null,
+ VisitNull&& visit_null) {
const uint8_t* bitmap = NULLPTR;
if (bitmap_buf != NULLPTR) {
bitmap = bitmap_buf->data();
@@ -492,11 +492,11 @@ static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_
}
template <typename VisitNotNull, typename VisitNull>
-static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
- int64_t left_offset,
- const std::shared_ptr<Buffer>& right_bitmap_buf,
- int64_t right_offset, int64_t length,
- VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
+ int64_t left_offset,
+ const std::shared_ptr<Buffer>& right_bitmap_buf,
+ int64_t right_offset, int64_t length,
+ VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
if (left_bitmap_buf == NULLPTR || right_bitmap_buf == NULLPTR) {
// At most one bitmap is present
if (left_bitmap_buf == NULLPTR) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
index 1114ec61f19..eda6088eb32 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.cc
@@ -45,7 +45,7 @@ BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t
// Prepare for inversion in NextRun.
// Clear out any preceding bits.
- word_ = word_ & ~BitUtil::LeastSignificantBitMask(position_);
+ word_ = word_ & ~BitUtil::LeastSignificantBitMask(position_);
}
#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
index 10155687a20..3e196628477 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_run_reader.h
@@ -17,14 +17,14 @@
#pragma once
-#include <cassert>
+#include <cassert>
#include <cstdint>
#include <cstring>
#include <string>
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_reader.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -42,14 +42,14 @@ struct BitRun {
}
};
-inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
return lhs.length == rhs.length && lhs.set == rhs.set;
}
-inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
- return lhs.length != rhs.length || lhs.set != rhs.set;
-}
-
+inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
+ return lhs.length != rhs.length || lhs.set != rhs.set;
+}
+
class BitRunReaderLinear {
public:
BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
@@ -70,7 +70,7 @@ class BitRunReaderLinear {
};
#if ARROW_LITTLE_ENDIAN
-/// A convenience class for counting the number of contiguous set/unset bits
+/// A convenience class for counting the number of contiguous set/unset bits
/// in a bitmap.
class ARROW_EXPORT BitRunReader {
public:
@@ -102,7 +102,7 @@ class ARROW_EXPORT BitRunReader {
int64_t start_bit_offset = start_position & 63;
// Invert the word for proper use of CountTrailingZeros and
// clear bits so CountTrailingZeros can do it magic.
- word_ = ~word_ & ~BitUtil::LeastSignificantBitMask(start_bit_offset);
+ word_ = ~word_ & ~BitUtil::LeastSignificantBitMask(start_bit_offset);
// Go forward until the next change from unset to set.
int64_t new_bits = BitUtil::CountTrailingZeros(word_) - start_bit_offset;
@@ -151,7 +151,7 @@ class ARROW_EXPORT BitRunReader {
}
// Two cases:
- // 1. For unset, CountTrailingZeros works naturally so we don't
+ // 1. For unset, CountTrailingZeros works naturally so we don't
// invert the word.
// 2. Otherwise invert so we can use CountTrailingZeros.
if (current_run_bit_set_) {
@@ -168,348 +168,348 @@ class ARROW_EXPORT BitRunReader {
using BitRunReader = BitRunReaderLinear;
#endif
-struct SetBitRun {
- int64_t position;
- int64_t length;
-
- bool AtEnd() const { return length == 0; }
-
- std::string ToString() const {
- return std::string("{pos=") + std::to_string(position) +
- ", len=" + std::to_string(length) + "}";
- }
-
- bool operator==(const SetBitRun& other) const {
- return position == other.position && length == other.length;
- }
- bool operator!=(const SetBitRun& other) const {
- return position != other.position || length != other.length;
- }
-};
-
-template <bool Reverse>
-class BaseSetBitRunReader {
- public:
- /// \brief Constructs new SetBitRunReader.
- ///
- /// \param[in] bitmap source data
- /// \param[in] start_offset bit offset into the source data
- /// \param[in] length number of bits to copy
- ARROW_NOINLINE
- BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
- : bitmap_(bitmap),
- length_(length),
- remaining_(length_),
- current_word_(0),
- current_num_bits_(0) {
- if (Reverse) {
- bitmap_ += (start_offset + length) / 8;
- const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
- if (length > 0 && end_bit_offset) {
- // Get LSBs from last byte
- ++bitmap_;
- current_num_bits_ =
- std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
- current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
- }
- } else {
- bitmap_ += start_offset / 8;
- const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
- if (length > 0 && bit_offset) {
- // Get MSBs from first byte
- current_num_bits_ =
- std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
- current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
- }
- }
- }
-
- ARROW_NOINLINE
- SetBitRun NextRun() {
- int64_t pos = 0;
- int64_t len = 0;
- if (current_num_bits_) {
- const auto run = FindCurrentRun();
- assert(remaining_ >= 0);
- if (run.length && current_num_bits_) {
- // The run ends in current_word_
- return AdjustRun(run);
- }
- pos = run.position;
- len = run.length;
- }
- if (!len) {
- // We didn't get any ones in current_word_, so we can skip any zeros
- // in the following words
- SkipNextZeros();
- if (remaining_ == 0) {
- return {0, 0};
- }
- assert(current_num_bits_);
- pos = position();
- } else if (!current_num_bits_) {
- if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
- current_word_ = LoadFullWord();
- current_num_bits_ = 64;
- } else if (remaining_ > 0) {
- current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
- current_num_bits_ = static_cast<int32_t>(remaining_);
- } else {
- // No bits remaining, perhaps we found a run?
- return AdjustRun({pos, len});
- }
- // If current word starts with a zero, we got a full run
- if (!(current_word_ & kFirstBit)) {
- return AdjustRun({pos, len});
- }
- }
- // Current word should now start with a set bit
- len += CountNextOnes();
- return AdjustRun({pos, len});
- }
-
- protected:
- int64_t position() const {
- if (Reverse) {
- return remaining_;
- } else {
- return length_ - remaining_;
- }
- }
-
- SetBitRun AdjustRun(SetBitRun run) {
- if (Reverse) {
- assert(run.position >= run.length);
- run.position -= run.length;
- }
- return run;
- }
-
- uint64_t LoadFullWord() {
- uint64_t word;
- if (Reverse) {
- bitmap_ -= 8;
- }
- memcpy(&word, bitmap_, 8);
- if (!Reverse) {
- bitmap_ += 8;
- }
- return BitUtil::ToLittleEndian(word);
- }
-
- uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
- assert(num_bits > 0);
- uint64_t word = 0;
- const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
- if (Reverse) {
- // Read in the most significant bytes of the word
- bitmap_ -= num_bytes;
- memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
- // XXX MostSignificantBitmask
- return (BitUtil::ToLittleEndian(word) << bit_offset) &
- ~BitUtil::LeastSignificantBitMask(64 - num_bits);
- } else {
- memcpy(&word, bitmap_, num_bytes);
- bitmap_ += num_bytes;
- return (BitUtil::ToLittleEndian(word) >> bit_offset) &
- BitUtil::LeastSignificantBitMask(num_bits);
- }
- }
-
- void SkipNextZeros() {
- assert(current_num_bits_ == 0);
- while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
- current_word_ = LoadFullWord();
- const auto num_zeros = CountFirstZeros(current_word_);
- if (num_zeros < 64) {
- // Run of zeros ends here
- current_word_ = ConsumeBits(current_word_, num_zeros);
- current_num_bits_ = 64 - num_zeros;
- remaining_ -= num_zeros;
- assert(remaining_ >= 0);
- assert(current_num_bits_ >= 0);
- return;
- }
- remaining_ -= 64;
- }
- // Run of zeros continues in last bitmap word
- if (remaining_ > 0) {
- current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
- current_num_bits_ = static_cast<int32_t>(remaining_);
- const auto num_zeros =
- std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
- current_word_ = ConsumeBits(current_word_, num_zeros);
- current_num_bits_ -= num_zeros;
- remaining_ -= num_zeros;
- assert(remaining_ >= 0);
- assert(current_num_bits_ >= 0);
- }
- }
-
- int64_t CountNextOnes() {
- assert(current_word_ & kFirstBit);
-
- int64_t len;
- if (~current_word_) {
- const auto num_ones = CountFirstZeros(~current_word_);
- assert(num_ones <= current_num_bits_);
- assert(num_ones <= remaining_);
- remaining_ -= num_ones;
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ -= num_ones;
- if (current_num_bits_) {
- // Run of ones ends here
- return num_ones;
- }
- len = num_ones;
- } else {
- // current_word_ is all ones
- remaining_ -= 64;
- current_num_bits_ = 0;
- len = 64;
- }
-
- while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
- current_word_ = LoadFullWord();
- const auto num_ones = CountFirstZeros(~current_word_);
- len += num_ones;
- remaining_ -= num_ones;
- if (num_ones < 64) {
- // Run of ones ends here
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ = 64 - num_ones;
- return len;
- }
- }
- // Run of ones continues in last bitmap word
- if (remaining_ > 0) {
- current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
- current_num_bits_ = static_cast<int32_t>(remaining_);
- const auto num_ones = CountFirstZeros(~current_word_);
- assert(num_ones <= current_num_bits_);
- assert(num_ones <= remaining_);
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ -= num_ones;
- remaining_ -= num_ones;
- len += num_ones;
- }
- return len;
- }
-
- SetBitRun FindCurrentRun() {
- // Skip any pending zeros
- const auto num_zeros = CountFirstZeros(current_word_);
- if (num_zeros >= current_num_bits_) {
- remaining_ -= current_num_bits_;
- current_word_ = 0;
- current_num_bits_ = 0;
- return {0, 0};
- }
- assert(num_zeros <= remaining_);
- current_word_ = ConsumeBits(current_word_, num_zeros);
- current_num_bits_ -= num_zeros;
- remaining_ -= num_zeros;
- const int64_t pos = position();
- // Count any ones
- const auto num_ones = CountFirstZeros(~current_word_);
- assert(num_ones <= current_num_bits_);
- assert(num_ones <= remaining_);
- current_word_ = ConsumeBits(current_word_, num_ones);
- current_num_bits_ -= num_ones;
- remaining_ -= num_ones;
- return {pos, num_ones};
- }
-
- inline int CountFirstZeros(uint64_t word);
- inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
-
- const uint8_t* bitmap_;
- const int64_t length_;
- int64_t remaining_;
- uint64_t current_word_;
- int32_t current_num_bits_;
-
- static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
-};
-
-template <>
-inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
- return BitUtil::CountTrailingZeros(word);
-}
-
-template <>
-inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
- return BitUtil::CountLeadingZeros(word);
-}
-
-template <>
-inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
- return word >> num_bits;
-}
-
-template <>
-inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
- return word << num_bits;
-}
-
-using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
-using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
-
-// Functional-style bit run visitors.
-
-// XXX: Try to make this function small so the compiler can inline and optimize
-// the `visit` function, which is normally a hot loop with vectorizable code.
-// - don't inline SetBitRunReader constructor, it doesn't hurt performance
-// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
-template <typename Visit>
-inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
- Visit&& visit) {
- if (bitmap == NULLPTR) {
- // Assuming all set (as in a null bitmap)
- return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
- }
- SetBitRunReader reader(bitmap, offset, length);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- ARROW_RETURN_NOT_OK(visit(run.position, run.length));
- }
- return Status::OK();
-}
-
-template <typename Visit>
-inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
- Visit&& visit) {
- if (bitmap == NULLPTR) {
- // Assuming all set (as in a null bitmap)
- visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
- return;
- }
- SetBitRunReader reader(bitmap, offset, length);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- visit(run.position, run.length);
- }
-}
-
-template <typename Visit>
-inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
- int64_t length, Visit&& visit) {
- return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
- std::forward<Visit>(visit));
-}
-
-template <typename Visit>
-inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
- int64_t length, Visit&& visit) {
- VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
- std::forward<Visit>(visit));
-}
-
+struct SetBitRun {
+ int64_t position;
+ int64_t length;
+
+ bool AtEnd() const { return length == 0; }
+
+ std::string ToString() const {
+ return std::string("{pos=") + std::to_string(position) +
+ ", len=" + std::to_string(length) + "}";
+ }
+
+ bool operator==(const SetBitRun& other) const {
+ return position == other.position && length == other.length;
+ }
+ bool operator!=(const SetBitRun& other) const {
+ return position != other.position || length != other.length;
+ }
+};
+
+template <bool Reverse>
+class BaseSetBitRunReader {
+ public:
+ /// \brief Constructs new SetBitRunReader.
+ ///
+ /// \param[in] bitmap source data
+ /// \param[in] start_offset bit offset into the source data
+ /// \param[in] length number of bits to copy
+ ARROW_NOINLINE
+ BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap),
+ length_(length),
+ remaining_(length_),
+ current_word_(0),
+ current_num_bits_(0) {
+ if (Reverse) {
+ bitmap_ += (start_offset + length) / 8;
+ const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
+ if (length > 0 && end_bit_offset) {
+ // Get LSBs from last byte
+ ++bitmap_;
+ current_num_bits_ =
+ std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
+ current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
+ }
+ } else {
+ bitmap_ += start_offset / 8;
+ const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
+ if (length > 0 && bit_offset) {
+ // Get MSBs from first byte
+ current_num_bits_ =
+ std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
+ current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
+ }
+ }
+ }
+
+ ARROW_NOINLINE
+ SetBitRun NextRun() {
+ int64_t pos = 0;
+ int64_t len = 0;
+ if (current_num_bits_) {
+ const auto run = FindCurrentRun();
+ assert(remaining_ >= 0);
+ if (run.length && current_num_bits_) {
+ // The run ends in current_word_
+ return AdjustRun(run);
+ }
+ pos = run.position;
+ len = run.length;
+ }
+ if (!len) {
+ // We didn't get any ones in current_word_, so we can skip any zeros
+ // in the following words
+ SkipNextZeros();
+ if (remaining_ == 0) {
+ return {0, 0};
+ }
+ assert(current_num_bits_);
+ pos = position();
+ } else if (!current_num_bits_) {
+ if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ current_num_bits_ = 64;
+ } else if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ } else {
+ // No bits remaining, perhaps we found a run?
+ return AdjustRun({pos, len});
+ }
+ // If current word starts with a zero, we got a full run
+ if (!(current_word_ & kFirstBit)) {
+ return AdjustRun({pos, len});
+ }
+ }
+ // Current word should now start with a set bit
+ len += CountNextOnes();
+ return AdjustRun({pos, len});
+ }
+
+ protected:
+ int64_t position() const {
+ if (Reverse) {
+ return remaining_;
+ } else {
+ return length_ - remaining_;
+ }
+ }
+
+ SetBitRun AdjustRun(SetBitRun run) {
+ if (Reverse) {
+ assert(run.position >= run.length);
+ run.position -= run.length;
+ }
+ return run;
+ }
+
+ uint64_t LoadFullWord() {
+ uint64_t word;
+ if (Reverse) {
+ bitmap_ -= 8;
+ }
+ memcpy(&word, bitmap_, 8);
+ if (!Reverse) {
+ bitmap_ += 8;
+ }
+ return BitUtil::ToLittleEndian(word);
+ }
+
+ uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+ assert(num_bits > 0);
+ uint64_t word = 0;
+ const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+ if (Reverse) {
+ // Read in the most significant bytes of the word
+ bitmap_ -= num_bytes;
+ memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
+ // XXX MostSignificantBitmask
+ return (BitUtil::ToLittleEndian(word) << bit_offset) &
+ ~BitUtil::LeastSignificantBitMask(64 - num_bits);
+ } else {
+ memcpy(&word, bitmap_, num_bytes);
+ bitmap_ += num_bytes;
+ return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+ BitUtil::LeastSignificantBitMask(num_bits);
+ }
+ }
+
+ void SkipNextZeros() {
+ assert(current_num_bits_ == 0);
+ while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ const auto num_zeros = CountFirstZeros(current_word_);
+ if (num_zeros < 64) {
+ // Run of zeros ends here
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ = 64 - num_zeros;
+ remaining_ -= num_zeros;
+ assert(remaining_ >= 0);
+ assert(current_num_bits_ >= 0);
+ return;
+ }
+ remaining_ -= 64;
+ }
+ // Run of zeros continues in last bitmap word
+ if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ const auto num_zeros =
+ std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ -= num_zeros;
+ remaining_ -= num_zeros;
+ assert(remaining_ >= 0);
+ assert(current_num_bits_ >= 0);
+ }
+ }
+
+ int64_t CountNextOnes() {
+ assert(current_word_ & kFirstBit);
+
+ int64_t len;
+ if (~current_word_) {
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ remaining_ -= num_ones;
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ if (current_num_bits_) {
+ // Run of ones ends here
+ return num_ones;
+ }
+ len = num_ones;
+ } else {
+ // current_word_ is all ones
+ remaining_ -= 64;
+ current_num_bits_ = 0;
+ len = 64;
+ }
+
+ while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+ current_word_ = LoadFullWord();
+ const auto num_ones = CountFirstZeros(~current_word_);
+ len += num_ones;
+ remaining_ -= num_ones;
+ if (num_ones < 64) {
+ // Run of ones ends here
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ = 64 - num_ones;
+ return len;
+ }
+ }
+ // Run of ones continues in last bitmap word
+ if (remaining_ > 0) {
+ current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+ current_num_bits_ = static_cast<int32_t>(remaining_);
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ remaining_ -= num_ones;
+ len += num_ones;
+ }
+ return len;
+ }
+
+ SetBitRun FindCurrentRun() {
+ // Skip any pending zeros
+ const auto num_zeros = CountFirstZeros(current_word_);
+ if (num_zeros >= current_num_bits_) {
+ remaining_ -= current_num_bits_;
+ current_word_ = 0;
+ current_num_bits_ = 0;
+ return {0, 0};
+ }
+ assert(num_zeros <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_zeros);
+ current_num_bits_ -= num_zeros;
+ remaining_ -= num_zeros;
+ const int64_t pos = position();
+ // Count any ones
+ const auto num_ones = CountFirstZeros(~current_word_);
+ assert(num_ones <= current_num_bits_);
+ assert(num_ones <= remaining_);
+ current_word_ = ConsumeBits(current_word_, num_ones);
+ current_num_bits_ -= num_ones;
+ remaining_ -= num_ones;
+ return {pos, num_ones};
+ }
+
+ inline int CountFirstZeros(uint64_t word);
+ inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
+
+ const uint8_t* bitmap_;
+ const int64_t length_;
+ int64_t remaining_;
+ uint64_t current_word_;
+ int32_t current_num_bits_;
+
+ static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
+};
+
+template <>
+inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
+ return BitUtil::CountTrailingZeros(word);
+}
+
+template <>
+inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
+ return BitUtil::CountLeadingZeros(word);
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
+ return word >> num_bits;
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
+ return word << num_bits;
+}
+
+using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
+using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
+
+// Functional-style bit run visitors.
+
+// XXX: Try to make this function small so the compiler can inline and optimize
+// the `visit` function, which is normally a hot loop with vectorizable code.
+// - don't inline SetBitRunReader constructor, it doesn't hurt performance
+// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
+template <typename Visit>
+inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
+ Visit&& visit) {
+ if (bitmap == NULLPTR) {
+ // Assuming all set (as in a null bitmap)
+ return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+ }
+ SetBitRunReader reader(bitmap, offset, length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ ARROW_RETURN_NOT_OK(visit(run.position, run.length));
+ }
+ return Status::OK();
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
+ Visit&& visit) {
+ if (bitmap == NULLPTR) {
+ // Assuming all set (as in a null bitmap)
+ visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+ return;
+ }
+ SetBitRunReader reader(bitmap, offset, length);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ visit(run.position, run.length);
+ }
+}
+
+template <typename Visit>
+inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+ int64_t length, Visit&& visit) {
+ return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
+ std::forward<Visit>(visit));
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+ int64_t length, Visit&& visit) {
+ VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
+ std::forward<Visit>(visit));
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
index cdd3683557c..b9e695dfcb0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_stream_utils.h
@@ -1,433 +1,433 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// From Apache Impala (incubating) as of 2016-01-29
-
-#pragma once
-
-#include <string.h>
-#include <algorithm>
-#include <cstdint>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bpacking.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace BitUtil {
-
-/// Utility class to write bit/byte streams. This class can write data to either be
-/// bit packed or byte aligned (and a single stream that has a mix of both).
-/// This class does not allocate memory.
-class BitWriter {
- public:
- /// buffer: buffer to write bits to. Buffer should be preallocated with
- /// 'buffer_len' bytes.
- BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
- Clear();
- }
-
- void Clear() {
- buffered_values_ = 0;
- byte_offset_ = 0;
- bit_offset_ = 0;
- }
-
- /// The number of current bytes written, including the current byte (i.e. may include a
- /// fraction of a byte). Includes buffered values.
- int bytes_written() const {
- return byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_));
- }
- uint8_t* buffer() const { return buffer_; }
- int buffer_len() const { return max_bytes_; }
-
- /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
- /// packed. Returns false if there was not enough space. num_bits must be <= 32.
- bool PutValue(uint64_t v, int num_bits);
-
- /// Writes v to the next aligned byte using num_bytes. If T is larger than
- /// num_bytes, the extra high-order bytes will be ignored. Returns false if
- /// there was not enough space.
- /// Assume the v is stored in buffer_ as a litte-endian format
- template <typename T>
- bool PutAligned(T v, int num_bytes);
-
- /// Write a Vlq encoded int to the buffer. Returns false if there was not enough
- /// room. The value is written byte aligned.
- /// For more details on vlq:
- /// en.wikipedia.org/wiki/Variable-length_quantity
- bool PutVlqInt(uint32_t v);
-
- // Writes an int zigzag encoded.
- bool PutZigZagVlqInt(int32_t v);
-
- /// Get a pointer to the next aligned byte and advance the underlying buffer
- /// by num_bytes.
- /// Returns NULL if there was not enough space.
- uint8_t* GetNextBytePtr(int num_bytes = 1);
-
- /// Flushes all buffered values to the buffer. Call this when done writing to
- /// the buffer. If 'align' is true, buffered_values_ is reset and any future
- /// writes will be written to the next byte boundary.
- void Flush(bool align = false);
-
- private:
- uint8_t* buffer_;
- int max_bytes_;
-
- /// Bit-packed values are initially written to this variable before being memcpy'd to
- /// buffer_. This is faster than writing values byte by byte directly to buffer_.
- uint64_t buffered_values_;
-
- int byte_offset_; // Offset in buffer_
- int bit_offset_; // Offset in buffered_values_
-};
-
-/// Utility class to read bit/byte stream. This class can read bits or bytes
-/// that are either byte aligned or not. It also has utilities to read multiple
-/// bytes in one read (e.g. encoded int).
-class BitReader {
- public:
- /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
- BitReader(const uint8_t* buffer, int buffer_len)
- : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
- int num_bytes = std::min(8, max_bytes_ - byte_offset_);
- memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
- buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
- }
-
- BitReader()
- : buffer_(NULL),
- max_bytes_(0),
- buffered_values_(0),
- byte_offset_(0),
- bit_offset_(0) {}
-
- void Reset(const uint8_t* buffer, int buffer_len) {
- buffer_ = buffer;
- max_bytes_ = buffer_len;
- byte_offset_ = 0;
- bit_offset_ = 0;
- int num_bytes = std::min(8, max_bytes_ - byte_offset_);
- memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
- buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
- }
-
- /// Gets the next value from the buffer. Returns true if 'v' could be read or false if
- /// there are not enough bytes left. num_bits must be <= 32.
- template <typename T>
- bool GetValue(int num_bits, T* v);
-
- /// Get a number of values from the buffer. Return the number of values actually read.
- template <typename T>
- int GetBatch(int num_bits, T* v, int batch_size);
-
- /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
- /// needs to be a little-endian native type and big enough to store
- /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
- /// be advanced to the start of the next byte before 'v' is read. Returns
- /// false if there are not enough bytes left.
- /// Assume the v was stored in buffer_ as a litte-endian format
- template <typename T>
- bool GetAligned(int num_bytes, T* v);
-
- /// Reads a vlq encoded int from the stream. The encoded int must start at
- /// the beginning of a byte. Return false if there were not enough bytes in
- /// the buffer.
- bool GetVlqInt(uint32_t* v);
-
- // Reads a zigzag encoded int `into` v.
- bool GetZigZagVlqInt(int32_t* v);
-
- /// Returns the number of bytes left in the stream, not including the current
- /// byte (i.e., there may be an additional fraction of a byte).
- int bytes_left() {
- return max_bytes_ -
- (byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_)));
- }
-
- /// Maximum byte length of a vlq encoded int
- static constexpr int kMaxVlqByteLength = 5;
-
- private:
- const uint8_t* buffer_;
- int max_bytes_;
-
- /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
- /// faster than reading values byte by byte directly from buffer_.
- uint64_t buffered_values_;
-
- int byte_offset_; // Offset in buffer_
- int bit_offset_; // Offset in buffered_values_
-};
-
-inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
- // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
- DCHECK_LE(num_bits, 32);
- DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
-
- if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
- return false;
-
- buffered_values_ |= v << bit_offset_;
- bit_offset_ += num_bits;
-
- if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
- // Flush buffered_values_ and write out bits of v that did not fit
- buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
- memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
- buffered_values_ = 0;
- byte_offset_ += 8;
- bit_offset_ -= 64;
- buffered_values_ = v >> (num_bits - bit_offset_);
- }
- DCHECK_LT(bit_offset_, 64);
- return true;
-}
-
-inline void BitWriter::Flush(bool align) {
- int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
- DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
- auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
- memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);
-
- if (align) {
- buffered_values_ = 0;
- byte_offset_ += num_bytes;
- bit_offset_ = 0;
- }
-}
-
-inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
- Flush(/* align */ true);
- DCHECK_LE(byte_offset_, max_bytes_);
- if (byte_offset_ + num_bytes > max_bytes_) return NULL;
- uint8_t* ptr = buffer_ + byte_offset_;
- byte_offset_ += num_bytes;
- return ptr;
-}
-
-template <typename T>
-inline bool BitWriter::PutAligned(T val, int num_bytes) {
- uint8_t* ptr = GetNextBytePtr(num_bytes);
- if (ptr == NULL) return false;
- val = arrow::BitUtil::ToLittleEndian(val);
- memcpy(ptr, &val, num_bytes);
- return true;
-}
-
-namespace detail {
-
-template <typename T>
-inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
- int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
- *v = static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
- *bit_offset);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- *bit_offset += num_bits;
- if (*bit_offset >= 64) {
- *byte_offset += 8;
- *bit_offset -= 64;
-
- int bytes_remaining = max_bytes - *byte_offset;
- if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
- memcpy(buffered_values, buffer + *byte_offset, 8);
- } else {
- memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
- }
- *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800 4805)
-#endif
- // Read bits of v that crossed into new buffered_values_
- *v = *v | static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset)
- << (num_bits - *bit_offset));
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- DCHECK_LE(*bit_offset, 64);
- }
-}
-
-} // namespace detail
-
-template <typename T>
-inline bool BitReader::GetValue(int num_bits, T* v) {
- return GetBatch(num_bits, v, 1) == 1;
-}
-
-template <typename T>
-inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
- DCHECK(buffer_ != NULL);
- // TODO: revisit this limit if necessary
- DCHECK_LE(num_bits, 32);
- DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
-
- int bit_offset = bit_offset_;
- int byte_offset = byte_offset_;
- uint64_t buffered_values = buffered_values_;
- int max_bytes = max_bytes_;
- const uint8_t* buffer = buffer_;
-
- uint64_t needed_bits = num_bits * batch_size;
- constexpr uint64_t kBitsPerByte = 8;
- uint64_t remaining_bits = (max_bytes - byte_offset) * kBitsPerByte - bit_offset;
- if (remaining_bits < needed_bits) {
- batch_size = static_cast<int>(remaining_bits) / num_bits;
- }
-
- int i = 0;
- if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
- for (; i < batch_size && bit_offset != 0; ++i) {
- detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
- &buffered_values);
- }
- }
-
- if (sizeof(T) == 4) {
- int num_unpacked =
- internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
- reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
- i += num_unpacked;
- byte_offset += num_unpacked * num_bits / 8;
- } else {
- const int buffer_size = 1024;
- uint32_t unpack_buffer[buffer_size];
- while (i < batch_size) {
- int unpack_size = std::min(buffer_size, batch_size - i);
- int num_unpacked =
- internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
- unpack_buffer, unpack_size, num_bits);
- if (num_unpacked == 0) {
- break;
- }
- for (int k = 0; k < num_unpacked; ++k) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4800)
-#endif
- v[i + k] = static_cast<T>(unpack_buffer[k]);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
- }
- i += num_unpacked;
- byte_offset += num_unpacked * num_bits / 8;
- }
- }
-
- int bytes_remaining = max_bytes - byte_offset;
- if (bytes_remaining >= 8) {
- memcpy(&buffered_values, buffer + byte_offset, 8);
- } else {
- memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
- }
- buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
-
- for (; i < batch_size; ++i) {
- detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
- &buffered_values);
- }
-
- bit_offset_ = bit_offset;
- byte_offset_ = byte_offset;
- buffered_values_ = buffered_values;
-
- return batch_size;
-}
-
-template <typename T>
-inline bool BitReader::GetAligned(int num_bytes, T* v) {
- if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
- return false;
- }
-
- int bytes_read = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
- if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) {
- return false;
- }
-
- // Advance byte_offset to next unread byte and read num_bytes
- byte_offset_ += bytes_read;
- memcpy(v, buffer_ + byte_offset_, num_bytes);
- *v = arrow::BitUtil::FromLittleEndian(*v);
- byte_offset_ += num_bytes;
-
- // Reset buffered_values_
- bit_offset_ = 0;
- int bytes_remaining = max_bytes_ - byte_offset_;
- if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
- memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
- } else {
- memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
- }
- buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
- return true;
-}
-
-inline bool BitWriter::PutVlqInt(uint32_t v) {
- bool result = true;
- while ((v & 0xFFFFFF80UL) != 0UL) {
- result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
- v >>= 7;
- }
- result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
- return result;
-}
-
-inline bool BitReader::GetVlqInt(uint32_t* v) {
- uint32_t tmp = 0;
-
- for (int i = 0; i < kMaxVlqByteLength; i++) {
- uint8_t byte = 0;
- if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
- return false;
- }
- tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
-
- if ((byte & 0x80) == 0) {
- *v = tmp;
- return true;
- }
- }
-
- return false;
-}
-
-inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
- auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
- return PutVlqInt((u_v << 1) ^ (u_v >> 31));
-}
-
-inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
- uint32_t u;
- if (!GetVlqInt(&u)) return false;
- *v = ::arrow::util::SafeCopy<int32_t>((u >> 1) ^ (u << 31));
- return true;
-}
-
-} // namespace BitUtil
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29
+
+#pragma once
+
+#include <string.h>
+#include <algorithm>
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bpacking.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace BitUtil {
+
+/// Utility class to write bit/byte streams. This class can write data to either be
+/// bit packed or byte aligned (and a single stream that has a mix of both).
+/// This class does not allocate memory.
+class BitWriter {
+ public:
+ /// buffer: buffer to write bits to. Buffer should be preallocated with
+ /// 'buffer_len' bytes.
+ BitWriter(uint8_t* buffer, int buffer_len) : buffer_(buffer), max_bytes_(buffer_len) {
+ Clear();
+ }
+
+ void Clear() {
+ buffered_values_ = 0;
+ byte_offset_ = 0;
+ bit_offset_ = 0;
+ }
+
+ /// The number of current bytes written, including the current byte (i.e. may include a
+ /// fraction of a byte). Includes buffered values.
+ int bytes_written() const {
+ return byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ }
+ uint8_t* buffer() const { return buffer_; }
+ int buffer_len() const { return max_bytes_; }
+
+ /// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
+ /// packed. Returns false if there was not enough space. num_bits must be <= 32.
+ bool PutValue(uint64_t v, int num_bits);
+
+ /// Writes v to the next aligned byte using num_bytes. If T is larger than
+ /// num_bytes, the extra high-order bytes will be ignored. Returns false if
+ /// there was not enough space.
+ /// Assume the v is stored in buffer_ as a litte-endian format
+ template <typename T>
+ bool PutAligned(T v, int num_bytes);
+
+ /// Write a Vlq encoded int to the buffer. Returns false if there was not enough
+ /// room. The value is written byte aligned.
+ /// For more details on vlq:
+ /// en.wikipedia.org/wiki/Variable-length_quantity
+ bool PutVlqInt(uint32_t v);
+
+ // Writes an int zigzag encoded.
+ bool PutZigZagVlqInt(int32_t v);
+
+ /// Get a pointer to the next aligned byte and advance the underlying buffer
+ /// by num_bytes.
+ /// Returns NULL if there was not enough space.
+ uint8_t* GetNextBytePtr(int num_bytes = 1);
+
+ /// Flushes all buffered values to the buffer. Call this when done writing to
+ /// the buffer. If 'align' is true, buffered_values_ is reset and any future
+ /// writes will be written to the next byte boundary.
+ void Flush(bool align = false);
+
+ private:
+ uint8_t* buffer_;
+ int max_bytes_;
+
+ /// Bit-packed values are initially written to this variable before being memcpy'd to
+ /// buffer_. This is faster than writing values byte by byte directly to buffer_.
+ uint64_t buffered_values_;
+
+ int byte_offset_; // Offset in buffer_
+ int bit_offset_; // Offset in buffered_values_
+};
+
+/// Utility class to read bit/byte stream. This class can read bits or bytes
+/// that are either byte aligned or not. It also has utilities to read multiple
+/// bytes in one read (e.g. encoded int).
+class BitReader {
+ public:
+ /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
+ BitReader(const uint8_t* buffer, int buffer_len)
+ : buffer_(buffer), max_bytes_(buffer_len), byte_offset_(0), bit_offset_(0) {
+ int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+ memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ }
+
+ BitReader()
+ : buffer_(NULL),
+ max_bytes_(0),
+ buffered_values_(0),
+ byte_offset_(0),
+ bit_offset_(0) {}
+
+ void Reset(const uint8_t* buffer, int buffer_len) {
+ buffer_ = buffer;
+ max_bytes_ = buffer_len;
+ byte_offset_ = 0;
+ bit_offset_ = 0;
+ int num_bytes = std::min(8, max_bytes_ - byte_offset_);
+ memcpy(&buffered_values_, buffer_ + byte_offset_, num_bytes);
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ }
+
+ /// Gets the next value from the buffer. Returns true if 'v' could be read or false if
+ /// there are not enough bytes left. num_bits must be <= 32.
+ template <typename T>
+ bool GetValue(int num_bits, T* v);
+
+ /// Get a number of values from the buffer. Return the number of values actually read.
+ template <typename T>
+ int GetBatch(int num_bits, T* v, int batch_size);
+
+ /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+ /// needs to be a little-endian native type and big enough to store
+ /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+ /// be advanced to the start of the next byte before 'v' is read. Returns
+ /// false if there are not enough bytes left.
+ /// Assume the v was stored in buffer_ as a litte-endian format
+ template <typename T>
+ bool GetAligned(int num_bytes, T* v);
+
+ /// Reads a vlq encoded int from the stream. The encoded int must start at
+ /// the beginning of a byte. Return false if there were not enough bytes in
+ /// the buffer.
+ bool GetVlqInt(uint32_t* v);
+
+ // Reads a zigzag encoded int `into` v.
+ bool GetZigZagVlqInt(int32_t* v);
+
+ /// Returns the number of bytes left in the stream, not including the current
+ /// byte (i.e., there may be an additional fraction of a byte).
+ int bytes_left() {
+ return max_bytes_ -
+ (byte_offset_ + static_cast<int>(BitUtil::BytesForBits(bit_offset_)));
+ }
+
+ /// Maximum byte length of a vlq encoded int
+ static constexpr int kMaxVlqByteLength = 5;
+
+ private:
+ const uint8_t* buffer_;
+ int max_bytes_;
+
+ /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+ /// faster than reading values byte by byte directly from buffer_.
+ uint64_t buffered_values_;
+
+ int byte_offset_; // Offset in buffer_
+ int bit_offset_; // Offset in buffered_values_
+};
+
+inline bool BitWriter::PutValue(uint64_t v, int num_bits) {
+ // TODO: revisit this limit if necessary (can be raised to 64 by fixing some edge cases)
+ DCHECK_LE(num_bits, 32);
+ DCHECK_EQ(v >> num_bits, 0) << "v = " << v << ", num_bits = " << num_bits;
+
+ if (ARROW_PREDICT_FALSE(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8))
+ return false;
+
+ buffered_values_ |= v << bit_offset_;
+ bit_offset_ += num_bits;
+
+ if (ARROW_PREDICT_FALSE(bit_offset_ >= 64)) {
+ // Flush buffered_values_ and write out bits of v that did not fit
+ buffered_values_ = arrow::BitUtil::ToLittleEndian(buffered_values_);
+ memcpy(buffer_ + byte_offset_, &buffered_values_, 8);
+ buffered_values_ = 0;
+ byte_offset_ += 8;
+ bit_offset_ -= 64;
+ buffered_values_ = v >> (num_bits - bit_offset_);
+ }
+ DCHECK_LT(bit_offset_, 64);
+ return true;
+}
+
+inline void BitWriter::Flush(bool align) {
+ int num_bytes = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ DCHECK_LE(byte_offset_ + num_bytes, max_bytes_);
+ auto buffered_values = arrow::BitUtil::ToLittleEndian(buffered_values_);
+ memcpy(buffer_ + byte_offset_, &buffered_values, num_bytes);
+
+ if (align) {
+ buffered_values_ = 0;
+ byte_offset_ += num_bytes;
+ bit_offset_ = 0;
+ }
+}
+
+inline uint8_t* BitWriter::GetNextBytePtr(int num_bytes) {
+ Flush(/* align */ true);
+ DCHECK_LE(byte_offset_, max_bytes_);
+ if (byte_offset_ + num_bytes > max_bytes_) return NULL;
+ uint8_t* ptr = buffer_ + byte_offset_;
+ byte_offset_ += num_bytes;
+ return ptr;
+}
+
+template <typename T>
+inline bool BitWriter::PutAligned(T val, int num_bytes) {
+ uint8_t* ptr = GetNextBytePtr(num_bytes);
+ if (ptr == NULL) return false;
+ val = arrow::BitUtil::ToLittleEndian(val);
+ memcpy(ptr, &val, num_bytes);
+ return true;
+}
+
+namespace detail {
+
+template <typename T>
+inline void GetValue_(int num_bits, T* v, int max_bytes, const uint8_t* buffer,
+ int* bit_offset, int* byte_offset, uint64_t* buffered_values) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+ *v = static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset + num_bits) >>
+ *bit_offset);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ *bit_offset += num_bits;
+ if (*bit_offset >= 64) {
+ *byte_offset += 8;
+ *bit_offset -= 64;
+
+ int bytes_remaining = max_bytes - *byte_offset;
+ if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+ memcpy(buffered_values, buffer + *byte_offset, 8);
+ } else {
+ memcpy(buffered_values, buffer + *byte_offset, bytes_remaining);
+ }
+ *buffered_values = arrow::BitUtil::FromLittleEndian(*buffered_values);
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800 4805)
+#endif
+ // Read bits of v that crossed into new buffered_values_
+ *v = *v | static_cast<T>(BitUtil::TrailingBits(*buffered_values, *bit_offset)
+ << (num_bits - *bit_offset));
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ DCHECK_LE(*bit_offset, 64);
+ }
+}
+
+} // namespace detail
+
+template <typename T>
+inline bool BitReader::GetValue(int num_bits, T* v) {
+ return GetBatch(num_bits, v, 1) == 1;
+}
+
+template <typename T>
+inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
+ DCHECK(buffer_ != NULL);
+ // TODO: revisit this limit if necessary
+ DCHECK_LE(num_bits, 32);
+ DCHECK_LE(num_bits, static_cast<int>(sizeof(T) * 8));
+
+ int bit_offset = bit_offset_;
+ int byte_offset = byte_offset_;
+ uint64_t buffered_values = buffered_values_;
+ int max_bytes = max_bytes_;
+ const uint8_t* buffer = buffer_;
+
+ uint64_t needed_bits = num_bits * batch_size;
+ constexpr uint64_t kBitsPerByte = 8;
+ uint64_t remaining_bits = (max_bytes - byte_offset) * kBitsPerByte - bit_offset;
+ if (remaining_bits < needed_bits) {
+ batch_size = static_cast<int>(remaining_bits) / num_bits;
+ }
+
+ int i = 0;
+ if (ARROW_PREDICT_FALSE(bit_offset != 0)) {
+ for (; i < batch_size && bit_offset != 0; ++i) {
+ detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+ &buffered_values);
+ }
+ }
+
+ if (sizeof(T) == 4) {
+ int num_unpacked =
+ internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+ reinterpret_cast<uint32_t*>(v + i), batch_size - i, num_bits);
+ i += num_unpacked;
+ byte_offset += num_unpacked * num_bits / 8;
+ } else {
+ const int buffer_size = 1024;
+ uint32_t unpack_buffer[buffer_size];
+ while (i < batch_size) {
+ int unpack_size = std::min(buffer_size, batch_size - i);
+ int num_unpacked =
+ internal::unpack32(reinterpret_cast<const uint32_t*>(buffer + byte_offset),
+ unpack_buffer, unpack_size, num_bits);
+ if (num_unpacked == 0) {
+ break;
+ }
+ for (int k = 0; k < num_unpacked; ++k) {
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4800)
+#endif
+ v[i + k] = static_cast<T>(unpack_buffer[k]);
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+ }
+ i += num_unpacked;
+ byte_offset += num_unpacked * num_bits / 8;
+ }
+ }
+
+ int bytes_remaining = max_bytes - byte_offset;
+ if (bytes_remaining >= 8) {
+ memcpy(&buffered_values, buffer + byte_offset, 8);
+ } else {
+ memcpy(&buffered_values, buffer + byte_offset, bytes_remaining);
+ }
+ buffered_values = arrow::BitUtil::FromLittleEndian(buffered_values);
+
+ for (; i < batch_size; ++i) {
+ detail::GetValue_(num_bits, &v[i], max_bytes, buffer, &bit_offset, &byte_offset,
+ &buffered_values);
+ }
+
+ bit_offset_ = bit_offset;
+ byte_offset_ = byte_offset;
+ buffered_values_ = buffered_values;
+
+ return batch_size;
+}
+
+template <typename T>
+inline bool BitReader::GetAligned(int num_bytes, T* v) {
+ if (ARROW_PREDICT_FALSE(num_bytes > static_cast<int>(sizeof(T)))) {
+ return false;
+ }
+
+ int bytes_read = static_cast<int>(BitUtil::BytesForBits(bit_offset_));
+ if (ARROW_PREDICT_FALSE(byte_offset_ + bytes_read + num_bytes > max_bytes_)) {
+ return false;
+ }
+
+ // Advance byte_offset to next unread byte and read num_bytes
+ byte_offset_ += bytes_read;
+ memcpy(v, buffer_ + byte_offset_, num_bytes);
+ *v = arrow::BitUtil::FromLittleEndian(*v);
+ byte_offset_ += num_bytes;
+
+ // Reset buffered_values_
+ bit_offset_ = 0;
+ int bytes_remaining = max_bytes_ - byte_offset_;
+ if (ARROW_PREDICT_TRUE(bytes_remaining >= 8)) {
+ memcpy(&buffered_values_, buffer_ + byte_offset_, 8);
+ } else {
+ memcpy(&buffered_values_, buffer_ + byte_offset_, bytes_remaining);
+ }
+ buffered_values_ = arrow::BitUtil::FromLittleEndian(buffered_values_);
+ return true;
+}
+
+inline bool BitWriter::PutVlqInt(uint32_t v) {
+ bool result = true;
+ while ((v & 0xFFFFFF80UL) != 0UL) {
+ result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
+ v >>= 7;
+ }
+ result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
+ return result;
+}
+
+inline bool BitReader::GetVlqInt(uint32_t* v) {
+ uint32_t tmp = 0;
+
+ for (int i = 0; i < kMaxVlqByteLength; i++) {
+ uint8_t byte = 0;
+ if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
+ return false;
+ }
+ tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
+
+ if ((byte & 0x80) == 0) {
+ *v = tmp;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+ auto u_v = ::arrow::util::SafeCopy<uint32_t>(v);
+ return PutVlqInt((u_v << 1) ^ (u_v >> 31));
+}
+
+inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
+ uint32_t u;
+ if (!GetVlqInt(&u)) return false;
+ *v = ::arrow::util::SafeCopy<int32_t>((u >> 1) ^ (u << 31));
+ return true;
+}
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
index 1b123f4153a..ee4bcde7713 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.cc
@@ -20,8 +20,8 @@
#include <cstdint>
#include <cstring>
-#include "arrow/util/logging.h"
-
+#include "arrow/util/logging.h"
+
namespace arrow {
namespace BitUtil {
@@ -69,59 +69,59 @@ void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_ar
bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
}
-template <bool value>
-void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
- // offset length
- // data |<------------->|
- // |--------|...|--------|...|--------|
- // |<--->| |<--->|
- // pro epi
- if (ARROW_PREDICT_FALSE(length == 0)) {
- return;
- }
-
- constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
-
- auto prologue = static_cast<int32_t>(BitUtil::RoundUp(offset, 8) - offset);
- DCHECK_LT(prologue, 8);
-
- if (length < prologue) { // special case where a mask is required
- // offset length
- // data |<->|
- // |--------|...|--------|...
- // mask --> |111|
- // |<---->|
- // pro
- uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
- BitUtil::kPrecedingBitmask[8 - prologue + length];
- data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask;
- return;
- }
-
- // align to a byte boundary
- data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte);
- offset += prologue;
- length -= prologue;
-
- // set values per byte
- DCHECK_EQ(offset % 8, 0);
- std::memset(data + offset / 8, set_byte, length / 8);
- offset += BitUtil::RoundDown(length, 8);
- length -= BitUtil::RoundDown(length, 8);
-
- // clean up
- DCHECK_LT(length, 8);
- data[offset / 8] =
- BitUtil::SpliceWord(static_cast<int32_t>(length), set_byte, data[offset / 8]);
-}
-
-void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
- SetBitmapImpl<true>(data, offset, length);
-}
-
-void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
- SetBitmapImpl<false>(data, offset, length);
-}
-
+template <bool value>
+void SetBitmapImpl(uint8_t* data, int64_t offset, int64_t length) {
+ // offset length
+ // data |<------------->|
+ // |--------|...|--------|...|--------|
+ // |<--->| |<--->|
+ // pro epi
+ if (ARROW_PREDICT_FALSE(length == 0)) {
+ return;
+ }
+
+ constexpr uint8_t set_byte = value ? UINT8_MAX : 0;
+
+ auto prologue = static_cast<int32_t>(BitUtil::RoundUp(offset, 8) - offset);
+ DCHECK_LT(prologue, 8);
+
+ if (length < prologue) { // special case where a mask is required
+ // offset length
+ // data |<->|
+ // |--------|...|--------|...
+ // mask --> |111|
+ // |<---->|
+ // pro
+ uint8_t mask = BitUtil::kPrecedingBitmask[8 - prologue] ^
+ BitUtil::kPrecedingBitmask[8 - prologue + length];
+ data[offset / 8] = value ? data[offset / 8] | mask : data[offset / 8] & ~mask;
+ return;
+ }
+
+ // align to a byte boundary
+ data[offset / 8] = BitUtil::SpliceWord(8 - prologue, data[offset / 8], set_byte);
+ offset += prologue;
+ length -= prologue;
+
+ // set values per byte
+ DCHECK_EQ(offset % 8, 0);
+ std::memset(data + offset / 8, set_byte, length / 8);
+ offset += BitUtil::RoundDown(length, 8);
+ length -= BitUtil::RoundDown(length, 8);
+
+ // clean up
+ DCHECK_LT(length, 8);
+ data[offset / 8] =
+ BitUtil::SpliceWord(static_cast<int32_t>(length), set_byte, data[offset / 8]);
+}
+
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length) {
+ SetBitmapImpl<true>(data, offset, length);
+}
+
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length) {
+ SetBitmapImpl<false>(data, offset, length);
+}
+
} // namespace BitUtil
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
index 216cf9fba88..c306ce7821b 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bit_util.h
@@ -112,7 +112,7 @@ constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
// Returns a mask for the bit_index lower order bits.
// Only valid for bit_index in the range [0, 64).
-constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
+constexpr uint64_t LeastSignificantBitMask(int64_t bit_index) {
return (static_cast<uint64_t>(1) << bit_index) - 1;
}
@@ -290,14 +290,14 @@ static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63
// the bitwise complement version of kPrecedingBitmask
static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128};
-static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
+static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
return (bits[i >> 3] >> (i & 0x07)) & 1;
}
// Gets the i-th bit from a byte. Should only be used with i <= 7.
-static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
- return byte & kBitmask[i];
-}
+static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
+ return byte & kBitmask[i];
+}
static inline void ClearBit(uint8_t* bits, int64_t i) {
bits[i / 8] &= kFlippedBitmask[i % 8];
@@ -318,37 +318,37 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
ARROW_EXPORT
void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
-/// \brief Sets all bits in the bitmap to true
-ARROW_EXPORT
-void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
-
-/// \brief Clears all bits in the bitmap (set to false)
-ARROW_EXPORT
-void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
-
-/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
-/// returned
-/// ex:
-/// ref: https://stackoverflow.com/a/59523400
-template <typename Word>
-constexpr Word PrecedingWordBitmask(unsigned int const i) {
- return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
-}
-static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
-static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
-static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
-static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
-
-/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
-/// from `high`.
-/// Word ret
-/// for (i = 0; i < sizeof(Word)*8; i++){
-/// ret[i]= i < n ? low[i]: high[i];
-/// }
-template <typename Word>
-constexpr Word SpliceWord(int n, Word low, Word high) {
- return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
-}
-
+/// \brief Sets all bits in the bitmap to true
+ARROW_EXPORT
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// \brief Clears all bits in the bitmap (set to false)
+ARROW_EXPORT
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
+/// returned
+/// ex:
+/// ref: https://stackoverflow.com/a/59523400
+template <typename Word>
+constexpr Word PrecedingWordBitmask(unsigned int const i) {
+ return (static_cast<Word>(i < sizeof(Word) * 8) << (i & (sizeof(Word) * 8 - 1))) - 1;
+}
+static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
+static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
+static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
+static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
+
+/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
+/// from `high`.
+/// Word ret
+/// for (i = 0; i < sizeof(Word)*8; i++){
+/// ret[i]= i < n ? low[i]: high[i];
+/// }
+template <typename Word>
+constexpr Word SpliceWord(int n, Word low, Word high) {
+ return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
+}
+
} // namespace BitUtil
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
index bd389138316..33d1dee1957 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.cc
@@ -46,16 +46,16 @@ std::string Bitmap::Diff(const Bitmap& other) const {
return ToArray()->Diff(*other.ToArray());
}
-void Bitmap::CopyFrom(const Bitmap& other) {
- ::arrow::internal::CopyBitmap(other.buffer_->data(), other.offset_, other.length_,
- buffer_->mutable_data(), offset_);
-}
-
-void Bitmap::CopyFromInverted(const Bitmap& other) {
- ::arrow::internal::InvertBitmap(other.buffer_->data(), other.offset_, other.length_,
- buffer_->mutable_data(), offset_);
-}
-
+void Bitmap::CopyFrom(const Bitmap& other) {
+ ::arrow::internal::CopyBitmap(other.buffer_->data(), other.offset_, other.length_,
+ buffer_->mutable_data(), offset_);
+}
+
+void Bitmap::CopyFromInverted(const Bitmap& other) {
+ ::arrow::internal::InvertBitmap(other.buffer_->data(), other.offset_, other.length_,
+ buffer_->mutable_data(), offset_);
+}
+
bool Bitmap::Equals(const Bitmap& other) const {
if (length_ != other.length_) {
return false;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
index 13e7c5dc00a..141f863c0b8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap.h
@@ -29,11 +29,11 @@
#include "arrow/buffer.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_reader.h"
-#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
#include "arrow/util/compare.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/functional.h"
#include "arrow/util/string_builder.h"
#include "arrow/util/string_view.h"
@@ -90,13 +90,13 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
BitUtil::SetBitTo(buffer_->mutable_data(), i + offset_, v);
}
- void SetBitsTo(bool v) {
- BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
- }
-
- void CopyFrom(const Bitmap& other);
- void CopyFromInverted(const Bitmap& other);
-
+ void SetBitsTo(bool v) {
+ BitUtil::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
+ }
+
+ void CopyFrom(const Bitmap& other);
+ void CopyFromInverted(const Bitmap& other);
+
/// \brief Visit bits from each bitmap as bitset<N>
///
/// All bitmaps must have identical length.
@@ -112,21 +112,21 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
}
}
- /// \brief Visit bits from each bitmap as bitset<N>
- ///
- /// All bitmaps must have identical length.
- template <size_t N, typename Visitor>
- static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
- int64_t bit_length = BitLength(bitmaps);
- std::bitset<N> bits;
- for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
- for (size_t i = 0; i < N; ++i) {
- bits[i] = bitmaps[i].GetBit(bit_i);
- }
- visitor(bits);
- }
- }
-
+ /// \brief Visit bits from each bitmap as bitset<N>
+ ///
+ /// All bitmaps must have identical length.
+ template <size_t N, typename Visitor>
+ static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps);
+ std::bitset<N> bits;
+ for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+ for (size_t i = 0; i < N; ++i) {
+ bits[i] = bitmaps[i].GetBit(bit_i);
+ }
+ visitor(bits);
+ }
+ }
+
/// \brief Visit words of bits from each bitmap as array<Word, N>
///
/// All bitmaps must have identical length. The first bit in a visited bitmap
@@ -135,14 +135,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
/// returned.
///
/// TODO(bkietz) allow for early termination
- // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
- // It also has a large prolog / epilog overhead and should be used
- // carefully in other cases.
- // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
- // and BitmapUInt64Reader.
+ // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+ // It also has a large prolog / epilog overhead and should be used
+ // carefully in other cases.
+ // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+ // and BitmapUInt64Reader.
template <size_t N, typename Visitor,
- typename Word = typename std::decay<
- internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
constexpr int64_t kBitWidth = sizeof(Word) * 8;
@@ -243,132 +243,132 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
return min_offset;
}
- template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
- typename Word = typename std::decay<
- internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
- static void RunVisitWordsAndWriteLoop(int64_t bit_length,
- std::array<ReaderT, N>& readers,
- std::array<WriterT, M>& writers,
- Visitor&& visitor) {
- constexpr int64_t kBitWidth = sizeof(Word) * 8;
-
- std::array<Word, N> visited_words;
- std::array<Word, M> output_words;
-
- // every reader will have same number of words, since they are same length'ed
- // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
- // Word boundary, every Word would have to be created from 2 adjoining Words
- auto n_words = readers[0].words();
- bit_length -= n_words * kBitWidth;
- while (n_words--) {
- // first collect all words to visited_words array
- for (size_t i = 0; i < N; i++) {
- visited_words[i] = readers[i].NextWord();
- }
- visitor(visited_words, &output_words);
- for (size_t i = 0; i < M; i++) {
- writers[i].PutNextWord(output_words[i]);
- }
- }
-
- // every reader will have same number of trailing bytes, because of the above reason
- // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
- // remaining full/ partial words to write
-
- if (bit_length) {
- // convert the word visitor lambda to a byte_visitor
- auto byte_visitor = [&](const std::array<uint8_t, N>& in,
- std::array<uint8_t, M>* out) {
- std::array<Word, N> in_words;
- std::array<Word, M> out_words;
- std::copy(in.begin(), in.end(), in_words.begin());
- visitor(in_words, &out_words);
- for (size_t i = 0; i < M; i++) {
- out->at(i) = static_cast<uint8_t>(out_words[i]);
- }
- };
-
- std::array<uint8_t, N> visited_bytes;
- std::array<uint8_t, M> output_bytes;
- int n_bytes = readers[0].trailing_bytes();
- while (n_bytes--) {
- visited_bytes.fill(0);
- output_bytes.fill(0);
- int valid_bits;
- for (size_t i = 0; i < N; i++) {
- visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
- }
- byte_visitor(visited_bytes, &output_bytes);
- for (size_t i = 0; i < M; i++) {
- writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
- }
- }
- }
- }
-
- /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
- /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
- ///
- /// All bitmaps must have identical length. The first bit in a visited bitmap
- /// may be offset within the first visited word, but words will otherwise contain
- /// densely packed bits loaded from the bitmap. That offset within the first word is
- /// returned.
- /// Visitor is expected to have the following signature
- /// [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
- ///
- // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
- // It also has a large prolog / epilog overhead and should be used
- // carefully in other cases.
- // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
- // and BitmapUInt64Reader.
- template <size_t N, size_t M, typename Visitor,
- typename Word = typename std::decay<
- internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
- static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
- std::array<Bitmap, M>* out_bitmaps_arg,
- Visitor&& visitor) {
- int64_t bit_length = BitLength(bitmaps_arg);
- assert(bit_length == BitLength(*out_bitmaps_arg));
-
- // if both input and output bitmaps have no byte offset, then use special template
- if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
- [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
- std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
- [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
- std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
- for (size_t i = 0; i < N; ++i) {
- const Bitmap& in_bitmap = bitmaps_arg[i];
- readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
- in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
- }
-
- std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
- for (size_t i = 0; i < M; ++i) {
- const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
- writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
- out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
- }
-
- RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
- } else {
- std::array<BitmapWordReader<Word>, N> readers;
- for (size_t i = 0; i < N; ++i) {
- const Bitmap& in_bitmap = bitmaps_arg[i];
- readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
- in_bitmap.length_);
- }
-
- std::array<BitmapWordWriter<Word>, M> writers;
- for (size_t i = 0; i < M; ++i) {
- const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
- writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
- out_bitmap.offset_, out_bitmap.length_);
- }
-
- RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
- }
- }
-
+ template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static void RunVisitWordsAndWriteLoop(int64_t bit_length,
+ std::array<ReaderT, N>& readers,
+ std::array<WriterT, M>& writers,
+ Visitor&& visitor) {
+ constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+ std::array<Word, N> visited_words;
+ std::array<Word, M> output_words;
+
+ // every reader will have same number of words, since they are same length'ed
+ // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
+ // Word boundary, every Word would have to be created from 2 adjoining Words
+ auto n_words = readers[0].words();
+ bit_length -= n_words * kBitWidth;
+ while (n_words--) {
+ // first collect all words to visited_words array
+ for (size_t i = 0; i < N; i++) {
+ visited_words[i] = readers[i].NextWord();
+ }
+ visitor(visited_words, &output_words);
+ for (size_t i = 0; i < M; i++) {
+ writers[i].PutNextWord(output_words[i]);
+ }
+ }
+
+ // every reader will have same number of trailing bytes, because of the above reason
+ // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
+ // remaining full/ partial words to write
+
+ if (bit_length) {
+ // convert the word visitor lambda to a byte_visitor
+ auto byte_visitor = [&](const std::array<uint8_t, N>& in,
+ std::array<uint8_t, M>* out) {
+ std::array<Word, N> in_words;
+ std::array<Word, M> out_words;
+ std::copy(in.begin(), in.end(), in_words.begin());
+ visitor(in_words, &out_words);
+ for (size_t i = 0; i < M; i++) {
+ out->at(i) = static_cast<uint8_t>(out_words[i]);
+ }
+ };
+
+ std::array<uint8_t, N> visited_bytes;
+ std::array<uint8_t, M> output_bytes;
+ int n_bytes = readers[0].trailing_bytes();
+ while (n_bytes--) {
+ visited_bytes.fill(0);
+ output_bytes.fill(0);
+ int valid_bits;
+ for (size_t i = 0; i < N; i++) {
+ visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
+ }
+ byte_visitor(visited_bytes, &output_bytes);
+ for (size_t i = 0; i < M; i++) {
+ writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
+ }
+ }
+ }
+ }
+
+ /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
+ /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
+ ///
+ /// All bitmaps must have identical length. The first bit in a visited bitmap
+ /// may be offset within the first visited word, but words will otherwise contain
+ /// densely packed bits loaded from the bitmap. That offset within the first word is
+ /// returned.
+ /// Visitor is expected to have the following signature
+ /// [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
+ ///
+ // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+ // It also has a large prolog / epilog overhead and should be used
+ // carefully in other cases.
+ // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+ // and BitmapUInt64Reader.
+ template <size_t N, size_t M, typename Visitor,
+ typename Word = typename std::decay<
+ internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+ static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+ std::array<Bitmap, M>* out_bitmaps_arg,
+ Visitor&& visitor) {
+ int64_t bit_length = BitLength(bitmaps_arg);
+ assert(bit_length == BitLength(*out_bitmaps_arg));
+
+ // if both input and output bitmaps have no byte offset, then use special template
+ if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
+ [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
+ std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
+ [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
+ std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
+ for (size_t i = 0; i < N; ++i) {
+ const Bitmap& in_bitmap = bitmaps_arg[i];
+ readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
+ in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
+ }
+
+ std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
+ for (size_t i = 0; i < M; ++i) {
+ const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+ writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
+ out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
+ }
+
+ RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+ } else {
+ std::array<BitmapWordReader<Word>, N> readers;
+ for (size_t i = 0; i < N; ++i) {
+ const Bitmap& in_bitmap = bitmaps_arg[i];
+ readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
+ in_bitmap.length_);
+ }
+
+ std::array<BitmapWordWriter<Word>, M> writers;
+ for (size_t i = 0; i < M; ++i) {
+ const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+ writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
+ out_bitmap.offset_, out_bitmap.length_);
+ }
+
+ RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+ }
+ }
+
const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
/// offset of first bit relative to buffer().data()
@@ -445,14 +445,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
/// assert bitmaps have identical length and return that length
static int64_t BitLength(const Bitmap* bitmaps, size_t N);
- template <size_t N>
- static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
- for (size_t i = 1; i < N; ++i) {
- assert(bitmaps[i].length() == bitmaps[0].length());
- }
- return bitmaps[0].length();
- }
-
+ template <size_t N>
+ static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
+ for (size_t i = 1; i < N; ++i) {
+ assert(bitmaps[i].length() == bitmaps[0].length());
+ }
+ return bitmaps[0].length();
+ }
+
std::shared_ptr<Buffer> buffer_;
int64_t offset_ = 0, length_ = 0;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
index 68a9016d8a0..129fa913231 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_generate.h
@@ -62,9 +62,9 @@ void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generat
template <class Generator>
void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
Generator&& g) {
- static_assert(std::is_same<typename std::result_of<Generator && ()>::type, bool>::value,
- "Functor passed to GenerateBitsUnrolled must return bool");
-
+ static_assert(std::is_same<typename std::result_of<Generator && ()>::type, bool>::value,
+ "Functor passed to GenerateBitsUnrolled must return bool");
+
if (length == 0) {
return;
}
@@ -77,7 +77,7 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
if (bit_mask != 0x01) {
current_byte = *cur & BitUtil::kPrecedingBitmask[start_bit_offset];
while (bit_mask != 0 && remaining > 0) {
- current_byte |= g() * bit_mask;
+ current_byte |= g() * bit_mask;
bit_mask = static_cast<uint8_t>(bit_mask << 1);
--remaining;
}
@@ -85,14 +85,14 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
}
int64_t remaining_bytes = remaining / 8;
- uint8_t out_results[8];
+ uint8_t out_results[8];
while (remaining_bytes-- > 0) {
- for (int i = 0; i < 8; ++i) {
- out_results[i] = g();
- }
- *cur++ = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
- out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
- out_results[6] << 6 | out_results[7] << 7);
+ for (int i = 0; i < 8; ++i) {
+ out_results[i] = g();
+ }
+ *cur++ = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
+ out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
+ out_results[6] << 6 | out_results[7] << 7);
}
int64_t remaining_bits = remaining % 8;
@@ -100,7 +100,7 @@ void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
current_byte = 0;
bit_mask = 0x01;
while (remaining_bits-- > 0) {
- current_byte |= g() * bit_mask;
+ current_byte |= g() * bit_mask;
bit_mask = static_cast<uint8_t>(bit_mask << 1);
}
*cur++ = current_byte;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
index afbad3f8aba..63c8b008f4a 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.cc
@@ -172,7 +172,7 @@ Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* data
}
Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* data,
- int64_t offset, int64_t length) {
+ int64_t offset, int64_t length) {
return TransferBitmap<TransferMode::Invert>(pool, data, offset, length);
}
@@ -215,26 +215,26 @@ bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right
return true;
}
-bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length) {
- if (left == nullptr && right == nullptr) {
- return true;
- } else if (left != nullptr && right != nullptr) {
- return BitmapEquals(left, left_offset, right, right_offset, length);
- } else if (left != nullptr) {
- return CountSetBits(left, left_offset, length) == length;
- } else {
- return CountSetBits(right, right_offset, length) == length;
- }
-}
-
-bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
- const std::shared_ptr<Buffer>& right, int64_t right_offset,
- int64_t length) {
- return OptionalBitmapEquals(left ? left->data() : nullptr, left_offset,
- right ? right->data() : nullptr, right_offset, length);
-}
-
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length) {
+ if (left == nullptr && right == nullptr) {
+ return true;
+ } else if (left != nullptr && right != nullptr) {
+ return BitmapEquals(left, left_offset, right, right_offset, length);
+ } else if (left != nullptr) {
+ return CountSetBits(left, left_offset, length) == length;
+ } else {
+ return CountSetBits(right, right_offset, length) == length;
+ }
+}
+
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right, int64_t right_offset,
+ int64_t length) {
+ return OptionalBitmapEquals(left ? left->data() : nullptr, left_offset,
+ right ? right->data() : nullptr, right_offset, length);
+}
+
namespace {
template <template <typename> class BitOp>
@@ -346,42 +346,42 @@ void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
BitmapOp<std::bit_xor>(left, left_offset, right, right_offset, length, out_offset, out);
}
-template <typename T>
-struct AndNotOp {
- constexpr T operator()(const T& l, const T& r) const { return l & ~r; }
-};
-
-Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset) {
- return BitmapOp<AndNotOp>(pool, left, left_offset, right, right_offset, length,
- out_offset);
-}
-
-void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset,
- uint8_t* out) {
- BitmapOp<AndNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
-}
-
-template <typename T>
-struct OrNotOp {
- constexpr T operator()(const T& l, const T& r) const { return l | ~r; }
-};
-
-Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset) {
- return BitmapOp<OrNotOp>(pool, left, left_offset, right, right_offset, length,
- out_offset);
-}
-
-void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
- BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
-}
-
+template <typename T>
+struct AndNotOp {
+ constexpr T operator()(const T& l, const T& r) const { return l & ~r; }
+};
+
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<AndNotOp>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset,
+ uint8_t* out) {
+ BitmapOp<AndNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
+template <typename T>
+struct OrNotOp {
+ constexpr T operator()(const T& l, const T& r) const { return l | ~r; }
+};
+
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset) {
+ return BitmapOp<OrNotOp>(pool, left, left_offset, right, right_offset, length,
+ out_offset);
+}
+
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out) {
+ BitmapOp<OrNotOp>(left, left_offset, right, right_offset, length, out_offset, out);
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
index ad9990459de..40a7797a239 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_ops.h
@@ -96,17 +96,17 @@ ARROW_EXPORT
bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
int64_t right_offset, int64_t length);
-// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
-// all-ones bitmap.
-ARROW_EXPORT
-bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length);
-
-ARROW_EXPORT
-bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
- const std::shared_ptr<Buffer>& right, int64_t right_offset,
- int64_t length);
-
+// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
+// all-ones bitmap.
+ARROW_EXPORT
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length);
+
+ARROW_EXPORT
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+ const std::shared_ptr<Buffer>& right, int64_t right_offset,
+ int64_t length);
+
/// \brief Do a "bitmap and" on right and left buffers starting at
/// their respective bit-offsets for the given bit-length and put
/// the results in out_buffer starting at the given bit-offset.
@@ -164,43 +164,43 @@ ARROW_EXPORT
void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-/// \brief Do a "bitmap and not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out_buffer starting at the given bit-offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset);
-
-/// \brief Do a "bitmap and not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out starting at the given bit-offset.
-ARROW_EXPORT
-void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
-/// \brief Do a "bitmap or not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out_buffer starting at the given bit-offset.
-///
-/// out_buffer will be allocated and initialized to zeros using pool before
-/// the operation.
-ARROW_EXPORT
-Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
- int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length,
- int64_t out_offset);
-
-/// \brief Do a "bitmap or not" on right and left buffers starting at
-/// their respective bit-offsets for the given bit-length and put
-/// the results in out starting at the given bit-offset.
-ARROW_EXPORT
-void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
-
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+ int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length,
+ int64_t out_offset);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+ int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
index c0f08ff249c..7c43747fafb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_reader.h
@@ -22,7 +22,7 @@
#include "arrow/buffer.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
namespace arrow {
@@ -70,190 +70,190 @@ class BitmapReader {
int64_t bit_offset_;
};
-// XXX Cannot name it BitmapWordReader because the name is already used
-// in bitmap_ops.cc
-
-class BitmapUInt64Reader {
- public:
- BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
- : bitmap_(bitmap + start_offset / 8),
- num_carry_bits_(8 - start_offset % 8),
- length_(length),
- remaining_length_(length_) {
- if (length_ > 0) {
- // Load carry bits from the first byte's MSBs
- if (length_ >= num_carry_bits_) {
- carry_bits_ =
- LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
- } else {
- carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
- }
- }
- }
-
- uint64_t NextWord() {
- if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
- // We can load a full word
- uint64_t next_word = LoadFullWord();
- // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
- uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
- carry_bits_ = next_word >> (64 - num_carry_bits_);
- remaining_length_ -= 64;
- return word;
- } else if (remaining_length_ > num_carry_bits_) {
- // We can load a partial word
- uint64_t next_word =
- LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
- uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
- carry_bits_ = next_word >> (64 - num_carry_bits_);
- remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
- return word;
- } else {
- remaining_length_ = 0;
- return carry_bits_;
- }
- }
-
- int64_t position() const { return length_ - remaining_length_; }
-
- int64_t length() const { return length_; }
-
- private:
- uint64_t LoadFullWord() {
- uint64_t word;
- memcpy(&word, bitmap_, 8);
- bitmap_ += 8;
- return BitUtil::ToLittleEndian(word);
- }
-
- uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
- uint64_t word = 0;
- const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
- memcpy(&word, bitmap_, num_bytes);
- bitmap_ += num_bytes;
- return (BitUtil::ToLittleEndian(word) >> bit_offset) &
- BitUtil::LeastSignificantBitMask(num_bits);
- }
-
- const uint8_t* bitmap_;
- const int64_t num_carry_bits_; // in [1, 8]
- const int64_t length_;
- int64_t remaining_length_;
- uint64_t carry_bits_;
-};
-
-// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
-// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
-// and should probably not be used for small bitmaps.
-
-template <typename Word, bool may_have_byte_offset = true>
-class BitmapWordReader {
- public:
- BitmapWordReader() = default;
- BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
- : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
- bitmap_(bitmap + offset / 8),
- bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
- // decrement word count by one as we may touch two adjacent words in one iteration
- nwords_ = length / (sizeof(Word) * 8) - 1;
- if (nwords_ < 0) {
- nwords_ = 0;
- }
- trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
- trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
-
- if (nwords_ > 0) {
- current_word_ = load<Word>(bitmap_);
- } else if (length > 0) {
- current_byte_ = load<uint8_t>(bitmap_);
- }
- }
-
- Word NextWord() {
- bitmap_ += sizeof(Word);
- const Word next_word = load<Word>(bitmap_);
- Word word = current_word_;
- if (may_have_byte_offset && offset_) {
- // combine two adjacent words into one word
- // |<------ next ----->|<---- current ---->|
- // +-------------+-----+-------------+-----+
- // | --- | A | B | --- |
- // +-------------+-----+-------------+-----+
- // | | offset
- // v v
- // +-----+-------------+
- // | A | B |
- // +-----+-------------+
- // |<------ word ----->|
- word >>= offset_;
- word |= next_word << (sizeof(Word) * 8 - offset_);
- }
- current_word_ = next_word;
- return word;
- }
-
- uint8_t NextTrailingByte(int& valid_bits) {
- uint8_t byte;
- assert(trailing_bits_ > 0);
-
- if (trailing_bits_ <= 8) {
- // last byte
- valid_bits = trailing_bits_;
- trailing_bits_ = 0;
- byte = 0;
- internal::BitmapReader reader(bitmap_, offset_, valid_bits);
- for (int i = 0; i < valid_bits; ++i) {
- byte >>= 1;
- if (reader.IsSet()) {
- byte |= 0x80;
- }
- reader.Next();
- }
- byte >>= (8 - valid_bits);
- } else {
- ++bitmap_;
- const uint8_t next_byte = load<uint8_t>(bitmap_);
- byte = current_byte_;
- if (may_have_byte_offset && offset_) {
- byte >>= offset_;
- byte |= next_byte << (8 - offset_);
- }
- current_byte_ = next_byte;
- trailing_bits_ -= 8;
- trailing_bytes_--;
- valid_bits = 8;
- }
- return byte;
- }
-
- int64_t words() const { return nwords_; }
- int trailing_bytes() const { return trailing_bytes_; }
-
- private:
- int64_t offset_;
- const uint8_t* bitmap_;
-
- const uint8_t* bitmap_end_;
- int64_t nwords_;
- int trailing_bits_;
- int trailing_bytes_;
- union {
- Word current_word_;
- struct {
-#if ARROW_LITTLE_ENDIAN == 0
- uint8_t padding_bytes_[sizeof(Word) - 1];
-#endif
- uint8_t current_byte_;
- };
- };
-
- template <typename DType>
- DType load(const uint8_t* bitmap) {
- assert(bitmap + sizeof(DType) <= bitmap_end_);
- return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
- }
-};
-
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc
+
+class BitmapUInt64Reader {
+ public:
+ BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+ : bitmap_(bitmap + start_offset / 8),
+ num_carry_bits_(8 - start_offset % 8),
+ length_(length),
+ remaining_length_(length_) {
+ if (length_ > 0) {
+ // Load carry bits from the first byte's MSBs
+ if (length_ >= num_carry_bits_) {
+ carry_bits_ =
+ LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
+ } else {
+ carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
+ }
+ }
+ }
+
+ uint64_t NextWord() {
+ if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
+ // We can load a full word
+ uint64_t next_word = LoadFullWord();
+ // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
+ uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+ carry_bits_ = next_word >> (64 - num_carry_bits_);
+ remaining_length_ -= 64;
+ return word;
+ } else if (remaining_length_ > num_carry_bits_) {
+ // We can load a partial word
+ uint64_t next_word =
+ LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
+ uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+ carry_bits_ = next_word >> (64 - num_carry_bits_);
+ remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
+ return word;
+ } else {
+ remaining_length_ = 0;
+ return carry_bits_;
+ }
+ }
+
+ int64_t position() const { return length_ - remaining_length_; }
+
+ int64_t length() const { return length_; }
+
+ private:
+ uint64_t LoadFullWord() {
+ uint64_t word;
+ memcpy(&word, bitmap_, 8);
+ bitmap_ += 8;
+ return BitUtil::ToLittleEndian(word);
+ }
+
+ uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+ uint64_t word = 0;
+ const int64_t num_bytes = BitUtil::BytesForBits(num_bits);
+ memcpy(&word, bitmap_, num_bytes);
+ bitmap_ += num_bytes;
+ return (BitUtil::ToLittleEndian(word) >> bit_offset) &
+ BitUtil::LeastSignificantBitMask(num_bits);
+ }
+
+ const uint8_t* bitmap_;
+ const int64_t num_carry_bits_; // in [1, 8]
+ const int64_t length_;
+ int64_t remaining_length_;
+ uint64_t carry_bits_;
+};
+
+// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
+// on sufficiently large inputs. However, it has a larger prolog / epilog overhead
+// and should probably not be used for small bitmaps.
+
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordReader {
+ public:
+ BitmapWordReader() = default;
+ BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
+ : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+ bitmap_(bitmap + offset / 8),
+ bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)) {
+ // decrement word count by one as we may touch two adjacent words in one iteration
+ nwords_ = length / (sizeof(Word) * 8) - 1;
+ if (nwords_ < 0) {
+ nwords_ = 0;
+ }
+ trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
+ trailing_bytes_ = static_cast<int>(BitUtil::BytesForBits(trailing_bits_));
+
+ if (nwords_ > 0) {
+ current_word_ = load<Word>(bitmap_);
+ } else if (length > 0) {
+ current_byte_ = load<uint8_t>(bitmap_);
+ }
+ }
+
+ Word NextWord() {
+ bitmap_ += sizeof(Word);
+ const Word next_word = load<Word>(bitmap_);
+ Word word = current_word_;
+ if (may_have_byte_offset && offset_) {
+ // combine two adjacent words into one word
+ // |<------ next ----->|<---- current ---->|
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // | | offset
+ // v v
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // |<------ word ----->|
+ word >>= offset_;
+ word |= next_word << (sizeof(Word) * 8 - offset_);
+ }
+ current_word_ = next_word;
+ return word;
+ }
+
+ uint8_t NextTrailingByte(int& valid_bits) {
+ uint8_t byte;
+ assert(trailing_bits_ > 0);
+
+ if (trailing_bits_ <= 8) {
+ // last byte
+ valid_bits = trailing_bits_;
+ trailing_bits_ = 0;
+ byte = 0;
+ internal::BitmapReader reader(bitmap_, offset_, valid_bits);
+ for (int i = 0; i < valid_bits; ++i) {
+ byte >>= 1;
+ if (reader.IsSet()) {
+ byte |= 0x80;
+ }
+ reader.Next();
+ }
+ byte >>= (8 - valid_bits);
+ } else {
+ ++bitmap_;
+ const uint8_t next_byte = load<uint8_t>(bitmap_);
+ byte = current_byte_;
+ if (may_have_byte_offset && offset_) {
+ byte >>= offset_;
+ byte |= next_byte << (8 - offset_);
+ }
+ current_byte_ = next_byte;
+ trailing_bits_ -= 8;
+ trailing_bytes_--;
+ valid_bits = 8;
+ }
+ return byte;
+ }
+
+ int64_t words() const { return nwords_; }
+ int trailing_bytes() const { return trailing_bytes_; }
+
+ private:
+ int64_t offset_;
+ const uint8_t* bitmap_;
+
+ const uint8_t* bitmap_end_;
+ int64_t nwords_;
+ int trailing_bits_;
+ int trailing_bytes_;
+ union {
+ Word current_word_;
+ struct {
+#if ARROW_LITTLE_ENDIAN == 0
+ uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+ uint8_t current_byte_;
+ };
+ };
+
+ template <typename DType>
+ DType load(const uint8_t* bitmap) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+ }
+};
+
/// \brief Index into a possibly non-existent bitmap
struct OptionalBitIndexer {
const uint8_t* bitmap;
@@ -263,7 +263,7 @@ struct OptionalBitIndexer {
: bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
bool operator[](int64_t i) const {
- return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
+ return bitmap == NULLPTR || BitUtil::GetBit(bitmap, offset + i);
}
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
index dc495d1135b..8a16993e052 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_visit.h
@@ -1,88 +1,88 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_reader.h"
-
-namespace arrow {
-namespace internal {
-
-// A function that visits each bit in a bitmap and calls a visitor function with a
-// boolean representation of that bit. This is intended to be analogous to
-// GenerateBits.
-template <class Visitor>
-void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
- Visitor&& visit) {
- BitmapReader reader(bitmap, start_offset, length);
- for (int64_t index = 0; index < length; ++index) {
- visit(reader.IsSet());
- reader.Next();
- }
-}
-
-// Like VisitBits(), but unrolls its main loop for better performance.
-template <class Visitor>
-void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
- Visitor&& visit) {
- if (length == 0) {
- return;
- }
-
- // Start by visiting any bits preceding the first full byte.
- int64_t num_bits_before_full_bytes =
- BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
- // Truncate num_bits_before_full_bytes if it is greater than length.
- if (num_bits_before_full_bytes > length) {
- num_bits_before_full_bytes = length;
- }
- // Use the non loop-unrolled VisitBits since we don't want to add branches
- VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
-
- // Shift the start pointer to the first full byte and compute the
- // number of full bytes to be read.
- const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
- const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
-
- // Iterate over each full byte of the input bitmap and call the visitor in
- // a loop-unrolled manner.
- for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
- // Get the current bit-packed byte value from the bitmap.
- const uint8_t byte = *(first_full_byte + byte_index);
-
- // Execute the visitor function on each bit of the current byte.
- visit(BitUtil::GetBitFromByte(byte, 0));
- visit(BitUtil::GetBitFromByte(byte, 1));
- visit(BitUtil::GetBitFromByte(byte, 2));
- visit(BitUtil::GetBitFromByte(byte, 3));
- visit(BitUtil::GetBitFromByte(byte, 4));
- visit(BitUtil::GetBitFromByte(byte, 5));
- visit(BitUtil::GetBitFromByte(byte, 6));
- visit(BitUtil::GetBitFromByte(byte, 7));
- }
-
- // Write any leftover bits in the last byte.
- const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
- VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
- visit);
-}
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+
+namespace arrow {
+namespace internal {
+
+// A function that visits each bit in a bitmap and calls a visitor function with a
+// boolean representation of that bit. This is intended to be analogous to
+// GenerateBits.
+template <class Visitor>
+void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Visitor&& visit) {
+ BitmapReader reader(bitmap, start_offset, length);
+ for (int64_t index = 0; index < length; ++index) {
+ visit(reader.IsSet());
+ reader.Next();
+ }
+}
+
+// Like VisitBits(), but unrolls its main loop for better performance.
+template <class Visitor>
+void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+ Visitor&& visit) {
+ if (length == 0) {
+ return;
+ }
+
+ // Start by visiting any bits preceding the first full byte.
+ int64_t num_bits_before_full_bytes =
+ BitUtil::RoundUpToMultipleOf8(start_offset) - start_offset;
+ // Truncate num_bits_before_full_bytes if it is greater than length.
+ if (num_bits_before_full_bytes > length) {
+ num_bits_before_full_bytes = length;
+ }
+ // Use the non loop-unrolled VisitBits since we don't want to add branches
+ VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
+
+ // Shift the start pointer to the first full byte and compute the
+ // number of full bytes to be read.
+ const uint8_t* first_full_byte = bitmap + BitUtil::CeilDiv(start_offset, 8);
+ const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
+
+ // Iterate over each full byte of the input bitmap and call the visitor in
+ // a loop-unrolled manner.
+ for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
+ // Get the current bit-packed byte value from the bitmap.
+ const uint8_t byte = *(first_full_byte + byte_index);
+
+ // Execute the visitor function on each bit of the current byte.
+ visit(BitUtil::GetBitFromByte(byte, 0));
+ visit(BitUtil::GetBitFromByte(byte, 1));
+ visit(BitUtil::GetBitFromByte(byte, 2));
+ visit(BitUtil::GetBitFromByte(byte, 3));
+ visit(BitUtil::GetBitFromByte(byte, 4));
+ visit(BitUtil::GetBitFromByte(byte, 5));
+ visit(BitUtil::GetBitFromByte(byte, 6));
+ visit(BitUtil::GetBitFromByte(byte, 7));
+ }
+
+ // Write any leftover bits in the last byte.
+ const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
+ VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
+ visit);
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
index 096cfc8655a..d5c6d909df0 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bitmap_writer.h
@@ -21,7 +21,7 @@
#include <cstring>
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/macros.h"
namespace arrow {
@@ -180,106 +180,106 @@ class FirstTimeBitmapWriter {
int64_t byte_offset_;
};
-template <typename Word, bool may_have_byte_offset = true>
-class BitmapWordWriter {
- public:
- BitmapWordWriter() = default;
- BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
- : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
- bitmap_(bitmap + offset / 8),
- bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
- mask_((1U << offset_) - 1) {
- if (offset_) {
- if (length >= static_cast<int>(sizeof(Word) * 8)) {
- current_word_ = load<Word>(bitmap_);
- } else if (length > 0) {
- current_byte_ = load<uint8_t>(bitmap_);
- }
- }
- }
-
- void PutNextWord(Word word) {
- if (may_have_byte_offset && offset_) {
- // split one word into two adjacent words, don't touch unused bits
- // |<------ word ----->|
- // +-----+-------------+
- // | A | B |
- // +-----+-------------+
- // | |
- // v v offset
- // +-------------+-----+-------------+-----+
- // | --- | A | B | --- |
- // +-------------+-----+-------------+-----+
- // |<------ next ----->|<---- current ---->|
- word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
- Word next_word = load<Word>(bitmap_ + sizeof(Word));
- current_word_ = (current_word_ & mask_) | (word & ~mask_);
- next_word = (next_word & ~mask_) | (word & mask_);
- store<Word>(bitmap_, current_word_);
- store<Word>(bitmap_ + sizeof(Word), next_word);
- current_word_ = next_word;
- } else {
- store<Word>(bitmap_, word);
- }
- bitmap_ += sizeof(Word);
- }
-
- void PutNextTrailingByte(uint8_t byte, int valid_bits) {
- if (valid_bits == 8) {
- if (may_have_byte_offset && offset_) {
- byte = (byte << offset_) | (byte >> (8 - offset_));
- uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
- current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
- next_byte = (next_byte & ~mask_) | (byte & mask_);
- store<uint8_t>(bitmap_, current_byte_);
- store<uint8_t>(bitmap_ + 1, next_byte);
- current_byte_ = next_byte;
- } else {
- store<uint8_t>(bitmap_, byte);
- }
- ++bitmap_;
- } else {
- assert(valid_bits > 0);
- assert(valid_bits < 8);
- assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
- internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
- for (int i = 0; i < valid_bits; ++i) {
- (byte & 0x01) ? writer.Set() : writer.Clear();
- writer.Next();
- byte >>= 1;
- }
- writer.Finish();
- }
- }
-
- private:
- int64_t offset_;
- uint8_t* bitmap_;
-
- const uint8_t* bitmap_end_;
- uint64_t mask_;
- union {
- Word current_word_;
- struct {
-#if ARROW_LITTLE_ENDIAN == 0
- uint8_t padding_bytes_[sizeof(Word) - 1];
-#endif
- uint8_t current_byte_;
- };
- };
-
- template <typename DType>
- DType load(const uint8_t* bitmap) {
- assert(bitmap + sizeof(DType) <= bitmap_end_);
- return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
- }
-
- template <typename DType>
- void store(uint8_t* bitmap, DType data) {
- assert(bitmap + sizeof(DType) <= bitmap_end_);
- util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
- }
-};
-
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordWriter {
+ public:
+ BitmapWordWriter() = default;
+ BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
+ : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+ bitmap_(bitmap + offset / 8),
+ bitmap_end_(bitmap_ + BitUtil::BytesForBits(offset_ + length)),
+ mask_((1U << offset_) - 1) {
+ if (offset_) {
+ if (length >= static_cast<int>(sizeof(Word) * 8)) {
+ current_word_ = load<Word>(bitmap_);
+ } else if (length > 0) {
+ current_byte_ = load<uint8_t>(bitmap_);
+ }
+ }
+ }
+
+ void PutNextWord(Word word) {
+ if (may_have_byte_offset && offset_) {
+ // split one word into two adjacent words, don't touch unused bits
+ // |<------ word ----->|
+ // +-----+-------------+
+ // | A | B |
+ // +-----+-------------+
+ // | |
+ // v v offset
+ // +-------------+-----+-------------+-----+
+ // | --- | A | B | --- |
+ // +-------------+-----+-------------+-----+
+ // |<------ next ----->|<---- current ---->|
+ word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
+ Word next_word = load<Word>(bitmap_ + sizeof(Word));
+ current_word_ = (current_word_ & mask_) | (word & ~mask_);
+ next_word = (next_word & ~mask_) | (word & mask_);
+ store<Word>(bitmap_, current_word_);
+ store<Word>(bitmap_ + sizeof(Word), next_word);
+ current_word_ = next_word;
+ } else {
+ store<Word>(bitmap_, word);
+ }
+ bitmap_ += sizeof(Word);
+ }
+
+ void PutNextTrailingByte(uint8_t byte, int valid_bits) {
+ if (valid_bits == 8) {
+ if (may_have_byte_offset && offset_) {
+ byte = (byte << offset_) | (byte >> (8 - offset_));
+ uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
+ current_byte_ = (current_byte_ & mask_) | (byte & ~mask_);
+ next_byte = (next_byte & ~mask_) | (byte & mask_);
+ store<uint8_t>(bitmap_, current_byte_);
+ store<uint8_t>(bitmap_ + 1, next_byte);
+ current_byte_ = next_byte;
+ } else {
+ store<uint8_t>(bitmap_, byte);
+ }
+ ++bitmap_;
+ } else {
+ assert(valid_bits > 0);
+ assert(valid_bits < 8);
+ assert(bitmap_ + BitUtil::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
+ internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
+ for (int i = 0; i < valid_bits; ++i) {
+ (byte & 0x01) ? writer.Set() : writer.Clear();
+ writer.Next();
+ byte >>= 1;
+ }
+ writer.Finish();
+ }
+ }
+
+ private:
+ int64_t offset_;
+ uint8_t* bitmap_;
+
+ const uint8_t* bitmap_end_;
+ uint64_t mask_;
+ union {
+ Word current_word_;
+ struct {
+#if ARROW_LITTLE_ENDIAN == 0
+ uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+ uint8_t current_byte_;
+ };
+ };
+
+ template <typename DType>
+ DType load(const uint8_t* bitmap) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ return BitUtil::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+ }
+
+ template <typename DType>
+ void store(uint8_t* bitmap, DType data) {
+ assert(bitmap + sizeof(DType) <= bitmap_end_);
+ util::SafeStore(bitmap, BitUtil::FromLittleEndian(data));
+ }
+};
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
index 538b7382e43..d9cafd602a2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.cc
@@ -27,9 +27,9 @@
#if defined(ARROW_HAVE_RUNTIME_AVX512)
#error #include "arrow/util/bpacking_avx512.h"
#endif
-#if defined(ARROW_HAVE_NEON)
-#error #include "arrow/util/bpacking_neon.h"
-#endif
+#if defined(ARROW_HAVE_NEON)
+#error #include "arrow/util/bpacking_neon.h"
+#endif
namespace arrow {
namespace internal {
@@ -166,12 +166,12 @@ struct Unpack32DynamicFunction {
} // namespace
int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
-#if defined(ARROW_HAVE_NEON)
- return unpack32_neon(in, out, batch_size, num_bits);
-#else
+#if defined(ARROW_HAVE_NEON)
+ return unpack32_neon(in, out, batch_size, num_bits);
+#else
static DynamicDispatch<Unpack32DynamicFunction> dispatch;
return dispatch.func(in, out, batch_size, num_bits);
-#endif
+#endif
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
index 7f4ca3e384c..e5a4dbbed89 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/bpacking.h
@@ -17,7 +17,7 @@
#pragma once
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/visibility.h"
#include <stdint.h>
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
index 53627aee18a..28dcce52bb8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/byte_stream_split.h
@@ -1,626 +1,626 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/util/simd.h"
-#include "arrow/util/ubsan.h"
-
-#include <stdint.h>
-#include <algorithm>
-
-#ifdef ARROW_HAVE_SSE4_2
-// Enable the SIMD for ByteStreamSplit Encoder/Decoder
-#define ARROW_HAVE_SIMD_SPLIT
-#endif // ARROW_HAVE_SSE4_2
-
-namespace arrow {
-namespace util {
-namespace internal {
-
-#if defined(ARROW_HAVE_SSE4_2)
-template <typename T>
-void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
-
- const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
- const int64_t num_blocks = size / kBlockSize;
- uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
-
- // First handle suffix.
- // This helps catch if the simd-based processing overflows into the suffix
- // since almost surely a test would fail.
- const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- gathered_byte_data[b] = data[byte_index];
- }
- out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
- }
-
- // The blocks get processed hierarchically using the unpack intrinsics.
- // Example with four streams:
- // Stage 1: AAAA BBBB CCCC DDDD
- // Stage 2: ACAC ACAC BDBD BDBD
- // Stage 3: ABCD ABCD ABCD ABCD
- __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
- constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
-
- for (int64_t i = 0; i < num_blocks; ++i) {
- for (size_t j = 0; j < kNumStreams; ++j) {
- stage[0][j] = _mm_loadu_si128(
- reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * stride]));
- }
- for (size_t step = 0; step < kNumStreamsLog2; ++step) {
- for (size_t j = 0; j < kNumStreamsHalf; ++j) {
- stage[step + 1U][j * 2] =
- _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- stage[step + 1U][j * 2 + 1U] =
- _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- }
- }
- for (size_t j = 0; j < kNumStreams; ++j) {
- _mm_storeu_si128(reinterpret_cast<__m128i*>(
- &output_data[(i * kNumStreams + j) * sizeof(__m128i)]),
- stage[kNumStreamsLog2][j]);
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- __m128i stage[3][kNumStreams];
- __m128i final_result[kNumStreams];
-
- const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
- const size_t num_blocks = size / kBlockSize;
- const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
- __m128i* output_buffer_streams[kNumStreams];
- for (size_t i = 0; i < kNumStreams; ++i) {
- output_buffer_streams[i] =
- reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]);
- }
-
- // First handle suffix.
- const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
- for (size_t i = num_processed_elements; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
- // The current shuffling algorithm diverges for float and double types but the compiler
- // should be able to remove the branch since only one path is taken for each template
- // instantiation.
- // Example run for floats:
- // Step 0, copy:
- // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
- // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
- // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
- // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
- // Step 3: __mm_unpacklo_epi8 and _mm_unpackhi_epi8:
- // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
- // Step 4: __mm_unpacklo_epi64 and _mm_unpackhi_epi64:
- // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
- for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
- // First copy the data to stage 0.
- for (size_t i = 0; i < kNumStreams; ++i) {
- stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams + i]);
- }
-
- // The shuffling of bytes is performed through the unpack intrinsics.
- // In my measurements this gives better performance then an implementation
- // which uses the shuffle intrinsics.
- for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) {
- for (size_t i = 0; i < kNumStreams / 2U; ++i) {
- stage[stage_lvl + 1][i * 2] =
- _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- stage[stage_lvl + 1][i * 2 + 1] =
- _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- }
- }
- if (kNumStreams == 8U) {
- // This is the path for double.
- __m128i tmp[8];
- for (size_t i = 0; i < 4; ++i) {
- tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
- tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
- }
-
- for (size_t i = 0; i < 4; ++i) {
- final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
- final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
- }
- } else {
- // this is the path for float.
- __m128i tmp[4];
- for (size_t i = 0; i < 2; ++i) {
- tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
- tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
- }
- for (size_t i = 0; i < 2; ++i) {
- final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]);
- final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
- }
- }
- for (size_t i = 0; i < kNumStreams; ++i) {
- _mm_storeu_si128(&output_buffer_streams[i][block_index], final_result[i]);
- }
- }
-}
-#endif // ARROW_HAVE_SSE4_2
-
-#if defined(ARROW_HAVE_AVX2)
-template <typename T>
-void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
-
- const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
- if (size < kBlockSize) // Back to SSE for small size
- return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
- const int64_t num_blocks = size / kBlockSize;
- uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
-
- // First handle suffix.
- const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- gathered_byte_data[b] = data[byte_index];
- }
- out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
- }
-
- // Processed hierarchically using unpack intrinsics, then permute intrinsics.
- __m256i stage[kNumStreamsLog2 + 1U][kNumStreams];
- __m256i final_result[kNumStreams];
- constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
-
- for (int64_t i = 0; i < num_blocks; ++i) {
- for (size_t j = 0; j < kNumStreams; ++j) {
- stage[0][j] = _mm256_loadu_si256(
- reinterpret_cast<const __m256i*>(&data[i * sizeof(__m256i) + j * stride]));
- }
-
- for (size_t step = 0; step < kNumStreamsLog2; ++step) {
- for (size_t j = 0; j < kNumStreamsHalf; ++j) {
- stage[step + 1U][j * 2] =
- _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- stage[step + 1U][j * 2 + 1U] =
- _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- }
- }
-
- if (kNumStreams == 8U) {
- // path for double, 128i index:
- // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
- // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
- final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00100000);
- final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00100000);
- final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b00100000);
- final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b00100000);
- final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00110001);
- final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00110001);
- final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b00110001);
- final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b00110001);
- } else {
- // path for float, 128i index:
- // {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07}
- final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00100000);
- final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00100000);
- final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b00110001);
- final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b00110001);
- }
-
- for (size_t j = 0; j < kNumStreams; ++j) {
- _mm256_storeu_si256(reinterpret_cast<__m256i*>(
- &output_data[(i * kNumStreams + j) * sizeof(__m256i)]),
- final_result[j]);
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- if (kNumStreams == 8U) // Back to SSE, currently no path for double.
- return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
-
- const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
- if (size < kBlockSize) // Back to SSE for small size
- return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
- const size_t num_blocks = size / kBlockSize;
- const __m256i* raw_values_simd = reinterpret_cast<const __m256i*>(raw_values);
- __m256i* output_buffer_streams[kNumStreams];
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- output_buffer_streams[i] =
- reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]);
- }
-
- // First handle suffix.
- const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
- for (size_t i = num_processed_elements; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
-
- // Path for float.
- // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
- // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
- // 3. Pack final 256i block with _mm256_permute2x128_si256.
- constexpr size_t kNumUnpack = 3U;
- __m256i stage[kNumUnpack + 1][kNumStreams];
- static const __m256i kPermuteMask =
- _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
- __m256i permute[kNumStreams];
- __m256i final_result[kNumStreams];
-
- for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
- for (size_t i = 0; i < kNumStreams; ++i) {
- stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]);
- }
-
- for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) {
- for (size_t i = 0; i < kNumStreams / 2U; ++i) {
- stage[stage_lvl + 1][i * 2] =
- _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- stage[stage_lvl + 1][i * 2 + 1] =
- _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
- }
- }
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask);
- }
-
- final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00100000);
- final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00110001);
- final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000);
- final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001);
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]);
- }
- }
-}
-#endif // ARROW_HAVE_AVX2
-
-#if defined(ARROW_HAVE_AVX512)
-template <typename T>
-void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
-
- const int64_t size = num_values * sizeof(T);
- constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
- if (size < kBlockSize) // Back to AVX2 for small size
- return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
- const int64_t num_blocks = size / kBlockSize;
- uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
-
- // First handle suffix.
- const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
- for (int64_t i = num_processed_elements; i < num_values; ++i) {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- gathered_byte_data[b] = data[byte_index];
- }
- out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
- }
-
- // Processed hierarchically using the unpack, then two shuffles.
- __m512i stage[kNumStreamsLog2 + 1U][kNumStreams];
- __m512i shuffle[kNumStreams];
- __m512i final_result[kNumStreams];
- constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
-
- for (int64_t i = 0; i < num_blocks; ++i) {
- for (size_t j = 0; j < kNumStreams; ++j) {
- stage[0][j] = _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
- }
-
- for (size_t step = 0; step < kNumStreamsLog2; ++step) {
- for (size_t j = 0; j < kNumStreamsHalf; ++j) {
- stage[step + 1U][j * 2] =
- _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- stage[step + 1U][j * 2 + 1U] =
- _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
- }
- }
-
- if (kNumStreams == 8U) {
- // path for double, 128i index:
- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
- shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b01000100);
- shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b01000100);
- shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b01000100);
- shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b01000100);
- shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b11101110);
- shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b11101110);
- shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
- stage[kNumStreamsLog2][5], 0b11101110);
- shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
- stage[kNumStreamsLog2][7], 0b11101110);
-
- final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
- final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
- final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
- final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
- final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
- final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
- final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
- final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
- } else {
- // path for float, 128i index:
- // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
- // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
- shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b01000100);
- shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b01000100);
- shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
- stage[kNumStreamsLog2][1], 0b11101110);
- shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
- stage[kNumStreamsLog2][3], 0b11101110);
-
- final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
- final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
- final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
- final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
- }
-
- for (size_t j = 0; j < kNumStreams; ++j) {
- _mm512_storeu_si512(reinterpret_cast<__m512i*>(
- &output_data[(i * kNumStreams + j) * sizeof(__m512i)]),
- final_result[j]);
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
- const size_t size = num_values * sizeof(T);
- constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
- if (size < kBlockSize) // Back to AVX2 for small size
- return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
-
- const size_t num_blocks = size / kBlockSize;
- const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
- __m512i* output_buffer_streams[kNumStreams];
- for (size_t i = 0; i < kNumStreams; ++i) {
- output_buffer_streams[i] =
- reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
- }
-
- // First handle suffix.
- const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
- for (size_t i = num_processed_elements; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
-
- constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U;
- __m512i final_result[kNumStreams];
- __m512i unpack[KNumUnpack + 1][kNumStreams];
- __m512i permutex[kNumStreams];
- __m512i permutex_mask;
- if (kNumStreams == 8U) {
- // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
- permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
- 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
- 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
- 0x00190011, 0x00090001, 0x00180010, 0x00080000);
- } else {
- permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
- 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
- }
-
- for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
- for (size_t i = 0; i < kNumStreams; ++i) {
- unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
- }
-
- for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
- for (size_t i = 0; i < kNumStreams / 2U; ++i) {
- unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
- unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
- unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
- unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
- }
- }
-
- if (kNumStreams == 8U) {
- // path for double
- // 1. unpack to epi16 block
- // 2. permutexvar_epi16 to 128i block
- // 3. shuffle 128i to final 512i target, index:
- // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
- // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
- // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
- // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
- for (size_t i = 0; i < kNumStreams; ++i)
- permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);
-
- __m512i shuffle[kNumStreams];
- shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
- shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
- shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
- shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
- shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
- shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
- shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
- shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
-
- final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
- final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
- final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
- final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
- final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
- final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
- final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
- final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
- } else {
- // Path for float.
- // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
- // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
- // 3. Pack final 256i block with _mm256_permute2x128_si256.
- for (size_t i = 0; i < kNumStreams; ++i)
- permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);
-
- final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
- final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
- final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
- final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
- }
-
- for (size_t i = 0; i < kNumStreams; ++i) {
- _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
- }
- }
-}
-#endif // ARROW_HAVE_AVX512
-
-#if defined(ARROW_HAVE_SIMD_SPLIT)
-template <typename T>
-void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
- int64_t stride, T* out) {
-#if defined(ARROW_HAVE_AVX512)
- return ByteStreamSplitDecodeAvx512(data, num_values, stride, out);
-#elif defined(ARROW_HAVE_AVX2)
- return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
-#elif defined(ARROW_HAVE_SSE4_2)
- return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
-#else
-#error "ByteStreamSplitDecodeSimd not implemented"
-#endif
-}
-
-template <typename T>
-void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
-#if defined(ARROW_HAVE_AVX512)
- return ByteStreamSplitEncodeAvx512<T>(raw_values, num_values, output_buffer_raw);
-#elif defined(ARROW_HAVE_AVX2)
- return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
-#elif defined(ARROW_HAVE_SSE4_2)
- return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
-#else
-#error "ByteStreamSplitEncodeSimd not implemented"
-#endif
-}
-#endif
-
-template <typename T>
-void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
- constexpr size_t kNumStreams = sizeof(T);
- for (size_t i = 0U; i < num_values; ++i) {
- for (size_t j = 0U; j < kNumStreams; ++j) {
- const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
- output_buffer_raw[j * num_values + i] = byte_in_value;
- }
- }
-}
-
-template <typename T>
-void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
- constexpr size_t kNumStreams = sizeof(T);
- auto output_buffer_raw = reinterpret_cast<uint8_t*>(out);
-
- for (int64_t i = 0; i < num_values; ++i) {
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * stride + i;
- output_buffer_raw[i * kNumStreams + b] = data[byte_index];
- }
- }
-}
-
-template <typename T>
-void inline ByteStreamSplitEncode(const uint8_t* raw_values, const size_t num_values,
- uint8_t* output_buffer_raw) {
-#if defined(ARROW_HAVE_SIMD_SPLIT)
- return ByteStreamSplitEncodeSimd<T>(raw_values, num_values, output_buffer_raw);
-#else
- return ByteStreamSplitEncodeScalar<T>(raw_values, num_values, output_buffer_raw);
-#endif
-}
-
-template <typename T>
-void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride,
- T* out) {
-#if defined(ARROW_HAVE_SIMD_SPLIT)
- return ByteStreamSplitDecodeSimd(data, num_values, stride, out);
-#else
- return ByteStreamSplitDecodeScalar(data, num_values, stride, out);
-#endif
-}
-
-} // namespace internal
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/simd.h"
+#include "arrow/util/ubsan.h"
+
+#include <stdint.h>
+#include <algorithm>
+
+#ifdef ARROW_HAVE_SSE4_2
+// Enable the SIMD for ByteStreamSplit Encoder/Decoder
+#define ARROW_HAVE_SIMD_SPLIT
+#endif // ARROW_HAVE_SSE4_2
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+#if defined(ARROW_HAVE_SSE4_2)
+template <typename T>
+void ByteStreamSplitDecodeSse2(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m128i) * kNumStreams;
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ // This helps catch if the simd-based processing overflows into the suffix
+ // since almost surely a test would fail.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // The blocks get processed hierarchically using the unpack intrinsics.
+ // Example with four streams:
+ // Stage 1: AAAA BBBB CCCC DDDD
+ // Stage 2: ACAC ACAC BDBD BDBD
+ // Stage 3: ABCD ABCD ABCD ABCD
+ __m128i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm_loadu_si128(
+ reinterpret_cast<const __m128i*>(&data[i * sizeof(__m128i) + j * stride]));
+ }
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m128i)]),
+ stage[kNumStreamsLog2][j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeSse2(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ __m128i stage[3][kNumStreams];
+ __m128i final_result[kNumStreams];
+
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m128i) * kNumStreams;
+ const size_t num_blocks = size / kBlockSize;
+ const __m128i* raw_values_sse = reinterpret_cast<const __m128i*>(raw_values);
+ __m128i* output_buffer_streams[kNumStreams];
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m128i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+ // The current shuffling algorithm diverges for float and double types but the compiler
+ // should be able to remove the branch since only one path is taken for each template
+ // instantiation.
+ // Example run for floats:
+ // Step 0, copy:
+ // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
+ // Step 1: _mm_unpacklo_epi8 and mm_unpackhi_epi8:
+ // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
+ // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
+ // Step 3: __mm_unpacklo_epi8 and _mm_unpackhi_epi8:
+ // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
+ // Step 4: __mm_unpacklo_epi64 and _mm_unpackhi_epi64:
+ // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ // First copy the data to stage 0.
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ stage[0][i] = _mm_loadu_si128(&raw_values_sse[block_index * kNumStreams + i]);
+ }
+
+ // The shuffling of bytes is performed through the unpack intrinsics.
+ // In my measurements this gives better performance then an implementation
+ // which uses the shuffle intrinsics.
+ for (size_t stage_lvl = 0; stage_lvl < 2U; ++stage_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ stage[stage_lvl + 1][i * 2] =
+ _mm_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ stage[stage_lvl + 1][i * 2 + 1] =
+ _mm_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ }
+ }
+ if (kNumStreams == 8U) {
+ // This is the path for double.
+ __m128i tmp[8];
+ for (size_t i = 0; i < 4; ++i) {
+ tmp[i * 2] = _mm_unpacklo_epi32(stage[2][i], stage[2][i + 4]);
+ tmp[i * 2 + 1] = _mm_unpackhi_epi32(stage[2][i], stage[2][i + 4]);
+ }
+
+ for (size_t i = 0; i < 4; ++i) {
+ final_result[i * 2] = _mm_unpacklo_epi32(tmp[i], tmp[i + 4]);
+ final_result[i * 2 + 1] = _mm_unpackhi_epi32(tmp[i], tmp[i + 4]);
+ }
+ } else {
+ // this is the path for float.
+ __m128i tmp[4];
+ for (size_t i = 0; i < 2; ++i) {
+ tmp[i * 2] = _mm_unpacklo_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
+ tmp[i * 2 + 1] = _mm_unpackhi_epi8(stage[2][i * 2], stage[2][i * 2 + 1]);
+ }
+ for (size_t i = 0; i < 2; ++i) {
+ final_result[i * 2] = _mm_unpacklo_epi64(tmp[i], tmp[i + 2]);
+ final_result[i * 2 + 1] = _mm_unpackhi_epi64(tmp[i], tmp[i + 2]);
+ }
+ }
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm_storeu_si128(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_SSE4_2
+
+#if defined(ARROW_HAVE_AVX2)
+template <typename T>
+void ByteStreamSplitDecodeAvx2(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m256i) * kNumStreams;
+ if (size < kBlockSize) // Back to SSE for small size
+ return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // Processed hierarchically using unpack intrinsics, then permute intrinsics.
+ __m256i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ __m256i final_result[kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm256_loadu_si256(
+ reinterpret_cast<const __m256i*>(&data[i * sizeof(__m256i) + j * stride]));
+ }
+
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm256_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm256_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double, 128i index:
+ // {0x00, 0x08}, {0x01, 0x09}, {0x02, 0x0A}, {0x03, 0x0B},
+ // {0x04, 0x0C}, {0x05, 0x0D}, {0x06, 0x0E}, {0x07, 0x0F},
+ final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00100000);
+ final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b00100000);
+ final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b00100000);
+ final_result[4] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00110001);
+ final_result[5] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00110001);
+ final_result[6] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b00110001);
+ final_result[7] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b00110001);
+ } else {
+ // path for float, 128i index:
+ // {0x00, 0x04}, {0x01, 0x05}, {0x02, 0x06}, {0x03, 0x07}
+ final_result[0] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00100000);
+ final_result[2] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b00110001);
+ final_result[3] = _mm256_permute2x128_si256(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b00110001);
+ }
+
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m256i)]),
+ final_result[j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ if (kNumStreams == 8U) // Back to SSE, currently no path for double.
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m256i) * kNumStreams;
+ if (size < kBlockSize) // Back to SSE for small size
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+ const size_t num_blocks = size / kBlockSize;
+ const __m256i* raw_values_simd = reinterpret_cast<const __m256i*>(raw_values);
+ __m256i* output_buffer_streams[kNumStreams];
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m256i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+
+ // Path for float.
+ // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
+ // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+ // 3. Pack final 256i block with _mm256_permute2x128_si256.
+ constexpr size_t kNumUnpack = 3U;
+ __m256i stage[kNumUnpack + 1][kNumStreams];
+ static const __m256i kPermuteMask =
+ _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);
+ __m256i permute[kNumStreams];
+ __m256i final_result[kNumStreams];
+
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ stage[0][i] = _mm256_loadu_si256(&raw_values_simd[block_index * kNumStreams + i]);
+ }
+
+ for (size_t stage_lvl = 0; stage_lvl < kNumUnpack; ++stage_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ stage[stage_lvl + 1][i * 2] =
+ _mm256_unpacklo_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ stage[stage_lvl + 1][i * 2 + 1] =
+ _mm256_unpackhi_epi8(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+ }
+ }
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ permute[i] = _mm256_permutevar8x32_epi32(stage[kNumUnpack][i], kPermuteMask);
+ }
+
+ final_result[0] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00100000);
+ final_result[1] = _mm256_permute2x128_si256(permute[0], permute[2], 0b00110001);
+ final_result[2] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00100000);
+ final_result[3] = _mm256_permute2x128_si256(permute[1], permute[3], 0b00110001);
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm256_storeu_si256(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_AVX2
+
+#if defined(ARROW_HAVE_AVX512)
+template <typename T>
+void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ constexpr size_t kNumStreamsLog2 = (kNumStreams == 8U ? 3U : 2U);
+
+ const int64_t size = num_values * sizeof(T);
+ constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams;
+ if (size < kBlockSize) // Back to AVX2 for small size
+ return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+ const int64_t num_blocks = size / kBlockSize;
+ uint8_t* output_data = reinterpret_cast<uint8_t*>(out);
+
+ // First handle suffix.
+ const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams;
+ for (int64_t i = num_processed_elements; i < num_values; ++i) {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ out[i] = arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]);
+ }
+
+ // Processed hierarchically using the unpack, then two shuffles.
+ __m512i stage[kNumStreamsLog2 + 1U][kNumStreams];
+ __m512i shuffle[kNumStreams];
+ __m512i final_result[kNumStreams];
+ constexpr size_t kNumStreamsHalf = kNumStreams / 2U;
+
+ for (int64_t i = 0; i < num_blocks; ++i) {
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ stage[0][j] = _mm512_loadu_si512(
+ reinterpret_cast<const __m512i*>(&data[i * sizeof(__m512i) + j * stride]));
+ }
+
+ for (size_t step = 0; step < kNumStreamsLog2; ++step) {
+ for (size_t j = 0; j < kNumStreamsHalf; ++j) {
+ stage[step + 1U][j * 2] =
+ _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ stage[step + 1U][j * 2 + 1U] =
+ _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double, 128i index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+ // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+ // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+ shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b01000100);
+ shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b01000100);
+ shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b11101110);
+ shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b11101110);
+ shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4],
+ stage[kNumStreamsLog2][5], 0b11101110);
+ shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6],
+ stage[kNumStreamsLog2][7], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
+ final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
+ final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
+ final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
+ } else {
+ // path for float, 128i index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D}
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F},
+ shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0],
+ stage[kNumStreamsLog2][1], 0b11101110);
+ shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2],
+ stage[kNumStreamsLog2][3], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ }
+
+ for (size_t j = 0; j < kNumStreams; ++j) {
+ _mm512_storeu_si512(reinterpret_cast<__m512i*>(
+ &output_data[(i * kNumStreams + j) * sizeof(__m512i)]),
+ final_result[j]);
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ static_assert(kNumStreams == 4U || kNumStreams == 8U, "Invalid number of streams.");
+ const size_t size = num_values * sizeof(T);
+ constexpr size_t kBlockSize = sizeof(__m512i) * kNumStreams;
+ if (size < kBlockSize) // Back to AVX2 for small size
+ return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
+
+ const size_t num_blocks = size / kBlockSize;
+ const __m512i* raw_values_simd = reinterpret_cast<const __m512i*>(raw_values);
+ __m512i* output_buffer_streams[kNumStreams];
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ output_buffer_streams[i] =
+ reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]);
+ }
+
+ // First handle suffix.
+ const size_t num_processed_elements = (num_blocks * kBlockSize) / sizeof(T);
+ for (size_t i = num_processed_elements; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+
+ constexpr size_t KNumUnpack = (kNumStreams == 8U) ? 2U : 3U;
+ __m512i final_result[kNumStreams];
+ __m512i unpack[KNumUnpack + 1][kNumStreams];
+ __m512i permutex[kNumStreams];
+ __m512i permutex_mask;
+ if (kNumStreams == 8U) {
+ // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version.
+ permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006,
+ 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004,
+ 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002,
+ 0x00190011, 0x00090001, 0x00180010, 0x00080000);
+ } else {
+ permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D,
+ 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00);
+ }
+
+ for (size_t block_index = 0; block_index < num_blocks; ++block_index) {
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]);
+ }
+
+ for (size_t unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) {
+ for (size_t i = 0; i < kNumStreams / 2U; ++i) {
+ unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8(
+ unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+ unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8(
+ unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]);
+ }
+ }
+
+ if (kNumStreams == 8U) {
+ // path for double
+ // 1. unpack to epi16 block
+ // 2. permutexvar_epi16 to 128i block
+ // 3. shuffle 128i to final 512i target, index:
+ // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C},
+ // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D},
+ // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E},
+ // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F},
+ for (size_t i = 0; i < kNumStreams; ++i)
+ permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]);
+
+ __m512i shuffle[kNumStreams];
+ shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+ shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100);
+ shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+ shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110);
+ shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+ shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100);
+ shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+ shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110);
+
+ final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000);
+ final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101);
+ final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000);
+ final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101);
+ final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000);
+ final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101);
+ final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000);
+ final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101);
+ } else {
+ // Path for float.
+ // 1. Processed hierarchically to 32i blcok using the unpack intrinsics.
+ // 2. Pack 128i block using _mm256_permutevar8x32_epi32.
+ // 3. Pack final 256i block with _mm256_permute2x128_si256.
+ for (size_t i = 0; i < kNumStreams; ++i)
+ permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]);
+
+ final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100);
+ final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110);
+ final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100);
+ final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110);
+ }
+
+ for (size_t i = 0; i < kNumStreams; ++i) {
+ _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]);
+ }
+ }
+}
+#endif // ARROW_HAVE_AVX512
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+template <typename T>
+void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values,
+ int64_t stride, T* out) {
+#if defined(ARROW_HAVE_AVX512)
+ return ByteStreamSplitDecodeAvx512(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_AVX2)
+ return ByteStreamSplitDecodeAvx2(data, num_values, stride, out);
+#elif defined(ARROW_HAVE_SSE4_2)
+ return ByteStreamSplitDecodeSse2(data, num_values, stride, out);
+#else
+#error "ByteStreamSplitDecodeSimd not implemented"
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_AVX512)
+ return ByteStreamSplitEncodeAvx512<T>(raw_values, num_values, output_buffer_raw);
+#elif defined(ARROW_HAVE_AVX2)
+ return ByteStreamSplitEncodeAvx2<T>(raw_values, num_values, output_buffer_raw);
+#elif defined(ARROW_HAVE_SSE4_2)
+ return ByteStreamSplitEncodeSse2<T>(raw_values, num_values, output_buffer_raw);
+#else
+#error "ByteStreamSplitEncodeSimd not implemented"
+#endif
+}
+#endif
+
+template <typename T>
+void ByteStreamSplitEncodeScalar(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+ constexpr size_t kNumStreams = sizeof(T);
+ for (size_t i = 0U; i < num_values; ++i) {
+ for (size_t j = 0U; j < kNumStreams; ++j) {
+ const uint8_t byte_in_value = raw_values[i * kNumStreams + j];
+ output_buffer_raw[j * num_values + i] = byte_in_value;
+ }
+ }
+}
+
+template <typename T>
+void ByteStreamSplitDecodeScalar(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+ constexpr size_t kNumStreams = sizeof(T);
+ auto output_buffer_raw = reinterpret_cast<uint8_t*>(out);
+
+ for (int64_t i = 0; i < num_values; ++i) {
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * stride + i;
+ output_buffer_raw[i * kNumStreams + b] = data[byte_index];
+ }
+ }
+}
+
+template <typename T>
+void inline ByteStreamSplitEncode(const uint8_t* raw_values, const size_t num_values,
+ uint8_t* output_buffer_raw) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ return ByteStreamSplitEncodeSimd<T>(raw_values, num_values, output_buffer_raw);
+#else
+ return ByteStreamSplitEncodeScalar<T>(raw_values, num_values, output_buffer_raw);
+#endif
+}
+
+template <typename T>
+void inline ByteStreamSplitDecode(const uint8_t* data, int64_t num_values, int64_t stride,
+ T* out) {
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ return ByteStreamSplitDecodeSimd(data, num_values, stride, out);
+#else
+ return ByteStreamSplitDecodeScalar(data, num_values, stride, out);
+#endif
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
index 671280a0a17..874b2c2c886 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.cc
@@ -1,226 +1,226 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/util/cancel.h"
-
-#include <atomic>
-#include <mutex>
-#include <sstream>
-#include <utility>
-
-#include "arrow/result.h"
-#include "arrow/util/atomic_shared_ptr.h"
-#include "arrow/util/io_util.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-#if ATOMIC_INT_LOCK_FREE != 2
-#error Lock-free atomic int required for signal safety
-#endif
-
-using internal::ReinstateSignalHandler;
-using internal::SetSignalHandler;
-using internal::SignalHandler;
-
-// NOTE: We care mainly about the making the common case (not cancelled) fast.
-
-struct StopSourceImpl {
- std::atomic<int> requested_{0}; // will be -1 or signal number if requested
- std::mutex mutex_;
- Status cancel_error_;
-};
-
-StopSource::StopSource() : impl_(new StopSourceImpl) {}
-
-StopSource::~StopSource() = default;
-
-void StopSource::RequestStop() { RequestStop(Status::Cancelled("Operation cancelled")); }
-
-void StopSource::RequestStop(Status st) {
- std::lock_guard<std::mutex> lock(impl_->mutex_);
- DCHECK(!st.ok());
- if (!impl_->requested_) {
- impl_->requested_ = -1;
- impl_->cancel_error_ = std::move(st);
- }
-}
-
-void StopSource::RequestStopFromSignal(int signum) {
- // Only async-signal-safe code allowed here
- impl_->requested_.store(signum);
-}
-
-void StopSource::Reset() {
- std::lock_guard<std::mutex> lock(impl_->mutex_);
- impl_->cancel_error_ = Status::OK();
- impl_->requested_.store(0);
-}
-
-StopToken StopSource::token() { return StopToken(impl_); }
-
-bool StopToken::IsStopRequested() const {
- if (!impl_) {
- return false;
- }
- return impl_->requested_.load() != 0;
-}
-
-Status StopToken::Poll() const {
- if (!impl_) {
- return Status::OK();
- }
- if (!impl_->requested_.load()) {
- return Status::OK();
- }
-
- std::lock_guard<std::mutex> lock(impl_->mutex_);
- if (impl_->cancel_error_.ok()) {
- auto signum = impl_->requested_.load();
- DCHECK_GT(signum, 0);
- impl_->cancel_error_ = internal::CancelledFromSignal(signum, "Operation cancelled");
- }
- return impl_->cancel_error_;
-}
-
-namespace {
-
-struct SignalStopState {
- struct SavedSignalHandler {
- int signum;
- SignalHandler handler;
- };
-
- Status RegisterHandlers(const std::vector<int>& signals) {
- if (!saved_handlers_.empty()) {
- return Status::Invalid("Signal handlers already registered");
- }
- for (int signum : signals) {
- ARROW_ASSIGN_OR_RAISE(auto handler,
- SetSignalHandler(signum, SignalHandler{&HandleSignal}));
- saved_handlers_.push_back({signum, handler});
- }
- return Status::OK();
- }
-
- void UnregisterHandlers() {
- auto handlers = std::move(saved_handlers_);
- for (const auto& h : handlers) {
- ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler).status());
- }
- }
-
- ~SignalStopState() {
- UnregisterHandlers();
- Disable();
- }
-
- StopSource* stop_source() { return stop_source_.get(); }
-
- bool enabled() { return stop_source_ != nullptr; }
-
- void Enable() {
- // Before creating a new StopSource, delete any lingering reference to
- // the previous one in the trash can. See DoHandleSignal() for details.
- EmptyTrashCan();
- internal::atomic_store(&stop_source_, std::make_shared<StopSource>());
- }
-
- void Disable() { internal::atomic_store(&stop_source_, NullSource()); }
-
- static SignalStopState* instance() { return &instance_; }
-
- private:
- // For readability
- std::shared_ptr<StopSource> NullSource() { return nullptr; }
-
- void EmptyTrashCan() { internal::atomic_store(&trash_can_, NullSource()); }
-
- static void HandleSignal(int signum) { instance_.DoHandleSignal(signum); }
-
- void DoHandleSignal(int signum) {
- // async-signal-safe code only
- auto source = internal::atomic_load(&stop_source_);
- if (source) {
- source->RequestStopFromSignal(signum);
- // Disable() may have been called in the meantime, but we can't
- // deallocate a shared_ptr here, so instead move it to a "trash can".
- // This minimizes the possibility of running a deallocator here,
- // however it doesn't entirely preclude it.
- //
- // Possible case:
- // - a signal handler (A) starts running, fetches the current source
- // - Disable() then Enable() are called, emptying the trash can and
- // replacing the current source
- // - a signal handler (B) starts running, fetches the current source
- // - signal handler A resumes, moves its source (the old source) into
- // the trash can (the only remaining reference)
- // - signal handler B resumes, moves its source (the current source)
- // into the trash can. This triggers deallocation of the old source,
- // since the trash can had the only remaining reference to it.
- //
- // This case should be sufficiently unlikely, but we cannot entirely
- // rule it out. The problem might be solved properly with a lock-free
- // linked list of StopSources.
- internal::atomic_store(&trash_can_, std::move(source));
- }
- ReinstateSignalHandler(signum, &HandleSignal);
- }
-
- std::shared_ptr<StopSource> stop_source_;
- std::shared_ptr<StopSource> trash_can_;
-
- std::vector<SavedSignalHandler> saved_handlers_;
-
- static SignalStopState instance_;
-};
-
-SignalStopState SignalStopState::instance_{};
-
-} // namespace
-
-Result<StopSource*> SetSignalStopSource() {
- auto stop_state = SignalStopState::instance();
- if (stop_state->enabled()) {
- return Status::Invalid("Signal stop source already set up");
- }
- stop_state->Enable();
- return stop_state->stop_source();
-}
-
-void ResetSignalStopSource() {
- auto stop_state = SignalStopState::instance();
- DCHECK(stop_state->enabled());
- stop_state->Disable();
-}
-
-Status RegisterCancellingSignalHandler(const std::vector<int>& signals) {
- auto stop_state = SignalStopState::instance();
- if (!stop_state->enabled()) {
- return Status::Invalid("Signal stop source was not set up");
- }
- return stop_state->RegisterHandlers(signals);
-}
-
-void UnregisterCancellingSignalHandler() {
- auto stop_state = SignalStopState::instance();
- DCHECK(stop_state->enabled());
- stop_state->UnregisterHandlers();
-}
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/cancel.h"
+
+#include <atomic>
+#include <mutex>
+#include <sstream>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/util/atomic_shared_ptr.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+#if ATOMIC_INT_LOCK_FREE != 2
+#error Lock-free atomic int required for signal safety
+#endif
+
+using internal::ReinstateSignalHandler;
+using internal::SetSignalHandler;
+using internal::SignalHandler;
+
+// NOTE: We care mainly about the making the common case (not cancelled) fast.
+
+struct StopSourceImpl {
+ std::atomic<int> requested_{0}; // will be -1 or signal number if requested
+ std::mutex mutex_;
+ Status cancel_error_;
+};
+
+StopSource::StopSource() : impl_(new StopSourceImpl) {}
+
+StopSource::~StopSource() = default;
+
+void StopSource::RequestStop() { RequestStop(Status::Cancelled("Operation cancelled")); }
+
+void StopSource::RequestStop(Status st) {
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ DCHECK(!st.ok());
+ if (!impl_->requested_) {
+ impl_->requested_ = -1;
+ impl_->cancel_error_ = std::move(st);
+ }
+}
+
+void StopSource::RequestStopFromSignal(int signum) {
+ // Only async-signal-safe code allowed here
+ impl_->requested_.store(signum);
+}
+
+void StopSource::Reset() {
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ impl_->cancel_error_ = Status::OK();
+ impl_->requested_.store(0);
+}
+
+StopToken StopSource::token() { return StopToken(impl_); }
+
+bool StopToken::IsStopRequested() const {
+ if (!impl_) {
+ return false;
+ }
+ return impl_->requested_.load() != 0;
+}
+
+Status StopToken::Poll() const {
+ if (!impl_) {
+ return Status::OK();
+ }
+ if (!impl_->requested_.load()) {
+ return Status::OK();
+ }
+
+ std::lock_guard<std::mutex> lock(impl_->mutex_);
+ if (impl_->cancel_error_.ok()) {
+ auto signum = impl_->requested_.load();
+ DCHECK_GT(signum, 0);
+ impl_->cancel_error_ = internal::CancelledFromSignal(signum, "Operation cancelled");
+ }
+ return impl_->cancel_error_;
+}
+
+namespace {
+
+struct SignalStopState {
+ struct SavedSignalHandler {
+ int signum;
+ SignalHandler handler;
+ };
+
+ Status RegisterHandlers(const std::vector<int>& signals) {
+ if (!saved_handlers_.empty()) {
+ return Status::Invalid("Signal handlers already registered");
+ }
+ for (int signum : signals) {
+ ARROW_ASSIGN_OR_RAISE(auto handler,
+ SetSignalHandler(signum, SignalHandler{&HandleSignal}));
+ saved_handlers_.push_back({signum, handler});
+ }
+ return Status::OK();
+ }
+
+ void UnregisterHandlers() {
+ auto handlers = std::move(saved_handlers_);
+ for (const auto& h : handlers) {
+ ARROW_CHECK_OK(SetSignalHandler(h.signum, h.handler).status());
+ }
+ }
+
+ ~SignalStopState() {
+ UnregisterHandlers();
+ Disable();
+ }
+
+ StopSource* stop_source() { return stop_source_.get(); }
+
+ bool enabled() { return stop_source_ != nullptr; }
+
+ void Enable() {
+ // Before creating a new StopSource, delete any lingering reference to
+ // the previous one in the trash can. See DoHandleSignal() for details.
+ EmptyTrashCan();
+ internal::atomic_store(&stop_source_, std::make_shared<StopSource>());
+ }
+
+ void Disable() { internal::atomic_store(&stop_source_, NullSource()); }
+
+ static SignalStopState* instance() { return &instance_; }
+
+ private:
+ // For readability
+ std::shared_ptr<StopSource> NullSource() { return nullptr; }
+
+ void EmptyTrashCan() { internal::atomic_store(&trash_can_, NullSource()); }
+
+ static void HandleSignal(int signum) { instance_.DoHandleSignal(signum); }
+
+ void DoHandleSignal(int signum) {
+ // async-signal-safe code only
+ auto source = internal::atomic_load(&stop_source_);
+ if (source) {
+ source->RequestStopFromSignal(signum);
+ // Disable() may have been called in the meantime, but we can't
+ // deallocate a shared_ptr here, so instead move it to a "trash can".
+ // This minimizes the possibility of running a deallocator here,
+ // however it doesn't entirely preclude it.
+ //
+ // Possible case:
+ // - a signal handler (A) starts running, fetches the current source
+ // - Disable() then Enable() are called, emptying the trash can and
+ // replacing the current source
+ // - a signal handler (B) starts running, fetches the current source
+ // - signal handler A resumes, moves its source (the old source) into
+ // the trash can (the only remaining reference)
+ // - signal handler B resumes, moves its source (the current source)
+ // into the trash can. This triggers deallocation of the old source,
+ // since the trash can had the only remaining reference to it.
+ //
+ // This case should be sufficiently unlikely, but we cannot entirely
+ // rule it out. The problem might be solved properly with a lock-free
+ // linked list of StopSources.
+ internal::atomic_store(&trash_can_, std::move(source));
+ }
+ ReinstateSignalHandler(signum, &HandleSignal);
+ }
+
+ std::shared_ptr<StopSource> stop_source_;
+ std::shared_ptr<StopSource> trash_can_;
+
+ std::vector<SavedSignalHandler> saved_handlers_;
+
+ static SignalStopState instance_;
+};
+
+SignalStopState SignalStopState::instance_{};
+
+} // namespace
+
+Result<StopSource*> SetSignalStopSource() {
+ auto stop_state = SignalStopState::instance();
+ if (stop_state->enabled()) {
+ return Status::Invalid("Signal stop source already set up");
+ }
+ stop_state->Enable();
+ return stop_state->stop_source();
+}
+
+void ResetSignalStopSource() {
+ auto stop_state = SignalStopState::instance();
+ DCHECK(stop_state->enabled());
+ stop_state->Disable();
+}
+
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals) {
+ auto stop_state = SignalStopState::instance();
+ if (!stop_state->enabled()) {
+ return Status::Invalid("Signal stop source was not set up");
+ }
+ return stop_state->RegisterHandlers(signals);
+}
+
+void UnregisterCancellingSignalHandler() {
+ auto stop_state = SignalStopState::instance();
+ DCHECK(stop_state->enabled());
+ stop_state->UnregisterHandlers();
+}
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
index 7c755c02d68..9e00f673a21 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cancel.h
@@ -1,102 +1,102 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/status.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class StopToken;
-
-struct StopSourceImpl;
-
-/// EXPERIMENTAL
-class ARROW_EXPORT StopSource {
- public:
- StopSource();
- ~StopSource();
-
- // Consumer API (the side that stops)
- void RequestStop();
- void RequestStop(Status error);
- void RequestStopFromSignal(int signum);
-
- StopToken token();
-
- // For internal use only
- void Reset();
-
- protected:
- std::shared_ptr<StopSourceImpl> impl_;
-};
-
-/// EXPERIMENTAL
-class ARROW_EXPORT StopToken {
- public:
- // Public for Cython
- StopToken() {}
-
- explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
-
- // A trivial token that never propagates any stop request
- static StopToken Unstoppable() { return StopToken(); }
-
- // Producer API (the side that gets asked to stopped)
- Status Poll() const;
- bool IsStopRequested() const;
-
- protected:
- std::shared_ptr<StopSourceImpl> impl_;
-};
-
-/// EXPERIMENTAL: Set a global StopSource that can receive signals
-///
-/// The only allowed order of calls is the following:
-/// - SetSignalStopSource()
-/// - any number of pairs of (RegisterCancellingSignalHandler,
-/// UnregisterCancellingSignalHandler) calls
-/// - ResetSignalStopSource()
-///
-/// Beware that these settings are process-wide. Typically, only one
-/// thread should call these APIs, even in a multithreaded setting.
-ARROW_EXPORT
-Result<StopSource*> SetSignalStopSource();
-
-/// EXPERIMENTAL: Reset the global signal-receiving StopSource
-///
-/// This will invalidate the pointer returned by SetSignalStopSource.
-ARROW_EXPORT
-void ResetSignalStopSource();
-
-/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
-ARROW_EXPORT
-Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
-
-/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
-ARROW_EXPORT
-void UnregisterCancellingSignalHandler();
-
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class StopToken;
+
+struct StopSourceImpl;
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopSource {
+ public:
+ StopSource();
+ ~StopSource();
+
+ // Consumer API (the side that stops)
+ void RequestStop();
+ void RequestStop(Status error);
+ void RequestStopFromSignal(int signum);
+
+ StopToken token();
+
+ // For internal use only
+ void Reset();
+
+ protected:
+ std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopToken {
+ public:
+ // Public for Cython
+ StopToken() {}
+
+ explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
+
+ // A trivial token that never propagates any stop request
+ static StopToken Unstoppable() { return StopToken(); }
+
+ // Producer API (the side that gets asked to stopped)
+ Status Poll() const;
+ bool IsStopRequested() const;
+
+ protected:
+ std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL: Set a global StopSource that can receive signals
+///
+/// The only allowed order of calls is the following:
+/// - SetSignalStopSource()
+/// - any number of pairs of (RegisterCancellingSignalHandler,
+/// UnregisterCancellingSignalHandler) calls
+/// - ResetSignalStopSource()
+///
+/// Beware that these settings are process-wide. Typically, only one
+/// thread should call these APIs, even in a multithreaded setting.
+ARROW_EXPORT
+Result<StopSource*> SetSignalStopSource();
+
+/// EXPERIMENTAL: Reset the global signal-receiving StopSource
+///
+/// This will invalidate the pointer returned by SetSignalStopSource.
+ARROW_EXPORT
+void ResetSignalStopSource();
+
+/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
+ARROW_EXPORT
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
+
+/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
+ARROW_EXPORT
+void UnregisterCancellingSignalHandler();
+
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
index 41109e80faa..8db199b4e76 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.cc
@@ -29,18 +29,18 @@
namespace arrow {
namespace util {
-namespace {
-
-Status CheckSupportsCompressionLevel(Compression::type type) {
- if (!Codec::SupportsCompressionLevel(type)) {
- return Status::Invalid(
- "The specified codec does not support the compression level parameter");
- }
- return Status::OK();
-}
-
-} // namespace
-
+namespace {
+
+Status CheckSupportsCompressionLevel(Compression::type type) {
+ if (!Codec::SupportsCompressionLevel(type)) {
+ return Status::Invalid(
+ "The specified codec does not support the compression level parameter");
+ }
+ return Status::OK();
+}
+
+} // namespace
+
int Codec::UseDefaultCompressionLevel() { return kUseDefaultCompressionLevel; }
Status Codec::Init() { return Status::OK(); }
@@ -115,24 +115,24 @@ bool Codec::SupportsCompressionLevel(Compression::type codec) {
}
}
-Result<int> Codec::MaximumCompressionLevel(Compression::type codec_type) {
- RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
- ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
- return codec->maximum_compression_level();
-}
-
-Result<int> Codec::MinimumCompressionLevel(Compression::type codec_type) {
- RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
- ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
- return codec->minimum_compression_level();
-}
-
-Result<int> Codec::DefaultCompressionLevel(Compression::type codec_type) {
- RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
- ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
- return codec->default_compression_level();
-}
-
+Result<int> Codec::MaximumCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->maximum_compression_level();
+}
+
+Result<int> Codec::MinimumCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->minimum_compression_level();
+}
+
+Result<int> Codec::DefaultCompressionLevel(Compression::type codec_type) {
+ RETURN_NOT_OK(CheckSupportsCompressionLevel(codec_type));
+ ARROW_ASSIGN_OR_RAISE(auto codec, Codec::Create(codec_type));
+ return codec->default_compression_level();
+}
+
Result<std::unique_ptr<Codec>> Codec::Create(Compression::type codec_type,
int compression_level) {
if (!IsAvailable(codec_type)) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
index bd5f2d1c647..0832e82a606 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression.h
@@ -24,13 +24,13 @@
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace util {
-constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
+constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
/// \brief Streaming compressor interface
///
@@ -132,27 +132,27 @@ class ARROW_EXPORT Codec {
/// \brief Return true if indicated codec supports setting a compression level
static bool SupportsCompressionLevel(Compression::type codec);
- /// \brief Return the smallest supported compression level for the codec
- /// Note: This function creates a temporary Codec instance
- static Result<int> MinimumCompressionLevel(Compression::type codec);
-
- /// \brief Return the largest supported compression level for the codec
- /// Note: This function creates a temporary Codec instance
- static Result<int> MaximumCompressionLevel(Compression::type codec);
-
- /// \brief Return the default compression level
- /// Note: This function creates a temporary Codec instance
- static Result<int> DefaultCompressionLevel(Compression::type codec);
-
- /// \brief Return the smallest supported compression level
- virtual int minimum_compression_level() const = 0;
-
- /// \brief Return the largest supported compression level
- virtual int maximum_compression_level() const = 0;
-
- /// \brief Return the default compression level
- virtual int default_compression_level() const = 0;
-
+ /// \brief Return the smallest supported compression level for the codec
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> MinimumCompressionLevel(Compression::type codec);
+
+ /// \brief Return the largest supported compression level for the codec
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> MaximumCompressionLevel(Compression::type codec);
+
+ /// \brief Return the default compression level
+ /// Note: This function creates a temporary Codec instance
+ static Result<int> DefaultCompressionLevel(Compression::type codec);
+
+ /// \brief Return the smallest supported compression level
+ virtual int minimum_compression_level() const = 0;
+
+ /// \brief Return the largest supported compression level
+ virtual int maximum_compression_level() const = 0;
+
+ /// \brief Return the default compression level
+ virtual int default_compression_level() const = 0;
+
/// \brief One-shot decompression function
///
/// output_buffer_len must be correct and therefore be obtained in advance.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
index cc41ce43f91..cb547c2c8cf 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_brotli.cc
@@ -224,11 +224,11 @@ class BrotliCodec : public Codec {
Compression::type compression_type() const override { return Compression::BROTLI; }
int compression_level() const override { return compression_level_; }
- int minimum_compression_level() const override { return BROTLI_MIN_QUALITY; }
- int maximum_compression_level() const override { return BROTLI_MAX_QUALITY; }
- int default_compression_level() const override {
- return kBrotliDefaultCompressionLevel;
- }
+ int minimum_compression_level() const override { return BROTLI_MIN_QUALITY; }
+ int maximum_compression_level() const override { return BROTLI_MAX_QUALITY; }
+ int default_compression_level() const override {
+ return kBrotliDefaultCompressionLevel;
+ }
private:
const int compression_level_;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
index 04c13cc4c5f..c783e405590 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_lz4.cc
@@ -27,7 +27,7 @@
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
@@ -300,9 +300,9 @@ class Lz4FrameCodec : public Codec {
}
Compression::type compression_type() const override { return Compression::LZ4_FRAME; }
- int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
protected:
const LZ4F_preferences_t prefs_;
@@ -353,9 +353,9 @@ class Lz4Codec : public Codec {
}
Compression::type compression_type() const override { return Compression::LZ4; }
- int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
};
// ----------------------------------------------------------------------
@@ -424,52 +424,52 @@ class Lz4HadoopCodec : public Lz4Codec {
int64_t TryDecompressHadoop(int64_t input_len, const uint8_t* input,
int64_t output_buffer_len, uint8_t* output_buffer) {
- // Parquet files written with the Hadoop Lz4Codec use their own framing.
- // The input buffer can contain an arbitrary number of "frames", each
- // with the following structure:
- // - bytes 0..3: big-endian uint32_t representing the frame decompressed size
- // - bytes 4..7: big-endian uint32_t representing the frame compressed size
- // - bytes 8...: frame compressed data
+ // Parquet files written with the Hadoop Lz4Codec use their own framing.
+ // The input buffer can contain an arbitrary number of "frames", each
+ // with the following structure:
+ // - bytes 0..3: big-endian uint32_t representing the frame decompressed size
+ // - bytes 4..7: big-endian uint32_t representing the frame compressed size
+ // - bytes 8...: frame compressed data
//
// The Hadoop Lz4Codec source code can be found here:
// https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc
- int64_t total_decompressed_size = 0;
-
- while (input_len >= kPrefixLength) {
- const uint32_t expected_decompressed_size =
- BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input));
- const uint32_t expected_compressed_size =
- BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input + sizeof(uint32_t)));
- input += kPrefixLength;
- input_len -= kPrefixLength;
-
- if (input_len < expected_compressed_size) {
- // Not enough bytes for Hadoop "frame"
- return kNotHadoop;
+ int64_t total_decompressed_size = 0;
+
+ while (input_len >= kPrefixLength) {
+ const uint32_t expected_decompressed_size =
+ BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input));
+ const uint32_t expected_compressed_size =
+ BitUtil::FromBigEndian(SafeLoadAs<uint32_t>(input + sizeof(uint32_t)));
+ input += kPrefixLength;
+ input_len -= kPrefixLength;
+
+ if (input_len < expected_compressed_size) {
+ // Not enough bytes for Hadoop "frame"
+ return kNotHadoop;
}
- if (output_buffer_len < expected_decompressed_size) {
- // Not enough bytes to hold advertised output => probably not Hadoop
- return kNotHadoop;
- }
- // Try decompressing and compare with expected decompressed length
- auto maybe_decompressed_size = Lz4Codec::Decompress(
- expected_compressed_size, input, output_buffer_len, output_buffer);
- if (!maybe_decompressed_size.ok() ||
- *maybe_decompressed_size != expected_decompressed_size) {
- return kNotHadoop;
- }
- input += expected_compressed_size;
- input_len -= expected_compressed_size;
- output_buffer += expected_decompressed_size;
- output_buffer_len -= expected_decompressed_size;
- total_decompressed_size += expected_decompressed_size;
+ if (output_buffer_len < expected_decompressed_size) {
+ // Not enough bytes to hold advertised output => probably not Hadoop
+ return kNotHadoop;
+ }
+ // Try decompressing and compare with expected decompressed length
+ auto maybe_decompressed_size = Lz4Codec::Decompress(
+ expected_compressed_size, input, output_buffer_len, output_buffer);
+ if (!maybe_decompressed_size.ok() ||
+ *maybe_decompressed_size != expected_decompressed_size) {
+ return kNotHadoop;
+ }
+ input += expected_compressed_size;
+ input_len -= expected_compressed_size;
+ output_buffer += expected_decompressed_size;
+ output_buffer_len -= expected_decompressed_size;
+ total_decompressed_size += expected_decompressed_size;
}
- if (input_len == 0) {
- return total_decompressed_size;
- } else {
- return kNotHadoop;
- }
+ if (input_len == 0) {
+ return total_decompressed_size;
+ } else {
+ return kNotHadoop;
+ }
}
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
index da00607d13b..3756f957d04 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_snappy.cc
@@ -86,9 +86,9 @@ class SnappyCodec : public Codec {
}
Compression::type compression_type() const override { return Compression::SNAPPY; }
- int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
- int default_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int maximum_compression_level() const override { return kUseDefaultCompressionLevel; }
+ int default_compression_level() const override { return kUseDefaultCompressionLevel; }
};
} // namespace
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
index 51373cc227c..e9cb2470ee2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zlib.cc
@@ -52,9 +52,9 @@ constexpr int GZIP_CODEC = 16;
// Determine if this is libz or gzip from header.
constexpr int DETECT_CODEC = 32;
-constexpr int kGZipMinCompressionLevel = 1;
-constexpr int kGZipMaxCompressionLevel = 9;
-
+constexpr int kGZipMinCompressionLevel = 1;
+constexpr int kGZipMaxCompressionLevel = 9;
+
int CompressionWindowBitsForFormat(GZipFormat::type format) {
int window_bits = WINDOW_BITS;
switch (format) {
@@ -249,9 +249,9 @@ class GZipCompressor : public Compressor {
// again with the same value of the flush parameter and more output space
// (updated avail_out), until the flush is complete (deflate returns
// with non-zero avail_out)."
- // "Note that Z_BUF_ERROR is not fatal, and deflate() can be called again
- // with more input and more output space to continue compressing."
- return FlushResult{bytes_written, stream_.avail_out == 0};
+ // "Note that Z_BUF_ERROR is not fatal, and deflate() can be called again
+ // with more input and more output space to continue compressing."
+ return FlushResult{bytes_written, stream_.avail_out == 0};
}
Result<EndResult> End(int64_t output_len, uint8_t* output) override {
@@ -471,9 +471,9 @@ class GZipCodec : public Codec {
Compression::type compression_type() const override { return Compression::GZIP; }
int compression_level() const override { return compression_level_; }
- int minimum_compression_level() const override { return kGZipMinCompressionLevel; }
- int maximum_compression_level() const override { return kGZipMaxCompressionLevel; }
- int default_compression_level() const override { return kGZipDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return kGZipMinCompressionLevel; }
+ int maximum_compression_level() const override { return kGZipMaxCompressionLevel; }
+ int default_compression_level() const override { return kGZipDefaultCompressionLevel; }
private:
// zlib is stateful and the z_stream state variable must be initialized
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
index 715b6e7374a..e15ecb4e1fe 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/compression_zstd.cc
@@ -228,9 +228,9 @@ class ZSTDCodec : public Codec {
}
Compression::type compression_type() const override { return Compression::ZSTD; }
- int minimum_compression_level() const override { return ZSTD_minCLevel(); }
- int maximum_compression_level() const override { return ZSTD_maxCLevel(); }
- int default_compression_level() const override { return kZSTDDefaultCompressionLevel; }
+ int minimum_compression_level() const override { return ZSTD_minCLevel(); }
+ int maximum_compression_level() const override { return ZSTD_maxCLevel(); }
+ int default_compression_level() const override { return kZSTDDefaultCompressionLevel; }
int compression_level() const override { return compression_level_; }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
index 1f54969539a..d803521a2d9 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.cc
@@ -31,11 +31,11 @@
#endif
#ifdef _WIN32
-#include <immintrin.h>
+#include <immintrin.h>
#include <intrin.h>
#include <array>
#include <bitset>
-
+
#include "arrow/util/windows_compatibility.h"
#endif
@@ -51,19 +51,19 @@
#include "arrow/result.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
+#include "arrow/util/optional.h"
#include "arrow/util/string.h"
-namespace arrow {
-namespace internal {
-
-namespace {
-
+namespace arrow {
+namespace internal {
+
+namespace {
+
using std::max;
-constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
-constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
-constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M
+constexpr int64_t kDefaultL1CacheSize = 32 * 1024; // Level 1: 32k
+constexpr int64_t kDefaultL2CacheSize = 256 * 1024; // Level 2: 256k
+constexpr int64_t kDefaultL3CacheSize = 3072 * 1024; // Level 3: 3M
#if defined(__MINGW64_VERSION_MAJOR) && __MINGW64_VERSION_MAJOR < 5
void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
@@ -72,31 +72,31 @@ void __cpuidex(int CPUInfo[4], int function_id, int subfunction_id) {
"=d"(CPUInfo[3])
: "a"(function_id), "c"(subfunction_id));
}
-
-int64_t _xgetbv(int xcr) {
- int out = 0;
- __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
- return out;
-}
+
+int64_t _xgetbv(int xcr) {
+ int out = 0;
+ __asm__ __volatile__("xgetbv" : "=a"(out) : "c"(xcr) : "%edx");
+ return out;
+}
+#endif
+
+#ifdef __APPLE__
+util::optional<int64_t> IntegerSysCtlByName(const char* name) {
+ size_t len = sizeof(int64_t);
+ int64_t data = 0;
+ if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
+ return data;
+ }
+ // ENOENT is the official errno value for non-existing sysctl's,
+ // but EINVAL and ENOTSUP have been seen in the wild.
+ if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
+ auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
+ ARROW_LOG(WARNING) << st.ToString();
+ }
+ return util::nullopt;
+}
#endif
-#ifdef __APPLE__
-util::optional<int64_t> IntegerSysCtlByName(const char* name) {
- size_t len = sizeof(int64_t);
- int64_t data = 0;
- if (sysctlbyname(name, &data, &len, nullptr, 0) == 0) {
- return data;
- }
- // ENOENT is the official errno value for non-existing sysctl's,
- // but EINVAL and ENOTSUP have been seen in the wild.
- if (errno != ENOENT && errno != EINVAL && errno != ENOTSUP) {
- auto st = IOErrorFromErrno(errno, "sysctlbyname failed for '", name, "'");
- ARROW_LOG(WARNING) << st.ToString();
- }
- return util::nullopt;
-}
-#endif
-
#if defined(__GNUC__) && defined(__linux__) && defined(__aarch64__)
// There is no direct instruction to get cache size on Arm64 like '__cpuid' on x86;
// Get Arm64 cache size by reading '/sys/devices/system/cpu/cpu0/cache/index*/size';
@@ -105,11 +105,11 @@ util::optional<int64_t> IntegerSysCtlByName(const char* name) {
// index1: L1 Icache
// index2: L2 cache
// index3: L3 cache
-const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
-const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
-const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";
+const char* kL1CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index0/size";
+const char* kL2CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index2/size";
+const char* kL3CacheSizeFile = "/sys/devices/system/cpu/cpu0/cache/index3/size";
-int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
+int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
char* content = nullptr;
char* last_char = nullptr;
size_t file_len = 0;
@@ -148,8 +148,8 @@ int64_t GetArm64CacheSize(const char* filename, int64_t default_size = -1) {
}
#endif
-#if !defined(_WIN32) && !defined(__APPLE__)
-struct {
+#if !defined(_WIN32) && !defined(__APPLE__)
+struct {
std::string name;
int64_t flag;
} flag_mappings[] = {
@@ -166,7 +166,7 @@ struct {
{"asimd", CpuInfo::ASIMD},
#endif
};
-const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
// Helper function to parse for hardware flags.
// values contains a list of space-separated flags. check to see if the flags we
@@ -274,13 +274,13 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
}
}
- bool zmm_enabled = false;
- if (features_ECX[27]) { // OSXSAVE
- // Query if the OS supports saving ZMM registers when switching contexts
- int64_t xcr0 = _xgetbv(0);
- zmm_enabled = (xcr0 & 0xE0) == 0xE0;
- }
-
+ bool zmm_enabled = false;
+ if (features_ECX[27]) { // OSXSAVE
+ // Query if the OS supports saving ZMM registers when switching contexts
+ int64_t xcr0 = _xgetbv(0);
+ zmm_enabled = (xcr0 & 0xE0) == 0xE0;
+ }
+
if (features_ECX[9]) *hardware_flags |= CpuInfo::SSSE3;
if (features_ECX[19]) *hardware_flags |= CpuInfo::SSE4_1;
if (features_ECX[20]) *hardware_flags |= CpuInfo::SSE4_2;
@@ -296,22 +296,22 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name,
if (features_EBX[3]) *hardware_flags |= CpuInfo::BMI1;
if (features_EBX[5]) *hardware_flags |= CpuInfo::AVX2;
if (features_EBX[8]) *hardware_flags |= CpuInfo::BMI2;
- // ARROW-11427: only use AVX512 if enabled by the OS
- if (zmm_enabled) {
- if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
- if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
- if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
- if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
- if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
- }
+ // ARROW-11427: only use AVX512 if enabled by the OS
+ if (zmm_enabled) {
+ if (features_EBX[16]) *hardware_flags |= CpuInfo::AVX512F;
+ if (features_EBX[17]) *hardware_flags |= CpuInfo::AVX512DQ;
+ if (features_EBX[28]) *hardware_flags |= CpuInfo::AVX512CD;
+ if (features_EBX[30]) *hardware_flags |= CpuInfo::AVX512BW;
+ if (features_EBX[31]) *hardware_flags |= CpuInfo::AVX512VL;
+ }
}
return true;
}
#endif
-} // namespace
-
+} // namespace
+
CpuInfo::CpuInfo()
: hardware_flags_(0),
num_cores_(1),
@@ -348,37 +348,37 @@ void CpuInfo::Init() {
if (QueryPerformanceFrequency(&performance_frequency)) {
max_mhz = static_cast<float>(performance_frequency.QuadPart);
}
-#elif defined(__APPLE__)
- // On macOS, get CPU information from system information base
- struct SysCtlCpuFeature {
- const char* name;
- int64_t flag;
- };
- std::vector<SysCtlCpuFeature> features = {
-#if defined(__aarch64__)
- // ARM64 (note that this is exposed under Rosetta as well)
- {"hw.optional.neon", ASIMD},
+#elif defined(__APPLE__)
+ // On macOS, get CPU information from system information base
+ struct SysCtlCpuFeature {
+ const char* name;
+ int64_t flag;
+ };
+ std::vector<SysCtlCpuFeature> features = {
+#if defined(__aarch64__)
+ // ARM64 (note that this is exposed under Rosetta as well)
+ {"hw.optional.neon", ASIMD},
+#else
+ // x86
+ {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
+ {"hw.optional.avx1_0", AVX},
+ {"hw.optional.avx2_0", AVX2},
+ {"hw.optional.bmi1", BMI1},
+ {"hw.optional.bmi2", BMI2},
+ {"hw.optional.avx512f", AVX512F},
+ {"hw.optional.avx512cd", AVX512CD},
+ {"hw.optional.avx512dq", AVX512DQ},
+ {"hw.optional.avx512bw", AVX512BW},
+ {"hw.optional.avx512vl", AVX512VL},
+#endif
+ };
+ for (const auto& feature : features) {
+ auto v = IntegerSysCtlByName(feature.name);
+ if (v.value_or(0)) {
+ hardware_flags_ |= feature.flag;
+ }
+ }
#else
- // x86
- {"hw.optional.sse4_2", SSSE3 | SSE4_1 | SSE4_2 | POPCNT},
- {"hw.optional.avx1_0", AVX},
- {"hw.optional.avx2_0", AVX2},
- {"hw.optional.bmi1", BMI1},
- {"hw.optional.bmi2", BMI2},
- {"hw.optional.avx512f", AVX512F},
- {"hw.optional.avx512cd", AVX512CD},
- {"hw.optional.avx512dq", AVX512DQ},
- {"hw.optional.avx512bw", AVX512BW},
- {"hw.optional.avx512vl", AVX512VL},
-#endif
- };
- for (const auto& feature : features) {
- auto v = IntegerSysCtlByName(feature.name);
- if (v.value_or(0)) {
- hardware_flags_ |= feature.flag;
- }
- }
-#else
// Read from /proc/cpuinfo
std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
while (cpuinfo) {
@@ -413,20 +413,20 @@ void CpuInfo::Init() {
#endif
#ifdef __APPLE__
- // On macOS, get cache size from system information base
- SetDefaultCacheSize();
- auto c = IntegerSysCtlByName("hw.l1dcachesize");
- if (c.has_value()) {
- cache_sizes_[0] = *c;
- }
- c = IntegerSysCtlByName("hw.l2cachesize");
- if (c.has_value()) {
- cache_sizes_[1] = *c;
- }
- c = IntegerSysCtlByName("hw.l3cachesize");
- if (c.has_value()) {
- cache_sizes_[2] = *c;
- }
+ // On macOS, get cache size from system information base
+ SetDefaultCacheSize();
+ auto c = IntegerSysCtlByName("hw.l1dcachesize");
+ if (c.has_value()) {
+ cache_sizes_[0] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l2cachesize");
+ if (c.has_value()) {
+ cache_sizes_[1] = *c;
+ }
+ c = IntegerSysCtlByName("hw.l3cachesize");
+ if (c.has_value()) {
+ cache_sizes_[2] = *c;
+ }
#elif _WIN32
if (!RetrieveCacheSize(cache_sizes_)) {
SetDefaultCacheSize();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
index 7b434229c1b..83819c25519 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/cpu_info.h
@@ -70,18 +70,18 @@ class ARROW_EXPORT CpuInfo {
/// Returns all the flags for this cpu
int64_t hardware_flags();
- /// \brief Returns whether or not the given feature is enabled.
- ///
- /// IsSupported() is true iff IsDetected() is also true and the feature
- /// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
- /// environment variable).
+ /// \brief Returns whether or not the given feature is enabled.
+ ///
+ /// IsSupported() is true iff IsDetected() is also true and the feature
+ /// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
+ /// environment variable).
bool IsSupported(int64_t flags) const { return (hardware_flags_ & flags) == flags; }
- /// Returns whether or not the given feature is available on the CPU.
- bool IsDetected(int64_t flags) const {
- return (original_hardware_flags_ & flags) == flags;
- }
-
+ /// Returns whether or not the given feature is available on the CPU.
+ bool IsDetected(int64_t flags) const {
+ return (original_hardware_flags_ & flags) == flags;
+ }
+
/// \brief The processor supports SSE4.2 and the Arrow libraries are built
/// with support for it
bool CanUseSSE4_2() const;
@@ -113,15 +113,15 @@ class ARROW_EXPORT CpuInfo {
private:
CpuInfo();
- enum UserSimdLevel {
- USER_SIMD_NONE = 0,
- USER_SIMD_SSE4_2,
- USER_SIMD_AVX,
- USER_SIMD_AVX2,
- USER_SIMD_AVX512,
- USER_SIMD_MAX,
- };
-
+ enum UserSimdLevel {
+ USER_SIMD_NONE = 0,
+ USER_SIMD_SSE4_2,
+ USER_SIMD_AVX,
+ USER_SIMD_AVX2,
+ USER_SIMD_AVX512,
+ USER_SIMD_MAX,
+ };
+
void Init();
/// Inits CPU cache size variables with default values
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
index bbbb11c7252..7aefd1ab9cd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.cc
@@ -30,7 +30,7 @@
#include "arrow/status.h"
#include "arrow/util/decimal.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/formatting.h"
#include "arrow/util/int128_internal.h"
#include "arrow/util/int_util_internal.h"
@@ -94,47 +94,47 @@ static constexpr double kDoublePowersOfTen[2 * 38 + 1] = {
1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27,
1e28, 1e29, 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38};
-// On the Windows R toolchain, INFINITY is double type instead of float
-static constexpr float kFloatInf = std::numeric_limits<float>::infinity();
-static constexpr float kFloatPowersOfTen76[2 * 76 + 1] = {
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f,
- 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f,
- 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f,
- 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f,
- 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f,
- 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f,
- 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f,
- 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f,
- 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f,
- 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f,
- 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f,
- 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f,
- 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
- kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf};
-
-static constexpr double kDoublePowersOfTen76[2 * 76 + 1] = {
- 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65,
- 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53,
- 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41,
- 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29,
- 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
- 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5,
- 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
- 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
- 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
- 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43,
- 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55,
- 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67,
- 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76};
-
+// On the Windows R toolchain, INFINITY is double type instead of float
+static constexpr float kFloatInf = std::numeric_limits<float>::infinity();
+static constexpr float kFloatPowersOfTen76[2 * 76 + 1] = {
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 1e-45f, 1e-44f, 1e-43f, 1e-42f,
+ 1e-41f, 1e-40f, 1e-39f, 1e-38f, 1e-37f, 1e-36f, 1e-35f,
+ 1e-34f, 1e-33f, 1e-32f, 1e-31f, 1e-30f, 1e-29f, 1e-28f,
+ 1e-27f, 1e-26f, 1e-25f, 1e-24f, 1e-23f, 1e-22f, 1e-21f,
+ 1e-20f, 1e-19f, 1e-18f, 1e-17f, 1e-16f, 1e-15f, 1e-14f,
+ 1e-13f, 1e-12f, 1e-11f, 1e-10f, 1e-9f, 1e-8f, 1e-7f,
+ 1e-6f, 1e-5f, 1e-4f, 1e-3f, 1e-2f, 1e-1f, 1e0f,
+ 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f,
+ 1e8f, 1e9f, 1e10f, 1e11f, 1e12f, 1e13f, 1e14f,
+ 1e15f, 1e16f, 1e17f, 1e18f, 1e19f, 1e20f, 1e21f,
+ 1e22f, 1e23f, 1e24f, 1e25f, 1e26f, 1e27f, 1e28f,
+ 1e29f, 1e30f, 1e31f, 1e32f, 1e33f, 1e34f, 1e35f,
+ 1e36f, 1e37f, 1e38f, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf,
+ kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf, kFloatInf};
+
+static constexpr double kDoublePowersOfTen76[2 * 76 + 1] = {
+ 1e-76, 1e-75, 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, 1e-65,
+ 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, 1e-56, 1e-55, 1e-54, 1e-53,
+ 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41,
+ 1e-40, 1e-39, 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, 1e-29,
+ 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17,
+ 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5,
+ 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
+ 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19,
+ 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31,
+ 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43,
+ 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55,
+ 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67,
+ 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76};
+
namespace {
template <typename Real, typename Derived>
@@ -267,7 +267,7 @@ static void AppendLittleEndianArrayToString(const std::array<uint64_t, n>& array
// *elem = dividend / 1e9;
// remainder = dividend % 1e9.
uint32_t hi = static_cast<uint32_t>(*elem >> 32);
- uint32_t lo = static_cast<uint32_t>(*elem & BitUtil::LeastSignificantBitMask(32));
+ uint32_t lo = static_cast<uint32_t>(*elem & BitUtil::LeastSignificantBitMask(32));
uint64_t dividend_hi = (static_cast<uint64_t>(remainder) << 32) | hi;
uint64_t quotient_hi = dividend_hi / k1e9;
remainder = static_cast<uint32_t>(dividend_hi % k1e9);
@@ -486,24 +486,24 @@ bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out)
return pos == size;
}
-inline Status ToArrowStatus(DecimalStatus dstatus, int num_bits) {
- switch (dstatus) {
- case DecimalStatus::kSuccess:
- return Status::OK();
-
- case DecimalStatus::kDivideByZero:
- return Status::Invalid("Division by 0 in Decimal", num_bits);
-
- case DecimalStatus::kOverflow:
- return Status::Invalid("Overflow occurred during Decimal", num_bits, " operation.");
-
- case DecimalStatus::kRescaleDataLoss:
- return Status::Invalid("Rescaling Decimal", num_bits,
- " value would cause data loss");
- }
- return Status::OK();
-}
-
+inline Status ToArrowStatus(DecimalStatus dstatus, int num_bits) {
+ switch (dstatus) {
+ case DecimalStatus::kSuccess:
+ return Status::OK();
+
+ case DecimalStatus::kDivideByZero:
+ return Status::Invalid("Division by 0 in Decimal", num_bits);
+
+ case DecimalStatus::kOverflow:
+ return Status::Invalid("Overflow occurred during Decimal", num_bits, " operation.");
+
+ case DecimalStatus::kRescaleDataLoss:
+ return Status::Invalid("Rescaling Decimal", num_bits,
+ " value would cause data loss");
+ }
+ return Status::OK();
+}
+
} // namespace
Status Decimal128::FromString(const util::string_view& s, Decimal128* out,
@@ -609,7 +609,7 @@ Result<Decimal128> Decimal128::FromBigEndian(const uint8_t* bytes, int32_t lengt
int64_t high, low;
- if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
+ if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
"was ", length, ", but must be between ", kMinDecimalBytes,
" and ", kMaxDecimalBytes);
@@ -657,275 +657,275 @@ Result<Decimal128> Decimal128::FromBigEndian(const uint8_t* bytes, int32_t lengt
}
Status Decimal128::ToArrowStatus(DecimalStatus dstatus) const {
- return arrow::ToArrowStatus(dstatus, 128);
-}
-
-std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) {
- os << decimal.ToIntegerString();
- return os;
-}
-
-Decimal256::Decimal256(const std::string& str) : Decimal256() {
- *this = Decimal256::FromString(str).ValueOrDie();
-}
-
-std::string Decimal256::ToIntegerString() const {
- std::string result;
- if (static_cast<int64_t>(little_endian_array()[3]) < 0) {
- result.push_back('-');
- Decimal256 abs = *this;
- abs.Negate();
- AppendLittleEndianArrayToString(abs.little_endian_array(), &result);
- } else {
- AppendLittleEndianArrayToString(little_endian_array(), &result);
- }
- return result;
-}
-
-std::string Decimal256::ToString(int32_t scale) const {
- std::string str(ToIntegerString());
- AdjustIntegerStringWithScale(scale, &str);
- return str;
-}
-
-Status Decimal256::FromString(const util::string_view& s, Decimal256* out,
- int32_t* precision, int32_t* scale) {
- if (s.empty()) {
- return Status::Invalid("Empty string cannot be converted to decimal");
- }
-
- DecimalComponents dec;
- if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
- return Status::Invalid("The string '", s, "' is not a valid decimal number");
- }
-
- // Count number of significant digits (without leading zeros)
- size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
- size_t significant_digits = dec.fractional_digits.size();
- if (first_non_zero != std::string::npos) {
- significant_digits += dec.whole_digits.size() - first_non_zero;
- }
-
- if (precision != nullptr) {
- *precision = static_cast<int32_t>(significant_digits);
- }
-
- if (scale != nullptr) {
- if (dec.has_exponent) {
- auto adjusted_exponent = dec.exponent;
- auto len = static_cast<int32_t>(significant_digits);
- *scale = -adjusted_exponent + len - 1;
- } else {
- *scale = static_cast<int32_t>(dec.fractional_digits.size());
- }
- }
-
- if (out != nullptr) {
- std::array<uint64_t, 4> little_endian_array = {0, 0, 0, 0};
- ShiftAndAdd(dec.whole_digits, little_endian_array.data(), little_endian_array.size());
- ShiftAndAdd(dec.fractional_digits, little_endian_array.data(),
- little_endian_array.size());
- *out = Decimal256(little_endian_array);
-
- if (dec.sign == '-') {
- out->Negate();
- }
- }
-
- return Status::OK();
-}
-
-Status Decimal256::FromString(const std::string& s, Decimal256* out, int32_t* precision,
- int32_t* scale) {
- return FromString(util::string_view(s), out, precision, scale);
-}
-
-Status Decimal256::FromString(const char* s, Decimal256* out, int32_t* precision,
- int32_t* scale) {
- return FromString(util::string_view(s), out, precision, scale);
-}
-
-Result<Decimal256> Decimal256::FromString(const util::string_view& s) {
- Decimal256 out;
- RETURN_NOT_OK(FromString(s, &out, nullptr, nullptr));
- return std::move(out);
-}
-
-Result<Decimal256> Decimal256::FromString(const std::string& s) {
- return FromString(util::string_view(s));
-}
-
-Result<Decimal256> Decimal256::FromString(const char* s) {
- return FromString(util::string_view(s));
-}
-
-Result<Decimal256> Decimal256::FromBigEndian(const uint8_t* bytes, int32_t length) {
- static constexpr int32_t kMinDecimalBytes = 1;
- static constexpr int32_t kMaxDecimalBytes = 32;
-
- std::array<uint64_t, 4> little_endian_array;
-
- if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
- return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
- "was ", length, ", but must be between ", kMinDecimalBytes,
- " and ", kMaxDecimalBytes);
- }
-
- // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the
- // sign bit.
- const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
-
- for (int word_idx = 0; word_idx < 4; word_idx++) {
- const int32_t word_length = std::min(length, static_cast<int32_t>(sizeof(uint64_t)));
-
- if (word_length == 8) {
- // Full words can be assigned as is (and are UB with the shift below).
- little_endian_array[word_idx] =
- UInt64FromBigEndian(bytes + length - word_length, word_length);
- } else {
- // Sign extend the word its if necessary
- uint64_t word = -1 * is_negative;
- if (length > 0) {
- // Incorporate the actual values if present.
- // Shift left enough bits to make room for the incoming int64_t
- word = SafeLeftShift(word, word_length * CHAR_BIT);
- // Preserve the upper bits by inplace OR-ing the int64_t
- word |= UInt64FromBigEndian(bytes + length - word_length, word_length);
- }
- little_endian_array[word_idx] = word;
- }
- // Move on to the next word.
- length -= word_length;
- }
-
- return Decimal256(little_endian_array);
-}
-
-Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
- return arrow::ToArrowStatus(dstatus, 256);
-}
-
-namespace {
-
-template <typename Real, typename Derived>
-struct Decimal256RealConversion {
- static Result<Decimal256> FromPositiveReal(Real real, int32_t precision,
- int32_t scale) {
- auto x = real;
- if (scale >= -76 && scale <= 76) {
- x *= Derived::powers_of_ten()[scale + 76];
- } else {
- x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
- }
- x = std::nearbyint(x);
- const auto max_abs = Derived::powers_of_ten()[precision + 76];
- if (x >= max_abs) {
- return Status::Invalid("Cannot convert ", real,
- " to Decimal256(precision = ", precision,
- ", scale = ", scale, "): overflow");
- }
- // Extract parts
- const auto part3 = std::floor(std::ldexp(x, -192));
- x -= std::ldexp(part3, 192);
- const auto part2 = std::floor(std::ldexp(x, -128));
- x -= std::ldexp(part2, 128);
- const auto part1 = std::floor(std::ldexp(x, -64));
- x -= std::ldexp(part1, 64);
- const auto part0 = x;
-
- DCHECK_GE(part3, 0);
- DCHECK_LT(part3, 1.8446744073709552e+19); // 2**64
- DCHECK_GE(part2, 0);
- DCHECK_LT(part2, 1.8446744073709552e+19); // 2**64
- DCHECK_GE(part1, 0);
- DCHECK_LT(part1, 1.8446744073709552e+19); // 2**64
- DCHECK_GE(part0, 0);
- DCHECK_LT(part0, 1.8446744073709552e+19); // 2**64
- return Decimal256(std::array<uint64_t, 4>{
- static_cast<uint64_t>(part0), static_cast<uint64_t>(part1),
- static_cast<uint64_t>(part2), static_cast<uint64_t>(part3)});
- }
-
- static Result<Decimal256> FromReal(Real x, int32_t precision, int32_t scale) {
- DCHECK_GT(precision, 0);
- DCHECK_LE(precision, 76);
-
- if (!std::isfinite(x)) {
- return Status::Invalid("Cannot convert ", x, " to Decimal256");
- }
- if (x < 0) {
- ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
- return dec.Negate();
- } else {
- // Includes negative zero
- return FromPositiveReal(x, precision, scale);
- }
- }
-
- static Real ToRealPositive(const Decimal256& decimal, int32_t scale) {
- DCHECK_GE(decimal, 0);
- Real x = 0;
- const auto& parts = decimal.little_endian_array();
- x += Derived::two_to_192(static_cast<Real>(parts[3]));
- x += Derived::two_to_128(static_cast<Real>(parts[2]));
- x += Derived::two_to_64(static_cast<Real>(parts[1]));
- x += static_cast<Real>(parts[0]);
- if (scale >= -76 && scale <= 76) {
- x *= Derived::powers_of_ten()[-scale + 76];
- } else {
- x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
- }
- return x;
- }
-
- static Real ToReal(Decimal256 decimal, int32_t scale) {
- if (decimal.little_endian_array()[3] & (1ULL << 63)) {
- // Convert the absolute value to avoid precision loss
- decimal.Negate();
- return -ToRealPositive(decimal, scale);
- } else {
- return ToRealPositive(decimal, scale);
- }
- }
-};
-
-struct Decimal256FloatConversion
- : public Decimal256RealConversion<float, Decimal256FloatConversion> {
- static constexpr const float* powers_of_ten() { return kFloatPowersOfTen76; }
-
- static float two_to_64(float x) { return x * 1.8446744e+19f; }
- static float two_to_128(float x) { return x == 0 ? 0 : INFINITY; }
- static float two_to_192(float x) { return x == 0 ? 0 : INFINITY; }
-};
-
-struct Decimal256DoubleConversion
- : public Decimal256RealConversion<double, Decimal256DoubleConversion> {
- static constexpr const double* powers_of_ten() { return kDoublePowersOfTen76; }
-
- static double two_to_64(double x) { return x * 1.8446744073709552e+19; }
- static double two_to_128(double x) { return x * 3.402823669209385e+38; }
- static double two_to_192(double x) { return x * 6.277101735386681e+57; }
-};
-
-} // namespace
-
-Result<Decimal256> Decimal256::FromReal(float x, int32_t precision, int32_t scale) {
- return Decimal256FloatConversion::FromReal(x, precision, scale);
-}
-
-Result<Decimal256> Decimal256::FromReal(double x, int32_t precision, int32_t scale) {
- return Decimal256DoubleConversion::FromReal(x, precision, scale);
-}
-
-float Decimal256::ToFloat(int32_t scale) const {
- return Decimal256FloatConversion::ToReal(*this, scale);
-}
-
-double Decimal256::ToDouble(int32_t scale) const {
- return Decimal256DoubleConversion::ToReal(*this, scale);
-}
-
-std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
+ return arrow::ToArrowStatus(dstatus, 128);
+}
+
+std::ostream& operator<<(std::ostream& os, const Decimal128& decimal) {
+ os << decimal.ToIntegerString();
+ return os;
+}
+
+Decimal256::Decimal256(const std::string& str) : Decimal256() {
+ *this = Decimal256::FromString(str).ValueOrDie();
+}
+
+std::string Decimal256::ToIntegerString() const {
+ std::string result;
+ if (static_cast<int64_t>(little_endian_array()[3]) < 0) {
+ result.push_back('-');
+ Decimal256 abs = *this;
+ abs.Negate();
+ AppendLittleEndianArrayToString(abs.little_endian_array(), &result);
+ } else {
+ AppendLittleEndianArrayToString(little_endian_array(), &result);
+ }
+ return result;
+}
+
+std::string Decimal256::ToString(int32_t scale) const {
+ std::string str(ToIntegerString());
+ AdjustIntegerStringWithScale(scale, &str);
+ return str;
+}
+
+Status Decimal256::FromString(const util::string_view& s, Decimal256* out,
+ int32_t* precision, int32_t* scale) {
+ if (s.empty()) {
+ return Status::Invalid("Empty string cannot be converted to decimal");
+ }
+
+ DecimalComponents dec;
+ if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+ return Status::Invalid("The string '", s, "' is not a valid decimal number");
+ }
+
+ // Count number of significant digits (without leading zeros)
+ size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+ size_t significant_digits = dec.fractional_digits.size();
+ if (first_non_zero != std::string::npos) {
+ significant_digits += dec.whole_digits.size() - first_non_zero;
+ }
+
+ if (precision != nullptr) {
+ *precision = static_cast<int32_t>(significant_digits);
+ }
+
+ if (scale != nullptr) {
+ if (dec.has_exponent) {
+ auto adjusted_exponent = dec.exponent;
+ auto len = static_cast<int32_t>(significant_digits);
+ *scale = -adjusted_exponent + len - 1;
+ } else {
+ *scale = static_cast<int32_t>(dec.fractional_digits.size());
+ }
+ }
+
+ if (out != nullptr) {
+ std::array<uint64_t, 4> little_endian_array = {0, 0, 0, 0};
+ ShiftAndAdd(dec.whole_digits, little_endian_array.data(), little_endian_array.size());
+ ShiftAndAdd(dec.fractional_digits, little_endian_array.data(),
+ little_endian_array.size());
+ *out = Decimal256(little_endian_array);
+
+ if (dec.sign == '-') {
+ out->Negate();
+ }
+ }
+
+ return Status::OK();
+}
+
+Status Decimal256::FromString(const std::string& s, Decimal256* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Status Decimal256::FromString(const char* s, Decimal256* out, int32_t* precision,
+ int32_t* scale) {
+ return FromString(util::string_view(s), out, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromString(const util::string_view& s) {
+ Decimal256 out;
+ RETURN_NOT_OK(FromString(s, &out, nullptr, nullptr));
+ return std::move(out);
+}
+
+Result<Decimal256> Decimal256::FromString(const std::string& s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal256> Decimal256::FromString(const char* s) {
+ return FromString(util::string_view(s));
+}
+
+Result<Decimal256> Decimal256::FromBigEndian(const uint8_t* bytes, int32_t length) {
+ static constexpr int32_t kMinDecimalBytes = 1;
+ static constexpr int32_t kMaxDecimalBytes = 32;
+
+ std::array<uint64_t, 4> little_endian_array;
+
+ if (ARROW_PREDICT_FALSE(length < kMinDecimalBytes || length > kMaxDecimalBytes)) {
+ return Status::Invalid("Length of byte array passed to Decimal128::FromBigEndian ",
+ "was ", length, ", but must be between ", kMinDecimalBytes,
+ " and ", kMaxDecimalBytes);
+ }
+
+ // Bytes are coming in big-endian, so the first byte is the MSB and therefore holds the
+ // sign bit.
+ const bool is_negative = static_cast<int8_t>(bytes[0]) < 0;
+
+ for (int word_idx = 0; word_idx < 4; word_idx++) {
+ const int32_t word_length = std::min(length, static_cast<int32_t>(sizeof(uint64_t)));
+
+ if (word_length == 8) {
+ // Full words can be assigned as is (and are UB with the shift below).
+ little_endian_array[word_idx] =
+ UInt64FromBigEndian(bytes + length - word_length, word_length);
+ } else {
+ // Sign extend the word its if necessary
+ uint64_t word = -1 * is_negative;
+ if (length > 0) {
+ // Incorporate the actual values if present.
+ // Shift left enough bits to make room for the incoming int64_t
+ word = SafeLeftShift(word, word_length * CHAR_BIT);
+ // Preserve the upper bits by inplace OR-ing the int64_t
+ word |= UInt64FromBigEndian(bytes + length - word_length, word_length);
+ }
+ little_endian_array[word_idx] = word;
+ }
+ // Move on to the next word.
+ length -= word_length;
+ }
+
+ return Decimal256(little_endian_array);
+}
+
+Status Decimal256::ToArrowStatus(DecimalStatus dstatus) const {
+ return arrow::ToArrowStatus(dstatus, 256);
+}
+
+namespace {
+
+template <typename Real, typename Derived>
+struct Decimal256RealConversion {
+ static Result<Decimal256> FromPositiveReal(Real real, int32_t precision,
+ int32_t scale) {
+ auto x = real;
+ if (scale >= -76 && scale <= 76) {
+ x *= Derived::powers_of_ten()[scale + 76];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(scale));
+ }
+ x = std::nearbyint(x);
+ const auto max_abs = Derived::powers_of_ten()[precision + 76];
+ if (x >= max_abs) {
+ return Status::Invalid("Cannot convert ", real,
+ " to Decimal256(precision = ", precision,
+ ", scale = ", scale, "): overflow");
+ }
+ // Extract parts
+ const auto part3 = std::floor(std::ldexp(x, -192));
+ x -= std::ldexp(part3, 192);
+ const auto part2 = std::floor(std::ldexp(x, -128));
+ x -= std::ldexp(part2, 128);
+ const auto part1 = std::floor(std::ldexp(x, -64));
+ x -= std::ldexp(part1, 64);
+ const auto part0 = x;
+
+ DCHECK_GE(part3, 0);
+ DCHECK_LT(part3, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part2, 0);
+ DCHECK_LT(part2, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part1, 0);
+ DCHECK_LT(part1, 1.8446744073709552e+19); // 2**64
+ DCHECK_GE(part0, 0);
+ DCHECK_LT(part0, 1.8446744073709552e+19); // 2**64
+ return Decimal256(std::array<uint64_t, 4>{
+ static_cast<uint64_t>(part0), static_cast<uint64_t>(part1),
+ static_cast<uint64_t>(part2), static_cast<uint64_t>(part3)});
+ }
+
+ static Result<Decimal256> FromReal(Real x, int32_t precision, int32_t scale) {
+ DCHECK_GT(precision, 0);
+ DCHECK_LE(precision, 76);
+
+ if (!std::isfinite(x)) {
+ return Status::Invalid("Cannot convert ", x, " to Decimal256");
+ }
+ if (x < 0) {
+ ARROW_ASSIGN_OR_RAISE(auto dec, FromPositiveReal(-x, precision, scale));
+ return dec.Negate();
+ } else {
+ // Includes negative zero
+ return FromPositiveReal(x, precision, scale);
+ }
+ }
+
+ static Real ToRealPositive(const Decimal256& decimal, int32_t scale) {
+ DCHECK_GE(decimal, 0);
+ Real x = 0;
+ const auto& parts = decimal.little_endian_array();
+ x += Derived::two_to_192(static_cast<Real>(parts[3]));
+ x += Derived::two_to_128(static_cast<Real>(parts[2]));
+ x += Derived::two_to_64(static_cast<Real>(parts[1]));
+ x += static_cast<Real>(parts[0]);
+ if (scale >= -76 && scale <= 76) {
+ x *= Derived::powers_of_ten()[-scale + 76];
+ } else {
+ x *= std::pow(static_cast<Real>(10), static_cast<Real>(-scale));
+ }
+ return x;
+ }
+
+ static Real ToReal(Decimal256 decimal, int32_t scale) {
+ if (decimal.little_endian_array()[3] & (1ULL << 63)) {
+ // Convert the absolute value to avoid precision loss
+ decimal.Negate();
+ return -ToRealPositive(decimal, scale);
+ } else {
+ return ToRealPositive(decimal, scale);
+ }
+ }
+};
+
+struct Decimal256FloatConversion
+ : public Decimal256RealConversion<float, Decimal256FloatConversion> {
+ static constexpr const float* powers_of_ten() { return kFloatPowersOfTen76; }
+
+ static float two_to_64(float x) { return x * 1.8446744e+19f; }
+ static float two_to_128(float x) { return x == 0 ? 0 : INFINITY; }
+ static float two_to_192(float x) { return x == 0 ? 0 : INFINITY; }
+};
+
+struct Decimal256DoubleConversion
+ : public Decimal256RealConversion<double, Decimal256DoubleConversion> {
+ static constexpr const double* powers_of_ten() { return kDoublePowersOfTen76; }
+
+ static double two_to_64(double x) { return x * 1.8446744073709552e+19; }
+ static double two_to_128(double x) { return x * 3.402823669209385e+38; }
+ static double two_to_192(double x) { return x * 6.277101735386681e+57; }
+};
+
+} // namespace
+
+Result<Decimal256> Decimal256::FromReal(float x, int32_t precision, int32_t scale) {
+ return Decimal256FloatConversion::FromReal(x, precision, scale);
+}
+
+Result<Decimal256> Decimal256::FromReal(double x, int32_t precision, int32_t scale) {
+ return Decimal256DoubleConversion::FromReal(x, precision, scale);
+}
+
+float Decimal256::ToFloat(int32_t scale) const {
+ return Decimal256FloatConversion::ToReal(*this, scale);
+}
+
+double Decimal256::ToDouble(int32_t scale) const {
+ return Decimal256DoubleConversion::ToReal(*this, scale);
+}
+
+std::ostream& operator<<(std::ostream& os, const Decimal256& decimal) {
os << decimal.ToIntegerString();
return os;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
index 4c6cc9dd1db..4a158728833 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/decimal.h
@@ -55,8 +55,8 @@ class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
/// \endcond
/// \brief constructor creates a Decimal128 from a BasicDecimal128.
- constexpr Decimal128(const BasicDecimal128& value) noexcept // NOLINT runtime/explicit
- : BasicDecimal128(value) {}
+ constexpr Decimal128(const BasicDecimal128& value) noexcept // NOLINT runtime/explicit
+ : BasicDecimal128(value) {}
/// \brief Parse the number from a base 10 string representation.
explicit Decimal128(const std::string& value);
@@ -173,119 +173,119 @@ struct Decimal128::ToRealConversion<double> {
}
};
-/// Represents a signed 256-bit integer in two's complement.
-/// The max decimal precision that can be safely represented is
-/// 76 significant digits.
-///
-/// The implementation is split into two parts :
-///
-/// 1. BasicDecimal256
-/// - can be safely compiled to IR without references to libstdc++.
-/// 2. Decimal256
-/// - (TODO) has additional functionality on top of BasicDecimal256 to deal with
-/// strings and streams.
-class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
- public:
- /// \cond FALSE
- // (need to avoid a duplicate definition in Sphinx)
- using BasicDecimal256::BasicDecimal256;
- /// \endcond
-
- /// \brief constructor creates a Decimal256 from a BasicDecimal256.
- constexpr Decimal256(const BasicDecimal256& value) noexcept : BasicDecimal256(value) {}
-
- /// \brief Parse the number from a base 10 string representation.
- explicit Decimal256(const std::string& value);
-
- /// \brief Empty constructor creates a Decimal256 with a value of 0.
- // This is required on some older compilers.
- constexpr Decimal256() noexcept : BasicDecimal256() {}
-
- /// \brief Convert the Decimal256 value to a base 10 decimal string with the given
- /// scale.
- std::string ToString(int32_t scale) const;
-
- /// \brief Convert the value to an integer string
- std::string ToIntegerString() const;
-
- /// \brief Convert a decimal string to a Decimal256 value, optionally including
- /// precision and scale if they're passed in and not null.
- static Status FromString(const util::string_view& s, Decimal256* out,
- int32_t* precision, int32_t* scale = NULLPTR);
- static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
- int32_t* scale = NULLPTR);
- static Status FromString(const char* s, Decimal256* out, int32_t* precision,
- int32_t* scale = NULLPTR);
- static Result<Decimal256> FromString(const util::string_view& s);
- static Result<Decimal256> FromString(const std::string& s);
- static Result<Decimal256> FromString(const char* s);
-
- /// \brief Convert Decimal256 from one scale to another
- Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
- Decimal256 out;
- auto dstatus = BasicDecimal256::Rescale(original_scale, new_scale, &out);
- ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
- return std::move(out);
- }
-
- /// Divide this number by right and return the result.
- ///
- /// This operation is not destructive.
- /// The answer rounds to zero. Signs work like:
- /// 21 / 5 -> 4, 1
- /// -21 / 5 -> -4, -1
- /// 21 / -5 -> -4, 1
- /// -21 / -5 -> 4, -1
- /// \param[in] divisor the number to divide by
- /// \return the pair of the quotient and the remainder
- Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
- std::pair<Decimal256, Decimal256> result;
- auto dstatus = BasicDecimal256::Divide(divisor, &result.first, &result.second);
- ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
- return std::move(result);
- }
-
- /// \brief Convert from a big-endian byte representation. The length must be
- /// between 1 and 32.
- /// \return error status if the length is an invalid value
- static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
-
- static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
- static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
-
- /// \brief Convert to a floating-point number (scaled).
- /// May return infinity in case of overflow.
- float ToFloat(int32_t scale) const;
- /// \brief Convert to a floating-point number (scaled)
- double ToDouble(int32_t scale) const;
-
- /// \brief Convert to a floating-point number (scaled)
- template <typename T>
- T ToReal(int32_t scale) const {
- return ToRealConversion<T>::ToReal(*this, scale);
- }
-
- friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
- const Decimal256& decimal);
-
- private:
- /// Converts internal error code to Status
- Status ToArrowStatus(DecimalStatus dstatus) const;
-
- template <typename T>
- struct ToRealConversion {};
-};
-
-template <>
-struct Decimal256::ToRealConversion<float> {
- static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
-};
-
-template <>
-struct Decimal256::ToRealConversion<double> {
- static double ToReal(const Decimal256& dec, int32_t scale) {
- return dec.ToDouble(scale);
- }
-};
-
+/// Represents a signed 256-bit integer in two's complement.
+/// The max decimal precision that can be safely represented is
+/// 76 significant digits.
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal256
+/// - can be safely compiled to IR without references to libstdc++.
+/// 2. Decimal256
+/// - (TODO) has additional functionality on top of BasicDecimal256 to deal with
+/// strings and streams.
+class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
+ public:
+ /// \cond FALSE
+ // (need to avoid a duplicate definition in Sphinx)
+ using BasicDecimal256::BasicDecimal256;
+ /// \endcond
+
+ /// \brief constructor creates a Decimal256 from a BasicDecimal256.
+ constexpr Decimal256(const BasicDecimal256& value) noexcept : BasicDecimal256(value) {}
+
+ /// \brief Parse the number from a base 10 string representation.
+ explicit Decimal256(const std::string& value);
+
+ /// \brief Empty constructor creates a Decimal256 with a value of 0.
+ // This is required on some older compilers.
+ constexpr Decimal256() noexcept : BasicDecimal256() {}
+
+ /// \brief Convert the Decimal256 value to a base 10 decimal string with the given
+ /// scale.
+ std::string ToString(int32_t scale) const;
+
+ /// \brief Convert the value to an integer string
+ std::string ToIntegerString() const;
+
+ /// \brief Convert a decimal string to a Decimal256 value, optionally including
+ /// precision and scale if they're passed in and not null.
+ static Status FromString(const util::string_view& s, Decimal256* out,
+ int32_t* precision, int32_t* scale = NULLPTR);
+ static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Status FromString(const char* s, Decimal256* out, int32_t* precision,
+ int32_t* scale = NULLPTR);
+ static Result<Decimal256> FromString(const util::string_view& s);
+ static Result<Decimal256> FromString(const std::string& s);
+ static Result<Decimal256> FromString(const char* s);
+
+ /// \brief Convert Decimal256 from one scale to another
+ Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
+ Decimal256 out;
+ auto dstatus = BasicDecimal256::Rescale(original_scale, new_scale, &out);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(out);
+ }
+
+ /// Divide this number by right and return the result.
+ ///
+ /// This operation is not destructive.
+ /// The answer rounds to zero. Signs work like:
+ /// 21 / 5 -> 4, 1
+ /// -21 / 5 -> -4, -1
+ /// 21 / -5 -> -4, 1
+ /// -21 / -5 -> 4, -1
+ /// \param[in] divisor the number to divide by
+ /// \return the pair of the quotient and the remainder
+ Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
+ std::pair<Decimal256, Decimal256> result;
+ auto dstatus = BasicDecimal256::Divide(divisor, &result.first, &result.second);
+ ARROW_RETURN_NOT_OK(ToArrowStatus(dstatus));
+ return std::move(result);
+ }
+
+ /// \brief Convert from a big-endian byte representation. The length must be
+ /// between 1 and 32.
+ /// \return error status if the length is an invalid value
+ static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
+
+ static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
+ static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
+
+ /// \brief Convert to a floating-point number (scaled).
+ /// May return infinity in case of overflow.
+ float ToFloat(int32_t scale) const;
+ /// \brief Convert to a floating-point number (scaled)
+ double ToDouble(int32_t scale) const;
+
+ /// \brief Convert to a floating-point number (scaled)
+ template <typename T>
+ T ToReal(int32_t scale) const {
+ return ToRealConversion<T>::ToReal(*this, scale);
+ }
+
+ friend ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
+ const Decimal256& decimal);
+
+ private:
+ /// Converts internal error code to Status
+ Status ToArrowStatus(DecimalStatus dstatus) const;
+
+ template <typename T>
+ struct ToRealConversion {};
+};
+
+template <>
+struct Decimal256::ToRealConversion<float> {
+ static float ToReal(const Decimal256& dec, int32_t scale) { return dec.ToFloat(scale); }
+};
+
+template <>
+struct Decimal256::ToRealConversion<double> {
+ static double ToReal(const Decimal256& dec, int32_t scale) {
+ return dec.ToDouble(scale);
+ }
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
index a499fdd2562..fe1b6ea3126 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.cc
@@ -17,7 +17,7 @@
#include "arrow/util/delimiting.h"
#include "arrow/buffer.h"
-#include "arrow/util/logging.h"
+#include "arrow/util/logging.h"
namespace arrow {
@@ -61,35 +61,35 @@ class NewlineBoundaryFinder : public BoundaryFinder {
return Status::OK();
}
- Status FindNth(util::string_view partial, util::string_view block, int64_t count,
- int64_t* out_pos, int64_t* num_found) override {
- DCHECK(partial.find_first_of(newline_delimiters) == util::string_view::npos);
-
- int64_t found = 0;
- int64_t pos = kNoDelimiterFound;
-
- auto cur_pos = block.find_first_of(newline_delimiters);
- while (cur_pos != util::string_view::npos) {
- if (block[cur_pos] == '\r' && cur_pos + 1 < block.length() &&
- block[cur_pos + 1] == '\n') {
- cur_pos += 2;
- } else {
- ++cur_pos;
- }
-
- pos = static_cast<int64_t>(cur_pos);
- if (++found >= count) {
- break;
- }
-
- cur_pos = block.find_first_of(newline_delimiters, cur_pos);
- }
-
- *out_pos = pos;
- *num_found = found;
- return Status::OK();
- }
-
+ Status FindNth(util::string_view partial, util::string_view block, int64_t count,
+ int64_t* out_pos, int64_t* num_found) override {
+ DCHECK(partial.find_first_of(newline_delimiters) == util::string_view::npos);
+
+ int64_t found = 0;
+ int64_t pos = kNoDelimiterFound;
+
+ auto cur_pos = block.find_first_of(newline_delimiters);
+ while (cur_pos != util::string_view::npos) {
+ if (block[cur_pos] == '\r' && cur_pos + 1 < block.length() &&
+ block[cur_pos + 1] == '\n') {
+ cur_pos += 2;
+ } else {
+ ++cur_pos;
+ }
+
+ pos = static_cast<int64_t>(cur_pos);
+ if (++found >= count) {
+ break;
+ }
+
+ cur_pos = block.find_first_of(newline_delimiters, cur_pos);
+ }
+
+ *out_pos = pos;
+ *num_found = found;
+ return Status::OK();
+ }
+
protected:
static constexpr const char* newline_delimiters = "\r\n";
};
@@ -168,26 +168,26 @@ Status Chunker::ProcessFinal(std::shared_ptr<Buffer> partial,
return Status::OK();
}
-Status Chunker::ProcessSkip(std::shared_ptr<Buffer> partial,
- std::shared_ptr<Buffer> block, bool final, int64_t* count,
- std::shared_ptr<Buffer>* rest) {
- DCHECK_GT(*count, 0);
- int64_t pos;
- int64_t num_found;
- ARROW_RETURN_NOT_OK(boundary_finder_->FindNth(
- util::string_view(*partial), util::string_view(*block), *count, &pos, &num_found));
- if (pos == BoundaryFinder::kNoDelimiterFound) {
- return StraddlingTooLarge();
- }
- if (ARROW_PREDICT_FALSE(final && *count > num_found && block->size() != pos)) {
- // Skip the last row in the final block which does not have a delimiter
- ++num_found;
- *rest = SliceBuffer(block, 0, 0);
- } else {
- *rest = SliceBuffer(block, pos);
- }
- *count -= num_found;
- return Status::OK();
-}
-
+Status Chunker::ProcessSkip(std::shared_ptr<Buffer> partial,
+ std::shared_ptr<Buffer> block, bool final, int64_t* count,
+ std::shared_ptr<Buffer>* rest) {
+ DCHECK_GT(*count, 0);
+ int64_t pos;
+ int64_t num_found;
+ ARROW_RETURN_NOT_OK(boundary_finder_->FindNth(
+ util::string_view(*partial), util::string_view(*block), *count, &pos, &num_found));
+ if (pos == BoundaryFinder::kNoDelimiterFound) {
+ return StraddlingTooLarge();
+ }
+ if (ARROW_PREDICT_FALSE(final && *count > num_found && block->size() != pos)) {
+ // Skip the last row in the final block which does not have a delimiter
+ ++num_found;
+ *rest = SliceBuffer(block, 0, 0);
+ } else {
+ *rest = SliceBuffer(block, pos);
+ }
+ *count -= num_found;
+ return Status::OK();
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
index 0ffe652441d..b4b868340db 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/delimiting.h
@@ -53,19 +53,19 @@ class ARROW_EXPORT BoundaryFinder {
/// `out_pos` will be -1 if no delimiter is found.
virtual Status FindLast(util::string_view block, int64_t* out_pos) = 0;
- /// \brief Find the position of the Nth delimiter inside the block
- ///
- /// `partial` is taken to be the beginning of the block, and `block`
- /// its continuation. Also, `partial` doesn't contain a delimiter.
- ///
- /// The returned `out_pos` is relative to `block`'s start and should point
- /// to the first character after the first delimiter.
- /// `out_pos` will be -1 if no delimiter is found.
- ///
- /// The returned `num_found` is the number of delimiters actually found
- virtual Status FindNth(util::string_view partial, util::string_view block,
- int64_t count, int64_t* out_pos, int64_t* num_found) = 0;
-
+ /// \brief Find the position of the Nth delimiter inside the block
+ ///
+ /// `partial` is taken to be the beginning of the block, and `block`
+ /// its continuation. Also, `partial` doesn't contain a delimiter.
+ ///
+ /// The returned `out_pos` is relative to `block`'s start and should point
+ /// to the first character after the first delimiter.
+ /// `out_pos` will be -1 if no delimiter is found.
+ ///
+ /// The returned `num_found` is the number of delimiters actually found
+ virtual Status FindNth(util::string_view partial, util::string_view block,
+ int64_t count, int64_t* out_pos, int64_t* num_found) = 0;
+
static constexpr int64_t kNoDelimiterFound = -1;
protected:
@@ -151,27 +151,27 @@ class ARROW_EXPORT Chunker {
Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
- /// \brief Skip count number of rows
- /// Pre-conditions:
- /// - `partial` is the start of a valid block of delimited data
- /// (i.e. starts just after a delimiter)
- /// - `block` follows `partial` in file order
- ///
- /// Post-conditions:
- /// - `count` is updated to indicate the number of rows that still need to be skipped
- /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
- /// `partial`
- /// - Else `rest` could be one or more valid blocks of delimited data which need to be
- /// parsed
- ///
- /// \param[in] partial incomplete delimited data
- /// \param[in] block delimited data following partial
- /// \param[in] final whether this is the final chunk
- /// \param[in,out] count number of rows that need to be skipped
- /// \param[out] rest subrange of block containing what was not skipped
- Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
- bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
-
+ /// \brief Skip count number of rows
+ /// Pre-conditions:
+ /// - `partial` is the start of a valid block of delimited data
+ /// (i.e. starts just after a delimiter)
+ /// - `block` follows `partial` in file order
+ ///
+ /// Post-conditions:
+ /// - `count` is updated to indicate the number of rows that still need to be skipped
+ /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
+ /// `partial`
+ /// - Else `rest` could be one or more valid blocks of delimited data which need to be
+ /// parsed
+ ///
+ /// \param[in] partial incomplete delimited data
+ /// \param[in] block delimited data following partial
+ /// \param[in] final whether this is the final chunk
+ /// \param[in,out] count number of rows that need to be skipped
+ /// \param[out] rest subrange of block containing what was not skipped
+ Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
+ bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
+
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
index a1d953d12ad..0cb2e44d275 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/endian.h
@@ -1,181 +1,181 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#ifdef _WIN32
-#define ARROW_LITTLE_ENDIAN 1
-#else
-#if defined(__APPLE__) || defined(__FreeBSD__)
-#include <machine/endian.h> // IWYU pragma: keep
-#elif defined(sun) || defined(__sun)
-#include <sys/byteorder.h> // IWYU pragma: keep
-#else
-#include <endian.h> // IWYU pragma: keep
-#endif
-#
-#ifndef __BYTE_ORDER__
-#error "__BYTE_ORDER__ not defined"
-#endif
-#
-#ifndef __ORDER_LITTLE_ENDIAN__
-#error "__ORDER_LITTLE_ENDIAN__ not defined"
-#endif
-#
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define ARROW_LITTLE_ENDIAN 1
-#else
-#define ARROW_LITTLE_ENDIAN 0
-#endif
-#endif
-
-#if defined(_MSC_VER)
-#include <intrin.h> // IWYU pragma: keep
-#define ARROW_BYTE_SWAP64 _byteswap_uint64
-#define ARROW_BYTE_SWAP32 _byteswap_ulong
-#else
-#define ARROW_BYTE_SWAP64 __builtin_bswap64
-#define ARROW_BYTE_SWAP32 __builtin_bswap32
-#endif
-
-#include "arrow/util/type_traits.h"
-#include "arrow/util/ubsan.h"
-
-namespace arrow {
-namespace BitUtil {
-
-//
-// Byte-swap 16-bit, 32-bit and 64-bit values
-//
-
-// Swap the byte order (i.e. endianness)
-static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
-static inline uint64_t ByteSwap(uint64_t value) {
- return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
-}
-static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
-static inline uint32_t ByteSwap(uint32_t value) {
- return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
-}
-static inline int16_t ByteSwap(int16_t value) {
- constexpr auto m = static_cast<int16_t>(0xff);
- return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
-}
-static inline uint16_t ByteSwap(uint16_t value) {
- return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
-}
-static inline uint8_t ByteSwap(uint8_t value) { return value; }
-static inline int8_t ByteSwap(int8_t value) { return value; }
-static inline double ByteSwap(double value) {
- const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
- return util::SafeCopy<double>(swapped);
-}
-static inline float ByteSwap(float value) {
- const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
- return util::SafeCopy<float>(swapped);
-}
-
-// Write the swapped bytes into dst. Src and dst cannot overlap.
-static inline void ByteSwap(void* dst, const void* src, int len) {
- switch (len) {
- case 1:
- *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
- return;
- case 2:
- *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
- return;
- case 4:
- *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
- return;
- case 8:
- *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
- return;
- default:
- break;
- }
-
- auto d = reinterpret_cast<uint8_t*>(dst);
- auto s = reinterpret_cast<const uint8_t*>(src);
- for (int i = 0; i < len; ++i) {
- d[i] = s[len - i - 1];
- }
-}
-
-// Convert to little/big endian format from the machine's native endian format.
-#if ARROW_LITTLE_ENDIAN
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToBigEndian(T value) {
- return ByteSwap(value);
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToLittleEndian(T value) {
- return value;
-}
-#else
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToBigEndian(T value) {
- return value;
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T ToLittleEndian(T value) {
- return ByteSwap(value);
-}
-#endif
-
-// Convert from big/little endian format to the machine's native endian format.
-#if ARROW_LITTLE_ENDIAN
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromBigEndian(T value) {
- return ByteSwap(value);
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromLittleEndian(T value) {
- return value;
-}
-#else
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromBigEndian(T value) {
- return value;
-}
-
-template <typename T, typename = internal::EnableIfIsOneOf<
- T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
- uint8_t, int8_t, float, double>>
-static inline T FromLittleEndian(T value) {
- return ByteSwap(value);
-}
-#endif
-
-} // namespace BitUtil
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#define ARROW_LITTLE_ENDIAN 1
+#else
+#if defined(__APPLE__) || defined(__FreeBSD__)
+#include <machine/endian.h> // IWYU pragma: keep
+#elif defined(sun) || defined(__sun)
+#include <sys/byteorder.h> // IWYU pragma: keep
+#else
+#include <endian.h> // IWYU pragma: keep
+#endif
+#
+#ifndef __BYTE_ORDER__
+#error "__BYTE_ORDER__ not defined"
+#endif
+#
+#ifndef __ORDER_LITTLE_ENDIAN__
+#error "__ORDER_LITTLE_ENDIAN__ not defined"
+#endif
+#
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define ARROW_LITTLE_ENDIAN 1
+#else
+#define ARROW_LITTLE_ENDIAN 0
+#endif
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h> // IWYU pragma: keep
+#define ARROW_BYTE_SWAP64 _byteswap_uint64
+#define ARROW_BYTE_SWAP32 _byteswap_ulong
+#else
+#define ARROW_BYTE_SWAP64 __builtin_bswap64
+#define ARROW_BYTE_SWAP32 __builtin_bswap32
+#endif
+
+#include "arrow/util/type_traits.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace BitUtil {
+
+//
+// Byte-swap 16-bit, 32-bit and 64-bit values
+//
+
+// Swap the byte order (i.e. endianness)
+static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
+static inline uint64_t ByteSwap(uint64_t value) {
+ return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
+}
+static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
+static inline uint32_t ByteSwap(uint32_t value) {
+ return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
+}
+static inline int16_t ByteSwap(int16_t value) {
+ constexpr auto m = static_cast<int16_t>(0xff);
+ return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
+}
+static inline uint16_t ByteSwap(uint16_t value) {
+ return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+}
+static inline uint8_t ByteSwap(uint8_t value) { return value; }
+static inline int8_t ByteSwap(int8_t value) { return value; }
+static inline double ByteSwap(double value) {
+ const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
+ return util::SafeCopy<double>(swapped);
+}
+static inline float ByteSwap(float value) {
+ const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
+ return util::SafeCopy<float>(swapped);
+}
+
+// Write the swapped bytes into dst. Src and dst cannot overlap.
+static inline void ByteSwap(void* dst, const void* src, int len) {
+ switch (len) {
+ case 1:
+ *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
+ return;
+ case 2:
+ *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
+ return;
+ case 4:
+ *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
+ return;
+ case 8:
+ *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
+ return;
+ default:
+ break;
+ }
+
+ auto d = reinterpret_cast<uint8_t*>(dst);
+ auto s = reinterpret_cast<const uint8_t*>(src);
+ for (int i = 0; i < len; ++i) {
+ d[i] = s[len - i - 1];
+ }
+}
+
+// Convert to little/big endian format from the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToBigEndian(T value) {
+ return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToLittleEndian(T value) {
+ return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToBigEndian(T value) {
+ return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T ToLittleEndian(T value) {
+ return ByteSwap(value);
+}
+#endif
+
+// Convert from big/little endian format to the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromBigEndian(T value) {
+ return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromLittleEndian(T value) {
+ return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromBigEndian(T value) {
+ return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+ T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+ uint8_t, int8_t, float, double>>
+static inline T FromLittleEndian(T value) {
+ return ByteSwap(value);
+}
+#endif
+
+} // namespace BitUtil
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
index efa8a997efe..c16d42ce5cf 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.cc
@@ -43,29 +43,29 @@ struct FloatToStringFormatter::Impl {
: converter_(DoubleToStringConverter::EMIT_POSITIVE_EXPONENT_SIGN, "inf", "nan",
'e', -6, 10, 6, 0) {}
- Impl(int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
- int decimal_in_shortest_low, int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode)
- : converter_(flags, inf_symbol, nan_symbol, exp_character, decimal_in_shortest_low,
- decimal_in_shortest_high, max_leading_padding_zeroes_in_precision_mode,
- max_trailing_padding_zeroes_in_precision_mode) {}
-
+ Impl(int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
+ int decimal_in_shortest_low, int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : converter_(flags, inf_symbol, nan_symbol, exp_character, decimal_in_shortest_low,
+ decimal_in_shortest_high, max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode) {}
+
DoubleToStringConverter converter_;
};
FloatToStringFormatter::FloatToStringFormatter() : impl_(new Impl()) {}
-FloatToStringFormatter::FloatToStringFormatter(
- int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
- int decimal_in_shortest_low, int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode)
- : impl_(new Impl(flags, inf_symbol, nan_symbol, exp_character,
- decimal_in_shortest_low, decimal_in_shortest_high,
- max_leading_padding_zeroes_in_precision_mode,
- max_trailing_padding_zeroes_in_precision_mode)) {}
-
+FloatToStringFormatter::FloatToStringFormatter(
+ int flags, const char* inf_symbol, const char* nan_symbol, char exp_character,
+ int decimal_in_shortest_low, int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : impl_(new Impl(flags, inf_symbol, nan_symbol, exp_character,
+ decimal_in_shortest_low, decimal_in_shortest_high,
+ max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode)) {}
+
FloatToStringFormatter::~FloatToStringFormatter() {}
int FloatToStringFormatter::FormatFloat(float v, char* out_buffer, int out_size) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
index ac91ec6a123..566c9795f83 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/formatting.h
@@ -31,7 +31,7 @@
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
-#include "arrow/util/double_conversion.h"
+#include "arrow/util/double_conversion.h"
#include "arrow/util/string_view.h"
#include "arrow/util/time.h"
#include "arrow/util/visibility.h"
@@ -220,11 +220,11 @@ class StringFormatter<UInt64Type> : public IntToStringFormatterMixin<UInt64Type>
class ARROW_EXPORT FloatToStringFormatter {
public:
FloatToStringFormatter();
- FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
- char exp_character, int decimal_in_shortest_low,
- int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode);
+ FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
+ char exp_character, int decimal_in_shortest_low,
+ int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode);
~FloatToStringFormatter();
// Returns the number of characters written
@@ -245,16 +245,16 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter {
explicit FloatToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
- FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
- char exp_character, int decimal_in_shortest_low,
- int decimal_in_shortest_high,
- int max_leading_padding_zeroes_in_precision_mode,
- int max_trailing_padding_zeroes_in_precision_mode)
- : FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
- decimal_in_shortest_low, decimal_in_shortest_high,
- max_leading_padding_zeroes_in_precision_mode,
- max_trailing_padding_zeroes_in_precision_mode) {}
-
+ FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
+ char exp_character, int decimal_in_shortest_low,
+ int decimal_in_shortest_high,
+ int max_leading_padding_zeroes_in_precision_mode,
+ int max_trailing_padding_zeroes_in_precision_mode)
+ : FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
+ decimal_in_shortest_low, decimal_in_shortest_high,
+ max_leading_padding_zeroes_in_precision_mode,
+ max_trailing_padding_zeroes_in_precision_mode) {}
+
template <typename Appender>
Return<Appender> operator()(value_type value, Appender&& append) {
char buffer[buffer_size];
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
index 30b3066d06e..9da79046fec 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/functional.h
@@ -17,27 +17,27 @@
#pragma once
-#include <memory>
+#include <memory>
#include <tuple>
#include <type_traits>
-#include "arrow/result.h"
+#include "arrow/result.h"
#include "arrow/util/macros.h"
namespace arrow {
namespace internal {
-struct Empty {
- static Result<Empty> ToResult(Status s) {
- if (ARROW_PREDICT_TRUE(s.ok())) {
- return Empty{};
- }
- return s;
- }
-};
-
+struct Empty {
+ static Result<Empty> ToResult(Status s) {
+ if (ARROW_PREDICT_TRUE(s.ok())) {
+ return Empty{};
+ }
+ return s;
+ }
+};
+
/// Helper struct for examining lambdas and other callables.
-/// TODO(ARROW-12655) support function pointers
+/// TODO(ARROW-12655) support function pointers
struct call_traits {
public:
template <typename R, typename... A>
@@ -63,20 +63,20 @@ struct call_traits {
static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
R (F::*)(A...) const);
- template <std::size_t I, typename F, typename R, typename... A>
- static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
- R (F::*)(A...) &&);
-
- template <typename F, typename R, typename... A>
- static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
-
- template <typename F, typename R, typename... A>
- static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
- const);
-
- template <typename F, typename R, typename... A>
- static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
-
+ template <std::size_t I, typename F, typename R, typename... A>
+ static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+ R (F::*)(A...) &&);
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
+ const);
+
+ template <typename F, typename R, typename... A>
+ static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
+
/// bool constant indicating whether F is a callable with more than one possible
/// signature. Will be true_type for objects which define multiple operator() or which
/// define a template operator()
@@ -97,64 +97,64 @@ struct call_traits {
using argument_type = decltype(argument_type_impl<I>(&std::decay<F>::type::operator()));
template <typename F>
- using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
-
- template <typename F>
+ using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
+
+ template <typename F>
using return_type = decltype(return_type_impl(&std::decay<F>::type::operator()));
template <typename F, typename T, typename RT = T>
using enable_if_return =
typename std::enable_if<std::is_same<return_type<F>, T>::value, RT>;
-
- template <typename T, typename R = void>
- using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
-
- template <typename T, typename R = void>
- using enable_if_not_empty =
- typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
+
+ template <typename T, typename R = void>
+ using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
+
+ template <typename T, typename R = void>
+ using enable_if_not_empty =
+ typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
+};
+
+/// A type erased callable object which may only be invoked once.
+/// It can be constructed from any lambda which matches the provided call signature.
+/// Invoking it results in destruction of the lambda, freeing any state/references
+/// immediately. Invoking a default constructed FnOnce or one which has already been
+/// invoked will segfault.
+template <typename Signature>
+class FnOnce;
+
+template <typename R, typename... A>
+class FnOnce<R(A...)> {
+ public:
+ FnOnce() = default;
+
+ template <typename Fn,
+ typename = typename std::enable_if<std::is_convertible<
+ typename std::result_of<Fn && (A...)>::type, R>::value>::type>
+ FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) { // NOLINT runtime/explicit
+ }
+
+ explicit operator bool() const { return impl_ != NULLPTR; }
+
+ R operator()(A... a) && {
+ auto bye = std::move(impl_);
+ return bye->invoke(std::forward<A&&>(a)...);
+ }
+
+ private:
+ struct Impl {
+ virtual ~Impl() = default;
+ virtual R invoke(A&&... a) = 0;
+ };
+
+ template <typename Fn>
+ struct FnImpl : Impl {
+ explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
+ R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
+ Fn fn_;
+ };
+
+ std::unique_ptr<Impl> impl_;
};
-/// A type erased callable object which may only be invoked once.
-/// It can be constructed from any lambda which matches the provided call signature.
-/// Invoking it results in destruction of the lambda, freeing any state/references
-/// immediately. Invoking a default constructed FnOnce or one which has already been
-/// invoked will segfault.
-template <typename Signature>
-class FnOnce;
-
-template <typename R, typename... A>
-class FnOnce<R(A...)> {
- public:
- FnOnce() = default;
-
- template <typename Fn,
- typename = typename std::enable_if<std::is_convertible<
- typename std::result_of<Fn && (A...)>::type, R>::value>::type>
- FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) { // NOLINT runtime/explicit
- }
-
- explicit operator bool() const { return impl_ != NULLPTR; }
-
- R operator()(A... a) && {
- auto bye = std::move(impl_);
- return bye->invoke(std::forward<A&&>(a)...);
- }
-
- private:
- struct Impl {
- virtual ~Impl() = default;
- virtual R invoke(A&&... a) = 0;
- };
-
- template <typename Fn>
- struct FnImpl : Impl {
- explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
- R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
- Fn fn_;
- };
-
- std::unique_ptr<Impl> impl_;
-};
-
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
index 640ff63655a..f288a15be3f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.cc
@@ -26,7 +26,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
-#include "arrow/util/thread_pool.h"
+#include "arrow/util/thread_pool.h"
namespace arrow {
@@ -40,8 +40,8 @@ using internal::checked_cast;
// should ideally not limit scalability.
static std::mutex global_waiter_mutex;
-const double FutureWaiter::kInfinity = HUGE_VAL;
-
+const double FutureWaiter::kInfinity = HUGE_VAL;
+
class FutureWaiterImpl : public FutureWaiter {
public:
FutureWaiterImpl(Kind kind, std::vector<FutureImpl*> futures)
@@ -76,7 +76,7 @@ class FutureWaiterImpl : public FutureWaiter {
}
}
- ~FutureWaiterImpl() override {
+ ~FutureWaiterImpl() override {
for (auto future : futures_) {
future->RemoveWaiter(this);
}
@@ -177,9 +177,9 @@ FutureWaiterImpl* GetConcreteWaiter(FutureWaiter* waiter) {
} // namespace
-FutureWaiter::FutureWaiter() = default;
+FutureWaiter::FutureWaiter() = default;
-FutureWaiter::~FutureWaiter() = default;
+FutureWaiter::~FutureWaiter() = default;
std::unique_ptr<FutureWaiter> FutureWaiter::Make(Kind kind,
std::vector<FutureImpl*> futures) {
@@ -232,70 +232,70 @@ class ConcreteFutureImpl : public FutureImpl {
void DoMarkFailed() { DoMarkFinishedOrFailed(FutureState::FAILURE); }
- void CheckOptions(const CallbackOptions& opts) {
- if (opts.should_schedule != ShouldSchedule::Never) {
- DCHECK_NE(opts.executor, nullptr)
- << "An executor must be specified when adding a callback that might schedule";
- }
- }
-
- void AddCallback(Callback callback, CallbackOptions opts) {
- CheckOptions(opts);
- std::unique_lock<std::mutex> lock(mutex_);
- CallbackRecord callback_record{std::move(callback), opts};
- if (IsFutureFinished(state_)) {
- lock.unlock();
- RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/true);
- } else {
- callbacks_.push_back(std::move(callback_record));
- }
- }
-
- bool TryAddCallback(const std::function<Callback()>& callback_factory,
- CallbackOptions opts) {
- CheckOptions(opts);
- std::unique_lock<std::mutex> lock(mutex_);
- if (IsFutureFinished(state_)) {
- return false;
- } else {
- callbacks_.push_back({callback_factory(), opts});
- return true;
- }
- }
-
- bool ShouldScheduleCallback(const CallbackRecord& callback_record,
- bool in_add_callback) {
- switch (callback_record.options.should_schedule) {
- case ShouldSchedule::Never:
- return false;
- case ShouldSchedule::Always:
- return true;
- case ShouldSchedule::IfUnfinished:
- return !in_add_callback;
- case ShouldSchedule::IfDifferentExecutor:
- return !callback_record.options.executor->OwnsThisThread();
- default:
- DCHECK(false) << "Unrecognized ShouldSchedule option";
- return false;
- }
- }
-
- void RunOrScheduleCallback(CallbackRecord&& callback_record, bool in_add_callback) {
- if (ShouldScheduleCallback(callback_record, in_add_callback)) {
- struct CallbackTask {
- void operator()() { std::move(callback)(*self); }
-
- Callback callback;
- std::shared_ptr<FutureImpl> self;
- };
- // Need to keep `this` alive until the callback has a chance to be scheduled.
- CallbackTask task{std::move(callback_record.callback), shared_from_this()};
- DCHECK_OK(callback_record.options.executor->Spawn(std::move(task)));
- } else {
- std::move(callback_record.callback)(*this);
- }
- }
-
+ void CheckOptions(const CallbackOptions& opts) {
+ if (opts.should_schedule != ShouldSchedule::Never) {
+ DCHECK_NE(opts.executor, nullptr)
+ << "An executor must be specified when adding a callback that might schedule";
+ }
+ }
+
+ void AddCallback(Callback callback, CallbackOptions opts) {
+ CheckOptions(opts);
+ std::unique_lock<std::mutex> lock(mutex_);
+ CallbackRecord callback_record{std::move(callback), opts};
+ if (IsFutureFinished(state_)) {
+ lock.unlock();
+ RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/true);
+ } else {
+ callbacks_.push_back(std::move(callback_record));
+ }
+ }
+
+ bool TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts) {
+ CheckOptions(opts);
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (IsFutureFinished(state_)) {
+ return false;
+ } else {
+ callbacks_.push_back({callback_factory(), opts});
+ return true;
+ }
+ }
+
+ bool ShouldScheduleCallback(const CallbackRecord& callback_record,
+ bool in_add_callback) {
+ switch (callback_record.options.should_schedule) {
+ case ShouldSchedule::Never:
+ return false;
+ case ShouldSchedule::Always:
+ return true;
+ case ShouldSchedule::IfUnfinished:
+ return !in_add_callback;
+ case ShouldSchedule::IfDifferentExecutor:
+ return !callback_record.options.executor->OwnsThisThread();
+ default:
+ DCHECK(false) << "Unrecognized ShouldSchedule option";
+ return false;
+ }
+ }
+
+ void RunOrScheduleCallback(CallbackRecord&& callback_record, bool in_add_callback) {
+ if (ShouldScheduleCallback(callback_record, in_add_callback)) {
+ struct CallbackTask {
+ void operator()() { std::move(callback)(*self); }
+
+ Callback callback;
+ std::shared_ptr<FutureImpl> self;
+ };
+ // Need to keep `this` alive until the callback has a chance to be scheduled.
+ CallbackTask task{std::move(callback_record.callback), shared_from_this()};
+ DCHECK_OK(callback_record.options.executor->Spawn(std::move(task)));
+ } else {
+ std::move(callback_record.callback)(*this);
+ }
+ }
+
void DoMarkFinishedOrFailed(FutureState state) {
{
// Lock the hypothetical waiter first, and the future after.
@@ -310,17 +310,17 @@ class ConcreteFutureImpl : public FutureImpl {
}
}
cv_.notify_all();
-
- // run callbacks, lock not needed since the future is finished by this
- // point so nothing else can modify the callbacks list and it is safe
- // to iterate.
- //
- // In fact, it is important not to hold the locks because the callback
- // may be slow or do its own locking on other resources
- for (auto& callback_record : callbacks_) {
- RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/false);
- }
- callbacks_.clear();
+
+ // run callbacks, lock not needed since the future is finished by this
+ // point so nothing else can modify the callbacks list and it is safe
+ // to iterate.
+ //
+ // In fact, it is important not to hold the locks because the callback
+ // may be slow or do its own locking on other resources
+ for (auto& callback_record : callbacks_) {
+ RunOrScheduleCallback(std::move(callback_record), /*in_add_callback=*/false);
+ }
+ callbacks_.clear();
}
void DoWait() {
@@ -355,12 +355,12 @@ std::unique_ptr<FutureImpl> FutureImpl::Make() {
return std::unique_ptr<FutureImpl>(new ConcreteFutureImpl());
}
-std::unique_ptr<FutureImpl> FutureImpl::MakeFinished(FutureState state) {
- std::unique_ptr<ConcreteFutureImpl> ptr(new ConcreteFutureImpl());
- ptr->state_ = state;
- return std::move(ptr);
-}
-
+std::unique_ptr<FutureImpl> FutureImpl::MakeFinished(FutureState state) {
+ std::unique_ptr<ConcreteFutureImpl> ptr(new ConcreteFutureImpl());
+ ptr->state_ = state;
+ return std::move(ptr);
+}
+
FutureImpl::FutureImpl() : state_(FutureState::PENDING) {}
FutureState FutureImpl::SetWaiter(FutureWaiter* w, int future_num) {
@@ -379,43 +379,43 @@ void FutureImpl::MarkFinished() { GetConcreteFuture(this)->DoMarkFinished(); }
void FutureImpl::MarkFailed() { GetConcreteFuture(this)->DoMarkFailed(); }
-void FutureImpl::AddCallback(Callback callback, CallbackOptions opts) {
- GetConcreteFuture(this)->AddCallback(std::move(callback), opts);
-}
-
-bool FutureImpl::TryAddCallback(const std::function<Callback()>& callback_factory,
- CallbackOptions opts) {
- return GetConcreteFuture(this)->TryAddCallback(callback_factory, opts);
-}
-
-Future<> AllComplete(const std::vector<Future<>>& futures) {
- struct State {
- explicit State(int64_t n_futures) : mutex(), n_remaining(n_futures) {}
-
- std::mutex mutex;
- std::atomic<size_t> n_remaining;
- };
-
- if (futures.empty()) {
- return Future<>::MakeFinished();
- }
-
- auto state = std::make_shared<State>(futures.size());
- auto out = Future<>::Make();
- for (const auto& future : futures) {
- future.AddCallback([state, out](const Status& status) mutable {
- if (!status.ok()) {
- std::unique_lock<std::mutex> lock(state->mutex);
- if (!out.is_finished()) {
- out.MarkFinished(status);
- }
- return;
- }
- if (state->n_remaining.fetch_sub(1) != 1) return;
- out.MarkFinished();
- });
- }
- return out;
-}
-
+void FutureImpl::AddCallback(Callback callback, CallbackOptions opts) {
+ GetConcreteFuture(this)->AddCallback(std::move(callback), opts);
+}
+
+bool FutureImpl::TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts) {
+ return GetConcreteFuture(this)->TryAddCallback(callback_factory, opts);
+}
+
+Future<> AllComplete(const std::vector<Future<>>& futures) {
+ struct State {
+ explicit State(int64_t n_futures) : mutex(), n_remaining(n_futures) {}
+
+ std::mutex mutex;
+ std::atomic<size_t> n_remaining;
+ };
+
+ if (futures.empty()) {
+ return Future<>::MakeFinished();
+ }
+
+ auto state = std::make_shared<State>(futures.size());
+ auto out = Future<>::Make();
+ for (const auto& future : futures) {
+ future.AddCallback([state, out](const Status& status) mutable {
+ if (!status.ok()) {
+ std::unique_lock<std::mutex> lock(state->mutex);
+ if (!out.is_finished()) {
+ out.MarkFinished(status);
+ }
+ return;
+ }
+ if (state->n_remaining.fetch_sub(1) != 1) return;
+ out.MarkFinished();
+ });
+ }
+ return out;
+}
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
index 9352de6596f..d9e0a939f25 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/future.h
@@ -18,8 +18,8 @@
#pragma once
#include <atomic>
-#include <cmath>
-#include <functional>
+#include <cmath>
+#include <functional>
#include <memory>
#include <type_traits>
#include <utility>
@@ -27,263 +27,263 @@
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/functional.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/functional.h"
#include "arrow/util/macros.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/type_fwd.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
namespace arrow {
-template <typename>
-struct EnsureFuture;
-
-namespace detail {
-
-template <typename>
-struct is_future : std::false_type {};
-
-template <typename T>
-struct is_future<Future<T>> : std::true_type {};
-
-template <typename Signature>
-using result_of_t = typename std::result_of<Signature>::type;
-
-// Helper to find the synchronous counterpart for a Future
-template <typename T>
-struct SyncType {
- using type = Result<T>;
-};
-
-template <>
-struct SyncType<internal::Empty> {
- using type = Status;
-};
-
-template <typename Fn>
-using first_arg_is_status =
- std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
- Status>;
-
-template <typename Fn, typename Then, typename Else,
- typename Count = internal::call_traits::argument_count<Fn>>
-using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
-
-/// Creates a callback that can be added to a future to mark a `dest` future finished
-template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
- bool DestEmpty = Dest::is_empty>
-struct MarkNextFinished {};
-
-/// If the source and dest are both empty we can pass on the status
-template <typename Source, typename Dest>
-struct MarkNextFinished<Source, Dest, true, true> {
- void operator()(const Status& status) && { next.MarkFinished(status); }
- Dest next;
-};
-
-/// If the source is not empty but the dest is then we can take the
-/// status out of the result
-template <typename Source, typename Dest>
-struct MarkNextFinished<Source, Dest, false, true> {
- void operator()(const Result<typename Source::ValueType>& res) && {
- next.MarkFinished(internal::Empty::ToResult(res.status()));
- }
- Dest next;
-};
-
-/// If neither are empty we pass on the result
-template <typename Source, typename Dest>
-struct MarkNextFinished<Source, Dest, false, false> {
- void operator()(const Result<typename Source::ValueType>& res) && {
- next.MarkFinished(res);
- }
- Dest next;
-};
-
-/// Helper that contains information about how to apply a continuation
-struct ContinueFuture {
- template <typename Return>
- struct ForReturnImpl;
-
- template <typename Return>
- using ForReturn = typename ForReturnImpl<Return>::type;
-
- template <typename Signature>
- using ForSignature = ForReturn<result_of_t<Signature>>;
-
- // If the callback returns void then we return Future<> that always finishes OK.
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
- NextFuture next, ContinueFunc&& f, Args&&... a) const {
- std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
- next.MarkFinished();
- }
-
- /// If the callback returns a non-future then we return Future<T>
- /// and mark the future finished with the callback result. It will get promoted
- /// to Result<T> as part of MarkFinished if it isn't already.
- ///
- /// If the callback returns Status and we return Future<> then also send the callback
- /// result as-is to the destination future.
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<
- !std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
- (!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
- operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
- next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
- }
-
- /// If the callback returns a Result and the next future is Future<> then we mark
- /// the future finished with the callback result.
- ///
- /// It may seem odd that the next future is Future<> when the callback returns a
- /// result but this can occur if the OnFailure callback returns a result while the
- /// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
- /// version of Then with an OnSuccess callback that returns void)
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<!std::is_void<ContinueResult>::value &&
- !is_future<ContinueResult>::value && NextFuture::is_empty &&
- !std::is_same<ContinueResult, Status>::value>::type
- operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
- next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
- }
-
- /// If the callback returns a Future<T> then we return Future<T>. We create a new
- /// future and add a callback to the future given to us by the user that forwards the
- /// result to the future we just created
- template <typename ContinueFunc, typename... Args,
- typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
- typename NextFuture = ForReturn<ContinueResult>>
- typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
- NextFuture next, ContinueFunc&& f, Args&&... a) const {
- ContinueResult signal_to_complete_next =
- std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
- MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
- signal_to_complete_next.AddCallback(std::move(callback));
- }
-
- /// Helpers to conditionally ignore arguments to ContinueFunc
- template <typename ContinueFunc, typename NextFuture, typename... Args>
- void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
- Args&&...) const {
- operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
- }
- template <typename ContinueFunc, typename NextFuture, typename... Args>
- void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
- Args&&... a) const {
- operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
- std::forward<Args>(a)...);
- }
-};
-
-/// Helper struct which tells us what kind of Future gets returned from `Then` based on
-/// the return type of the OnSuccess callback
-template <>
-struct ContinueFuture::ForReturnImpl<void> {
- using type = Future<>;
-};
-
-template <>
-struct ContinueFuture::ForReturnImpl<Status> {
- using type = Future<>;
-};
-
-template <typename R>
-struct ContinueFuture::ForReturnImpl {
- using type = Future<R>;
-};
-
-template <typename T>
-struct ContinueFuture::ForReturnImpl<Result<T>> {
- using type = Future<T>;
-};
-
-template <typename T>
-struct ContinueFuture::ForReturnImpl<Future<T>> {
- using type = Future<T>;
-};
-
-} // namespace detail
-
+template <typename>
+struct EnsureFuture;
+
+namespace detail {
+
+template <typename>
+struct is_future : std::false_type {};
+
+template <typename T>
+struct is_future<Future<T>> : std::true_type {};
+
+template <typename Signature>
+using result_of_t = typename std::result_of<Signature>::type;
+
+// Helper to find the synchronous counterpart for a Future
+template <typename T>
+struct SyncType {
+ using type = Result<T>;
+};
+
+template <>
+struct SyncType<internal::Empty> {
+ using type = Status;
+};
+
+template <typename Fn>
+using first_arg_is_status =
+ std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
+ Status>;
+
+template <typename Fn, typename Then, typename Else,
+ typename Count = internal::call_traits::argument_count<Fn>>
+using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
+
+/// Creates a callback that can be added to a future to mark a `dest` future finished
+template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
+ bool DestEmpty = Dest::is_empty>
+struct MarkNextFinished {};
+
+/// If the source and dest are both empty we can pass on the status
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, true, true> {
+ void operator()(const Status& status) && { next.MarkFinished(status); }
+ Dest next;
+};
+
+/// If the source is not empty but the dest is then we can take the
+/// status out of the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, true> {
+ void operator()(const Result<typename Source::ValueType>& res) && {
+ next.MarkFinished(internal::Empty::ToResult(res.status()));
+ }
+ Dest next;
+};
+
+/// If neither are empty we pass on the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, false> {
+ void operator()(const Result<typename Source::ValueType>& res) && {
+ next.MarkFinished(res);
+ }
+ Dest next;
+};
+
+/// Helper that contains information about how to apply a continuation
+struct ContinueFuture {
+ template <typename Return>
+ struct ForReturnImpl;
+
+ template <typename Return>
+ using ForReturn = typename ForReturnImpl<Return>::type;
+
+ template <typename Signature>
+ using ForSignature = ForReturn<result_of_t<Signature>>;
+
+ // If the callback returns void then we return Future<> that always finishes OK.
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
+ NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+ next.MarkFinished();
+ }
+
+ /// If the callback returns a non-future then we return Future<T>
+ /// and mark the future finished with the callback result. It will get promoted
+ /// to Result<T> as part of MarkFinished if it isn't already.
+ ///
+ /// If the callback returns Status and we return Future<> then also send the callback
+ /// result as-is to the destination future.
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<
+ !std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
+ (!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
+ operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
+ }
+
+ /// If the callback returns a Result and the next future is Future<> then we mark
+ /// the future finished with the callback result.
+ ///
+ /// It may seem odd that the next future is Future<> when the callback returns a
+ /// result but this can occur if the OnFailure callback returns a result while the
+ /// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
+ /// version of Then with an OnSuccess callback that returns void)
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<!std::is_void<ContinueResult>::value &&
+ !is_future<ContinueResult>::value && NextFuture::is_empty &&
+ !std::is_same<ContinueResult, Status>::value>::type
+ operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
+ }
+
+ /// If the callback returns a Future<T> then we return Future<T>. We create a new
+ /// future and add a callback to the future given to us by the user that forwards the
+ /// result to the future we just created
+ template <typename ContinueFunc, typename... Args,
+ typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+ typename NextFuture = ForReturn<ContinueResult>>
+ typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
+ NextFuture next, ContinueFunc&& f, Args&&... a) const {
+ ContinueResult signal_to_complete_next =
+ std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+ MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
+ signal_to_complete_next.AddCallback(std::move(callback));
+ }
+
+ /// Helpers to conditionally ignore arguments to ContinueFunc
+ template <typename ContinueFunc, typename NextFuture, typename... Args>
+ void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
+ Args&&...) const {
+ operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
+ }
+ template <typename ContinueFunc, typename NextFuture, typename... Args>
+ void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
+ Args&&... a) const {
+ operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
+ std::forward<Args>(a)...);
+ }
+};
+
+/// Helper struct which tells us what kind of Future gets returned from `Then` based on
+/// the return type of the OnSuccess callback
+template <>
+struct ContinueFuture::ForReturnImpl<void> {
+ using type = Future<>;
+};
+
+template <>
+struct ContinueFuture::ForReturnImpl<Status> {
+ using type = Future<>;
+};
+
+template <typename R>
+struct ContinueFuture::ForReturnImpl {
+ using type = Future<R>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Result<T>> {
+ using type = Future<T>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Future<T>> {
+ using type = Future<T>;
+};
+
+} // namespace detail
+
/// A Future's execution or completion status
enum class FutureState : int8_t { PENDING, SUCCESS, FAILURE };
inline bool IsFutureFinished(FutureState state) { return state != FutureState::PENDING; }
-/// \brief Describe whether the callback should be scheduled or run synchronously
-enum class ShouldSchedule {
- /// Always run the callback synchronously (the default)
- Never = 0,
- /// Schedule a new task only if the future is not finished when the
- /// callback is added
- IfUnfinished = 1,
- /// Always schedule the callback as a new task
- Always = 2,
- /// Schedule a new task only if it would run on an executor other than
- /// the specified executor.
- IfDifferentExecutor = 3,
-};
-
-/// \brief Options that control how a continuation is run
-struct CallbackOptions {
- /// Describe whether the callback should be run synchronously or scheduled
- ShouldSchedule should_schedule = ShouldSchedule::Never;
- /// If the callback is scheduled then this is the executor it should be scheduled
- /// on. If this is NULL then should_schedule must be Never
- internal::Executor* executor = NULLPTR;
-
- static CallbackOptions Defaults() { return {}; }
-};
-
-// Untyped private implementation
-class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
+/// \brief Describe whether the callback should be scheduled or run synchronously
+enum class ShouldSchedule {
+ /// Always run the callback synchronously (the default)
+ Never = 0,
+ /// Schedule a new task only if the future is not finished when the
+ /// callback is added
+ IfUnfinished = 1,
+ /// Always schedule the callback as a new task
+ Always = 2,
+ /// Schedule a new task only if it would run on an executor other than
+ /// the specified executor.
+ IfDifferentExecutor = 3,
+};
+
+/// \brief Options that control how a continuation is run
+struct CallbackOptions {
+ /// Describe whether the callback should be run synchronously or scheduled
+ ShouldSchedule should_schedule = ShouldSchedule::Never;
+ /// If the callback is scheduled then this is the executor it should be scheduled
+ /// on. If this is NULL then should_schedule must be Never
+ internal::Executor* executor = NULLPTR;
+
+ static CallbackOptions Defaults() { return {}; }
+};
+
+// Untyped private implementation
+class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
public:
- FutureImpl();
+ FutureImpl();
virtual ~FutureImpl() = default;
FutureState state() { return state_.load(); }
static std::unique_ptr<FutureImpl> Make();
- static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
+ static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
// Future API
void MarkFinished();
void MarkFailed();
void Wait();
bool Wait(double seconds);
- template <typename ValueType>
- Result<ValueType>* CastResult() const {
- return static_cast<Result<ValueType>*>(result_.get());
- }
-
- using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
- void AddCallback(Callback callback, CallbackOptions opts);
- bool TryAddCallback(const std::function<Callback()>& callback_factory,
- CallbackOptions opts);
-
+ template <typename ValueType>
+ Result<ValueType>* CastResult() const {
+ return static_cast<Result<ValueType>*>(result_.get());
+ }
+
+ using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
+ void AddCallback(Callback callback, CallbackOptions opts);
+ bool TryAddCallback(const std::function<Callback()>& callback_factory,
+ CallbackOptions opts);
+
// Waiter API
inline FutureState SetWaiter(FutureWaiter* w, int future_num);
inline void RemoveWaiter(FutureWaiter* w);
- std::atomic<FutureState> state_{FutureState::PENDING};
-
- // Type erased storage for arbitrary results
- // XXX small objects could be stored inline instead of boxed in a pointer
- using Storage = std::unique_ptr<void, void (*)(void*)>;
- Storage result_{NULLPTR, NULLPTR};
-
- struct CallbackRecord {
- Callback callback;
- CallbackOptions options;
- };
- std::vector<CallbackRecord> callbacks_;
+ std::atomic<FutureState> state_{FutureState::PENDING};
+
+ // Type erased storage for arbitrary results
+ // XXX small objects could be stored inline instead of boxed in a pointer
+ using Storage = std::unique_ptr<void, void (*)(void*)>;
+ Storage result_{NULLPTR, NULLPTR};
+
+ struct CallbackRecord {
+ Callback callback;
+ CallbackOptions options;
+ };
+ std::vector<CallbackRecord> callbacks_;
};
// An object that waits on multiple futures at once. Only one waiter
@@ -292,9 +292,9 @@ class ARROW_EXPORT FutureWaiter {
public:
enum Kind : int8_t { ANY, ALL, ALL_OR_FIRST_FAILED, ITERATE };
- // HUGE_VAL isn't constexpr on Windows
- // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
- static const double kInfinity;
+ // HUGE_VAL isn't constexpr on Windows
+ // https://social.msdn.microsoft.com/Forums/vstudio/en-US/47e8b9ff-b205-4189-968e-ee3bc3e2719f/constexpr-compile-error?forum=vclanguage
+ static const double kInfinity;
static std::unique_ptr<FutureWaiter> Make(Kind kind, std::vector<FutureImpl*> futures);
@@ -318,7 +318,7 @@ class ARROW_EXPORT FutureWaiter {
static std::vector<FutureImpl*> ExtractFutures(const std::vector<FutureType>& futures) {
std::vector<FutureImpl*> base_futures(futures.size());
for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
- base_futures[i] = futures[i].impl_.get();
+ base_futures[i] = futures[i].impl_.get();
}
return base_futures;
}
@@ -329,7 +329,7 @@ class ARROW_EXPORT FutureWaiter {
const std::vector<FutureType*>& futures) {
std::vector<FutureImpl*> base_futures(futures.size());
for (int i = 0; i < static_cast<int>(futures.size()); ++i) {
- base_futures[i] = futures[i]->impl_.get();
+ base_futures[i] = futures[i]->impl_.get();
}
return base_futures;
}
@@ -358,11 +358,11 @@ class ARROW_EXPORT FutureWaiter {
/// to complete, or wait on multiple Futures at once (using WaitForAll,
/// WaitForAny or AsCompletedIterator).
template <typename T>
-class ARROW_MUST_USE_TYPE Future {
+class ARROW_MUST_USE_TYPE Future {
public:
- using ValueType = T;
- using SyncType = typename detail::SyncType<T>::type;
- static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
+ using ValueType = T;
+ using SyncType = typename detail::SyncType<T>::type;
+ static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
// The default constructor creates an invalid Future. Use Future::Make()
// for a valid Future. This constructor is mostly for the convenience
// of being able to presize a vector of Futures.
@@ -370,7 +370,7 @@ class ARROW_MUST_USE_TYPE Future {
// Consumer API
- bool is_valid() const { return impl_ != NULLPTR; }
+ bool is_valid() const { return impl_ != NULLPTR; }
/// \brief Return the Future's current state
///
@@ -381,41 +381,41 @@ class ARROW_MUST_USE_TYPE Future {
return impl_->state();
}
- /// \brief Whether the Future is finished
+ /// \brief Whether the Future is finished
///
- /// A false return value is only indicative, as the Future can complete
- /// concurrently. A true return value is definitive, though.
- bool is_finished() const {
+ /// A false return value is only indicative, as the Future can complete
+ /// concurrently. A true return value is definitive, though.
+ bool is_finished() const {
CheckValid();
- return IsFutureFinished(impl_->state());
- }
-
- /// \brief Wait for the Future to complete and return its Result
- const Result<ValueType>& result() const& {
+ return IsFutureFinished(impl_->state());
+ }
+
+ /// \brief Wait for the Future to complete and return its Result
+ const Result<ValueType>& result() const& {
Wait();
- return *GetResult();
+ return *GetResult();
}
- /// \brief Returns an rvalue to the result. This method is potentially unsafe
- ///
- /// The future is not the unique owner of the result, copies of a future will
- /// also point to the same result. You must make sure that no other copies
- /// of the future exist. Attempts to add callbacks after you move the result
- /// will result in undefined behavior.
- Result<ValueType>&& MoveResult() {
+ /// \brief Returns an rvalue to the result. This method is potentially unsafe
+ ///
+ /// The future is not the unique owner of the result, copies of a future will
+ /// also point to the same result. You must make sure that no other copies
+ /// of the future exist. Attempts to add callbacks after you move the result
+ /// will result in undefined behavior.
+ Result<ValueType>&& MoveResult() {
Wait();
- return std::move(*GetResult());
+ return std::move(*GetResult());
}
/// \brief Wait for the Future to complete and return its Status
- const Status& status() const { return result().status(); }
-
- /// \brief Future<T> is convertible to Future<>, which views only the
- /// Status of the original. Marking the returned Future Finished is not supported.
- explicit operator Future<>() const {
- Future<> status_future;
- status_future.impl_ = impl_;
- return status_future;
+ const Status& status() const { return result().status(); }
+
+ /// \brief Future<T> is convertible to Future<>, which views only the
+ /// Status of the original. Marking the returned Future Finished is not supported.
+ explicit operator Future<>() const {
+ Future<> status_future;
+ status_future.impl_ = impl_;
+ return status_future;
}
/// \brief Wait for the Future to complete
@@ -441,270 +441,270 @@ class ARROW_MUST_USE_TYPE Future {
// Producer API
- /// \brief Producer API: mark Future finished
+ /// \brief Producer API: mark Future finished
///
- /// The Future's result is set to `res`.
- void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
-
- /// \brief Mark a Future<> completed with the provided Status.
- template <typename E = ValueType, typename = typename std::enable_if<
- std::is_same<E, internal::Empty>::value>::type>
- void MarkFinished(Status s = Status::OK()) {
- return DoMarkFinished(E::ToResult(std::move(s)));
+ /// The Future's result is set to `res`.
+ void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
+
+ /// \brief Mark a Future<> completed with the provided Status.
+ template <typename E = ValueType, typename = typename std::enable_if<
+ std::is_same<E, internal::Empty>::value>::type>
+ void MarkFinished(Status s = Status::OK()) {
+ return DoMarkFinished(E::ToResult(std::move(s)));
}
/// \brief Producer API: instantiate a valid Future
///
- /// The Future's state is initialized with PENDING. If you are creating a future with
- /// this method you must ensure that future is eventually completed (with success or
- /// failure). Creating a future, returning it, and never completing the future can lead
- /// to memory leaks (for example, see Loop).
+ /// The Future's state is initialized with PENDING. If you are creating a future with
+ /// this method you must ensure that future is eventually completed (with success or
+ /// failure). Creating a future, returning it, and never completing the future can lead
+ /// to memory leaks (for example, see Loop).
static Future Make() {
Future fut;
- fut.impl_ = FutureImpl::Make();
+ fut.impl_ = FutureImpl::Make();
return fut;
}
/// \brief Producer API: instantiate a finished Future
- static Future<ValueType> MakeFinished(Result<ValueType> res) {
- Future<ValueType> fut;
- fut.InitializeFromResult(std::move(res));
+ static Future<ValueType> MakeFinished(Result<ValueType> res) {
+ Future<ValueType> fut;
+ fut.InitializeFromResult(std::move(res));
return fut;
}
- /// \brief Make a finished Future<> with the provided Status.
- template <typename E = ValueType, typename = typename std::enable_if<
- std::is_same<E, internal::Empty>::value>::type>
- static Future<> MakeFinished(Status s = Status::OK()) {
- return MakeFinished(E::ToResult(std::move(s)));
- }
-
- struct WrapResultyOnComplete {
- template <typename OnComplete>
- struct Callback {
- void operator()(const FutureImpl& impl) && {
- std::move(on_complete)(*impl.CastResult<ValueType>());
- }
- OnComplete on_complete;
- };
- };
-
- struct WrapStatusyOnComplete {
- template <typename OnComplete>
- struct Callback {
- static_assert(std::is_same<internal::Empty, ValueType>::value,
- "Only callbacks for Future<> should accept Status and not Result");
-
- void operator()(const FutureImpl& impl) && {
- std::move(on_complete)(impl.CastResult<ValueType>()->status());
- }
- OnComplete on_complete;
- };
- };
-
- template <typename OnComplete>
- using WrapOnComplete = typename std::conditional<
- detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
- WrapResultyOnComplete>::type::template Callback<OnComplete>;
-
- /// \brief Consumer API: Register a callback to run when this future completes
- ///
- /// The callback should receive the result of the future (const Result<T>&)
- /// For a void or statusy future this should be (const Status&)
- ///
- /// There is no guarantee to the order in which callbacks will run. In
- /// particular, callbacks added while the future is being marked complete
- /// may be executed immediately, ahead of, or even the same time as, other
- /// callbacks that have been previously added.
- ///
- /// WARNING: callbacks may hold arbitrary references, including cyclic references.
- /// Since callbacks will only be destroyed after they are invoked, this can lead to
- /// memory leaks if a Future is never marked finished (abandoned):
- ///
- /// {
- /// auto fut = Future<>::Make();
- /// fut.AddCallback([fut]() {});
- /// }
- ///
- /// In this example `fut` falls out of scope but is not destroyed because it holds a
- /// cyclic reference to itself through the callback.
- template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
- void AddCallback(OnComplete on_complete,
- CallbackOptions opts = CallbackOptions::Defaults()) const {
- // We know impl_ will not be dangling when invoking callbacks because at least one
- // thread will be waiting for MarkFinished to return. Thus it's safe to keep a
- // weak reference to impl_ here
- impl_->AddCallback(Callback{std::move(on_complete)}, opts);
- }
-
- /// \brief Overload of AddCallback that will return false instead of running
- /// synchronously
- ///
- /// This overload will guarantee the callback is never run synchronously. If the future
- /// is already finished then it will simply return false. This can be useful to avoid
- /// stack overflow in a situation where you have recursive Futures. For an example
- /// see the Loop function
- ///
- /// Takes in a callback factory function to allow moving callbacks (the factory function
- /// will only be called if the callback can successfully be added)
- ///
- /// Returns true if a callback was actually added and false if the callback failed
- /// to add because the future was marked complete.
- template <typename CallbackFactory,
- typename OnComplete = detail::result_of_t<CallbackFactory()>,
- typename Callback = WrapOnComplete<OnComplete>>
- bool TryAddCallback(const CallbackFactory& callback_factory,
- CallbackOptions opts = CallbackOptions::Defaults()) const {
- return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
- }
-
- template <typename OnSuccess, typename OnFailure>
- struct ThenOnComplete {
- static constexpr bool has_no_args =
- internal::call_traits::argument_count<OnSuccess>::value == 0;
-
- using ContinuedFuture = detail::ContinueFuture::ForSignature<
- detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
-
- static_assert(
- std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
- ContinuedFuture>::value,
- "OnSuccess and OnFailure must continue with the same future type");
-
- struct DummyOnSuccess {
- void operator()(const T&);
- };
- using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
- 0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
-
- static_assert(
- !std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
- "OnSuccess' argument should not be a Result");
-
- void operator()(const Result<T>& result) && {
- detail::ContinueFuture continue_future;
- if (ARROW_PREDICT_TRUE(result.ok())) {
- // move on_failure to a(n immediately destroyed) temporary to free its resources
- ARROW_UNUSED(OnFailure(std::move(on_failure)));
- continue_future.IgnoringArgsIf(
- detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
- std::move(next), std::move(on_success), result.ValueOrDie());
- } else {
- ARROW_UNUSED(OnSuccess(std::move(on_success)));
- continue_future(std::move(next), std::move(on_failure), result.status());
- }
- }
-
- OnSuccess on_success;
- OnFailure on_failure;
- ContinuedFuture next;
- };
-
- template <typename OnSuccess>
- struct PassthruOnFailure {
- using ContinuedFuture = detail::ContinueFuture::ForSignature<
- detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
-
- Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
- };
-
- /// \brief Consumer API: Register a continuation to run when this future completes
- ///
- /// The continuation will run in the same thread that called MarkFinished (whatever
- /// callback is registered with this function will run before MarkFinished returns).
- /// Avoid long-running callbacks in favor of submitting a task to an Executor and
- /// returning the future.
- ///
- /// Two callbacks are supported:
- /// - OnSuccess, called with the result (const ValueType&) on successul completion.
- /// for an empty future this will be called with nothing ()
- /// - OnFailure, called with the error (const Status&) on failed completion.
- /// This callback is optional and defaults to a passthru of any errors.
- ///
- /// Then() returns a Future whose ValueType is derived from the return type of the
- /// callbacks. If a callback returns:
- /// - void, a Future<> will be returned which will completes successully as soon
- /// as the callback runs.
- /// - Status, a Future<> will be returned which will complete with the returned Status
- /// as soon as the callback runs.
- /// - V or Result<V>, a Future<V> will be returned which will complete with the result
- /// of invoking the callback as soon as the callback runs.
- /// - Future<V>, a Future<V> will be returned which will be marked complete when the
- /// future returned by the callback completes (and will complete with the same
- /// result).
- ///
- /// The continued Future type must be the same for both callbacks.
- ///
- /// Note that OnFailure can swallow errors, allowing continued Futures to successully
- /// complete even if this Future fails.
- ///
- /// If this future is already completed then the callback will be run immediately
- /// and the returned future may already be marked complete.
- ///
- /// See AddCallback for general considerations when writing callbacks.
- template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
- typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
- typename ContinuedFuture = typename OnComplete::ContinuedFuture>
- ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
- CallbackOptions options = CallbackOptions::Defaults()) const {
- auto next = ContinuedFuture::Make();
- AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
- std::forward<OnFailure>(on_failure), next},
- options);
- return next;
- }
-
- /// \brief Implicit constructor to create a finished future from a value
- Future(ValueType val) : Future() { // NOLINT runtime/explicit
- impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
- SetResult(std::move(val));
- }
-
- /// \brief Implicit constructor to create a future from a Result, enabling use
- /// of macros like ARROW_ASSIGN_OR_RAISE.
- Future(Result<ValueType> res) : Future() { // NOLINT runtime/explicit
- if (ARROW_PREDICT_TRUE(res.ok())) {
- impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
- } else {
- impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
- }
- SetResult(std::move(res));
- }
-
- /// \brief Implicit constructor to create a future from a Status, enabling use
- /// of macros like ARROW_RETURN_NOT_OK.
- Future(Status s) // NOLINT runtime/explicit
- : Future(Result<ValueType>(std::move(s))) {}
-
+ /// \brief Make a finished Future<> with the provided Status.
+ template <typename E = ValueType, typename = typename std::enable_if<
+ std::is_same<E, internal::Empty>::value>::type>
+ static Future<> MakeFinished(Status s = Status::OK()) {
+ return MakeFinished(E::ToResult(std::move(s)));
+ }
+
+ struct WrapResultyOnComplete {
+ template <typename OnComplete>
+ struct Callback {
+ void operator()(const FutureImpl& impl) && {
+ std::move(on_complete)(*impl.CastResult<ValueType>());
+ }
+ OnComplete on_complete;
+ };
+ };
+
+ struct WrapStatusyOnComplete {
+ template <typename OnComplete>
+ struct Callback {
+ static_assert(std::is_same<internal::Empty, ValueType>::value,
+ "Only callbacks for Future<> should accept Status and not Result");
+
+ void operator()(const FutureImpl& impl) && {
+ std::move(on_complete)(impl.CastResult<ValueType>()->status());
+ }
+ OnComplete on_complete;
+ };
+ };
+
+ template <typename OnComplete>
+ using WrapOnComplete = typename std::conditional<
+ detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
+ WrapResultyOnComplete>::type::template Callback<OnComplete>;
+
+ /// \brief Consumer API: Register a callback to run when this future completes
+ ///
+ /// The callback should receive the result of the future (const Result<T>&)
+ /// For a void or statusy future this should be (const Status&)
+ ///
+ /// There is no guarantee to the order in which callbacks will run. In
+ /// particular, callbacks added while the future is being marked complete
+ /// may be executed immediately, ahead of, or even the same time as, other
+ /// callbacks that have been previously added.
+ ///
+ /// WARNING: callbacks may hold arbitrary references, including cyclic references.
+ /// Since callbacks will only be destroyed after they are invoked, this can lead to
+ /// memory leaks if a Future is never marked finished (abandoned):
+ ///
+ /// {
+ /// auto fut = Future<>::Make();
+ /// fut.AddCallback([fut]() {});
+ /// }
+ ///
+ /// In this example `fut` falls out of scope but is not destroyed because it holds a
+ /// cyclic reference to itself through the callback.
+ template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
+ void AddCallback(OnComplete on_complete,
+ CallbackOptions opts = CallbackOptions::Defaults()) const {
+ // We know impl_ will not be dangling when invoking callbacks because at least one
+ // thread will be waiting for MarkFinished to return. Thus it's safe to keep a
+ // weak reference to impl_ here
+ impl_->AddCallback(Callback{std::move(on_complete)}, opts);
+ }
+
+ /// \brief Overload of AddCallback that will return false instead of running
+ /// synchronously
+ ///
+ /// This overload will guarantee the callback is never run synchronously. If the future
+ /// is already finished then it will simply return false. This can be useful to avoid
+ /// stack overflow in a situation where you have recursive Futures. For an example
+ /// see the Loop function
+ ///
+ /// Takes in a callback factory function to allow moving callbacks (the factory function
+ /// will only be called if the callback can successfully be added)
+ ///
+ /// Returns true if a callback was actually added and false if the callback failed
+ /// to add because the future was marked complete.
+ template <typename CallbackFactory,
+ typename OnComplete = detail::result_of_t<CallbackFactory()>,
+ typename Callback = WrapOnComplete<OnComplete>>
+ bool TryAddCallback(const CallbackFactory& callback_factory,
+ CallbackOptions opts = CallbackOptions::Defaults()) const {
+ return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
+ }
+
+ template <typename OnSuccess, typename OnFailure>
+ struct ThenOnComplete {
+ static constexpr bool has_no_args =
+ internal::call_traits::argument_count<OnSuccess>::value == 0;
+
+ using ContinuedFuture = detail::ContinueFuture::ForSignature<
+ detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+ static_assert(
+ std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
+ ContinuedFuture>::value,
+ "OnSuccess and OnFailure must continue with the same future type");
+
+ struct DummyOnSuccess {
+ void operator()(const T&);
+ };
+ using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
+ 0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
+
+ static_assert(
+ !std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
+ "OnSuccess' argument should not be a Result");
+
+ void operator()(const Result<T>& result) && {
+ detail::ContinueFuture continue_future;
+ if (ARROW_PREDICT_TRUE(result.ok())) {
+ // move on_failure to a(n immediately destroyed) temporary to free its resources
+ ARROW_UNUSED(OnFailure(std::move(on_failure)));
+ continue_future.IgnoringArgsIf(
+ detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
+ std::move(next), std::move(on_success), result.ValueOrDie());
+ } else {
+ ARROW_UNUSED(OnSuccess(std::move(on_success)));
+ continue_future(std::move(next), std::move(on_failure), result.status());
+ }
+ }
+
+ OnSuccess on_success;
+ OnFailure on_failure;
+ ContinuedFuture next;
+ };
+
+ template <typename OnSuccess>
+ struct PassthruOnFailure {
+ using ContinuedFuture = detail::ContinueFuture::ForSignature<
+ detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+ Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
+ };
+
+ /// \brief Consumer API: Register a continuation to run when this future completes
+ ///
+ /// The continuation will run in the same thread that called MarkFinished (whatever
+ /// callback is registered with this function will run before MarkFinished returns).
+ /// Avoid long-running callbacks in favor of submitting a task to an Executor and
+ /// returning the future.
+ ///
+ /// Two callbacks are supported:
+ /// - OnSuccess, called with the result (const ValueType&) on successul completion.
+ /// for an empty future this will be called with nothing ()
+ /// - OnFailure, called with the error (const Status&) on failed completion.
+ /// This callback is optional and defaults to a passthru of any errors.
+ ///
+ /// Then() returns a Future whose ValueType is derived from the return type of the
+ /// callbacks. If a callback returns:
+ /// - void, a Future<> will be returned which will completes successully as soon
+ /// as the callback runs.
+ /// - Status, a Future<> will be returned which will complete with the returned Status
+ /// as soon as the callback runs.
+ /// - V or Result<V>, a Future<V> will be returned which will complete with the result
+ /// of invoking the callback as soon as the callback runs.
+ /// - Future<V>, a Future<V> will be returned which will be marked complete when the
+ /// future returned by the callback completes (and will complete with the same
+ /// result).
+ ///
+ /// The continued Future type must be the same for both callbacks.
+ ///
+ /// Note that OnFailure can swallow errors, allowing continued Futures to successully
+ /// complete even if this Future fails.
+ ///
+ /// If this future is already completed then the callback will be run immediately
+ /// and the returned future may already be marked complete.
+ ///
+ /// See AddCallback for general considerations when writing callbacks.
+ template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
+ typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
+ typename ContinuedFuture = typename OnComplete::ContinuedFuture>
+ ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
+ CallbackOptions options = CallbackOptions::Defaults()) const {
+ auto next = ContinuedFuture::Make();
+ AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
+ std::forward<OnFailure>(on_failure), next},
+ options);
+ return next;
+ }
+
+ /// \brief Implicit constructor to create a finished future from a value
+ Future(ValueType val) : Future() { // NOLINT runtime/explicit
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ SetResult(std::move(val));
+ }
+
+ /// \brief Implicit constructor to create a future from a Result, enabling use
+ /// of macros like ARROW_ASSIGN_OR_RAISE.
+ Future(Result<ValueType> res) : Future() { // NOLINT runtime/explicit
+ if (ARROW_PREDICT_TRUE(res.ok())) {
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ } else {
+ impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+ }
+ SetResult(std::move(res));
+ }
+
+ /// \brief Implicit constructor to create a future from a Status, enabling use
+ /// of macros like ARROW_RETURN_NOT_OK.
+ Future(Status s) // NOLINT runtime/explicit
+ : Future(Result<ValueType>(std::move(s))) {}
+
protected:
- void InitializeFromResult(Result<ValueType> res) {
- if (ARROW_PREDICT_TRUE(res.ok())) {
- impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
- } else {
- impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
- }
- SetResult(std::move(res));
- }
-
- void Initialize() { impl_ = FutureImpl::Make(); }
-
- Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
-
- void SetResult(Result<ValueType> res) {
- impl_->result_ = {new Result<ValueType>(std::move(res)),
- [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
- }
-
- void DoMarkFinished(Result<ValueType> res) {
- SetResult(std::move(res));
-
- if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
- impl_->MarkFinished();
- } else {
- impl_->MarkFailed();
- }
- }
-
+ void InitializeFromResult(Result<ValueType> res) {
+ if (ARROW_PREDICT_TRUE(res.ok())) {
+ impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+ } else {
+ impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+ }
+ SetResult(std::move(res));
+ }
+
+ void Initialize() { impl_ = FutureImpl::Make(); }
+
+ Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
+
+ void SetResult(Result<ValueType> res) {
+ impl_->result_ = {new Result<ValueType>(std::move(res)),
+ [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
+ }
+
+ void DoMarkFinished(Result<ValueType> res) {
+ SetResult(std::move(res));
+
+ if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
+ impl_->MarkFinished();
+ } else {
+ impl_->MarkFailed();
+ }
+ }
+
void CheckValid() const {
#ifndef NDEBUG
if (!is_valid()) {
@@ -713,54 +713,54 @@ class ARROW_MUST_USE_TYPE Future {
#endif
}
- explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
+ explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
+
+ std::shared_ptr<FutureImpl> impl_;
- std::shared_ptr<FutureImpl> impl_;
-
friend class FutureWaiter;
- friend struct detail::ContinueFuture;
-
- template <typename U>
- friend class Future;
- friend class WeakFuture<T>;
-
- FRIEND_TEST(FutureRefTest, ChainRemoved);
- FRIEND_TEST(FutureRefTest, TailRemoved);
- FRIEND_TEST(FutureRefTest, HeadRemoved);
+ friend struct detail::ContinueFuture;
+
+ template <typename U>
+ friend class Future;
+ friend class WeakFuture<T>;
+
+ FRIEND_TEST(FutureRefTest, ChainRemoved);
+ FRIEND_TEST(FutureRefTest, TailRemoved);
+ FRIEND_TEST(FutureRefTest, HeadRemoved);
};
-template <typename T>
-typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
- return fut.result();
-}
-
-template <>
-inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
- const Future<internal::Empty>& fut) {
- return fut.status();
-}
-
-template <typename T>
-class WeakFuture {
- public:
- explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
-
- Future<T> get() { return Future<T>{impl_.lock()}; }
-
- private:
- std::weak_ptr<FutureImpl> impl_;
-};
-
-/// If a Result<Future> holds an error instead of a Future, construct a finished Future
-/// holding that error.
-template <typename T>
-static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
- if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
- return Future<T>::MakeFinished(std::move(maybe_future).status());
- }
- return std::move(maybe_future).MoveValueUnsafe();
-}
-
+template <typename T>
+typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
+ return fut.result();
+}
+
+template <>
+inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
+ const Future<internal::Empty>& fut) {
+ return fut.status();
+}
+
+template <typename T>
+class WeakFuture {
+ public:
+ explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
+
+ Future<T> get() { return Future<T>{impl_.lock()}; }
+
+ private:
+ std::weak_ptr<FutureImpl> impl_;
+};
+
+/// If a Result<Future> holds an error instead of a Future, construct a finished Future
+/// holding that error.
+template <typename T>
+static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
+ if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
+ return Future<T>::MakeFinished(std::move(maybe_future).status());
+ }
+ return std::move(maybe_future).MoveValueUnsafe();
+}
+
/// \brief Wait for all the futures to end, or for the given timeout to expire.
///
/// `true` is returned if all the futures completed before the timeout was reached,
@@ -783,53 +783,53 @@ inline bool WaitForAll(const std::vector<Future<T>*>& futures,
return waiter->Wait(seconds);
}
-/// \brief Create a Future which completes when all of `futures` complete.
-///
-/// The future's result is a vector of the results of `futures`.
-/// Note that this future will never be marked "failed"; failed results
-/// will be stored in the result vector alongside successful results.
-template <typename T>
-Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
- struct State {
- explicit State(std::vector<Future<T>> f)
- : futures(std::move(f)), n_remaining(futures.size()) {}
-
- std::vector<Future<T>> futures;
- std::atomic<size_t> n_remaining;
- };
-
- if (futures.size() == 0) {
- return {std::vector<Result<T>>{}};
- }
-
- auto state = std::make_shared<State>(std::move(futures));
-
- auto out = Future<std::vector<Result<T>>>::Make();
- for (const Future<T>& future : state->futures) {
- future.AddCallback([state, out](const Result<T>&) mutable {
- if (state->n_remaining.fetch_sub(1) != 1) return;
-
- std::vector<Result<T>> results(state->futures.size());
- for (size_t i = 0; i < results.size(); ++i) {
- results[i] = state->futures[i].result();
- }
- out.MarkFinished(std::move(results));
- });
- }
- return out;
-}
-
-template <>
-inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
-
-/// \brief Create a Future which completes when all of `futures` complete.
-///
-/// The future will be marked complete if all `futures` complete
-/// successfully. Otherwise, it will be marked failed with the status of
-/// the first failing future.
-ARROW_EXPORT
-Future<> AllComplete(const std::vector<Future<>>& futures);
-
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future's result is a vector of the results of `futures`.
+/// Note that this future will never be marked "failed"; failed results
+/// will be stored in the result vector alongside successful results.
+template <typename T>
+Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
+ struct State {
+ explicit State(std::vector<Future<T>> f)
+ : futures(std::move(f)), n_remaining(futures.size()) {}
+
+ std::vector<Future<T>> futures;
+ std::atomic<size_t> n_remaining;
+ };
+
+ if (futures.size() == 0) {
+ return {std::vector<Result<T>>{}};
+ }
+
+ auto state = std::make_shared<State>(std::move(futures));
+
+ auto out = Future<std::vector<Result<T>>>::Make();
+ for (const Future<T>& future : state->futures) {
+ future.AddCallback([state, out](const Result<T>&) mutable {
+ if (state->n_remaining.fetch_sub(1) != 1) return;
+
+ std::vector<Result<T>> results(state->futures.size());
+ for (size_t i = 0; i < results.size(); ++i) {
+ results[i] = state->futures[i].result();
+ }
+ out.MarkFinished(std::move(results));
+ });
+ }
+ return out;
+}
+
+template <>
+inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future will be marked complete if all `futures` complete
+/// successfully. Otherwise, it will be marked failed with the status of
+/// the first failing future.
+ARROW_EXPORT
+Future<> AllComplete(const std::vector<Future<>>& futures);
+
/// \brief Wait for one of the futures to end, or for the given timeout to expire.
///
/// The indices of all completed futures are returned. Note that some futures
@@ -854,104 +854,104 @@ inline std::vector<int> WaitForAny(const std::vector<Future<T>*>& futures,
return waiter->MoveFinishedFutures();
}
-struct Continue {
- template <typename T>
- operator util::optional<T>() && { // NOLINT explicit
- return {};
- }
-};
-
-template <typename T = internal::Empty>
-util::optional<T> Break(T break_value = {}) {
- return util::optional<T>{std::move(break_value)};
-}
-
-template <typename T = internal::Empty>
-using ControlFlow = util::optional<T>;
-
-/// \brief Loop through an asynchronous sequence
-///
-/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
-/// of each yielded future the resulting ControlFlow will be examined. A Break will
-/// terminate the loop, while a Continue will re-invoke `iterate`.
-///
-/// \return A future which will complete when a Future returned by iterate completes with
-/// a Break
-template <typename Iterate,
- typename Control = typename detail::result_of_t<Iterate()>::ValueType,
- typename BreakValueType = typename Control::value_type>
-Future<BreakValueType> Loop(Iterate iterate) {
- struct Callback {
- bool CheckForTermination(const Result<Control>& control_res) {
- if (!control_res.ok()) {
- break_fut.MarkFinished(control_res.status());
- return true;
- }
- if (control_res->has_value()) {
- break_fut.MarkFinished(**control_res);
- return true;
- }
- return false;
- }
-
- void operator()(const Result<Control>& maybe_control) && {
- if (CheckForTermination(maybe_control)) return;
-
- auto control_fut = iterate();
- while (true) {
- if (control_fut.TryAddCallback([this]() { return *this; })) {
- // Adding a callback succeeded; control_fut was not finished
- // and we must wait to CheckForTermination.
- return;
- }
- // Adding a callback failed; control_fut was finished and we
- // can CheckForTermination immediately. This also avoids recursion and potential
- // stack overflow.
- if (CheckForTermination(control_fut.result())) return;
-
- control_fut = iterate();
- }
- }
-
- Iterate iterate;
-
- // If the future returned by control_fut is never completed then we will be hanging on
- // to break_fut forever even if the listener has given up listening on it. Instead we
- // rely on the fact that a producer (the caller of Future<>::Make) is always
- // responsible for completing the futures they create.
- // TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
- Future<BreakValueType> break_fut;
- };
-
- auto break_fut = Future<BreakValueType>::Make();
- auto control_fut = iterate();
- control_fut.AddCallback(Callback{std::move(iterate), break_fut});
-
- return break_fut;
-}
-
-inline Future<> ToFuture(Status status) {
- return Future<>::MakeFinished(std::move(status));
-}
-
-template <typename T>
-Future<T> ToFuture(T value) {
- return Future<T>::MakeFinished(std::move(value));
-}
-
-template <typename T>
-Future<T> ToFuture(Result<T> maybe_value) {
- return Future<T>::MakeFinished(std::move(maybe_value));
-}
-
-template <typename T>
-Future<T> ToFuture(Future<T> fut) {
- return std::move(fut);
-}
-
-template <typename T>
-struct EnsureFuture {
- using type = decltype(ToFuture(std::declval<T>()));
-};
-
+struct Continue {
+ template <typename T>
+ operator util::optional<T>() && { // NOLINT explicit
+ return {};
+ }
+};
+
+template <typename T = internal::Empty>
+util::optional<T> Break(T break_value = {}) {
+ return util::optional<T>{std::move(break_value)};
+}
+
+template <typename T = internal::Empty>
+using ControlFlow = util::optional<T>;
+
+/// \brief Loop through an asynchronous sequence
+///
+/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
+/// of each yielded future the resulting ControlFlow will be examined. A Break will
+/// terminate the loop, while a Continue will re-invoke `iterate`.
+///
+/// \return A future which will complete when a Future returned by iterate completes with
+/// a Break
+template <typename Iterate,
+ typename Control = typename detail::result_of_t<Iterate()>::ValueType,
+ typename BreakValueType = typename Control::value_type>
+Future<BreakValueType> Loop(Iterate iterate) {
+ struct Callback {
+ bool CheckForTermination(const Result<Control>& control_res) {
+ if (!control_res.ok()) {
+ break_fut.MarkFinished(control_res.status());
+ return true;
+ }
+ if (control_res->has_value()) {
+ break_fut.MarkFinished(**control_res);
+ return true;
+ }
+ return false;
+ }
+
+ void operator()(const Result<Control>& maybe_control) && {
+ if (CheckForTermination(maybe_control)) return;
+
+ auto control_fut = iterate();
+ while (true) {
+ if (control_fut.TryAddCallback([this]() { return *this; })) {
+ // Adding a callback succeeded; control_fut was not finished
+ // and we must wait to CheckForTermination.
+ return;
+ }
+ // Adding a callback failed; control_fut was finished and we
+ // can CheckForTermination immediately. This also avoids recursion and potential
+ // stack overflow.
+ if (CheckForTermination(control_fut.result())) return;
+
+ control_fut = iterate();
+ }
+ }
+
+ Iterate iterate;
+
+ // If the future returned by control_fut is never completed then we will be hanging on
+ // to break_fut forever even if the listener has given up listening on it. Instead we
+ // rely on the fact that a producer (the caller of Future<>::Make) is always
+ // responsible for completing the futures they create.
+ // TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
+ Future<BreakValueType> break_fut;
+ };
+
+ auto break_fut = Future<BreakValueType>::Make();
+ auto control_fut = iterate();
+ control_fut.AddCallback(Callback{std::move(iterate), break_fut});
+
+ return break_fut;
+}
+
+inline Future<> ToFuture(Status status) {
+ return Future<>::MakeFinished(std::move(status));
+}
+
+template <typename T>
+Future<T> ToFuture(T value) {
+ return Future<T>::MakeFinished(std::move(value));
+}
+
+template <typename T>
+Future<T> ToFuture(Result<T> maybe_value) {
+ return Future<T>::MakeFinished(std::move(maybe_value));
+}
+
+template <typename T>
+Future<T> ToFuture(Future<T> fut) {
+ return std::move(fut);
+}
+
+template <typename T>
+struct EnsureFuture {
+ using type = decltype(ToFuture(std::declval<T>()));
+};
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
index 2b887cfbfeb..ac1adcfb13e 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/hashing.h
@@ -39,7 +39,7 @@
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_builders.h"
-#include "arrow/util/endian.h"
+#include "arrow/util/endian.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
@@ -329,7 +329,7 @@ class HashTable {
// Stash old entries and seal builder, effectively resetting the Buffer
const Entry* old_entries = entries_;
- ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
+ ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
// Allocate new buffer
RETURN_NOT_OK(UpsizeBuffer(new_capacity));
@@ -460,13 +460,13 @@ class ScalarMemoTable : public MemoTable {
out_data[index] = entry->payload.value;
}
});
- // Zero-initialize the null entry
- if (null_index_ != kKeyNotFound) {
- int32_t index = null_index_ - start;
- if (index >= 0) {
- out_data[index] = Scalar{};
- }
- }
+ // Zero-initialize the null entry
+ if (null_index_ != kKeyNotFound) {
+ int32_t index = null_index_ - start;
+ if (index >= 0) {
+ out_data[index] = Scalar{};
+ }
+ }
}
void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
@@ -697,8 +697,8 @@ class BinaryMemoTable : public MemoTable {
DCHECK_LE(start, size());
const builder_offset_type* offsets = binary_builder_.offsets_data();
- const builder_offset_type delta =
- start < binary_builder_.length() ? offsets[start] : 0;
+ const builder_offset_type delta =
+ start < binary_builder_.length() ? offsets[start] : 0;
for (int32_t i = start; i < size(); ++i) {
const builder_offset_type adjusted_offset = offsets[i] - delta;
Offset cast_offset = static_cast<Offset>(adjusted_offset);
@@ -781,8 +781,8 @@ class BinaryMemoTable : public MemoTable {
if (left_size > 0) {
memcpy(out_data, in_data + left_offset, left_size);
}
- // Zero-initialize the null entry
- memset(out_data + left_size, 0, width_size);
+ // Zero-initialize the null entry
+ memset(out_data + left_size, 0, width_size);
auto right_size = values_size() - static_cast<size_t>(null_data_offset);
if (right_size > 0) {
@@ -852,8 +852,8 @@ struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
using MemoTableType = BinaryMemoTable<BinaryBuilder>;
};
-template <typename T>
-struct HashTraits<T, enable_if_decimal<T>> {
+template <typename T>
+struct HashTraits<T, enable_if_decimal<T>> {
using MemoTableType = BinaryMemoTable<BinaryBuilder>;
};
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
index 34665dcf00c..1d494671a9f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int128_internal.h
@@ -16,10 +16,10 @@
// under the License.
#pragma once
-#include "arrow/util/config.h"
+#include "arrow/util/config.h"
#include "arrow/util/macros.h"
-#ifndef ARROW_USE_NATIVE_INT128
+#ifndef ARROW_USE_NATIVE_INT128
#include <boost/multiprecision/cpp_int.hpp>
#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
index 91ab77c64c7..24c5fe56eff 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.cc
@@ -26,13 +26,13 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
namespace internal {
@@ -59,7 +59,7 @@ static const uint64_t max_uints[] = {0, max_uint8, max_uint16, 0, max_ui
0, 0, 0, max_uint64};
// Check if we would need to expand the underlying storage type
-static inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) {
+static inline uint8_t ExpandedUIntWidth(uint64_t val, uint8_t current_width) {
// Optimize for the common case where width doesn't change
if (ARROW_PREDICT_TRUE(val <= max_uints[current_width])) {
return current_width;
@@ -366,7 +366,7 @@ width8:
}
template <typename Source, typename Dest>
-static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t length) {
+static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t length) {
while (length >= 4) {
dest[0] = static_cast<Dest>(src[0]);
dest[1] = static_cast<Dest>(src[1]);
@@ -383,15 +383,15 @@ static inline void CastIntsInternal(const Source* src, Dest* dest, int64_t lengt
}
void DowncastInts(const int64_t* source, int8_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastInts(const int64_t* source, int16_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastInts(const int64_t* source, int32_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) {
@@ -399,25 +399,25 @@ void DowncastInts(const int64_t* source, int64_t* dest, int64_t length) {
}
void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
+ CastIntsInternal(source, dest, length);
}
void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) {
memcpy(dest, source, length * sizeof(int64_t));
}
-void UpcastInts(const int32_t* source, int64_t* dest, int64_t length) {
- CastIntsInternal(source, dest, length);
-}
-
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length) {
+ CastIntsInternal(source, dest, length);
+}
+
template <typename InputInt, typename OutputInt>
void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length,
const int32_t* transpose_map) {
@@ -466,72 +466,72 @@ INSTANTIATE_ALL()
#undef INSTANTIATE_ALL
#undef INSTANTIATE_ALL_DEST
-namespace {
-
-template <typename SrcType>
-struct TransposeIntsDest {
- const SrcType* src;
- uint8_t* dest;
- int64_t dest_offset;
- int64_t length;
- const int32_t* transpose_map;
-
- template <typename T>
- enable_if_integer<T, Status> Visit(const T&) {
- using DestType = typename T::c_type;
- TransposeInts(src, reinterpret_cast<DestType*>(dest) + dest_offset, length,
- transpose_map);
- return Status::OK();
- }
-
- Status Visit(const DataType& type) {
- return Status::TypeError("TransposeInts received non-integer dest_type");
- }
-
- Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
-};
-
-struct TransposeIntsSrc {
- const uint8_t* src;
- uint8_t* dest;
- int64_t src_offset;
- int64_t dest_offset;
- int64_t length;
- const int32_t* transpose_map;
- const DataType& dest_type;
-
- template <typename T>
- enable_if_integer<T, Status> Visit(const T&) {
- using SrcType = typename T::c_type;
- return TransposeIntsDest<SrcType>{reinterpret_cast<const SrcType*>(src) + src_offset,
- dest, dest_offset, length,
- transpose_map}(dest_type);
- }
-
- Status Visit(const DataType& type) {
- return Status::TypeError("TransposeInts received non-integer dest_type");
- }
-
- Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
-};
-
-}; // namespace
-
-Status TransposeInts(const DataType& src_type, const DataType& dest_type,
- const uint8_t* src, uint8_t* dest, int64_t src_offset,
- int64_t dest_offset, int64_t length, const int32_t* transpose_map) {
- TransposeIntsSrc transposer{src, dest, src_offset, dest_offset,
- length, transpose_map, dest_type};
- return transposer(src_type);
-}
-
+namespace {
+
+template <typename SrcType>
+struct TransposeIntsDest {
+ const SrcType* src;
+ uint8_t* dest;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using DestType = typename T::c_type;
+ TransposeInts(src, reinterpret_cast<DestType*>(dest) + dest_offset, length,
+ transpose_map);
+ return Status::OK();
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+struct TransposeIntsSrc {
+ const uint8_t* src;
+ uint8_t* dest;
+ int64_t src_offset;
+ int64_t dest_offset;
+ int64_t length;
+ const int32_t* transpose_map;
+ const DataType& dest_type;
+
+ template <typename T>
+ enable_if_integer<T, Status> Visit(const T&) {
+ using SrcType = typename T::c_type;
+ return TransposeIntsDest<SrcType>{reinterpret_cast<const SrcType*>(src) + src_offset,
+ dest, dest_offset, length,
+ transpose_map}(dest_type);
+ }
+
+ Status Visit(const DataType& type) {
+ return Status::TypeError("TransposeInts received non-integer dest_type");
+ }
+
+ Status operator()(const DataType& type) { return VisitTypeInline(type, this); }
+};
+
+}; // namespace
+
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map) {
+ TransposeIntsSrc transposer{src, dest, src_offset, dest_offset,
+ length, transpose_map, dest_type};
+ return transposer(src_type);
+}
+
template <typename T>
-static std::string FormatInt(T val) {
+static std::string FormatInt(T val) {
return std::to_string(val);
}
template <typename IndexCType, bool IsSigned = std::is_signed<IndexCType>::value>
-static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
+static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
// For unsigned integers, if the values array is larger than the maximum
// index value (e.g. especially for UINT8 / UINT16), then there is no need to
// boundscheck.
@@ -549,22 +549,22 @@ static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limi
return ((IsSigned && val < 0) ||
(val >= 0 && static_cast<uint64_t>(val) >= upper_limit));
};
- return VisitSetBitRuns(
- bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
- bool block_out_of_bounds = false;
- for (int64_t i = 0; i < length; ++i) {
- block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
+ return VisitSetBitRuns(
+ bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
+ bool block_out_of_bounds = false;
+ for (int64_t i = 0; i < length; ++i) {
+ block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
}
- if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
- for (int64_t i = 0; i < length; ++i) {
- if (IsOutOfBounds(indices_data[offset + i])) {
- return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
- " out of bounds");
- }
+ if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
+ for (int64_t i = 0; i < length; ++i) {
+ if (IsOutOfBounds(indices_data[offset + i])) {
+ return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
+ " out of bounds");
+ }
}
}
- return Status::OK();
- });
+ return Status::OK();
+ });
}
/// \brief Branchless boundschecking of the indices. Processes batches of
@@ -596,8 +596,8 @@ Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit) {
// ----------------------------------------------------------------------
// Utilities for casting from one integer type to another
-namespace {
-
+namespace {
+
template <typename InType, typename CType = typename InType::c_type>
Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper) {
if (std::numeric_limits<CType>::lowest() >= bound_lower &&
@@ -696,8 +696,8 @@ Status CheckIntegersInRangeImpl(const Datum& datum, const Scalar& bound_lower,
checked_cast<const ScalarType&>(bound_upper).value);
}
-} // namespace
-
+} // namespace
+
Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
const Scalar& bound_upper) {
Type::type type_id = datum.type()->id();
@@ -729,8 +729,8 @@ Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
}
}
-namespace {
-
+namespace {
+
template <typename O, typename I, typename Enable = void>
struct is_number_downcast {
static constexpr bool value = false;
@@ -919,8 +919,8 @@ Status IntegersCanFitImpl(const Datum& datum, const DataType& target_type) {
return CheckIntegersInRange(datum, ScalarType(bound_min), ScalarType(bound_max));
}
-} // namespace
-
+} // namespace
+
Status IntegersCanFit(const Datum& datum, const DataType& target_type) {
if (!is_integer(target_type.id())) {
return Status::Invalid("Target type is not an integer type: ", target_type);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
index 145a83b3171..bf9226cdf12 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util.h
@@ -18,7 +18,7 @@
#pragma once
#include <cstdint>
-#include <type_traits>
+#include <type_traits>
#include "arrow/status.h"
#include "arrow/util/visibility.h"
@@ -70,30 +70,30 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
ARROW_EXPORT
void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
-ARROW_EXPORT
-void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
-
+ARROW_EXPORT
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
+ const InputInt* source, OutputInt* dest, int64_t length) {
+ DowncastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
+ const InputInt* source, OutputInt* dest, int64_t length) {
+ UpcastInts(source, dest, length);
+}
+
template <typename InputInt, typename OutputInt>
-inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
- const InputInt* source, OutputInt* dest, int64_t length) {
- DowncastInts(source, dest, length);
-}
-
-template <typename InputInt, typename OutputInt>
-inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
- const InputInt* source, OutputInt* dest, int64_t length) {
- UpcastInts(source, dest, length);
-}
-
-template <typename InputInt, typename OutputInt>
ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
const int32_t* transpose_map);
-ARROW_EXPORT
-Status TransposeInts(const DataType& src_type, const DataType& dest_type,
- const uint8_t* src, uint8_t* dest, int64_t src_offset,
- int64_t dest_offset, int64_t length, const int32_t* transpose_map);
-
+ARROW_EXPORT
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+ const uint8_t* src, uint8_t* dest, int64_t src_offset,
+ int64_t dest_offset, int64_t length, const int32_t* transpose_map);
+
/// \brief Do vectorized boundschecking of integer-type array indices. The
/// indices must be non-nonnegative and strictly less than the passed upper
/// limit (which is usually the length of an array that is being indexed-into).
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
index 3760d03c9ff..4136706629f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/int_util_internal.h
@@ -63,27 +63,27 @@ OPS_WITH_OVERFLOW(DivideWithOverflow, div)
#undef OP_WITH_OVERFLOW
#undef OPS_WITH_OVERFLOW
-// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
-// where T is a signed integer type. On overflow, these functions return true.
-// Otherwise, false is returned and `out` is updated with the result of the
-// operation.
-
-#define UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
- static inline bool _func_name(_type u, _type* out) { \
- return !psnip_safe_##_psnip_type##_##_psnip_op(out, u); \
- }
-
-#define SIGNED_UNARY_OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
- UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64)
-
-SIGNED_UNARY_OPS_WITH_OVERFLOW(NegateWithOverflow, neg)
-
-#undef UNARY_OP_WITH_OVERFLOW
-#undef SIGNED_UNARY_OPS_WITH_OVERFLOW
-
+// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
+// where T is a signed integer type. On overflow, these functions return true.
+// Otherwise, false is returned and `out` is updated with the result of the
+// operation.
+
+#define UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, _type, _psnip_type) \
+ static inline bool _func_name(_type u, _type* out) { \
+ return !psnip_safe_##_psnip_type##_##_psnip_op(out, u); \
+ }
+
+#define SIGNED_UNARY_OPS_WITH_OVERFLOW(_func_name, _psnip_op) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int8_t, int8) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int16_t, int16) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int32_t, int32) \
+ UNARY_OP_WITH_OVERFLOW(_func_name, _psnip_op, int64_t, int64)
+
+SIGNED_UNARY_OPS_WITH_OVERFLOW(NegateWithOverflow, neg)
+
+#undef UNARY_OP_WITH_OVERFLOW
+#undef SIGNED_UNARY_OPS_WITH_OVERFLOW
+
/// Signed addition with well-defined behaviour on overflow (as unsigned)
template <typename SignedInt>
SignedInt SafeSignedAdd(SignedInt u, SignedInt v) {
@@ -100,13 +100,13 @@ SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) {
static_cast<UnsignedInt>(v));
}
-/// Signed negation with well-defined behaviour on overflow (as unsigned)
-template <typename SignedInt>
-SignedInt SafeSignedNegate(SignedInt u) {
- using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
- return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
-}
-
+/// Signed negation with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedNegate(SignedInt u) {
+ using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+ return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
+}
+
/// Signed left shift with well-defined behaviour on negative numbers or overflow
template <typename SignedInt, typename Shift>
SignedInt SafeLeftShift(SignedInt u, Shift shift) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
index 85f3843f715..f6566ea7e36 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.cc
@@ -22,15 +22,15 @@
#define _FILE_OFFSET_BITS 64
-#if defined(sun) || defined(__sun)
-// According to https://bugs.python.org/issue1759169#msg82201, __EXTENSIONS__
-// is the best way to enable modern POSIX APIs, such as posix_madvise(), on Solaris.
-// (see also
-// https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/sys/mman.h)
-#undef __EXTENSIONS__
-#define __EXTENSIONS__
-#endif
-
+#if defined(sun) || defined(__sun)
+// According to https://bugs.python.org/issue1759169#msg82201, __EXTENSIONS__
+// is the best way to enable modern POSIX APIs, such as posix_madvise(), on Solaris.
+// (see also
+// https://github.com/illumos/illumos-gate/blob/master/usr/src/uts/common/sys/mman.h)
+#undef __EXTENSIONS__
+#define __EXTENSIONS__
+#endif
+
#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep
#include <algorithm>
@@ -41,7 +41,7 @@
#include <random>
#include <sstream>
#include <string>
-#include <thread>
+#include <thread>
#include <utility>
#include <vector>
@@ -244,26 +244,26 @@ class WinErrorDetail : public StatusDetail {
};
#endif
-const char kSignalDetailTypeId[] = "arrow::SignalDetail";
-
-class SignalDetail : public StatusDetail {
- public:
- explicit SignalDetail(int signum) : signum_(signum) {}
-
- const char* type_id() const override { return kSignalDetailTypeId; }
-
- std::string ToString() const override {
- std::stringstream ss;
- ss << "received signal " << signum_;
- return ss.str();
- }
-
- int signum() const { return signum_; }
-
- protected:
- int signum_;
-};
-
+const char kSignalDetailTypeId[] = "arrow::SignalDetail";
+
+class SignalDetail : public StatusDetail {
+ public:
+ explicit SignalDetail(int signum) : signum_(signum) {}
+
+ const char* type_id() const override { return kSignalDetailTypeId; }
+
+ std::string ToString() const override {
+ std::stringstream ss;
+ ss << "received signal " << signum_;
+ return ss.str();
+ }
+
+ int signum() const { return signum_; }
+
+ protected:
+ int signum_;
+};
+
} // namespace
std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum) {
@@ -276,10 +276,10 @@ std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum) {
}
#endif
-std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum) {
- return std::make_shared<SignalDetail>(signum);
-}
-
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum) {
+ return std::make_shared<SignalDetail>(signum);
+}
+
int ErrnoFromStatus(const Status& status) {
const auto detail = status.detail();
if (detail != nullptr && detail->type_id() == kErrnoDetailTypeId) {
@@ -298,14 +298,14 @@ int WinErrorFromStatus(const Status& status) {
return 0;
}
-int SignalFromStatus(const Status& status) {
- const auto detail = status.detail();
- if (detail != nullptr && detail->type_id() == kSignalDetailTypeId) {
- return checked_cast<const SignalDetail&>(*detail).signum();
- }
- return 0;
-}
-
+int SignalFromStatus(const Status& status) {
+ const auto detail = status.detail();
+ if (detail != nullptr && detail->type_id() == kSignalDetailTypeId) {
+ return checked_cast<const SignalDetail&>(*detail).signum();
+ }
+ return 0;
+}
+
//
// PlatformFilename implementation
//
@@ -403,18 +403,18 @@ namespace {
Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents) {
#ifdef _WIN32
- const auto s = dir_path.ToNative().c_str();
- if (CreateDirectoryW(s, nullptr)) {
+ const auto s = dir_path.ToNative().c_str();
+ if (CreateDirectoryW(s, nullptr)) {
return true;
}
int errnum = GetLastError();
if (errnum == ERROR_ALREADY_EXISTS) {
- const auto attrs = GetFileAttributesW(s);
- if (attrs == INVALID_FILE_ATTRIBUTES || !(attrs & FILE_ATTRIBUTE_DIRECTORY)) {
- // Note we propagate the original error, not the GetFileAttributesW() error
- return IOErrorFromWinError(ERROR_ALREADY_EXISTS, "Cannot create directory '",
- dir_path.ToString(), "': non-directory entry exists");
- }
+ const auto attrs = GetFileAttributesW(s);
+ if (attrs == INVALID_FILE_ATTRIBUTES || !(attrs & FILE_ATTRIBUTE_DIRECTORY)) {
+ // Note we propagate the original error, not the GetFileAttributesW() error
+ return IOErrorFromWinError(ERROR_ALREADY_EXISTS, "Cannot create directory '",
+ dir_path.ToString(), "': non-directory entry exists");
+ }
return false;
}
if (create_parents && errnum == ERROR_PATH_NOT_FOUND) {
@@ -427,17 +427,17 @@ Result<bool> DoCreateDir(const PlatformFilename& dir_path, bool create_parents)
return IOErrorFromWinError(GetLastError(), "Cannot create directory '",
dir_path.ToString(), "'");
#else
- const auto s = dir_path.ToNative().c_str();
- if (mkdir(s, S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
+ const auto s = dir_path.ToNative().c_str();
+ if (mkdir(s, S_IRWXU | S_IRWXG | S_IRWXO) == 0) {
return true;
}
if (errno == EEXIST) {
- struct stat st;
- if (stat(s, &st) || !S_ISDIR(st.st_mode)) {
- // Note we propagate the original errno, not the stat() errno
- return IOErrorFromErrno(EEXIST, "Cannot create directory '", dir_path.ToString(),
- "': non-directory entry exists");
- }
+ struct stat st;
+ if (stat(s, &st) || !S_ISDIR(st.st_mode)) {
+ // Note we propagate the original errno, not the stat() errno
+ return IOErrorFromErrno(EEXIST, "Cannot create directory '", dir_path.ToString(),
+ "': non-directory entry exists");
+ }
return false;
}
if (create_parents && errno == ENOENT) {
@@ -1019,15 +1019,15 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes,
return StatusFromMmapErrno("MapViewOfFile failed");
}
return Status::OK();
-#elif defined(__linux__)
- if (ftruncate(fildes, new_size) == -1) {
- return StatusFromMmapErrno("ftruncate failed");
- }
- *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE);
- if (*new_addr == MAP_FAILED) {
- return StatusFromMmapErrno("mremap failed");
- }
- return Status::OK();
+#elif defined(__linux__)
+ if (ftruncate(fildes, new_size) == -1) {
+ return StatusFromMmapErrno("ftruncate failed");
+ }
+ *new_addr = mremap(addr, old_size, new_size, MREMAP_MAYMOVE);
+ if (*new_addr == MAP_FAILED) {
+ return StatusFromMmapErrno("mremap failed");
+ }
+ return Status::OK();
#else
// we have to close the mmap first, truncate the file to the new size
// and recreate the mmap
@@ -1089,7 +1089,7 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
}
}
return Status::OK();
-#elif defined(POSIX_MADV_WILLNEED)
+#elif defined(POSIX_MADV_WILLNEED)
for (const auto& region : regions) {
if (region.size != 0) {
const auto aligned = align_region(region);
@@ -1103,8 +1103,8 @@ Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions) {
}
}
return Status::OK();
-#else
- return Status::OK();
+#else
+ return Status::OK();
#endif
}
@@ -1468,51 +1468,51 @@ std::string MakeRandomName(int num_chars) {
} // namespace
Result<std::unique_ptr<TemporaryDir>> TemporaryDir::Make(const std::string& prefix) {
- const int kNumChars = 8;
-
+ const int kNumChars = 8;
+
NativePathString base_name;
- auto MakeBaseName = [&]() {
- std::string suffix = MakeRandomName(kNumChars);
- return StringToNative(prefix + suffix);
- };
-
- auto TryCreatingDirectory =
- [&](const NativePathString& base_dir) -> Result<std::unique_ptr<TemporaryDir>> {
- Status st;
- for (int attempt = 0; attempt < 3; ++attempt) {
- PlatformFilename fn(base_dir + kNativeSep + base_name + kNativeSep);
- auto result = CreateDir(fn);
- if (!result.ok()) {
- // Probably a permissions error or a non-existing base_dir
- return nullptr;
- }
- if (*result) {
- return std::unique_ptr<TemporaryDir>(new TemporaryDir(std::move(fn)));
- }
- // The random name already exists in base_dir, try with another name
- st = Status::IOError("Path already exists: '", fn.ToString(), "'");
- ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
- }
- return st;
- };
-
- ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
-
+ auto MakeBaseName = [&]() {
+ std::string suffix = MakeRandomName(kNumChars);
+ return StringToNative(prefix + suffix);
+ };
+
+ auto TryCreatingDirectory =
+ [&](const NativePathString& base_dir) -> Result<std::unique_ptr<TemporaryDir>> {
+ Status st;
+ for (int attempt = 0; attempt < 3; ++attempt) {
+ PlatformFilename fn(base_dir + kNativeSep + base_name + kNativeSep);
+ auto result = CreateDir(fn);
+ if (!result.ok()) {
+ // Probably a permissions error or a non-existing base_dir
+ return nullptr;
+ }
+ if (*result) {
+ return std::unique_ptr<TemporaryDir>(new TemporaryDir(std::move(fn)));
+ }
+ // The random name already exists in base_dir, try with another name
+ st = Status::IOError("Path already exists: '", fn.ToString(), "'");
+ ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
+ }
+ return st;
+ };
+
+ ARROW_ASSIGN_OR_RAISE(base_name, MakeBaseName());
+
auto base_dirs = GetPlatformTemporaryDirs();
DCHECK_NE(base_dirs.size(), 0);
- for (const auto& base_dir : base_dirs) {
- ARROW_ASSIGN_OR_RAISE(auto ptr, TryCreatingDirectory(base_dir));
- if (ptr) {
- return std::move(ptr);
+ for (const auto& base_dir : base_dirs) {
+ ARROW_ASSIGN_OR_RAISE(auto ptr, TryCreatingDirectory(base_dir));
+ if (ptr) {
+ return std::move(ptr);
}
- // Cannot create in this directory, try the next one
+ // Cannot create in this directory, try the next one
}
- return Status::IOError(
- "Cannot create temporary subdirectory in any "
- "of the platform temporary directories");
+ return Status::IOError(
+ "Cannot create temporary subdirectory in any "
+ "of the platform temporary directories");
}
TemporaryDir::TemporaryDir(PlatformFilename&& path) : path_(std::move(path)) {}
@@ -1594,64 +1594,64 @@ Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler)
return Status::OK();
}
-void ReinstateSignalHandler(int signum, SignalHandler::Callback handler) {
-#if !ARROW_HAVE_SIGACTION
- // Cannot report any errors from signal() (but there shouldn't be any)
- signal(signum, handler);
-#endif
-}
-
-Status SendSignal(int signum) {
- if (raise(signum) == 0) {
- return Status::OK();
- }
- if (errno == EINVAL) {
- return Status::Invalid("Invalid signal number ", signum);
- }
- return IOErrorFromErrno(errno, "Failed to raise signal");
-}
-
-Status SendSignalToThread(int signum, uint64_t thread_id) {
-#ifdef _WIN32
- return Status::NotImplemented("Cannot send signal to specific thread on Windows");
-#else
- // Have to use a C-style cast because pthread_t can be a pointer *or* integer type
- int r = pthread_kill((pthread_t)thread_id, signum); // NOLINT readability-casting
- if (r == 0) {
- return Status::OK();
- }
- if (r == EINVAL) {
- return Status::Invalid("Invalid signal number ", signum);
- }
- return IOErrorFromErrno(r, "Failed to raise signal");
-#endif
-}
-
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler) {
+#if !ARROW_HAVE_SIGACTION
+ // Cannot report any errors from signal() (but there shouldn't be any)
+ signal(signum, handler);
+#endif
+}
+
+Status SendSignal(int signum) {
+ if (raise(signum) == 0) {
+ return Status::OK();
+ }
+ if (errno == EINVAL) {
+ return Status::Invalid("Invalid signal number ", signum);
+ }
+ return IOErrorFromErrno(errno, "Failed to raise signal");
+}
+
+Status SendSignalToThread(int signum, uint64_t thread_id) {
+#ifdef _WIN32
+ return Status::NotImplemented("Cannot send signal to specific thread on Windows");
+#else
+ // Have to use a C-style cast because pthread_t can be a pointer *or* integer type
+ int r = pthread_kill((pthread_t)thread_id, signum); // NOLINT readability-casting
+ if (r == 0) {
+ return Status::OK();
+ }
+ if (r == EINVAL) {
+ return Status::Invalid("Invalid signal number ", signum);
+ }
+ return IOErrorFromErrno(r, "Failed to raise signal");
+#endif
+}
+
namespace {
-int64_t GetPid() {
-#ifdef _WIN32
- return GetCurrentProcessId();
-#else
- return getpid();
-#endif
-}
-
+int64_t GetPid() {
+#ifdef _WIN32
+ return GetCurrentProcessId();
+#else
+ return getpid();
+#endif
+}
+
std::mt19937_64 GetSeedGenerator() {
// Initialize Mersenne Twister PRNG with a true random seed.
- // Make sure to mix in process id to minimize risks of clashes when parallel testing.
+ // Make sure to mix in process id to minimize risks of clashes when parallel testing.
#ifdef ARROW_VALGRIND
// Valgrind can crash, hang or enter an infinite loop on std::random_device,
// use a crude initializer instead.
const uint8_t dummy = 0;
ARROW_UNUSED(dummy);
std::mt19937_64 seed_gen(reinterpret_cast<uintptr_t>(&dummy) ^
- static_cast<uintptr_t>(GetPid()));
+ static_cast<uintptr_t>(GetPid()));
#else
std::random_device true_random;
std::mt19937_64 seed_gen(static_cast<uint64_t>(true_random()) ^
- (static_cast<uint64_t>(true_random()) << 32) ^
- static_cast<uint64_t>(GetPid()));
+ (static_cast<uint64_t>(true_random()) << 32) ^
+ static_cast<uint64_t>(GetPid()));
#endif
return seed_gen;
}
@@ -1665,21 +1665,21 @@ int64_t GetRandomSeed() {
return static_cast<int64_t>(seed_gen());
}
-uint64_t GetThreadId() {
- uint64_t equiv{0};
- // std::thread::id is trivially copyable as per C++ spec,
- // so type punning as a uint64_t should work
- static_assert(sizeof(std::thread::id) <= sizeof(uint64_t),
- "std::thread::id can't fit into uint64_t");
- const auto tid = std::this_thread::get_id();
- memcpy(&equiv, reinterpret_cast<const void*>(&tid), sizeof(tid));
- return equiv;
-}
-
-uint64_t GetOptionalThreadId() {
- auto tid = GetThreadId();
- return (tid == 0) ? tid - 1 : tid;
-}
-
+uint64_t GetThreadId() {
+ uint64_t equiv{0};
+ // std::thread::id is trivially copyable as per C++ spec,
+ // so type punning as a uint64_t should work
+ static_assert(sizeof(std::thread::id) <= sizeof(uint64_t),
+ "std::thread::id can't fit into uint64_t");
+ const auto tid = std::this_thread::get_id();
+ memcpy(&equiv, reinterpret_cast<const void*>(&tid), sizeof(tid));
+ return equiv;
+}
+
+uint64_t GetOptionalThreadId() {
+ auto tid = GetThreadId();
+ return (tid == 0) ? tid - 1 : tid;
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
index 7aa26f0819e..4255dd37105 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/io_util.h
@@ -209,8 +209,8 @@ std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum);
ARROW_EXPORT
std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum);
#endif
-ARROW_EXPORT
-std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
template <typename... Args>
Status StatusFromErrno(int errnum, StatusCode code, Args&&... args) {
@@ -236,17 +236,17 @@ Status IOErrorFromWinError(int errnum, Args&&... args) {
}
#endif
-template <typename... Args>
-Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
- return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
- std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-Status CancelledFromSignal(int signum, Args&&... args) {
- return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
-}
-
+template <typename... Args>
+Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
+ return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
+ std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status CancelledFromSignal(int signum, Args&&... args) {
+ return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
+}
+
ARROW_EXPORT
int ErrnoFromStatus(const Status&);
@@ -254,9 +254,9 @@ int ErrnoFromStatus(const Status&);
ARROW_EXPORT
int WinErrorFromStatus(const Status&);
-ARROW_EXPORT
-int SignalFromStatus(const Status&);
-
+ARROW_EXPORT
+int SignalFromStatus(const Status&);
+
class ARROW_EXPORT TemporaryDir {
public:
~TemporaryDir();
@@ -309,26 +309,26 @@ Result<SignalHandler> GetSignalHandler(int signum);
ARROW_EXPORT
Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler);
-/// \brief Reinstate the signal handler
-///
-/// For use in signal handlers. This is needed on platforms without sigaction()
-/// such as Windows, as the default signal handler is restored there as
-/// soon as a signal is raised.
-ARROW_EXPORT
-void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
-
-/// \brief Send a signal to the current process
-///
-/// The thread which will receive the signal is unspecified.
-ARROW_EXPORT
-Status SendSignal(int signum);
-
-/// \brief Send a signal to the given thread
-///
-/// This function isn't supported on Windows.
-ARROW_EXPORT
-Status SendSignalToThread(int signum, uint64_t thread_id);
-
+/// \brief Reinstate the signal handler
+///
+/// For use in signal handlers. This is needed on platforms without sigaction()
+/// such as Windows, as the default signal handler is restored there as
+/// soon as a signal is raised.
+ARROW_EXPORT
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
+
+/// \brief Send a signal to the current process
+///
+/// The thread which will receive the signal is unspecified.
+ARROW_EXPORT
+Status SendSignal(int signum);
+
+/// \brief Send a signal to the given thread
+///
+/// This function isn't supported on Windows.
+ARROW_EXPORT
+Status SendSignalToThread(int signum, uint64_t thread_id);
+
/// \brief Get an unpredictable random seed
///
/// This function may be slightly costly, so should only be used to initialize
@@ -338,12 +338,12 @@ Status SendSignalToThread(int signum, uint64_t thread_id);
ARROW_EXPORT
int64_t GetRandomSeed();
-/// \brief Get the current thread id
-///
-/// In addition to having the same properties as std::thread, the returned value
-/// is a regular integer value, which is more convenient than an opaque type.
-ARROW_EXPORT
-uint64_t GetThreadId();
-
+/// \brief Get the current thread id
+///
+/// In addition to having the same properties as std::thread, the returned value
+/// is a regular integer value, which is more convenient than an opaque type.
+ARROW_EXPORT
+uint64_t GetThreadId();
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
index 374ac1afd4e..2f42803d26f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/iterator.h
@@ -43,40 +43,40 @@ struct IterationTraits {
/// \brief a reserved value which indicates the end of iteration. By
/// default this is NULLPTR since most iterators yield pointer types.
/// Specialize IterationTraits if different end semantics are required.
- ///
- /// Note: This should not be used to determine if a given value is a
- /// terminal value. Use IsIterationEnd (which uses IsEnd) instead. This
- /// is only for returning terminal values.
+ ///
+ /// Note: This should not be used to determine if a given value is a
+ /// terminal value. Use IsIterationEnd (which uses IsEnd) instead. This
+ /// is only for returning terminal values.
static T End() { return T(NULLPTR); }
-
- /// \brief Checks to see if the value is a terminal value.
- /// A method is used here since T is not neccesarily comparable in many
- /// cases even though it has a distinct final value
- static bool IsEnd(const T& val) { return val == End(); }
+
+ /// \brief Checks to see if the value is a terminal value.
+ /// A method is used here since T is not neccesarily comparable in many
+ /// cases even though it has a distinct final value
+ static bool IsEnd(const T& val) { return val == End(); }
};
template <typename T>
-T IterationEnd() {
- return IterationTraits<T>::End();
-}
-
-template <typename T>
-bool IsIterationEnd(const T& val) {
- return IterationTraits<T>::IsEnd(val);
-}
-
-template <typename T>
+T IterationEnd() {
+ return IterationTraits<T>::End();
+}
+
+template <typename T>
+bool IsIterationEnd(const T& val) {
+ return IterationTraits<T>::IsEnd(val);
+}
+
+template <typename T>
struct IterationTraits<util::optional<T>> {
/// \brief by default when iterating through a sequence of optional,
/// nullopt indicates the end of iteration.
/// Specialize IterationTraits if different end semantics are required.
static util::optional<T> End() { return util::nullopt; }
- /// \brief by default when iterating through a sequence of optional,
- /// nullopt (!has_value()) indicates the end of iteration.
- /// Specialize IterationTraits if different end semantics are required.
- static bool IsEnd(const util::optional<T>& val) { return !val.has_value(); }
-
+ /// \brief by default when iterating through a sequence of optional,
+ /// nullopt (!has_value()) indicates the end of iteration.
+ /// Specialize IterationTraits if different end semantics are required.
+ static bool IsEnd(const util::optional<T>& val) { return !val.has_value(); }
+
// TODO(bkietz) The range-for loop over Iterator<optional<T>> yields
// Result<optional<T>> which is unnecessary (since only the unyielded end optional
// is nullopt. Add IterationTraits::GetRangeElement() to handle this case
@@ -87,8 +87,8 @@ template <typename T>
class Iterator : public util::EqualityComparable<Iterator<T>> {
public:
/// \brief Iterator may be constructed from any type which has a member function
- /// with signature Result<T> Next();
- /// End of iterator is signalled by returning IteratorTraits<T>::End();
+ /// with signature Result<T> Next();
+ /// End of iterator is signalled by returning IteratorTraits<T>::End();
///
/// The argument is moved or copied to the heap and kept in a unique_ptr<void>. Only
/// its destructor and its Next method (which are stored in function pointers) are
@@ -116,7 +116,7 @@ class Iterator : public util::EqualityComparable<Iterator<T>> {
for (;;) {
ARROW_ASSIGN_OR_RAISE(auto value, Next());
- if (IsIterationEnd(value)) break;
+ if (IsIterationEnd(value)) break;
ARROW_RETURN_NOT_OK(visitor(std::move(value)));
}
@@ -210,132 +210,132 @@ class Iterator : public util::EqualityComparable<Iterator<T>> {
};
template <typename T>
-struct TransformFlow {
- using YieldValueType = T;
-
- TransformFlow(YieldValueType value, bool ready_for_next)
- : finished_(false),
- ready_for_next_(ready_for_next),
- yield_value_(std::move(value)) {}
- TransformFlow(bool finished, bool ready_for_next)
- : finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
-
- bool HasValue() const { return yield_value_.has_value(); }
- bool Finished() const { return finished_; }
- bool ReadyForNext() const { return ready_for_next_; }
- T Value() const { return *yield_value_; }
-
- bool finished_ = false;
- bool ready_for_next_ = false;
- util::optional<YieldValueType> yield_value_;
-};
-
-struct TransformFinish {
- template <typename T>
- operator TransformFlow<T>() && { // NOLINT explicit
- return TransformFlow<T>(true, true);
- }
-};
-
-struct TransformSkip {
- template <typename T>
- operator TransformFlow<T>() && { // NOLINT explicit
- return TransformFlow<T>(false, true);
- }
-};
-
-template <typename T>
-TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
- return TransformFlow<T>(std::move(value), ready_for_next);
-}
-
-template <typename T, typename V>
-using Transformer = std::function<Result<TransformFlow<V>>(T)>;
-
-template <typename T, typename V>
-class TransformIterator {
- public:
- explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
- : it_(std::move(it)),
- transformer_(std::move(transformer)),
- last_value_(),
- finished_() {}
-
- Result<V> Next() {
- while (!finished_) {
- ARROW_ASSIGN_OR_RAISE(util::optional<V> next, Pump());
- if (next.has_value()) {
- return std::move(*next);
- }
- ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
- }
- return IterationTraits<V>::End();
- }
-
- private:
- // Calls the transform function on the current value. Can return in several ways
- // * If the next value is requested (e.g. skip) it will return an empty optional
- // * If an invalid status is encountered that will be returned
- // * If finished it will return IterationTraits<V>::End()
- // * If a value is returned by the transformer that will be returned
- Result<util::optional<V>> Pump() {
- if (!finished_ && last_value_.has_value()) {
- auto next_res = transformer_(*last_value_);
- if (!next_res.ok()) {
- finished_ = true;
- return next_res.status();
- }
- auto next = *next_res;
- if (next.ReadyForNext()) {
- if (IsIterationEnd(*last_value_)) {
- finished_ = true;
- }
- last_value_.reset();
- }
- if (next.Finished()) {
- finished_ = true;
- }
- if (next.HasValue()) {
- return next.Value();
- }
- }
- if (finished_) {
- return IterationTraits<V>::End();
- }
- return util::nullopt;
- }
-
- Iterator<T> it_;
- Transformer<T, V> transformer_;
- util::optional<T> last_value_;
- bool finished_ = false;
-};
-
-/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
-///
-/// The transformer will be called on each element of the source iterator and for each
-/// call it can yield a value, skip, or finish the iteration. When yielding a value the
-/// transformer can choose to consume the source item (the default, ready_for_next = true)
-/// or to keep it and it will be called again on the same value.
-///
-/// This is essentially a more generic form of the map operation that can return 0, 1, or
-/// many values for each of the source items.
-///
-/// The transformer will be exposed to the end of the source sequence
-/// (IterationTraits::End) in case it needs to return some penultimate item(s).
-///
-/// Any invalid status returned by the transformer will be returned immediately.
-template <typename T, typename V>
-Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
- return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
-}
-
-template <typename T>
+struct TransformFlow {
+ using YieldValueType = T;
+
+ TransformFlow(YieldValueType value, bool ready_for_next)
+ : finished_(false),
+ ready_for_next_(ready_for_next),
+ yield_value_(std::move(value)) {}
+ TransformFlow(bool finished, bool ready_for_next)
+ : finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
+
+ bool HasValue() const { return yield_value_.has_value(); }
+ bool Finished() const { return finished_; }
+ bool ReadyForNext() const { return ready_for_next_; }
+ T Value() const { return *yield_value_; }
+
+ bool finished_ = false;
+ bool ready_for_next_ = false;
+ util::optional<YieldValueType> yield_value_;
+};
+
+struct TransformFinish {
+ template <typename T>
+ operator TransformFlow<T>() && { // NOLINT explicit
+ return TransformFlow<T>(true, true);
+ }
+};
+
+struct TransformSkip {
+ template <typename T>
+ operator TransformFlow<T>() && { // NOLINT explicit
+ return TransformFlow<T>(false, true);
+ }
+};
+
+template <typename T>
+TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
+ return TransformFlow<T>(std::move(value), ready_for_next);
+}
+
+template <typename T, typename V>
+using Transformer = std::function<Result<TransformFlow<V>>(T)>;
+
+template <typename T, typename V>
+class TransformIterator {
+ public:
+ explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
+ : it_(std::move(it)),
+ transformer_(std::move(transformer)),
+ last_value_(),
+ finished_() {}
+
+ Result<V> Next() {
+ while (!finished_) {
+ ARROW_ASSIGN_OR_RAISE(util::optional<V> next, Pump());
+ if (next.has_value()) {
+ return std::move(*next);
+ }
+ ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
+ }
+ return IterationTraits<V>::End();
+ }
+
+ private:
+ // Calls the transform function on the current value. Can return in several ways
+ // * If the next value is requested (e.g. skip) it will return an empty optional
+ // * If an invalid status is encountered that will be returned
+ // * If finished it will return IterationTraits<V>::End()
+ // * If a value is returned by the transformer that will be returned
+ Result<util::optional<V>> Pump() {
+ if (!finished_ && last_value_.has_value()) {
+ auto next_res = transformer_(*last_value_);
+ if (!next_res.ok()) {
+ finished_ = true;
+ return next_res.status();
+ }
+ auto next = *next_res;
+ if (next.ReadyForNext()) {
+ if (IsIterationEnd(*last_value_)) {
+ finished_ = true;
+ }
+ last_value_.reset();
+ }
+ if (next.Finished()) {
+ finished_ = true;
+ }
+ if (next.HasValue()) {
+ return next.Value();
+ }
+ }
+ if (finished_) {
+ return IterationTraits<V>::End();
+ }
+ return util::nullopt;
+ }
+
+ Iterator<T> it_;
+ Transformer<T, V> transformer_;
+ util::optional<T> last_value_;
+ bool finished_ = false;
+};
+
+/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
+///
+/// The transformer will be called on each element of the source iterator and for each
+/// call it can yield a value, skip, or finish the iteration. When yielding a value the
+/// transformer can choose to consume the source item (the default, ready_for_next = true)
+/// or to keep it and it will be called again on the same value.
+///
+/// This is essentially a more generic form of the map operation that can return 0, 1, or
+/// many values for each of the source items.
+///
+/// The transformer will be exposed to the end of the source sequence
+/// (IterationTraits::End) in case it needs to return some penultimate item(s).
+///
+/// Any invalid status returned by the transformer will be returned immediately.
+template <typename T, typename V>
+Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
+ return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
+}
+
+template <typename T>
struct IterationTraits<Iterator<T>> {
// The end condition for an Iterator of Iterators is a default constructed (null)
// Iterator.
static Iterator<T> End() { return Iterator<T>(); }
- static bool IsEnd(const Iterator<T>& val) { return !val; }
+ static bool IsEnd(const Iterator<T>& val) { return !val; }
};
template <typename Fn, typename T>
@@ -427,7 +427,7 @@ class MapIterator {
Result<O> Next() {
ARROW_ASSIGN_OR_RAISE(I i, it_.Next());
- if (IsIterationEnd(i)) {
+ if (IsIterationEnd(i)) {
return IterationTraits<O>::End();
}
@@ -489,7 +489,7 @@ struct FilterIterator {
for (;;) {
ARROW_ASSIGN_OR_RAISE(From i, it_.Next());
- if (IsIterationEnd(i)) {
+ if (IsIterationEnd(i)) {
return IterationTraits<To>::End();
}
@@ -525,12 +525,12 @@ class FlattenIterator {
explicit FlattenIterator(Iterator<Iterator<T>> it) : parent_(std::move(it)) {}
Result<T> Next() {
- if (IsIterationEnd(child_)) {
+ if (IsIterationEnd(child_)) {
// Pop from parent's iterator.
ARROW_ASSIGN_OR_RAISE(child_, parent_.Next());
// Check if final iteration reached.
- if (IsIterationEnd(child_)) {
+ if (IsIterationEnd(child_)) {
return IterationTraits<T>::End();
}
@@ -539,7 +539,7 @@ class FlattenIterator {
// Pop from child_ and check for depletion.
ARROW_ASSIGN_OR_RAISE(T out, child_.Next());
- if (IsIterationEnd(out)) {
+ if (IsIterationEnd(out)) {
// Reset state such that we pop from parent on the recursive call
child_ = IterationTraits<Iterator<T>>::End();
@@ -559,10 +559,10 @@ Iterator<T> MakeFlattenIterator(Iterator<Iterator<T>> it) {
return Iterator<T>(FlattenIterator<T>(std::move(it)));
}
-template <typename Reader>
-Iterator<typename Reader::ValueType> MakeIteratorFromReader(
- const std::shared_ptr<Reader>& reader) {
- return MakeFunctionIterator([reader] { return reader->Next(); });
+template <typename Reader>
+Iterator<typename Reader::ValueType> MakeIteratorFromReader(
+ const std::shared_ptr<Reader>& reader) {
+ return MakeFunctionIterator([reader] { return reader->Next(); });
}
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
index c4a3ac64aab..ad3b686a9bd 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.cc
@@ -70,11 +70,11 @@ KeyValueMetadata::KeyValueMetadata(std::vector<std::string> keys,
ARROW_CHECK_EQ(keys.size(), values.size());
}
-std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Make(
- std::vector<std::string> keys, std::vector<std::string> values) {
- return std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
-}
-
+std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Make(
+ std::vector<std::string> keys, std::vector<std::string> values) {
+ return std::make_shared<KeyValueMetadata>(std::move(keys), std::move(values));
+}
+
void KeyValueMetadata::ToUnorderedMap(
std::unordered_map<std::string, std::string>* out) const {
DCHECK_NE(out, nullptr);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
index 9835b1739c7..d42ab78f667 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/key_value_metadata.h
@@ -39,9 +39,9 @@ class ARROW_EXPORT KeyValueMetadata {
explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
virtual ~KeyValueMetadata() = default;
- static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
- std::vector<std::string> values);
-
+ static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
+ std::vector<std::string> values);
+
void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const;
void Append(const std::string& key, const std::string& value);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
index 314b277a821..65359b44081 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.cc
@@ -24,31 +24,31 @@
#include <iostream>
#ifdef ARROW_USE_GLOG
-
+
#include <signal.h>
#include <vector>
-
+
#error #include "glog/logging.h"
-
-// Restore our versions of DCHECK and friends, as GLog defines its own
-#undef DCHECK
-#undef DCHECK_OK
-#undef DCHECK_EQ
-#undef DCHECK_NE
-#undef DCHECK_LE
-#undef DCHECK_LT
-#undef DCHECK_GE
-#undef DCHECK_GT
-
-#define DCHECK ARROW_DCHECK
-#define DCHECK_OK ARROW_DCHECK_OK
-#define DCHECK_EQ ARROW_DCHECK_EQ
-#define DCHECK_NE ARROW_DCHECK_NE
-#define DCHECK_LE ARROW_DCHECK_LE
-#define DCHECK_LT ARROW_DCHECK_LT
-#define DCHECK_GE ARROW_DCHECK_GE
-#define DCHECK_GT ARROW_DCHECK_GT
-
+
+// Restore our versions of DCHECK and friends, as GLog defines its own
+#undef DCHECK
+#undef DCHECK_OK
+#undef DCHECK_EQ
+#undef DCHECK_NE
+#undef DCHECK_LE
+#undef DCHECK_LT
+#undef DCHECK_GE
+#undef DCHECK_GT
+
+#define DCHECK ARROW_DCHECK
+#define DCHECK_OK ARROW_DCHECK_OK
+#define DCHECK_EQ ARROW_DCHECK_EQ
+#define DCHECK_NE ARROW_DCHECK_NE
+#define DCHECK_LE ARROW_DCHECK_LE
+#define DCHECK_LT ARROW_DCHECK_LT
+#define DCHECK_GE ARROW_DCHECK_GE
+#define DCHECK_GT ARROW_DCHECK_GT
+
#endif
namespace arrow {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
index 286cca361b0..15a0188ab76 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/logging.h
@@ -92,33 +92,33 @@ enum class ArrowLogLevel : int {
// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros
// only do so in debug mode.
-#define ARROW_DCHECK(condition) \
+#define ARROW_DCHECK(condition) \
while (false) ARROW_IGNORE_EXPR(condition); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_OK(s) \
- ARROW_IGNORE_EXPR(s); \
+#define ARROW_DCHECK_OK(s) \
+ ARROW_IGNORE_EXPR(s); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_EQ(val1, val2) \
+#define ARROW_DCHECK_EQ(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_NE(val1, val2) \
+#define ARROW_DCHECK_NE(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_LE(val1, val2) \
+#define ARROW_DCHECK_LE(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_LT(val1, val2) \
+#define ARROW_DCHECK_LT(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_GE(val1, val2) \
+#define ARROW_DCHECK_GE(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
-#define ARROW_DCHECK_GT(val1, val2) \
+#define ARROW_DCHECK_GT(val1, val2) \
while (false) ARROW_IGNORE_EXPR(val1); \
while (false) ARROW_IGNORE_EXPR(val2); \
while (false) ::arrow::util::detail::NullLog()
@@ -126,26 +126,26 @@ enum class ArrowLogLevel : int {
#else
#define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
-#define ARROW_DCHECK ARROW_CHECK
-#define ARROW_DCHECK_OK ARROW_CHECK_OK
-#define ARROW_DCHECK_EQ ARROW_CHECK_EQ
-#define ARROW_DCHECK_NE ARROW_CHECK_NE
-#define ARROW_DCHECK_LE ARROW_CHECK_LE
-#define ARROW_DCHECK_LT ARROW_CHECK_LT
-#define ARROW_DCHECK_GE ARROW_CHECK_GE
-#define ARROW_DCHECK_GT ARROW_CHECK_GT
+#define ARROW_DCHECK ARROW_CHECK
+#define ARROW_DCHECK_OK ARROW_CHECK_OK
+#define ARROW_DCHECK_EQ ARROW_CHECK_EQ
+#define ARROW_DCHECK_NE ARROW_CHECK_NE
+#define ARROW_DCHECK_LE ARROW_CHECK_LE
+#define ARROW_DCHECK_LT ARROW_CHECK_LT
+#define ARROW_DCHECK_GE ARROW_CHECK_GE
+#define ARROW_DCHECK_GT ARROW_CHECK_GT
#endif // NDEBUG
-#define DCHECK ARROW_DCHECK
-#define DCHECK_OK ARROW_DCHECK_OK
-#define DCHECK_EQ ARROW_DCHECK_EQ
-#define DCHECK_NE ARROW_DCHECK_NE
-#define DCHECK_LE ARROW_DCHECK_LE
-#define DCHECK_LT ARROW_DCHECK_LT
-#define DCHECK_GE ARROW_DCHECK_GE
-#define DCHECK_GT ARROW_DCHECK_GT
-
+#define DCHECK ARROW_DCHECK
+#define DCHECK_OK ARROW_DCHECK_OK
+#define DCHECK_EQ ARROW_DCHECK_EQ
+#define DCHECK_NE ARROW_DCHECK_NE
+#define DCHECK_LE ARROW_DCHECK_LE
+#define DCHECK_LT ARROW_DCHECK_LT
+#define DCHECK_GE ARROW_DCHECK_GE
+#define DCHECK_GT ARROW_DCHECK_GT
+
// This code is adapted from
// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h.
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
index 3f665c01838..6c80be380ae 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/mutex.h
@@ -37,7 +37,7 @@ class ARROW_EXPORT Mutex {
/// A Guard is falsy if a lock could not be acquired.
class ARROW_EXPORT Guard {
public:
- Guard() : locked_(NULLPTR, [](Mutex* /* mutex */) {}) {}
+ Guard() : locked_(NULLPTR, [](Mutex* /* mutex */) {}) {}
Guard(Guard&&) = default;
Guard& operator=(Guard&&) = default;
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
index b4858f0bf96..80f60fbdb36 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/parallel.h
@@ -21,9 +21,9 @@
#include <vector>
#include "arrow/status.h"
-#include "arrow/util/functional.h"
+#include "arrow/util/functional.h"
#include "arrow/util/thread_pool.h"
-#include "arrow/util/vector.h"
+#include "arrow/util/vector.h"
namespace arrow {
namespace internal {
@@ -32,12 +32,12 @@ namespace internal {
// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads.
template <class FUNCTION>
-Status ParallelFor(int num_tasks, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
- std::vector<Future<>> futures(num_tasks);
+Status ParallelFor(int num_tasks, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ std::vector<Future<>> futures(num_tasks);
for (int i = 0; i < num_tasks; ++i) {
- ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
+ ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
}
auto st = Status::OK();
for (auto& fut : futures) {
@@ -46,30 +46,30 @@ Status ParallelFor(int num_tasks, FUNCTION&& func,
return st;
}
-template <class FUNCTION, typename T,
- typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
-Future<std::vector<R>> ParallelForAsync(
- std::vector<T> inputs, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
- std::vector<Future<R>> futures(inputs.size());
- for (size_t i = 0; i < inputs.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
- }
- return All(std::move(futures))
- .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
- return UnwrapOrRaise(results);
- });
-}
-
+template <class FUNCTION, typename T,
+ typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> ParallelForAsync(
+ std::vector<T> inputs, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ std::vector<Future<R>> futures(inputs.size());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i, std::move(inputs[i])));
+ }
+ return All(std::move(futures))
+ .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
+ return UnwrapOrRaise(results);
+ });
+}
+
// A parallelizer that takes a `Status(int)` function and calls it with
// arguments between 0 and `num_tasks - 1`, in sequence or in parallel,
// depending on the input boolean.
template <class FUNCTION>
-Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
+Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
if (use_threads) {
- return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
+ return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
} else {
for (int i = 0; i < num_tasks; ++i) {
RETURN_NOT_OK(func(i));
@@ -78,25 +78,25 @@ Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
}
}
-// A parallelizer that takes a `Result<R>(int index, T item)` function and
-// calls it with each item from the input array, in sequence or in parallel,
-// depending on the input boolean.
-
-template <class FUNCTION, typename T,
- typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
-Future<std::vector<R>> OptionalParallelForAsync(
- bool use_threads, std::vector<T> inputs, FUNCTION&& func,
- Executor* executor = internal::GetCpuThreadPool()) {
- if (use_threads) {
- return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
- } else {
- std::vector<R> result(inputs.size());
- for (size_t i = 0; i < inputs.size(); ++i) {
- ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
- }
- return result;
- }
-}
-
+// A parallelizer that takes a `Result<R>(int index, T item)` function and
+// calls it with each item from the input array, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION, typename T,
+ typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> OptionalParallelForAsync(
+ bool use_threads, std::vector<T> inputs, FUNCTION&& func,
+ Executor* executor = internal::GetCpuThreadPool()) {
+ if (use_threads) {
+ return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor);
+ } else {
+ std::vector<R> result(inputs.size());
+ for (size_t i = 0; i < inputs.size(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
+ }
+ return result;
+ }
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
index 677778774e3..6c71fa6e155 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/queue.h
@@ -1,29 +1,29 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/vendored/ProducerConsumerQueue.h"
-
-namespace arrow {
-namespace util {
-
-template <typename T>
-using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
-
-}
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/ProducerConsumerQueue.h"
+
+namespace arrow {
+namespace util {
+
+template <typename T>
+using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
+
+}
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
index d9598a6eb34..0440a2eb563 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/reflection_internal.h
@@ -1,133 +1,133 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "arrow/type_traits.h"
-#include "arrow/util/string_view.h"
-
-namespace arrow {
-namespace internal {
-
-template <size_t...>
-struct index_sequence {};
-
-template <size_t N, size_t Head = N, size_t... Tail>
-struct make_index_sequence_impl;
-
-template <size_t N>
-using make_index_sequence = typename make_index_sequence_impl<N>::type;
-
-template <typename... T>
-using index_sequence_for = make_index_sequence<sizeof...(T)>;
-
-template <size_t N, size_t... I>
-struct make_index_sequence_impl<N, 0, I...> {
- using type = index_sequence<I...>;
-};
-
-template <size_t N, size_t H, size_t... I>
-struct make_index_sequence_impl : make_index_sequence_impl<N, H - 1, H - 1, I...> {};
-
-static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
-static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
-
-template <typename...>
-struct all_same : std::true_type {};
-
-template <typename One>
-struct all_same<One> : std::true_type {};
-
-template <typename Same, typename... Rest>
-struct all_same<Same, Same, Rest...> : all_same<Same, Rest...> {};
-
-template <typename One, typename Other, typename... Rest>
-struct all_same<One, Other, Rest...> : std::false_type {};
-
-template <size_t... I, typename... T, typename Fn>
-void ForEachTupleMemberImpl(const std::tuple<T...>& tup, Fn&& fn, index_sequence<I...>) {
- (void)std::make_tuple((fn(std::get<I>(tup), I), std::ignore)...);
-}
-
-template <typename... T, typename Fn>
-void ForEachTupleMember(const std::tuple<T...>& tup, Fn&& fn) {
- ForEachTupleMemberImpl(tup, fn, index_sequence_for<T...>());
-}
-
-template <typename C, typename T>
-struct DataMemberProperty {
- using Class = C;
- using Type = T;
-
- constexpr const Type& get(const Class& obj) const { return obj.*ptr_; }
-
- void set(Class* obj, Type value) const { (*obj).*ptr_ = std::move(value); }
-
- constexpr util::string_view name() const { return name_; }
-
- util::string_view name_;
- Type Class::*ptr_;
-};
-
-template <typename Class, typename Type>
-constexpr DataMemberProperty<Class, Type> DataMember(util::string_view name,
- Type Class::*ptr) {
- return {name, ptr};
-}
-
-template <typename... Properties>
-struct PropertyTuple {
- template <typename Fn>
- void ForEach(Fn&& fn) const {
- ForEachTupleMember(props_, fn);
- }
-
- static_assert(all_same<typename Properties::Class...>::value,
- "All properties must be properties of the same class");
-
- size_t size() const { return sizeof...(Properties); }
-
- std::tuple<Properties...> props_;
-};
-
-template <typename... Properties>
-PropertyTuple<Properties...> MakeProperties(Properties... props) {
- return {std::make_tuple(props...)};
-}
-
-template <typename Enum>
-struct EnumTraits {};
-
-template <typename Enum, Enum... Values>
-struct BasicEnumTraits {
- using CType = typename std::underlying_type<Enum>::type;
- using Type = typename CTypeTraits<CType>::ArrowType;
- static std::array<Enum, sizeof...(Values)> values() { return {Values...}; }
-};
-
-template <typename T, typename Enable = void>
-struct has_enum_traits : std::false_type {};
-
-template <typename T>
-struct has_enum_traits<T, void_t<typename EnumTraits<T>::Type>> : std::true_type {};
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "arrow/type_traits.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace internal {
+
+template <size_t...>
+struct index_sequence {};
+
+template <size_t N, size_t Head = N, size_t... Tail>
+struct make_index_sequence_impl;
+
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+
+template <typename... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+template <size_t N, size_t... I>
+struct make_index_sequence_impl<N, 0, I...> {
+ using type = index_sequence<I...>;
+};
+
+template <size_t N, size_t H, size_t... I>
+struct make_index_sequence_impl : make_index_sequence_impl<N, H - 1, H - 1, I...> {};
+
+static_assert(std::is_same<index_sequence<>, make_index_sequence<0>>::value, "");
+static_assert(std::is_same<index_sequence<0, 1, 2>, make_index_sequence<3>>::value, "");
+
+template <typename...>
+struct all_same : std::true_type {};
+
+template <typename One>
+struct all_same<One> : std::true_type {};
+
+template <typename Same, typename... Rest>
+struct all_same<Same, Same, Rest...> : all_same<Same, Rest...> {};
+
+template <typename One, typename Other, typename... Rest>
+struct all_same<One, Other, Rest...> : std::false_type {};
+
+template <size_t... I, typename... T, typename Fn>
+void ForEachTupleMemberImpl(const std::tuple<T...>& tup, Fn&& fn, index_sequence<I...>) {
+ (void)std::make_tuple((fn(std::get<I>(tup), I), std::ignore)...);
+}
+
+template <typename... T, typename Fn>
+void ForEachTupleMember(const std::tuple<T...>& tup, Fn&& fn) {
+ ForEachTupleMemberImpl(tup, fn, index_sequence_for<T...>());
+}
+
+template <typename C, typename T>
+struct DataMemberProperty {
+ using Class = C;
+ using Type = T;
+
+ constexpr const Type& get(const Class& obj) const { return obj.*ptr_; }
+
+ void set(Class* obj, Type value) const { (*obj).*ptr_ = std::move(value); }
+
+ constexpr util::string_view name() const { return name_; }
+
+ util::string_view name_;
+ Type Class::*ptr_;
+};
+
+template <typename Class, typename Type>
+constexpr DataMemberProperty<Class, Type> DataMember(util::string_view name,
+ Type Class::*ptr) {
+ return {name, ptr};
+}
+
+template <typename... Properties>
+struct PropertyTuple {
+ template <typename Fn>
+ void ForEach(Fn&& fn) const {
+ ForEachTupleMember(props_, fn);
+ }
+
+ static_assert(all_same<typename Properties::Class...>::value,
+ "All properties must be properties of the same class");
+
+ size_t size() const { return sizeof...(Properties); }
+
+ std::tuple<Properties...> props_;
+};
+
+template <typename... Properties>
+PropertyTuple<Properties...> MakeProperties(Properties... props) {
+ return {std::make_tuple(props...)};
+}
+
+template <typename Enum>
+struct EnumTraits {};
+
+template <typename Enum, Enum... Values>
+struct BasicEnumTraits {
+ using CType = typename std::underlying_type<Enum>::type;
+ using Type = typename CTypeTraits<CType>::ArrowType;
+ static std::array<Enum, sizeof...(Values)> values() { return {Values...}; }
+};
+
+template <typename T, typename Enable = void>
+struct has_enum_traits : std::false_type {};
+
+template <typename T>
+struct has_enum_traits<T, void_t<typename EnumTraits<T>::Type>> : std::true_type {};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
index cf13264e41e..68d29930666 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/rle_encoding.h
@@ -1,826 +1,826 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use
-// in parquet-cpp, Arrow
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-
-#include "arrow/util/bit_block_counter.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/macros.h"
-
-namespace arrow {
-namespace util {
-
-/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs
-/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
-/// (literal encoding).
-/// For both types of runs, there is a byte-aligned indicator which encodes the length
-/// of the run and the type of the run.
-/// This encoding has the benefit that when there aren't any long enough runs, values
-/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
-/// the run length are byte aligned. This allows for very efficient decoding
-/// implementations.
-/// The encoding is:
-/// encoded-block := run*
-/// run := literal-run | repeated-run
-/// literal-run := literal-indicator < literal bytes >
-/// repeated-run := repeated-indicator < repeated value. padded to byte boundary >
-/// literal-indicator := varint_encode( number_of_groups << 1 | 1)
-/// repeated-indicator := varint_encode( number_of_repetitions << 1 )
-//
-/// Each run is preceded by a varint. The varint's least significant bit is
-/// used to indicate whether the run is a literal run or a repeated run. The rest
-/// of the varint is used to determine the length of the run (eg how many times the
-/// value repeats).
-//
-/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
-/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
-/// on a byte boundary without padding.
-/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
-/// the actual number of encoded ints. (This means that the total number of encoded values
-/// can not be determined from the encoded data, since the number of values in the last
-/// group may not be a multiple of 8). For the last group of literal runs, we pad
-/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
-/// without the need for additional checks.
-//
-/// There is a break-even point when it is more storage efficient to do run length
-/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes
-/// for both the repeated encoding or the literal encoding. This value can always
-/// be computed based on the bit-width.
-/// TODO: think about how to use this for strings. The bit packing isn't quite the same.
-//
-/// Examples with bit-width 1 (eg encoding booleans):
-/// ----------------------------------------
-/// 100 1s followed by 100 0s:
-/// <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
-/// - (total 4 bytes)
-//
-/// alternating 1s and 0s (200 total):
-/// 200 ints = 25 groups of 8
-/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
-/// (total 26 bytes, 1 byte overhead)
-//
-
-/// Decoder class for RLE encoded data.
-class RleDecoder {
- public:
- /// Create a decoder object. buffer/buffer_len is the decoded data.
- /// bit_width is the width of each value (before encoding).
- RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
- : bit_reader_(buffer, buffer_len),
- bit_width_(bit_width),
- current_value_(0),
- repeat_count_(0),
- literal_count_(0) {
- DCHECK_GE(bit_width_, 0);
- DCHECK_LE(bit_width_, 64);
- }
-
- RleDecoder() : bit_width_(-1) {}
-
- void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
- DCHECK_GE(bit_width, 0);
- DCHECK_LE(bit_width, 64);
- bit_reader_.Reset(buffer, buffer_len);
- bit_width_ = bit_width;
- current_value_ = 0;
- repeat_count_ = 0;
- literal_count_ = 0;
- }
-
- /// Gets the next value. Returns false if there are no more.
- template <typename T>
- bool Get(T* val);
-
- /// Gets a batch of values. Returns the number of decoded elements.
- template <typename T>
- int GetBatch(T* values, int batch_size);
-
- /// Like GetBatch but add spacing for null entries
- template <typename T>
- int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset, T* out);
-
- /// Like GetBatch but the values are then decoded using the provided dictionary
- template <typename T>
- int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values,
- int batch_size);
-
- /// Like GetBatchWithDict but add spacing for null entries
- ///
- /// Null entries will be zero-initialized in `values` to avoid leaking
- /// private data.
- template <typename T>
- int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values,
- int batch_size, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset);
-
- protected:
- BitUtil::BitReader bit_reader_;
- /// Number of bits needed to encode the value. Must be between 0 and 64.
- int bit_width_;
- uint64_t current_value_;
- int32_t repeat_count_;
- int32_t literal_count_;
-
- private:
- /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
- /// are no more.
- template <typename T>
- bool NextCounts();
-
- /// Utility methods for retrieving spaced values.
- template <typename T, typename RunType, typename Converter>
- int GetSpaced(Converter converter, int batch_size, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset, T* out);
-};
-
-/// Class to incrementally build the rle data. This class does not allocate any memory.
-/// The encoding has two modes: encoding repeated runs and literal runs.
-/// If the run is sufficiently short, it is more efficient to encode as a literal run.
-/// This class does so by buffering 8 values at a time. If they are not all the same
-/// they are added to the literal run. If they are the same, they are added to the
-/// repeated run. When we switch modes, the previous run is flushed out.
-class RleEncoder {
- public:
- /// buffer/buffer_len: preallocated output buffer.
- /// bit_width: max number of bits for value.
- /// TODO: consider adding a min_repeated_run_length so the caller can control
- /// when values should be encoded as repeated runs. Currently this is derived
- /// based on the bit_width, which can determine a storage optimal choice.
- /// TODO: allow 0 bit_width (and have dict encoder use it)
- RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
- : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
- DCHECK_GE(bit_width_, 0);
- DCHECK_LE(bit_width_, 64);
- max_run_byte_size_ = MinBufferSize(bit_width);
- DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
- Clear();
- }
-
- /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
- /// This is the maximum length of a single run for 'bit_width'.
- /// It is not valid to pass a buffer less than this length.
- static int MinBufferSize(int bit_width) {
- /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
- int max_literal_run_size =
- 1 +
- static_cast<int>(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width));
- /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value.
- int max_repeated_run_size = BitUtil::BitReader::kMaxVlqByteLength +
- static_cast<int>(BitUtil::BytesForBits(bit_width));
- return std::max(max_literal_run_size, max_repeated_run_size);
- }
-
- /// Returns the maximum byte size it could take to encode 'num_values'.
- static int MaxBufferSize(int bit_width, int num_values) {
- // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
- // and then a repeated run of length 8".
- // 8 values per smallest run, 8 bits per byte
- int bytes_per_run = bit_width;
- int num_runs = static_cast<int>(BitUtil::CeilDiv(num_values, 8));
- int literal_max_size = num_runs + num_runs * bytes_per_run;
-
- // In the very worst case scenario, the data is a concatenation of repeated
- // runs of 8 values. Repeated run has a 1 byte varint followed by the
- // bit-packed repeated value
- int min_repeated_run_size = 1 + static_cast<int>(BitUtil::BytesForBits(bit_width));
- int repeated_max_size =
- static_cast<int>(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size;
-
- return std::max(literal_max_size, repeated_max_size);
- }
-
- /// Encode value. Returns true if the value fits in buffer, false otherwise.
- /// This value must be representable with bit_width_ bits.
- bool Put(uint64_t value);
-
- /// Flushes any pending values to the underlying buffer.
- /// Returns the total number of bytes written
- int Flush();
-
- /// Resets all the state in the encoder.
- void Clear();
-
- /// Returns pointer to underlying buffer
- uint8_t* buffer() { return bit_writer_.buffer(); }
- int32_t len() { return bit_writer_.bytes_written(); }
-
- private:
- /// Flushes any buffered values. If this is part of a repeated run, this is largely
- /// a no-op.
- /// If it is part of a literal run, this will call FlushLiteralRun, which writes
- /// out the buffered literal values.
- /// If 'done' is true, the current run would be written even if it would normally
- /// have been buffered more. This should only be called at the end, when the
- /// encoder has received all values even if it would normally continue to be
- /// buffered.
- void FlushBufferedValues(bool done);
-
- /// Flushes literal values to the underlying buffer. If update_indicator_byte,
- /// then the current literal run is complete and the indicator byte is updated.
- void FlushLiteralRun(bool update_indicator_byte);
-
- /// Flushes a repeated run to the underlying buffer.
- void FlushRepeatedRun();
-
- /// Checks and sets buffer_full_. This must be called after flushing a run to
- /// make sure there are enough bytes remaining to encode the next run.
- void CheckBufferFull();
-
- /// The maximum number of values in a single literal run
- /// (number of groups encodable by a 1-byte indicator * 8)
- static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
-
- /// Number of bits needed to encode the value. Must be between 0 and 64.
- const int bit_width_;
-
- /// Underlying buffer.
- BitUtil::BitWriter bit_writer_;
-
- /// If true, the buffer is full and subsequent Put()'s will fail.
- bool buffer_full_;
-
- /// The maximum byte size a single run can take.
- int max_run_byte_size_;
-
- /// We need to buffer at most 8 values for literals. This happens when the
- /// bit_width is 1 (so 8 values fit in one byte).
- /// TODO: generalize this to other bit widths
- int64_t buffered_values_[8];
-
- /// Number of values in buffered_values_
- int num_buffered_values_;
-
- /// The current (also last) value that was written and the count of how
- /// many times in a row that value has been seen. This is maintained even
- /// if we are in a literal run. If the repeat_count_ get high enough, we switch
- /// to encoding repeated runs.
- uint64_t current_value_;
- int repeat_count_;
-
- /// Number of literals in the current run. This does not include the literals
- /// that might be in buffered_values_. Only after we've got a group big enough
- /// can we decide if they should part of the literal_count_ or repeat_count_
- int literal_count_;
-
- /// Pointer to a byte in the underlying buffer that stores the indicator byte.
- /// This is reserved as soon as we need a literal run but the value is written
- /// when the literal run is complete.
- uint8_t* literal_indicator_byte_;
-};
-
-template <typename T>
-inline bool RleDecoder::Get(T* val) {
- return GetBatch(val, 1) == 1;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatch(T* values, int batch_size) {
- DCHECK_GE(bit_width_, 0);
- int values_read = 0;
-
- auto* out = values;
-
- while (values_read < batch_size) {
- int remaining = batch_size - values_read;
-
- if (repeat_count_ > 0) { // Repeated value case.
- int repeat_batch = std::min(remaining, repeat_count_);
- std::fill(out, out + repeat_batch, static_cast<T>(current_value_));
-
- repeat_count_ -= repeat_batch;
- values_read += repeat_batch;
- out += repeat_batch;
- } else if (literal_count_ > 0) {
- int literal_batch = std::min(remaining, literal_count_);
- int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch);
- if (actual_read != literal_batch) {
- return values_read;
- }
-
- literal_count_ -= literal_batch;
- values_read += literal_batch;
- out += literal_batch;
- } else {
- if (!NextCounts<T>()) return values_read;
- }
- }
-
- return values_read;
-}
-
-template <typename T, typename RunType, typename Converter>
-inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset,
- T* out) {
- if (ARROW_PREDICT_FALSE(null_count == batch_size)) {
- converter.FillZero(out, out + batch_size);
- return batch_size;
- }
-
- DCHECK_GE(bit_width_, 0);
- int values_read = 0;
- int values_remaining = batch_size - null_count;
-
- // Assume no bits to start.
- arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset,
- /*length=*/batch_size);
- arrow::internal::BitRun valid_run = bit_reader.NextRun();
- while (values_read < batch_size) {
- if (ARROW_PREDICT_FALSE(valid_run.length == 0)) {
- valid_run = bit_reader.NextRun();
- }
-
- DCHECK_GT(batch_size, 0);
- DCHECK_GT(valid_run.length, 0);
-
- if (valid_run.set) {
- if ((repeat_count_ == 0) && (literal_count_ == 0)) {
- if (!NextCounts<RunType>()) return values_read;
- DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0));
- }
-
- if (repeat_count_ > 0) {
- int repeat_batch = 0;
- // Consume the entire repeat counts incrementing repeat_batch to
- // be the total of nulls + values consumed, we only need to
- // get the total count because we can fill in the same value for
- // nulls and non-nulls. This proves to be a big efficiency win.
- while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
- DCHECK_GT(valid_run.length, 0);
- if (valid_run.set) {
- int update_size = std::min(static_cast<int>(valid_run.length), repeat_count_);
- repeat_count_ -= update_size;
- repeat_batch += update_size;
- valid_run.length -= update_size;
- values_remaining -= update_size;
- } else {
- // We can consume all nulls here because we would do so on
- // the next loop anyways.
- repeat_batch += static_cast<int>(valid_run.length);
- valid_run.length = 0;
- }
- if (valid_run.length == 0) {
- valid_run = bit_reader.NextRun();
- }
- }
- RunType current_value = static_cast<RunType>(current_value_);
- if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) {
- return values_read;
- }
- converter.Fill(out, out + repeat_batch, current_value);
- out += repeat_batch;
- values_read += repeat_batch;
- } else if (literal_count_ > 0) {
- int literal_batch = std::min(values_remaining, literal_count_);
- DCHECK_GT(literal_batch, 0);
-
- // Decode the literals
- constexpr int kBufferSize = 1024;
- RunType indices[kBufferSize];
- literal_batch = std::min(literal_batch, kBufferSize);
- int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
- if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
- return values_read;
- }
- if (!converter.IsValid(indices, /*length=*/actual_read)) {
- return values_read;
- }
- int skipped = 0;
- int literals_read = 0;
- while (literals_read < literal_batch) {
- if (valid_run.set) {
- int update_size = std::min(literal_batch - literals_read,
- static_cast<int>(valid_run.length));
- converter.Copy(out, indices + literals_read, update_size);
- literals_read += update_size;
- out += update_size;
- valid_run.length -= update_size;
- } else {
- converter.FillZero(out, out + valid_run.length);
- out += valid_run.length;
- skipped += static_cast<int>(valid_run.length);
- valid_run.length = 0;
- }
- if (valid_run.length == 0) {
- valid_run = bit_reader.NextRun();
- }
- }
- literal_count_ -= literal_batch;
- values_remaining -= literal_batch;
- values_read += literal_batch + skipped;
- }
- } else {
- converter.FillZero(out, out + valid_run.length);
- out += valid_run.length;
- values_read += static_cast<int>(valid_run.length);
- valid_run.length = 0;
- }
- }
- DCHECK_EQ(valid_run.length, 0);
- DCHECK_EQ(values_remaining, 0);
- return values_read;
-}
-
-// Converter for GetSpaced that handles runs that get returned
-// directly as output.
-template <typename T>
-struct PlainRleConverter {
- T kZero = {};
- inline bool IsValid(const T& values) const { return true; }
- inline bool IsValid(const T* values, int32_t length) const { return true; }
- inline void Fill(T* begin, T* end, const T& run_value) const {
- std::fill(begin, end, run_value);
- }
- inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
- inline void Copy(T* out, const T* values, int length) const {
- std::memcpy(out, values, length * sizeof(T));
- }
-};
-
-template <typename T>
-inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset, T* out) {
- if (null_count == 0) {
- return GetBatch<T>(out, batch_size);
- }
-
- PlainRleConverter<T> converter;
- arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
- batch_size);
-
- int total_processed = 0;
- int processed = 0;
- arrow::internal::BitBlockCount block;
-
- do {
- block = block_counter.NextFourWords();
- if (block.length == 0) {
- break;
- }
- if (block.AllSet()) {
- processed = GetBatch<T>(out, block.length);
- } else if (block.NoneSet()) {
- converter.FillZero(out, out + block.length);
- processed = block.length;
- } else {
- processed = GetSpaced<T, /*RunType=*/T, PlainRleConverter<T>>(
- converter, block.length, block.length - block.popcount, valid_bits,
- valid_bits_offset, out);
- }
- total_processed += processed;
- out += block.length;
- valid_bits_offset += block.length;
- } while (processed == block.length);
- return total_processed;
-}
-
-static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) {
- return idx >= 0 && idx < dictionary_length;
-}
-
-// Converter for GetSpaced that handles runs of returned dictionary
-// indices.
-template <typename T>
-struct DictionaryConverter {
- T kZero = {};
- const T* dictionary;
- int32_t dictionary_length;
-
- inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); }
-
- inline bool IsValid(const int32_t* values, int32_t length) const {
- using IndexType = int32_t;
- IndexType min_index = std::numeric_limits<IndexType>::max();
- IndexType max_index = std::numeric_limits<IndexType>::min();
- for (int x = 0; x < length; x++) {
- min_index = std::min(values[x], min_index);
- max_index = std::max(values[x], max_index);
- }
-
- return IndexInRange(min_index, dictionary_length) &&
- IndexInRange(max_index, dictionary_length);
- }
- inline void Fill(T* begin, T* end, const int32_t& run_value) const {
- std::fill(begin, end, dictionary[run_value]);
- }
- inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
-
- inline void Copy(T* out, const int32_t* values, int length) const {
- for (int x = 0; x < length; x++) {
- out[x] = dictionary[values[x]];
- }
- }
-};
-
-template <typename T>
-inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length,
- T* values, int batch_size) {
- // Per https://github.com/apache/parquet-format/blob/master/Encodings.md,
- // the maximum dictionary index width in Parquet is 32 bits.
- using IndexType = int32_t;
- DictionaryConverter<T> converter;
- converter.dictionary = dictionary;
- converter.dictionary_length = dictionary_length;
-
- DCHECK_GE(bit_width_, 0);
- int values_read = 0;
-
- auto* out = values;
-
- while (values_read < batch_size) {
- int remaining = batch_size - values_read;
-
- if (repeat_count_ > 0) {
- auto idx = static_cast<IndexType>(current_value_);
- if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) {
- return values_read;
- }
- T val = dictionary[idx];
-
- int repeat_batch = std::min(remaining, repeat_count_);
- std::fill(out, out + repeat_batch, val);
-
- /* Upkeep counters */
- repeat_count_ -= repeat_batch;
- values_read += repeat_batch;
- out += repeat_batch;
- } else if (literal_count_ > 0) {
- constexpr int kBufferSize = 1024;
- IndexType indices[kBufferSize];
-
- int literal_batch = std::min(remaining, literal_count_);
- literal_batch = std::min(literal_batch, kBufferSize);
-
- int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
- if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
- return values_read;
- }
- if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) {
- return values_read;
- }
- converter.Copy(out, indices, literal_batch);
-
- /* Upkeep counters */
- literal_count_ -= literal_batch;
- values_read += literal_batch;
- out += literal_batch;
- } else {
- if (!NextCounts<IndexType>()) return values_read;
- }
- }
-
- return values_read;
-}
-
-template <typename T>
-inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
- int32_t dictionary_length, T* out,
- int batch_size, int null_count,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) {
- if (null_count == 0) {
- return GetBatchWithDict<T>(dictionary, dictionary_length, out, batch_size);
- }
- arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
- batch_size);
- using IndexType = int32_t;
- DictionaryConverter<T> converter;
- converter.dictionary = dictionary;
- converter.dictionary_length = dictionary_length;
-
- int total_processed = 0;
- int processed = 0;
- arrow::internal::BitBlockCount block;
- do {
- block = block_counter.NextFourWords();
- if (block.length == 0) {
- break;
- }
- if (block.AllSet()) {
- processed = GetBatchWithDict<T>(dictionary, dictionary_length, out, block.length);
- } else if (block.NoneSet()) {
- converter.FillZero(out, out + block.length);
- processed = block.length;
- } else {
- processed = GetSpaced<T, /*RunType=*/IndexType, DictionaryConverter<T>>(
- converter, block.length, block.length - block.popcount, valid_bits,
- valid_bits_offset, out);
- }
- total_processed += processed;
- out += block.length;
- valid_bits_offset += block.length;
- } while (processed == block.length);
- return total_processed;
-}
-
-template <typename T>
-bool RleDecoder::NextCounts() {
- // Read the next run's indicator int, it could be a literal or repeated run.
- // The int is encoded as a vlq-encoded value.
- uint32_t indicator_value = 0;
- if (!bit_reader_.GetVlqInt(&indicator_value)) return false;
-
- // lsb indicates if it is a literal run or repeated run
- bool is_literal = indicator_value & 1;
- uint32_t count = indicator_value >> 1;
- if (is_literal) {
- if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX) / 8)) {
- return false;
- }
- literal_count_ = count * 8;
- } else {
- if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX))) {
- return false;
- }
- repeat_count_ = count;
- T value = {};
- if (!bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)),
- &value)) {
- return false;
- }
- current_value_ = static_cast<uint64_t>(value);
- }
- return true;
-}
-
-/// This function buffers input values 8 at a time. After seeing all 8 values,
-/// it decides whether they should be encoded as a literal or repeated run.
-inline bool RleEncoder::Put(uint64_t value) {
- DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
- if (ARROW_PREDICT_FALSE(buffer_full_)) return false;
-
- if (ARROW_PREDICT_TRUE(current_value_ == value)) {
- ++repeat_count_;
- if (repeat_count_ > 8) {
- // This is just a continuation of the current run, no need to buffer the
- // values.
- // Note that this is the fast path for long repeated runs.
- return true;
- }
- } else {
- if (repeat_count_ >= 8) {
- // We had a run that was long enough but it has ended. Flush the
- // current repeated run.
- DCHECK_EQ(literal_count_, 0);
- FlushRepeatedRun();
- }
- repeat_count_ = 1;
- current_value_ = value;
- }
-
- buffered_values_[num_buffered_values_] = value;
- if (++num_buffered_values_ == 8) {
- DCHECK_EQ(literal_count_ % 8, 0);
- FlushBufferedValues(false);
- }
- return true;
-}
-
-inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
- if (literal_indicator_byte_ == NULL) {
- // The literal indicator byte has not been reserved yet, get one now.
- literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
- DCHECK(literal_indicator_byte_ != NULL);
- }
-
- // Write all the buffered values as bit packed literals
- for (int i = 0; i < num_buffered_values_; ++i) {
- bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
- DCHECK(success) << "There is a bug in using CheckBufferFull()";
- }
- num_buffered_values_ = 0;
-
- if (update_indicator_byte) {
- // At this point we need to write the indicator byte for the literal run.
- // We only reserve one byte, to allow for streaming writes of literal values.
- // The logic makes sure we flush literal runs often enough to not overrun
- // the 1 byte.
- DCHECK_EQ(literal_count_ % 8, 0);
- int num_groups = literal_count_ / 8;
- int32_t indicator_value = (num_groups << 1) | 1;
- DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
- *literal_indicator_byte_ = static_cast<uint8_t>(indicator_value);
- literal_indicator_byte_ = NULL;
- literal_count_ = 0;
- CheckBufferFull();
- }
-}
-
-inline void RleEncoder::FlushRepeatedRun() {
- DCHECK_GT(repeat_count_, 0);
- bool result = true;
- // The lsb of 0 indicates this is a repeated run
- int32_t indicator_value = repeat_count_ << 1 | 0;
- result &= bit_writer_.PutVlqInt(indicator_value);
- result &= bit_writer_.PutAligned(current_value_,
- static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)));
- DCHECK(result);
- num_buffered_values_ = 0;
- repeat_count_ = 0;
- CheckBufferFull();
-}
-
-/// Flush the values that have been buffered. At this point we decide whether
-/// we need to switch between the run types or continue the current one.
-inline void RleEncoder::FlushBufferedValues(bool done) {
- if (repeat_count_ >= 8) {
- // Clear the buffered values. They are part of the repeated run now and we
- // don't want to flush them out as literals.
- num_buffered_values_ = 0;
- if (literal_count_ != 0) {
- // There was a current literal run. All the values in it have been flushed
- // but we still need to update the indicator byte.
- DCHECK_EQ(literal_count_ % 8, 0);
- DCHECK_EQ(repeat_count_, 8);
- FlushLiteralRun(true);
- }
- DCHECK_EQ(literal_count_, 0);
- return;
- }
-
- literal_count_ += num_buffered_values_;
- DCHECK_EQ(literal_count_ % 8, 0);
- int num_groups = literal_count_ / 8;
- if (num_groups + 1 >= (1 << 6)) {
- // We need to start a new literal run because the indicator byte we've reserved
- // cannot store more values.
- DCHECK(literal_indicator_byte_ != NULL);
- FlushLiteralRun(true);
- } else {
- FlushLiteralRun(done);
- }
- repeat_count_ = 0;
-}
-
-inline int RleEncoder::Flush() {
- if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
- bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
- num_buffered_values_ == 0);
- // There is something pending, figure out if it's a repeated or literal run
- if (repeat_count_ > 0 && all_repeat) {
- FlushRepeatedRun();
- } else {
- DCHECK_EQ(literal_count_ % 8, 0);
- // Buffer the last group of literals to 8 by padding with 0s.
- for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
- ++num_buffered_values_) {
- buffered_values_[num_buffered_values_] = 0;
- }
- literal_count_ += num_buffered_values_;
- FlushLiteralRun(true);
- repeat_count_ = 0;
- }
- }
- bit_writer_.Flush();
- DCHECK_EQ(num_buffered_values_, 0);
- DCHECK_EQ(literal_count_, 0);
- DCHECK_EQ(repeat_count_, 0);
-
- return bit_writer_.bytes_written();
-}
-
-inline void RleEncoder::CheckBufferFull() {
- int bytes_written = bit_writer_.bytes_written();
- if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
- buffer_full_ = true;
- }
-}
-
-inline void RleEncoder::Clear() {
- buffer_full_ = false;
- current_value_ = 0;
- repeat_count_ = 0;
- num_buffered_values_ = 0;
- literal_count_ = 0;
- literal_indicator_byte_ = NULL;
- bit_writer_.Clear();
-}
-
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Imported from Apache Impala (incubating) on 2016-01-29 and modified for use
+// in parquet-cpp, Arrow
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+/// Utility classes to do run length encoding (RLE) for fixed bit width values. If runs
+/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+/// (literal encoding).
+/// For both types of runs, there is a byte-aligned indicator which encodes the length
+/// of the run and the type of the run.
+/// This encoding has the benefit that when there aren't any long enough runs, values
+/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+/// the run length are byte aligned. This allows for very efficient decoding
+/// implementations.
+/// The encoding is:
+/// encoded-block := run*
+/// run := literal-run | repeated-run
+/// literal-run := literal-indicator < literal bytes >
+/// repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+/// literal-indicator := varint_encode( number_of_groups << 1 | 1)
+/// repeated-indicator := varint_encode( number_of_repetitions << 1 )
+//
+/// Each run is preceded by a varint. The varint's least significant bit is
+/// used to indicate whether the run is a literal run or a repeated run. The rest
+/// of the varint is used to determine the length of the run (eg how many times the
+/// value repeats).
+//
+/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+/// on a byte boundary without padding.
+/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+/// the actual number of encoded ints. (This means that the total number of encoded values
+/// can not be determined from the encoded data, since the number of values in the last
+/// group may not be a multiple of 8). For the last group of literal runs, we pad
+/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+/// without the need for additional checks.
+//
+/// There is a break-even point when it is more storage efficient to do run length
+/// encoding. For 1 bit-width values, that point is 8 values. They require 2 bytes
+/// for both the repeated encoding or the literal encoding. This value can always
+/// be computed based on the bit-width.
+/// TODO: think about how to use this for strings. The bit packing isn't quite the same.
+//
+/// Examples with bit-width 1 (eg encoding booleans):
+/// ----------------------------------------
+/// 100 1s followed by 100 0s:
+/// <varint(100 << 1)> <1, padded to 1 byte> <varint(100 << 1)> <0, padded to 1 byte>
+/// - (total 4 bytes)
+//
+/// alternating 1s and 0s (200 total):
+/// 200 ints = 25 groups of 8
+/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+/// (total 26 bytes, 1 byte overhead)
+//
+
+/// Decoder class for RLE encoded data.
+class RleDecoder {
+ public:
+ /// Create a decoder object. buffer/buffer_len is the decoded data.
+ /// bit_width is the width of each value (before encoding).
+ RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
+ : bit_reader_(buffer, buffer_len),
+ bit_width_(bit_width),
+ current_value_(0),
+ repeat_count_(0),
+ literal_count_(0) {
+ DCHECK_GE(bit_width_, 0);
+ DCHECK_LE(bit_width_, 64);
+ }
+
+ RleDecoder() : bit_width_(-1) {}
+
+ void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
+ DCHECK_GE(bit_width, 0);
+ DCHECK_LE(bit_width, 64);
+ bit_reader_.Reset(buffer, buffer_len);
+ bit_width_ = bit_width;
+ current_value_ = 0;
+ repeat_count_ = 0;
+ literal_count_ = 0;
+ }
+
+ /// Gets the next value. Returns false if there are no more.
+ template <typename T>
+ bool Get(T* val);
+
+ /// Gets a batch of values. Returns the number of decoded elements.
+ template <typename T>
+ int GetBatch(T* values, int batch_size);
+
+ /// Like GetBatch but add spacing for null entries
+ template <typename T>
+ int GetBatchSpaced(int batch_size, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* out);
+
+ /// Like GetBatch but the values are then decoded using the provided dictionary
+ template <typename T>
+ int GetBatchWithDict(const T* dictionary, int32_t dictionary_length, T* values,
+ int batch_size);
+
+ /// Like GetBatchWithDict but add spacing for null entries
+ ///
+ /// Null entries will be zero-initialized in `values` to avoid leaking
+ /// private data.
+ template <typename T>
+ int GetBatchWithDictSpaced(const T* dictionary, int32_t dictionary_length, T* values,
+ int batch_size, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset);
+
+ protected:
+ BitUtil::BitReader bit_reader_;
+ /// Number of bits needed to encode the value. Must be between 0 and 64.
+ int bit_width_;
+ uint64_t current_value_;
+ int32_t repeat_count_;
+ int32_t literal_count_;
+
+ private:
+ /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
+ /// are no more.
+ template <typename T>
+ bool NextCounts();
+
+ /// Utility methods for retrieving spaced values.
+ template <typename T, typename RunType, typename Converter>
+ int GetSpaced(Converter converter, int batch_size, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset, T* out);
+};
+
+/// Class to incrementally build the rle data. This class does not allocate any memory.
+/// The encoding has two modes: encoding repeated runs and literal runs.
+/// If the run is sufficiently short, it is more efficient to encode as a literal run.
+/// This class does so by buffering 8 values at a time. If they are not all the same
+/// they are added to the literal run. If they are the same, they are added to the
+/// repeated run. When we switch modes, the previous run is flushed out.
+class RleEncoder {
+ public:
+ /// buffer/buffer_len: preallocated output buffer.
+ /// bit_width: max number of bits for value.
+ /// TODO: consider adding a min_repeated_run_length so the caller can control
+ /// when values should be encoded as repeated runs. Currently this is derived
+ /// based on the bit_width, which can determine a storage optimal choice.
+ /// TODO: allow 0 bit_width (and have dict encoder use it)
+ RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
+ : bit_width_(bit_width), bit_writer_(buffer, buffer_len) {
+ DCHECK_GE(bit_width_, 0);
+ DCHECK_LE(bit_width_, 64);
+ max_run_byte_size_ = MinBufferSize(bit_width);
+ DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
+ Clear();
+ }
+
+ /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
+ /// This is the maximum length of a single run for 'bit_width'.
+ /// It is not valid to pass a buffer less than this length.
+ static int MinBufferSize(int bit_width) {
+ /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+ int max_literal_run_size =
+ 1 +
+ static_cast<int>(BitUtil::BytesForBits(MAX_VALUES_PER_LITERAL_RUN * bit_width));
+ /// Up to kMaxVlqByteLength indicator and a single 'bit_width' value.
+ int max_repeated_run_size = BitUtil::BitReader::kMaxVlqByteLength +
+ static_cast<int>(BitUtil::BytesForBits(bit_width));
+ return std::max(max_literal_run_size, max_repeated_run_size);
+ }
+
+ /// Returns the maximum byte size it could take to encode 'num_values'.
+ static int MaxBufferSize(int bit_width, int num_values) {
+ // For a bit_width > 1, the worst case is the repetition of "literal run of length 8
+ // and then a repeated run of length 8".
+ // 8 values per smallest run, 8 bits per byte
+ int bytes_per_run = bit_width;
+ int num_runs = static_cast<int>(BitUtil::CeilDiv(num_values, 8));
+ int literal_max_size = num_runs + num_runs * bytes_per_run;
+
+ // In the very worst case scenario, the data is a concatenation of repeated
+ // runs of 8 values. Repeated run has a 1 byte varint followed by the
+ // bit-packed repeated value
+ int min_repeated_run_size = 1 + static_cast<int>(BitUtil::BytesForBits(bit_width));
+ int repeated_max_size =
+ static_cast<int>(BitUtil::CeilDiv(num_values, 8)) * min_repeated_run_size;
+
+ return std::max(literal_max_size, repeated_max_size);
+ }
+
+ /// Encode value. Returns true if the value fits in buffer, false otherwise.
+ /// This value must be representable with bit_width_ bits.
+ bool Put(uint64_t value);
+
+ /// Flushes any pending values to the underlying buffer.
+ /// Returns the total number of bytes written
+ int Flush();
+
+ /// Resets all the state in the encoder.
+ void Clear();
+
+ /// Returns pointer to underlying buffer
+ uint8_t* buffer() { return bit_writer_.buffer(); }
+ int32_t len() { return bit_writer_.bytes_written(); }
+
+ private:
+ /// Flushes any buffered values. If this is part of a repeated run, this is largely
+ /// a no-op.
+ /// If it is part of a literal run, this will call FlushLiteralRun, which writes
+ /// out the buffered literal values.
+ /// If 'done' is true, the current run would be written even if it would normally
+ /// have been buffered more. This should only be called at the end, when the
+ /// encoder has received all values even if it would normally continue to be
+ /// buffered.
+ void FlushBufferedValues(bool done);
+
+ /// Flushes literal values to the underlying buffer. If update_indicator_byte,
+ /// then the current literal run is complete and the indicator byte is updated.
+ void FlushLiteralRun(bool update_indicator_byte);
+
+ /// Flushes a repeated run to the underlying buffer.
+ void FlushRepeatedRun();
+
+ /// Checks and sets buffer_full_. This must be called after flushing a run to
+ /// make sure there are enough bytes remaining to encode the next run.
+ void CheckBufferFull();
+
+ /// The maximum number of values in a single literal run
+ /// (number of groups encodable by a 1-byte indicator * 8)
+ static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
+
+ /// Number of bits needed to encode the value. Must be between 0 and 64.
+ const int bit_width_;
+
+ /// Underlying buffer.
+ BitUtil::BitWriter bit_writer_;
+
+ /// If true, the buffer is full and subsequent Put()'s will fail.
+ bool buffer_full_;
+
+ /// The maximum byte size a single run can take.
+ int max_run_byte_size_;
+
+ /// We need to buffer at most 8 values for literals. This happens when the
+ /// bit_width is 1 (so 8 values fit in one byte).
+ /// TODO: generalize this to other bit widths
+ int64_t buffered_values_[8];
+
+ /// Number of values in buffered_values_
+ int num_buffered_values_;
+
+ /// The current (also last) value that was written and the count of how
+ /// many times in a row that value has been seen. This is maintained even
+ /// if we are in a literal run. If the repeat_count_ get high enough, we switch
+ /// to encoding repeated runs.
+ uint64_t current_value_;
+ int repeat_count_;
+
+ /// Number of literals in the current run. This does not include the literals
+ /// that might be in buffered_values_. Only after we've got a group big enough
+ /// can we decide if they should part of the literal_count_ or repeat_count_
+ int literal_count_;
+
+ /// Pointer to a byte in the underlying buffer that stores the indicator byte.
+ /// This is reserved as soon as we need a literal run but the value is written
+ /// when the literal run is complete.
+ uint8_t* literal_indicator_byte_;
+};
+
+template <typename T>
+inline bool RleDecoder::Get(T* val) {
+ return GetBatch(val, 1) == 1;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatch(T* values, int batch_size) {
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+
+ auto* out = values;
+
+ while (values_read < batch_size) {
+ int remaining = batch_size - values_read;
+
+ if (repeat_count_ > 0) { // Repeated value case.
+ int repeat_batch = std::min(remaining, repeat_count_);
+ std::fill(out, out + repeat_batch, static_cast<T>(current_value_));
+
+ repeat_count_ -= repeat_batch;
+ values_read += repeat_batch;
+ out += repeat_batch;
+ } else if (literal_count_ > 0) {
+ int literal_batch = std::min(remaining, literal_count_);
+ int actual_read = bit_reader_.GetBatch(bit_width_, out, literal_batch);
+ if (actual_read != literal_batch) {
+ return values_read;
+ }
+
+ literal_count_ -= literal_batch;
+ values_read += literal_batch;
+ out += literal_batch;
+ } else {
+ if (!NextCounts<T>()) return values_read;
+ }
+ }
+
+ return values_read;
+}
+
+template <typename T, typename RunType, typename Converter>
+inline int RleDecoder::GetSpaced(Converter converter, int batch_size, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset,
+ T* out) {
+ if (ARROW_PREDICT_FALSE(null_count == batch_size)) {
+ converter.FillZero(out, out + batch_size);
+ return batch_size;
+ }
+
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+ int values_remaining = batch_size - null_count;
+
+ // Assume no bits to start.
+ arrow::internal::BitRunReader bit_reader(valid_bits, valid_bits_offset,
+ /*length=*/batch_size);
+ arrow::internal::BitRun valid_run = bit_reader.NextRun();
+ while (values_read < batch_size) {
+ if (ARROW_PREDICT_FALSE(valid_run.length == 0)) {
+ valid_run = bit_reader.NextRun();
+ }
+
+ DCHECK_GT(batch_size, 0);
+ DCHECK_GT(valid_run.length, 0);
+
+ if (valid_run.set) {
+ if ((repeat_count_ == 0) && (literal_count_ == 0)) {
+ if (!NextCounts<RunType>()) return values_read;
+ DCHECK((repeat_count_ > 0) ^ (literal_count_ > 0));
+ }
+
+ if (repeat_count_ > 0) {
+ int repeat_batch = 0;
+ // Consume the entire repeat counts incrementing repeat_batch to
+ // be the total of nulls + values consumed, we only need to
+ // get the total count because we can fill in the same value for
+ // nulls and non-nulls. This proves to be a big efficiency win.
+ while (repeat_count_ > 0 && (values_read + repeat_batch) < batch_size) {
+ DCHECK_GT(valid_run.length, 0);
+ if (valid_run.set) {
+ int update_size = std::min(static_cast<int>(valid_run.length), repeat_count_);
+ repeat_count_ -= update_size;
+ repeat_batch += update_size;
+ valid_run.length -= update_size;
+ values_remaining -= update_size;
+ } else {
+ // We can consume all nulls here because we would do so on
+ // the next loop anyways.
+ repeat_batch += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ if (valid_run.length == 0) {
+ valid_run = bit_reader.NextRun();
+ }
+ }
+ RunType current_value = static_cast<RunType>(current_value_);
+ if (ARROW_PREDICT_FALSE(!converter.IsValid(current_value))) {
+ return values_read;
+ }
+ converter.Fill(out, out + repeat_batch, current_value);
+ out += repeat_batch;
+ values_read += repeat_batch;
+ } else if (literal_count_ > 0) {
+ int literal_batch = std::min(values_remaining, literal_count_);
+ DCHECK_GT(literal_batch, 0);
+
+ // Decode the literals
+ constexpr int kBufferSize = 1024;
+ RunType indices[kBufferSize];
+ literal_batch = std::min(literal_batch, kBufferSize);
+ int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
+ if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
+ return values_read;
+ }
+ if (!converter.IsValid(indices, /*length=*/actual_read)) {
+ return values_read;
+ }
+ int skipped = 0;
+ int literals_read = 0;
+ while (literals_read < literal_batch) {
+ if (valid_run.set) {
+ int update_size = std::min(literal_batch - literals_read,
+ static_cast<int>(valid_run.length));
+ converter.Copy(out, indices + literals_read, update_size);
+ literals_read += update_size;
+ out += update_size;
+ valid_run.length -= update_size;
+ } else {
+ converter.FillZero(out, out + valid_run.length);
+ out += valid_run.length;
+ skipped += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ if (valid_run.length == 0) {
+ valid_run = bit_reader.NextRun();
+ }
+ }
+ literal_count_ -= literal_batch;
+ values_remaining -= literal_batch;
+ values_read += literal_batch + skipped;
+ }
+ } else {
+ converter.FillZero(out, out + valid_run.length);
+ out += valid_run.length;
+ values_read += static_cast<int>(valid_run.length);
+ valid_run.length = 0;
+ }
+ }
+ DCHECK_EQ(valid_run.length, 0);
+ DCHECK_EQ(values_remaining, 0);
+ return values_read;
+}
+
+// Converter for GetSpaced that handles runs that get returned
+// directly as output.
+template <typename T>
+struct PlainRleConverter {
+ T kZero = {};
+ inline bool IsValid(const T& values) const { return true; }
+ inline bool IsValid(const T* values, int32_t length) const { return true; }
+ inline void Fill(T* begin, T* end, const T& run_value) const {
+ std::fill(begin, end, run_value);
+ }
+ inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
+ inline void Copy(T* out, const T* values, int length) const {
+ std::memcpy(out, values, length * sizeof(T));
+ }
+};
+
+template <typename T>
+inline int RleDecoder::GetBatchSpaced(int batch_size, int null_count,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* out) {
+ if (null_count == 0) {
+ return GetBatch<T>(out, batch_size);
+ }
+
+ PlainRleConverter<T> converter;
+ arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
+ batch_size);
+
+ int total_processed = 0;
+ int processed = 0;
+ arrow::internal::BitBlockCount block;
+
+ do {
+ block = block_counter.NextFourWords();
+ if (block.length == 0) {
+ break;
+ }
+ if (block.AllSet()) {
+ processed = GetBatch<T>(out, block.length);
+ } else if (block.NoneSet()) {
+ converter.FillZero(out, out + block.length);
+ processed = block.length;
+ } else {
+ processed = GetSpaced<T, /*RunType=*/T, PlainRleConverter<T>>(
+ converter, block.length, block.length - block.popcount, valid_bits,
+ valid_bits_offset, out);
+ }
+ total_processed += processed;
+ out += block.length;
+ valid_bits_offset += block.length;
+ } while (processed == block.length);
+ return total_processed;
+}
+
+static inline bool IndexInRange(int32_t idx, int32_t dictionary_length) {
+ return idx >= 0 && idx < dictionary_length;
+}
+
+// Converter for GetSpaced that handles runs of returned dictionary
+// indices.
+template <typename T>
+struct DictionaryConverter {
+ T kZero = {};
+ const T* dictionary;
+ int32_t dictionary_length;
+
+ inline bool IsValid(int32_t value) { return IndexInRange(value, dictionary_length); }
+
+ inline bool IsValid(const int32_t* values, int32_t length) const {
+ using IndexType = int32_t;
+ IndexType min_index = std::numeric_limits<IndexType>::max();
+ IndexType max_index = std::numeric_limits<IndexType>::min();
+ for (int x = 0; x < length; x++) {
+ min_index = std::min(values[x], min_index);
+ max_index = std::max(values[x], max_index);
+ }
+
+ return IndexInRange(min_index, dictionary_length) &&
+ IndexInRange(max_index, dictionary_length);
+ }
+ inline void Fill(T* begin, T* end, const int32_t& run_value) const {
+ std::fill(begin, end, dictionary[run_value]);
+ }
+ inline void FillZero(T* begin, T* end) { std::fill(begin, end, kZero); }
+
+ inline void Copy(T* out, const int32_t* values, int length) const {
+ for (int x = 0; x < length; x++) {
+ out[x] = dictionary[values[x]];
+ }
+ }
+};
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDict(const T* dictionary, int32_t dictionary_length,
+ T* values, int batch_size) {
+ // Per https://github.com/apache/parquet-format/blob/master/Encodings.md,
+ // the maximum dictionary index width in Parquet is 32 bits.
+ using IndexType = int32_t;
+ DictionaryConverter<T> converter;
+ converter.dictionary = dictionary;
+ converter.dictionary_length = dictionary_length;
+
+ DCHECK_GE(bit_width_, 0);
+ int values_read = 0;
+
+ auto* out = values;
+
+ while (values_read < batch_size) {
+ int remaining = batch_size - values_read;
+
+ if (repeat_count_ > 0) {
+ auto idx = static_cast<IndexType>(current_value_);
+ if (ARROW_PREDICT_FALSE(!IndexInRange(idx, dictionary_length))) {
+ return values_read;
+ }
+ T val = dictionary[idx];
+
+ int repeat_batch = std::min(remaining, repeat_count_);
+ std::fill(out, out + repeat_batch, val);
+
+ /* Upkeep counters */
+ repeat_count_ -= repeat_batch;
+ values_read += repeat_batch;
+ out += repeat_batch;
+ } else if (literal_count_ > 0) {
+ constexpr int kBufferSize = 1024;
+ IndexType indices[kBufferSize];
+
+ int literal_batch = std::min(remaining, literal_count_);
+ literal_batch = std::min(literal_batch, kBufferSize);
+
+ int actual_read = bit_reader_.GetBatch(bit_width_, indices, literal_batch);
+ if (ARROW_PREDICT_FALSE(actual_read != literal_batch)) {
+ return values_read;
+ }
+ if (ARROW_PREDICT_FALSE(!converter.IsValid(indices, /*length=*/literal_batch))) {
+ return values_read;
+ }
+ converter.Copy(out, indices, literal_batch);
+
+ /* Upkeep counters */
+ literal_count_ -= literal_batch;
+ values_read += literal_batch;
+ out += literal_batch;
+ } else {
+ if (!NextCounts<IndexType>()) return values_read;
+ }
+ }
+
+ return values_read;
+}
+
+template <typename T>
+inline int RleDecoder::GetBatchWithDictSpaced(const T* dictionary,
+ int32_t dictionary_length, T* out,
+ int batch_size, int null_count,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (null_count == 0) {
+ return GetBatchWithDict<T>(dictionary, dictionary_length, out, batch_size);
+ }
+ arrow::internal::BitBlockCounter block_counter(valid_bits, valid_bits_offset,
+ batch_size);
+ using IndexType = int32_t;
+ DictionaryConverter<T> converter;
+ converter.dictionary = dictionary;
+ converter.dictionary_length = dictionary_length;
+
+ int total_processed = 0;
+ int processed = 0;
+ arrow::internal::BitBlockCount block;
+ do {
+ block = block_counter.NextFourWords();
+ if (block.length == 0) {
+ break;
+ }
+ if (block.AllSet()) {
+ processed = GetBatchWithDict<T>(dictionary, dictionary_length, out, block.length);
+ } else if (block.NoneSet()) {
+ converter.FillZero(out, out + block.length);
+ processed = block.length;
+ } else {
+ processed = GetSpaced<T, /*RunType=*/IndexType, DictionaryConverter<T>>(
+ converter, block.length, block.length - block.popcount, valid_bits,
+ valid_bits_offset, out);
+ }
+ total_processed += processed;
+ out += block.length;
+ valid_bits_offset += block.length;
+ } while (processed == block.length);
+ return total_processed;
+}
+
+template <typename T>
+bool RleDecoder::NextCounts() {
+ // Read the next run's indicator int, it could be a literal or repeated run.
+ // The int is encoded as a vlq-encoded value.
+ uint32_t indicator_value = 0;
+ if (!bit_reader_.GetVlqInt(&indicator_value)) return false;
+
+ // lsb indicates if it is a literal run or repeated run
+ bool is_literal = indicator_value & 1;
+ uint32_t count = indicator_value >> 1;
+ if (is_literal) {
+ if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX) / 8)) {
+ return false;
+ }
+ literal_count_ = count * 8;
+ } else {
+ if (ARROW_PREDICT_FALSE(count == 0 || count > static_cast<uint32_t>(INT32_MAX))) {
+ return false;
+ }
+ repeat_count_ = count;
+ T value = {};
+ if (!bit_reader_.GetAligned<T>(static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)),
+ &value)) {
+ return false;
+ }
+ current_value_ = static_cast<uint64_t>(value);
+ }
+ return true;
+}
+
+/// This function buffers input values 8 at a time. After seeing all 8 values,
+/// it decides whether they should be encoded as a literal or repeated run.
+inline bool RleEncoder::Put(uint64_t value) {
+ DCHECK(bit_width_ == 64 || value < (1ULL << bit_width_));
+ if (ARROW_PREDICT_FALSE(buffer_full_)) return false;
+
+ if (ARROW_PREDICT_TRUE(current_value_ == value)) {
+ ++repeat_count_;
+ if (repeat_count_ > 8) {
+ // This is just a continuation of the current run, no need to buffer the
+ // values.
+ // Note that this is the fast path for long repeated runs.
+ return true;
+ }
+ } else {
+ if (repeat_count_ >= 8) {
+ // We had a run that was long enough but it has ended. Flush the
+ // current repeated run.
+ DCHECK_EQ(literal_count_, 0);
+ FlushRepeatedRun();
+ }
+ repeat_count_ = 1;
+ current_value_ = value;
+ }
+
+ buffered_values_[num_buffered_values_] = value;
+ if (++num_buffered_values_ == 8) {
+ DCHECK_EQ(literal_count_ % 8, 0);
+ FlushBufferedValues(false);
+ }
+ return true;
+}
+
+inline void RleEncoder::FlushLiteralRun(bool update_indicator_byte) {
+ if (literal_indicator_byte_ == NULL) {
+ // The literal indicator byte has not been reserved yet, get one now.
+ literal_indicator_byte_ = bit_writer_.GetNextBytePtr();
+ DCHECK(literal_indicator_byte_ != NULL);
+ }
+
+ // Write all the buffered values as bit packed literals
+ for (int i = 0; i < num_buffered_values_; ++i) {
+ bool success = bit_writer_.PutValue(buffered_values_[i], bit_width_);
+ DCHECK(success) << "There is a bug in using CheckBufferFull()";
+ }
+ num_buffered_values_ = 0;
+
+ if (update_indicator_byte) {
+ // At this point we need to write the indicator byte for the literal run.
+ // We only reserve one byte, to allow for streaming writes of literal values.
+ // The logic makes sure we flush literal runs often enough to not overrun
+ // the 1 byte.
+ DCHECK_EQ(literal_count_ % 8, 0);
+ int num_groups = literal_count_ / 8;
+ int32_t indicator_value = (num_groups << 1) | 1;
+ DCHECK_EQ(indicator_value & 0xFFFFFF00, 0);
+ *literal_indicator_byte_ = static_cast<uint8_t>(indicator_value);
+ literal_indicator_byte_ = NULL;
+ literal_count_ = 0;
+ CheckBufferFull();
+ }
+}
+
+inline void RleEncoder::FlushRepeatedRun() {
+ DCHECK_GT(repeat_count_, 0);
+ bool result = true;
+ // The lsb of 0 indicates this is a repeated run
+ int32_t indicator_value = repeat_count_ << 1 | 0;
+ result &= bit_writer_.PutVlqInt(indicator_value);
+ result &= bit_writer_.PutAligned(current_value_,
+ static_cast<int>(BitUtil::CeilDiv(bit_width_, 8)));
+ DCHECK(result);
+ num_buffered_values_ = 0;
+ repeat_count_ = 0;
+ CheckBufferFull();
+}
+
+/// Flush the values that have been buffered. At this point we decide whether
+/// we need to switch between the run types or continue the current one.
+inline void RleEncoder::FlushBufferedValues(bool done) {
+ if (repeat_count_ >= 8) {
+ // Clear the buffered values. They are part of the repeated run now and we
+ // don't want to flush them out as literals.
+ num_buffered_values_ = 0;
+ if (literal_count_ != 0) {
+ // There was a current literal run. All the values in it have been flushed
+ // but we still need to update the indicator byte.
+ DCHECK_EQ(literal_count_ % 8, 0);
+ DCHECK_EQ(repeat_count_, 8);
+ FlushLiteralRun(true);
+ }
+ DCHECK_EQ(literal_count_, 0);
+ return;
+ }
+
+ literal_count_ += num_buffered_values_;
+ DCHECK_EQ(literal_count_ % 8, 0);
+ int num_groups = literal_count_ / 8;
+ if (num_groups + 1 >= (1 << 6)) {
+ // We need to start a new literal run because the indicator byte we've reserved
+ // cannot store more values.
+ DCHECK(literal_indicator_byte_ != NULL);
+ FlushLiteralRun(true);
+ } else {
+ FlushLiteralRun(done);
+ }
+ repeat_count_ = 0;
+}
+
+inline int RleEncoder::Flush() {
+ if (literal_count_ > 0 || repeat_count_ > 0 || num_buffered_values_ > 0) {
+ bool all_repeat = literal_count_ == 0 && (repeat_count_ == num_buffered_values_ ||
+ num_buffered_values_ == 0);
+ // There is something pending, figure out if it's a repeated or literal run
+ if (repeat_count_ > 0 && all_repeat) {
+ FlushRepeatedRun();
+ } else {
+ DCHECK_EQ(literal_count_ % 8, 0);
+ // Buffer the last group of literals to 8 by padding with 0s.
+ for (; num_buffered_values_ != 0 && num_buffered_values_ < 8;
+ ++num_buffered_values_) {
+ buffered_values_[num_buffered_values_] = 0;
+ }
+ literal_count_ += num_buffered_values_;
+ FlushLiteralRun(true);
+ repeat_count_ = 0;
+ }
+ }
+ bit_writer_.Flush();
+ DCHECK_EQ(num_buffered_values_, 0);
+ DCHECK_EQ(literal_count_, 0);
+ DCHECK_EQ(repeat_count_, 0);
+
+ return bit_writer_.bytes_written();
+}
+
+inline void RleEncoder::CheckBufferFull() {
+ int bytes_written = bit_writer_.bytes_written();
+ if (bytes_written + max_run_byte_size_ > bit_writer_.buffer_len()) {
+ buffer_full_ = true;
+ }
+}
+
+inline void RleEncoder::Clear() {
+ buffer_full_ = false;
+ current_value_ = 0;
+ repeat_count_ = 0;
+ num_buffered_values_ = 0;
+ literal_count_ = 0;
+ literal_indicator_byte_ = NULL;
+ bit_writer_.Clear();
+}
+
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
index 9414984663f..8265e1d22ae 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/spaced.h
@@ -1,98 +1,98 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-
-#include "arrow/util/bit_run_reader.h"
-
-namespace arrow {
-namespace util {
-namespace internal {
-
-/// \brief Compress the buffer to spaced, excluding the null entries.
-///
-/// \param[in] src the source buffer
-/// \param[in] num_values the size of source buffer
-/// \param[in] valid_bits bitmap data indicating position of valid slots
-/// \param[in] valid_bits_offset offset into valid_bits
-/// \param[out] output the output buffer spaced
-/// \return The size of spaced buffer.
-template <typename T>
-inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset, T* output) {
- int num_valid_values = 0;
-
- arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T));
- num_valid_values += static_cast<int32_t>(run.length);
- }
-
- return num_valid_values;
-}
-
-/// \brief Relocate values in buffer into positions of non-null values as indicated by
-/// a validity bitmap.
-///
-/// \param[in, out] buffer the in-place buffer
-/// \param[in] num_values total size of buffer including null slots
-/// \param[in] null_count number of null slots
-/// \param[in] valid_bits bitmap data indicating position of valid slots
-/// \param[in] valid_bits_offset offset into valid_bits
-/// \return The number of values expanded, including nulls.
-template <typename T>
-inline int SpacedExpand(T* buffer, int num_values, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset) {
- // Point to end as we add the spacing from the back.
- int idx_decode = num_values - null_count;
-
- // Depending on the number of nulls, some of the value slots in buffer may
- // be uninitialized, and this will cause valgrind warnings / potentially UB
- std::memset(static_cast<void*>(buffer + idx_decode), 0, null_count * sizeof(T));
- if (idx_decode == 0) {
- // All nulls, nothing more to do
- return num_values;
- }
-
- arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset,
- num_values);
- while (true) {
- const auto run = reader.NextRun();
- if (run.length == 0) {
- break;
- }
- idx_decode -= static_cast<int32_t>(run.length);
- assert(idx_decode >= 0);
- std::memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T));
- }
-
- // Otherwise caller gave an incorrect null_count
- assert(idx_decode == 0);
- return num_values;
-}
-
-} // namespace internal
-} // namespace util
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_run_reader.h"
+
+namespace arrow {
+namespace util {
+namespace internal {
+
+/// \brief Compress the buffer to spaced, excluding the null entries.
+///
+/// \param[in] src the source buffer
+/// \param[in] num_values the size of source buffer
+/// \param[in] valid_bits bitmap data indicating position of valid slots
+/// \param[in] valid_bits_offset offset into valid_bits
+/// \param[out] output the output buffer spaced
+/// \return The size of spaced buffer.
+template <typename T>
+inline int SpacedCompress(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, T* output) {
+ int num_valid_values = 0;
+
+ arrow::internal::SetBitRunReader reader(valid_bits, valid_bits_offset, num_values);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ std::memcpy(output + num_valid_values, src + run.position, run.length * sizeof(T));
+ num_valid_values += static_cast<int32_t>(run.length);
+ }
+
+ return num_valid_values;
+}
+
+/// \brief Relocate values in buffer into positions of non-null values as indicated by
+/// a validity bitmap.
+///
+/// \param[in, out] buffer the in-place buffer
+/// \param[in] num_values total size of buffer including null slots
+/// \param[in] null_count number of null slots
+/// \param[in] valid_bits bitmap data indicating position of valid slots
+/// \param[in] valid_bits_offset offset into valid_bits
+/// \return The number of values expanded, including nulls.
+template <typename T>
+inline int SpacedExpand(T* buffer, int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ // Point to end as we add the spacing from the back.
+ int idx_decode = num_values - null_count;
+
+ // Depending on the number of nulls, some of the value slots in buffer may
+ // be uninitialized, and this will cause valgrind warnings / potentially UB
+ std::memset(static_cast<void*>(buffer + idx_decode), 0, null_count * sizeof(T));
+ if (idx_decode == 0) {
+ // All nulls, nothing more to do
+ return num_values;
+ }
+
+ arrow::internal::ReverseSetBitRunReader reader(valid_bits, valid_bits_offset,
+ num_values);
+ while (true) {
+ const auto run = reader.NextRun();
+ if (run.length == 0) {
+ break;
+ }
+ idx_decode -= static_cast<int32_t>(run.length);
+ assert(idx_decode >= 0);
+ std::memmove(buffer + run.position, buffer + idx_decode, run.length * sizeof(T));
+ }
+
+ // Otherwise caller gave an incorrect null_count
+ assert(idx_decode == 0);
+ return num_values;
+}
+
+} // namespace internal
+} // namespace util
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
index 5abb2feb446..d922311df1c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.cc
@@ -92,23 +92,23 @@ Status ParseHexValue(const char* data, uint8_t* out) {
namespace internal {
-std::vector<util::string_view> SplitString(util::string_view v, char delimiter) {
- std::vector<util::string_view> parts;
- size_t start = 0, end;
- while (true) {
- end = v.find(delimiter, start);
- parts.push_back(v.substr(start, end - start));
- if (end == std::string::npos) {
- break;
- }
- start = end + 1;
- }
- return parts;
-}
-
-template <typename StringLike>
-static std::string JoinStringLikes(const std::vector<StringLike>& strings,
- util::string_view delimiter) {
+std::vector<util::string_view> SplitString(util::string_view v, char delimiter) {
+ std::vector<util::string_view> parts;
+ size_t start = 0, end;
+ while (true) {
+ end = v.find(delimiter, start);
+ parts.push_back(v.substr(start, end - start));
+ if (end == std::string::npos) {
+ break;
+ }
+ start = end + 1;
+ }
+ return parts;
+}
+
+template <typename StringLike>
+static std::string JoinStringLikes(const std::vector<StringLike>& strings,
+ util::string_view delimiter) {
if (strings.size() == 0) {
return "";
}
@@ -120,18 +120,18 @@ static std::string JoinStringLikes(const std::vector<StringLike>& strings,
return out;
}
-std::string JoinStrings(const std::vector<util::string_view>& strings,
- util::string_view delimiter) {
- return JoinStringLikes(strings, delimiter);
-}
-
-std::string JoinStrings(const std::vector<std::string>& strings,
- util::string_view delimiter) {
- return JoinStringLikes(strings, delimiter);
-}
-
-static constexpr bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
-
+std::string JoinStrings(const std::vector<util::string_view>& strings,
+ util::string_view delimiter) {
+ return JoinStringLikes(strings, delimiter);
+}
+
+std::string JoinStrings(const std::vector<std::string>& strings,
+ util::string_view delimiter) {
+ return JoinStringLikes(strings, delimiter);
+}
+
+static constexpr bool IsWhitespace(char c) { return c == ' ' || c == '\t'; }
+
std::string TrimString(std::string value) {
size_t ltrim_chars = 0;
while (ltrim_chars < value.size() && IsWhitespace(value[ltrim_chars])) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
index 932e599fc21..68b8a54e313 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/string.h
@@ -42,20 +42,20 @@ ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out);
namespace internal {
-/// \brief Split a string with a delimiter
-ARROW_EXPORT
-std::vector<util::string_view> SplitString(util::string_view v, char delim);
-
+/// \brief Split a string with a delimiter
+ARROW_EXPORT
+std::vector<util::string_view> SplitString(util::string_view v, char delim);
+
/// \brief Join strings with a delimiter
ARROW_EXPORT
std::string JoinStrings(const std::vector<util::string_view>& strings,
util::string_view delimiter);
-/// \brief Join strings with a delimiter
-ARROW_EXPORT
-std::string JoinStrings(const std::vector<std::string>& strings,
- util::string_view delimiter);
-
+/// \brief Join strings with a delimiter
+ARROW_EXPORT
+std::string JoinStrings(const std::vector<std::string>& strings,
+ util::string_view delimiter);
+
/// \brief Trim whitespace from left and right sides of string
ARROW_EXPORT
std::string TrimString(std::string value);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
index 04a6d95cacb..7e8ab64b703 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.cc
@@ -30,29 +30,29 @@
namespace arrow {
namespace internal {
-namespace {
-
+namespace {
+
////////////////////////////////////////////////////////////////////////
// Serial TaskGroup implementation
class SerialTaskGroup : public TaskGroup {
public:
- explicit SerialTaskGroup(StopToken stop_token) : stop_token_(std::move(stop_token)) {}
-
- void AppendReal(FnOnce<Status()> task) override {
+ explicit SerialTaskGroup(StopToken stop_token) : stop_token_(std::move(stop_token)) {}
+
+ void AppendReal(FnOnce<Status()> task) override {
DCHECK(!finished_);
- if (stop_token_.IsStopRequested()) {
- status_ &= stop_token_.Poll();
- return;
- }
+ if (stop_token_.IsStopRequested()) {
+ status_ &= stop_token_.Poll();
+ return;
+ }
if (status_.ok()) {
- status_ &= std::move(task)();
+ status_ &= std::move(task)();
}
}
Status current_status() override { return status_; }
- bool ok() const override { return status_.ok(); }
+ bool ok() const override { return status_.ok(); }
Status Finish() override {
if (!finished_) {
@@ -61,11 +61,11 @@ class SerialTaskGroup : public TaskGroup {
return status_;
}
- Future<> FinishAsync() override { return Future<>::MakeFinished(Finish()); }
-
+ Future<> FinishAsync() override { return Future<>::MakeFinished(Finish()); }
+
int parallelism() override { return 1; }
- StopToken stop_token_;
+ StopToken stop_token_;
Status status_;
bool finished_ = false;
};
@@ -75,11 +75,11 @@ class SerialTaskGroup : public TaskGroup {
class ThreadedTaskGroup : public TaskGroup {
public:
- ThreadedTaskGroup(Executor* executor, StopToken stop_token)
- : executor_(executor),
- stop_token_(std::move(stop_token)),
- nremaining_(0),
- ok_(true) {}
+ ThreadedTaskGroup(Executor* executor, StopToken stop_token)
+ : executor_(executor),
+ stop_token_(std::move(stop_token)),
+ nremaining_(0),
+ ok_(true) {}
~ThreadedTaskGroup() override {
// Make sure all pending tasks are finished, so that dangling references
@@ -87,42 +87,42 @@ class ThreadedTaskGroup : public TaskGroup {
ARROW_UNUSED(Finish());
}
- void AppendReal(FnOnce<Status()> task) override {
- DCHECK(!finished_);
- if (stop_token_.IsStopRequested()) {
- UpdateStatus(stop_token_.Poll());
- return;
- }
-
+ void AppendReal(FnOnce<Status()> task) override {
+ DCHECK(!finished_);
+ if (stop_token_.IsStopRequested()) {
+ UpdateStatus(stop_token_.Poll());
+ return;
+ }
+
// The hot path is unlocked thanks to atomics
// Only if an error occurs is the lock taken
if (ok_.load(std::memory_order_acquire)) {
nremaining_.fetch_add(1, std::memory_order_acquire);
auto self = checked_pointer_cast<ThreadedTaskGroup>(shared_from_this());
-
- struct Callable {
- void operator()() {
- if (self_->ok_.load(std::memory_order_acquire)) {
- Status st;
- if (stop_token_.IsStopRequested()) {
- st = stop_token_.Poll();
- } else {
- // XXX what about exceptions?
- st = std::move(task_)();
- }
- self_->UpdateStatus(std::move(st));
- }
- self_->OneTaskDone();
+
+ struct Callable {
+ void operator()() {
+ if (self_->ok_.load(std::memory_order_acquire)) {
+ Status st;
+ if (stop_token_.IsStopRequested()) {
+ st = stop_token_.Poll();
+ } else {
+ // XXX what about exceptions?
+ st = std::move(task_)();
+ }
+ self_->UpdateStatus(std::move(st));
+ }
+ self_->OneTaskDone();
}
-
- std::shared_ptr<ThreadedTaskGroup> self_;
- FnOnce<Status()> task_;
- StopToken stop_token_;
- };
-
- Status st =
- executor_->Spawn(Callable{std::move(self), std::move(task), stop_token_});
+
+ std::shared_ptr<ThreadedTaskGroup> self_;
+ FnOnce<Status()> task_;
+ StopToken stop_token_;
+ };
+
+ Status st =
+ executor_->Spawn(Callable{std::move(self), std::move(task), stop_token_});
UpdateStatus(std::move(st));
}
}
@@ -132,7 +132,7 @@ class ThreadedTaskGroup : public TaskGroup {
return status_;
}
- bool ok() const override { return ok_.load(); }
+ bool ok() const override { return ok_.load(); }
Status Finish() override {
std::unique_lock<std::mutex> lock(mutex_);
@@ -144,20 +144,20 @@ class ThreadedTaskGroup : public TaskGroup {
return status_;
}
- Future<> FinishAsync() override {
+ Future<> FinishAsync() override {
std::lock_guard<std::mutex> lock(mutex_);
- if (!completion_future_.has_value()) {
- if (nremaining_.load() == 0) {
- completion_future_ = Future<>::MakeFinished(status_);
- } else {
- completion_future_ = Future<>::Make();
- }
- }
- return *completion_future_;
+ if (!completion_future_.has_value()) {
+ if (nremaining_.load() == 0) {
+ completion_future_ = Future<>::MakeFinished(status_);
+ } else {
+ completion_future_ = Future<>::Make();
+ }
+ }
+ return *completion_future_;
}
- int parallelism() override { return executor_->GetCapacity(); }
-
+ int parallelism() override { return executor_->GetCapacity(); }
+
protected:
void UpdateStatus(Status&& st) {
// Must be called unlocked, only locks on error
@@ -177,27 +177,27 @@ class ThreadedTaskGroup : public TaskGroup {
// before cv.notify_one() has returned
std::unique_lock<std::mutex> lock(mutex_);
cv_.notify_one();
- if (completion_future_.has_value()) {
- // MarkFinished could be slow. We don't want to call it while we are holding
- // the lock.
- auto& future = *completion_future_;
- const auto finished = completion_future_->is_finished();
- const auto& status = status_;
- // This will be redundant if the user calls Finish and not FinishAsync
- if (!finished && !finished_) {
- finished_ = true;
- lock.unlock();
- future.MarkFinished(status);
- } else {
- lock.unlock();
- }
- }
+ if (completion_future_.has_value()) {
+ // MarkFinished could be slow. We don't want to call it while we are holding
+ // the lock.
+ auto& future = *completion_future_;
+ const auto finished = completion_future_->is_finished();
+ const auto& status = status_;
+ // This will be redundant if the user calls Finish and not FinishAsync
+ if (!finished && !finished_) {
+ finished_ = true;
+ lock.unlock();
+ future.MarkFinished(status);
+ } else {
+ lock.unlock();
+ }
+ }
}
}
// These members are usable unlocked
Executor* executor_;
- StopToken stop_token_;
+ StopToken stop_token_;
std::atomic<int32_t> nremaining_;
std::atomic<bool> ok_;
@@ -206,18 +206,18 @@ class ThreadedTaskGroup : public TaskGroup {
std::condition_variable cv_;
Status status_;
bool finished_ = false;
- util::optional<Future<>> completion_future_;
+ util::optional<Future<>> completion_future_;
};
-} // namespace
-
-std::shared_ptr<TaskGroup> TaskGroup::MakeSerial(StopToken stop_token) {
- return std::shared_ptr<TaskGroup>(new SerialTaskGroup{stop_token});
+} // namespace
+
+std::shared_ptr<TaskGroup> TaskGroup::MakeSerial(StopToken stop_token) {
+ return std::shared_ptr<TaskGroup>(new SerialTaskGroup{stop_token});
}
-std::shared_ptr<TaskGroup> TaskGroup::MakeThreaded(Executor* thread_pool,
- StopToken stop_token) {
- return std::shared_ptr<TaskGroup>(new ThreadedTaskGroup{thread_pool, stop_token});
+std::shared_ptr<TaskGroup> TaskGroup::MakeThreaded(Executor* thread_pool,
+ StopToken stop_token) {
+ return std::shared_ptr<TaskGroup>(new ThreadedTaskGroup{thread_pool, stop_token});
}
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
index b3692cbcfeb..3bb72f0d9cb 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/task_group.h
@@ -21,9 +21,9 @@
#include <utility>
#include "arrow/status.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/cancel.h"
-#include "arrow/util/functional.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
#include "arrow/util/macros.h"
#include "arrow/util/type_fwd.h"
#include "arrow/util/visibility.h"
@@ -38,18 +38,18 @@ namespace internal {
/// implementation. When Finish() returns, it is guaranteed that all
/// tasks have finished, or at least one has errored.
///
-/// Once an error has occurred any tasks that are submitted to the task group
-/// will not run. The call to Append will simply return without scheduling the
-/// task.
-///
-/// If the task group is parallel it is possible that multiple tasks could be
-/// running at the same time and one of those tasks fails. This will put the
-/// task group in a failure state (so additional tasks cannot be run) however
-/// it will not interrupt running tasks. Finish will not complete
-/// until all running tasks have finished, even if one task fails.
-///
-/// Once a task group has finished new tasks may not be added to it. If you need to start
-/// a new batch of work then you should create a new task group.
+/// Once an error has occurred any tasks that are submitted to the task group
+/// will not run. The call to Append will simply return without scheduling the
+/// task.
+///
+/// If the task group is parallel it is possible that multiple tasks could be
+/// running at the same time and one of those tasks fails. This will put the
+/// task group in a failure state (so additional tasks cannot be run) however
+/// it will not interrupt running tasks. Finish will not complete
+/// until all running tasks have finished, even if one task fails.
+///
+/// Once a task group has finished new tasks may not be added to it. If you need to start
+/// a new batch of work then you should create a new task group.
class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
public:
/// Add a Status-returning function to execute. Execution order is
@@ -65,33 +65,33 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
/// task (or subgroup).
virtual Status Finish() = 0;
- /// Returns a future that will complete the first time all tasks are finished.
- /// This should be called only after all top level tasks
- /// have been added to the task group.
- ///
- /// If you are using a TaskGroup asynchronously there are a few considerations to keep
- /// in mind. The tasks should not block on I/O, etc (defeats the purpose of using
- /// futures) and should not be doing any nested locking or you run the risk of the tasks
- /// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
- ///
- /// Primarily this call is intended to help migrate existing work written with TaskGroup
- /// in mind to using futures without having to do a complete conversion on the first
- /// pass.
- virtual Future<> FinishAsync() = 0;
-
+ /// Returns a future that will complete the first time all tasks are finished.
+ /// This should be called only after all top level tasks
+ /// have been added to the task group.
+ ///
+ /// If you are using a TaskGroup asynchronously there are a few considerations to keep
+ /// in mind. The tasks should not block on I/O, etc (defeats the purpose of using
+ /// futures) and should not be doing any nested locking or you run the risk of the tasks
+ /// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
+ ///
+ /// Primarily this call is intended to help migrate existing work written with TaskGroup
+ /// in mind to using futures without having to do a complete conversion on the first
+ /// pass.
+ virtual Future<> FinishAsync() = 0;
+
/// The current aggregate error Status. Non-blocking, useful for stopping early.
virtual Status current_status() = 0;
- /// Whether some tasks have already failed. Non-blocking, useful for stopping early.
- virtual bool ok() const = 0;
+ /// Whether some tasks have already failed. Non-blocking, useful for stopping early.
+ virtual bool ok() const = 0;
/// How many tasks can typically be executed in parallel.
/// This is only a hint, useful for testing or debugging.
virtual int parallelism() = 0;
- static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
- static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
- StopToken = StopToken::Unstoppable());
+ static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
+ static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
+ StopToken = StopToken::Unstoppable());
virtual ~TaskGroup() = default;
@@ -99,7 +99,7 @@ class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
TaskGroup() = default;
ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup);
- virtual void AppendReal(FnOnce<Status()> task) = 0;
+ virtual void AppendReal(FnOnce<Status()> task) = 0;
};
} // namespace internal
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
index 93527f0c1f7..99b771ca0f2 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.cc
@@ -1,417 +1,417 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/util/tdigest.h"
-
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <limits>
-#include <queue>
-#include <tuple>
-#include <vector>
-
-#include "arrow/status.h"
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-namespace arrow {
-namespace internal {
-
-namespace {
-
-// a numerically stable lerp is unbelievably complex
-// but we are *approximating* the quantile, so let's keep it simple
-double Lerp(double a, double b, double t) { return a + t * (b - a); }
-
-// histogram bin
-struct Centroid {
- double mean;
- double weight; // # data points in this bin
-
- // merge with another centroid
- void Merge(const Centroid& centroid) {
- weight += centroid.weight;
- mean += (centroid.mean - mean) * centroid.weight / weight;
- }
-};
-
-// scale function K0: linear function, as baseline
-struct ScalerK0 {
- explicit ScalerK0(uint32_t delta) : delta_norm(delta / 2.0) {}
-
- double K(double q) const { return delta_norm * q; }
- double Q(double k) const { return k / delta_norm; }
-
- const double delta_norm;
-};
-
-// scale function K1
-struct ScalerK1 {
- explicit ScalerK1(uint32_t delta) : delta_norm(delta / (2.0 * M_PI)) {}
-
- double K(double q) const { return delta_norm * std::asin(2 * q - 1); }
- double Q(double k) const { return (std::sin(k / delta_norm) + 1) / 2; }
-
- const double delta_norm;
-};
-
-// implements t-digest merging algorithm
-template <class T = ScalerK1>
-class TDigestMerger : private T {
- public:
- explicit TDigestMerger(uint32_t delta) : T(delta) { Reset(0, nullptr); }
-
- void Reset(double total_weight, std::vector<Centroid>* tdigest) {
- total_weight_ = total_weight;
- tdigest_ = tdigest;
- if (tdigest_) {
- tdigest_->resize(0);
- }
- weight_so_far_ = 0;
- weight_limit_ = -1; // trigger first centroid merge
- }
-
- // merge one centroid from a sorted centroid stream
- void Add(const Centroid& centroid) {
- auto& td = *tdigest_;
- const double weight = weight_so_far_ + centroid.weight;
- if (weight <= weight_limit_) {
- td.back().Merge(centroid);
- } else {
- const double quantile = weight_so_far_ / total_weight_;
- const double next_weight_limit = total_weight_ * this->Q(this->K(quantile) + 1);
- // weight limit should be strictly increasing, until the last centroid
- if (next_weight_limit <= weight_limit_) {
- weight_limit_ = total_weight_;
- } else {
- weight_limit_ = next_weight_limit;
- }
- td.push_back(centroid); // should never exceed capacity and trigger reallocation
- }
- weight_so_far_ = weight;
- }
-
- // validate k-size of a tdigest
- Status Validate(const std::vector<Centroid>& tdigest, double total_weight) const {
- double q_prev = 0, k_prev = this->K(0);
- for (size_t i = 0; i < tdigest.size(); ++i) {
- const double q = q_prev + tdigest[i].weight / total_weight;
- const double k = this->K(q);
- if (tdigest[i].weight != 1 && (k - k_prev) > 1.001) {
- return Status::Invalid("oversized centroid: ", k - k_prev);
- }
- k_prev = k;
- q_prev = q;
- }
- return Status::OK();
- }
-
- private:
- double total_weight_; // total weight of this tdigest
- double weight_so_far_; // accumulated weight till current bin
- double weight_limit_; // max accumulated weight to move to next bin
- std::vector<Centroid>* tdigest_;
-};
-
-} // namespace
-
-class TDigest::TDigestImpl {
- public:
- explicit TDigestImpl(uint32_t delta)
- : delta_(delta > 10 ? delta : 10), merger_(delta_) {
- tdigests_[0].reserve(delta_);
- tdigests_[1].reserve(delta_);
- Reset();
- }
-
- void Reset() {
- tdigests_[0].resize(0);
- tdigests_[1].resize(0);
- current_ = 0;
- total_weight_ = 0;
- min_ = std::numeric_limits<double>::max();
- max_ = std::numeric_limits<double>::lowest();
- merger_.Reset(0, nullptr);
- }
-
- Status Validate() const {
- // check weight, centroid order
- double total_weight = 0, prev_mean = std::numeric_limits<double>::lowest();
- for (const auto& centroid : tdigests_[current_]) {
- if (std::isnan(centroid.mean) || std::isnan(centroid.weight)) {
- return Status::Invalid("NAN found in tdigest");
- }
- if (centroid.mean < prev_mean) {
- return Status::Invalid("centroid mean decreases");
- }
- if (centroid.weight < 1) {
- return Status::Invalid("invalid centroid weight");
- }
- prev_mean = centroid.mean;
- total_weight += centroid.weight;
- }
- if (total_weight != total_weight_) {
- return Status::Invalid("tdigest total weight mismatch");
- }
- // check if buffer expanded
- if (tdigests_[0].capacity() > delta_ || tdigests_[1].capacity() > delta_) {
- return Status::Invalid("oversized tdigest buffer");
- }
- // check k-size
- return merger_.Validate(tdigests_[current_], total_weight_);
- }
-
- void Dump() const {
- const auto& td = tdigests_[current_];
- for (size_t i = 0; i < td.size(); ++i) {
- std::cerr << i << ": mean = " << td[i].mean << ", weight = " << td[i].weight
- << std::endl;
- }
- std::cerr << "min = " << min_ << ", max = " << max_ << std::endl;
- }
-
- // merge with other tdigests
- void Merge(const std::vector<const TDigestImpl*>& tdigest_impls) {
- // current and end iterator
- using CentroidIter = std::vector<Centroid>::const_iterator;
- using CentroidIterPair = std::pair<CentroidIter, CentroidIter>;
- // use a min-heap to find next minimal centroid from all tdigests
- auto centroid_gt = [](const CentroidIterPair& lhs, const CentroidIterPair& rhs) {
- return lhs.first->mean > rhs.first->mean;
- };
- using CentroidQueue =
- std::priority_queue<CentroidIterPair, std::vector<CentroidIterPair>,
- decltype(centroid_gt)>;
-
- // trivial dynamic memory allocated at runtime
- std::vector<CentroidIterPair> queue_buffer;
- queue_buffer.reserve(tdigest_impls.size() + 1);
- CentroidQueue queue(std::move(centroid_gt), std::move(queue_buffer));
-
- const auto& this_tdigest = tdigests_[current_];
- if (this_tdigest.size() > 0) {
- queue.emplace(this_tdigest.cbegin(), this_tdigest.cend());
- }
- for (const TDigestImpl* td : tdigest_impls) {
- const auto& other_tdigest = td->tdigests_[td->current_];
- if (other_tdigest.size() > 0) {
- queue.emplace(other_tdigest.cbegin(), other_tdigest.cend());
- total_weight_ += td->total_weight_;
- min_ = std::min(min_, td->min_);
- max_ = std::max(max_, td->max_);
- }
- }
-
- merger_.Reset(total_weight_, &tdigests_[1 - current_]);
- CentroidIter current_iter, end_iter;
- // do k-way merge till one buffer left
- while (queue.size() > 1) {
- std::tie(current_iter, end_iter) = queue.top();
- merger_.Add(*current_iter);
- queue.pop();
- if (++current_iter != end_iter) {
- queue.emplace(current_iter, end_iter);
- }
- }
- // merge last buffer
- if (!queue.empty()) {
- std::tie(current_iter, end_iter) = queue.top();
- while (current_iter != end_iter) {
- merger_.Add(*current_iter++);
- }
- }
- merger_.Reset(0, nullptr);
-
- current_ = 1 - current_;
- }
-
- // merge input data with current tdigest
- void MergeInput(std::vector<double>& input) {
- total_weight_ += input.size();
-
- std::sort(input.begin(), input.end());
- min_ = std::min(min_, input.front());
- max_ = std::max(max_, input.back());
-
- // pick next minimal centroid from input and tdigest, feed to merger
- merger_.Reset(total_weight_, &tdigests_[1 - current_]);
- const auto& td = tdigests_[current_];
- uint32_t tdigest_index = 0, input_index = 0;
- while (tdigest_index < td.size() && input_index < input.size()) {
- if (td[tdigest_index].mean < input[input_index]) {
- merger_.Add(td[tdigest_index++]);
- } else {
- merger_.Add(Centroid{input[input_index++], 1});
- }
- }
- while (tdigest_index < td.size()) {
- merger_.Add(td[tdigest_index++]);
- }
- while (input_index < input.size()) {
- merger_.Add(Centroid{input[input_index++], 1});
- }
- merger_.Reset(0, nullptr);
-
- input.resize(0);
- current_ = 1 - current_;
- }
-
- double Quantile(double q) const {
- const auto& td = tdigests_[current_];
-
- if (q < 0 || q > 1 || td.size() == 0) {
- return NAN;
- }
-
- const double index = q * total_weight_;
- if (index <= 1) {
- return min_;
- } else if (index >= total_weight_ - 1) {
- return max_;
- }
-
- // find centroid contains the index
- uint32_t ci = 0;
- double weight_sum = 0;
- for (; ci < td.size(); ++ci) {
- weight_sum += td[ci].weight;
- if (index <= weight_sum) {
- break;
- }
- }
- DCHECK_LT(ci, td.size());
-
- // deviation of index from the centroid center
- double diff = index + td[ci].weight / 2 - weight_sum;
-
- // index happen to be in a unit weight centroid
- if (td[ci].weight == 1 && std::abs(diff) < 0.5) {
- return td[ci].mean;
- }
-
- // find adjacent centroids for interpolation
- uint32_t ci_left = ci, ci_right = ci;
- if (diff > 0) {
- if (ci_right == td.size() - 1) {
- // index larger than center of last bin
- DCHECK_EQ(weight_sum, total_weight_);
- const Centroid* c = &td[ci_right];
- DCHECK_GE(c->weight, 2);
- return Lerp(c->mean, max_, diff / (c->weight / 2));
- }
- ++ci_right;
- } else {
- if (ci_left == 0) {
- // index smaller than center of first bin
- const Centroid* c = &td[0];
- DCHECK_GE(c->weight, 2);
- return Lerp(min_, c->mean, index / (c->weight / 2));
- }
- --ci_left;
- diff += td[ci_left].weight / 2 + td[ci_right].weight / 2;
- }
-
- // interpolate from adjacent centroids
- diff /= (td[ci_left].weight / 2 + td[ci_right].weight / 2);
- return Lerp(td[ci_left].mean, td[ci_right].mean, diff);
- }
-
- double Mean() const {
- double sum = 0;
- for (const auto& centroid : tdigests_[current_]) {
- sum += centroid.mean * centroid.weight;
- }
- return total_weight_ == 0 ? NAN : sum / total_weight_;
- }
-
- double total_weight() const { return total_weight_; }
-
- private:
- // must be delcared before merger_, see constructor initialization list
- const uint32_t delta_;
-
- TDigestMerger<> merger_;
- double total_weight_;
- double min_, max_;
-
- // ping-pong buffer holds two tdigests, size = 2 * delta * sizeof(Centroid)
- std::vector<Centroid> tdigests_[2];
- // index of active tdigest buffer, 0 or 1
- int current_;
-};
-
-TDigest::TDigest(uint32_t delta, uint32_t buffer_size) : impl_(new TDigestImpl(delta)) {
- input_.reserve(buffer_size);
- Reset();
-}
-
-TDigest::~TDigest() = default;
-TDigest::TDigest(TDigest&&) = default;
-TDigest& TDigest::operator=(TDigest&&) = default;
-
-void TDigest::Reset() {
- input_.resize(0);
- impl_->Reset();
-}
-
-Status TDigest::Validate() {
- MergeInput();
- return impl_->Validate();
-}
-
-void TDigest::Dump() {
- MergeInput();
- impl_->Dump();
-}
-
-void TDigest::Merge(std::vector<TDigest>* tdigests) {
- MergeInput();
-
- std::vector<const TDigestImpl*> tdigest_impls;
- tdigest_impls.reserve(tdigests->size());
- for (auto& td : *tdigests) {
- td.MergeInput();
- tdigest_impls.push_back(td.impl_.get());
- }
- impl_->Merge(tdigest_impls);
-}
-
-double TDigest::Quantile(double q) {
- MergeInput();
- return impl_->Quantile(q);
-}
-
-double TDigest::Mean() {
- MergeInput();
- return impl_->Mean();
-}
-
-bool TDigest::is_empty() const {
- return input_.size() == 0 && impl_->total_weight() == 0;
-}
-
-void TDigest::MergeInput() {
- if (input_.size() > 0) {
- impl_->MergeInput(input_); // will mutate input_
- }
-}
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/tdigest.h"
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <queue>
+#include <tuple>
+#include <vector>
+
+#include "arrow/status.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+// a numerically stable lerp is unbelievably complex
+// but we are *approximating* the quantile, so let's keep it simple
+double Lerp(double a, double b, double t) { return a + t * (b - a); }
+
+// histogram bin
+struct Centroid {
+ double mean;
+ double weight; // # data points in this bin
+
+ // merge with another centroid
+ void Merge(const Centroid& centroid) {
+ weight += centroid.weight;
+ mean += (centroid.mean - mean) * centroid.weight / weight;
+ }
+};
+
+// scale function K0: linear function, as baseline
+struct ScalerK0 {
+ explicit ScalerK0(uint32_t delta) : delta_norm(delta / 2.0) {}
+
+ double K(double q) const { return delta_norm * q; }
+ double Q(double k) const { return k / delta_norm; }
+
+ const double delta_norm;
+};
+
+// scale function K1
+struct ScalerK1 {
+ explicit ScalerK1(uint32_t delta) : delta_norm(delta / (2.0 * M_PI)) {}
+
+ double K(double q) const { return delta_norm * std::asin(2 * q - 1); }
+ double Q(double k) const { return (std::sin(k / delta_norm) + 1) / 2; }
+
+ const double delta_norm;
+};
+
+// implements t-digest merging algorithm
+template <class T = ScalerK1>
+class TDigestMerger : private T {
+ public:
+ explicit TDigestMerger(uint32_t delta) : T(delta) { Reset(0, nullptr); }
+
+ void Reset(double total_weight, std::vector<Centroid>* tdigest) {
+ total_weight_ = total_weight;
+ tdigest_ = tdigest;
+ if (tdigest_) {
+ tdigest_->resize(0);
+ }
+ weight_so_far_ = 0;
+ weight_limit_ = -1; // trigger first centroid merge
+ }
+
+ // merge one centroid from a sorted centroid stream
+ void Add(const Centroid& centroid) {
+ auto& td = *tdigest_;
+ const double weight = weight_so_far_ + centroid.weight;
+ if (weight <= weight_limit_) {
+ td.back().Merge(centroid);
+ } else {
+ const double quantile = weight_so_far_ / total_weight_;
+ const double next_weight_limit = total_weight_ * this->Q(this->K(quantile) + 1);
+ // weight limit should be strictly increasing, until the last centroid
+ if (next_weight_limit <= weight_limit_) {
+ weight_limit_ = total_weight_;
+ } else {
+ weight_limit_ = next_weight_limit;
+ }
+ td.push_back(centroid); // should never exceed capacity and trigger reallocation
+ }
+ weight_so_far_ = weight;
+ }
+
+ // validate k-size of a tdigest
+ Status Validate(const std::vector<Centroid>& tdigest, double total_weight) const {
+ double q_prev = 0, k_prev = this->K(0);
+ for (size_t i = 0; i < tdigest.size(); ++i) {
+ const double q = q_prev + tdigest[i].weight / total_weight;
+ const double k = this->K(q);
+ if (tdigest[i].weight != 1 && (k - k_prev) > 1.001) {
+ return Status::Invalid("oversized centroid: ", k - k_prev);
+ }
+ k_prev = k;
+ q_prev = q;
+ }
+ return Status::OK();
+ }
+
+ private:
+ double total_weight_; // total weight of this tdigest
+ double weight_so_far_; // accumulated weight till current bin
+ double weight_limit_; // max accumulated weight to move to next bin
+ std::vector<Centroid>* tdigest_;
+};
+
+} // namespace
+
+class TDigest::TDigestImpl {
+ public:
+ explicit TDigestImpl(uint32_t delta)
+ : delta_(delta > 10 ? delta : 10), merger_(delta_) {
+ tdigests_[0].reserve(delta_);
+ tdigests_[1].reserve(delta_);
+ Reset();
+ }
+
+ void Reset() {
+ tdigests_[0].resize(0);
+ tdigests_[1].resize(0);
+ current_ = 0;
+ total_weight_ = 0;
+ min_ = std::numeric_limits<double>::max();
+ max_ = std::numeric_limits<double>::lowest();
+ merger_.Reset(0, nullptr);
+ }
+
+ Status Validate() const {
+ // check weight, centroid order
+ double total_weight = 0, prev_mean = std::numeric_limits<double>::lowest();
+ for (const auto& centroid : tdigests_[current_]) {
+ if (std::isnan(centroid.mean) || std::isnan(centroid.weight)) {
+ return Status::Invalid("NAN found in tdigest");
+ }
+ if (centroid.mean < prev_mean) {
+ return Status::Invalid("centroid mean decreases");
+ }
+ if (centroid.weight < 1) {
+ return Status::Invalid("invalid centroid weight");
+ }
+ prev_mean = centroid.mean;
+ total_weight += centroid.weight;
+ }
+ if (total_weight != total_weight_) {
+ return Status::Invalid("tdigest total weight mismatch");
+ }
+ // check if buffer expanded
+ if (tdigests_[0].capacity() > delta_ || tdigests_[1].capacity() > delta_) {
+ return Status::Invalid("oversized tdigest buffer");
+ }
+ // check k-size
+ return merger_.Validate(tdigests_[current_], total_weight_);
+ }
+
+ void Dump() const {
+ const auto& td = tdigests_[current_];
+ for (size_t i = 0; i < td.size(); ++i) {
+ std::cerr << i << ": mean = " << td[i].mean << ", weight = " << td[i].weight
+ << std::endl;
+ }
+ std::cerr << "min = " << min_ << ", max = " << max_ << std::endl;
+ }
+
+ // merge with other tdigests
+ void Merge(const std::vector<const TDigestImpl*>& tdigest_impls) {
+ // current and end iterator
+ using CentroidIter = std::vector<Centroid>::const_iterator;
+ using CentroidIterPair = std::pair<CentroidIter, CentroidIter>;
+ // use a min-heap to find next minimal centroid from all tdigests
+ auto centroid_gt = [](const CentroidIterPair& lhs, const CentroidIterPair& rhs) {
+ return lhs.first->mean > rhs.first->mean;
+ };
+ using CentroidQueue =
+ std::priority_queue<CentroidIterPair, std::vector<CentroidIterPair>,
+ decltype(centroid_gt)>;
+
+ // trivial dynamic memory allocated at runtime
+ std::vector<CentroidIterPair> queue_buffer;
+ queue_buffer.reserve(tdigest_impls.size() + 1);
+ CentroidQueue queue(std::move(centroid_gt), std::move(queue_buffer));
+
+ const auto& this_tdigest = tdigests_[current_];
+ if (this_tdigest.size() > 0) {
+ queue.emplace(this_tdigest.cbegin(), this_tdigest.cend());
+ }
+ for (const TDigestImpl* td : tdigest_impls) {
+ const auto& other_tdigest = td->tdigests_[td->current_];
+ if (other_tdigest.size() > 0) {
+ queue.emplace(other_tdigest.cbegin(), other_tdigest.cend());
+ total_weight_ += td->total_weight_;
+ min_ = std::min(min_, td->min_);
+ max_ = std::max(max_, td->max_);
+ }
+ }
+
+ merger_.Reset(total_weight_, &tdigests_[1 - current_]);
+ CentroidIter current_iter, end_iter;
+ // do k-way merge till one buffer left
+ while (queue.size() > 1) {
+ std::tie(current_iter, end_iter) = queue.top();
+ merger_.Add(*current_iter);
+ queue.pop();
+ if (++current_iter != end_iter) {
+ queue.emplace(current_iter, end_iter);
+ }
+ }
+ // merge last buffer
+ if (!queue.empty()) {
+ std::tie(current_iter, end_iter) = queue.top();
+ while (current_iter != end_iter) {
+ merger_.Add(*current_iter++);
+ }
+ }
+ merger_.Reset(0, nullptr);
+
+ current_ = 1 - current_;
+ }
+
+ // merge input data with current tdigest
+ void MergeInput(std::vector<double>& input) {
+ total_weight_ += input.size();
+
+ std::sort(input.begin(), input.end());
+ min_ = std::min(min_, input.front());
+ max_ = std::max(max_, input.back());
+
+ // pick next minimal centroid from input and tdigest, feed to merger
+ merger_.Reset(total_weight_, &tdigests_[1 - current_]);
+ const auto& td = tdigests_[current_];
+ uint32_t tdigest_index = 0, input_index = 0;
+ while (tdigest_index < td.size() && input_index < input.size()) {
+ if (td[tdigest_index].mean < input[input_index]) {
+ merger_.Add(td[tdigest_index++]);
+ } else {
+ merger_.Add(Centroid{input[input_index++], 1});
+ }
+ }
+ while (tdigest_index < td.size()) {
+ merger_.Add(td[tdigest_index++]);
+ }
+ while (input_index < input.size()) {
+ merger_.Add(Centroid{input[input_index++], 1});
+ }
+ merger_.Reset(0, nullptr);
+
+ input.resize(0);
+ current_ = 1 - current_;
+ }
+
+ double Quantile(double q) const {
+ const auto& td = tdigests_[current_];
+
+ if (q < 0 || q > 1 || td.size() == 0) {
+ return NAN;
+ }
+
+ const double index = q * total_weight_;
+ if (index <= 1) {
+ return min_;
+ } else if (index >= total_weight_ - 1) {
+ return max_;
+ }
+
+ // find centroid contains the index
+ uint32_t ci = 0;
+ double weight_sum = 0;
+ for (; ci < td.size(); ++ci) {
+ weight_sum += td[ci].weight;
+ if (index <= weight_sum) {
+ break;
+ }
+ }
+ DCHECK_LT(ci, td.size());
+
+ // deviation of index from the centroid center
+ double diff = index + td[ci].weight / 2 - weight_sum;
+
+ // index happen to be in a unit weight centroid
+ if (td[ci].weight == 1 && std::abs(diff) < 0.5) {
+ return td[ci].mean;
+ }
+
+ // find adjacent centroids for interpolation
+ uint32_t ci_left = ci, ci_right = ci;
+ if (diff > 0) {
+ if (ci_right == td.size() - 1) {
+ // index larger than center of last bin
+ DCHECK_EQ(weight_sum, total_weight_);
+ const Centroid* c = &td[ci_right];
+ DCHECK_GE(c->weight, 2);
+ return Lerp(c->mean, max_, diff / (c->weight / 2));
+ }
+ ++ci_right;
+ } else {
+ if (ci_left == 0) {
+ // index smaller than center of first bin
+ const Centroid* c = &td[0];
+ DCHECK_GE(c->weight, 2);
+ return Lerp(min_, c->mean, index / (c->weight / 2));
+ }
+ --ci_left;
+ diff += td[ci_left].weight / 2 + td[ci_right].weight / 2;
+ }
+
+ // interpolate from adjacent centroids
+ diff /= (td[ci_left].weight / 2 + td[ci_right].weight / 2);
+ return Lerp(td[ci_left].mean, td[ci_right].mean, diff);
+ }
+
+ double Mean() const {
+ double sum = 0;
+ for (const auto& centroid : tdigests_[current_]) {
+ sum += centroid.mean * centroid.weight;
+ }
+ return total_weight_ == 0 ? NAN : sum / total_weight_;
+ }
+
+ double total_weight() const { return total_weight_; }
+
+ private:
+ // must be delcared before merger_, see constructor initialization list
+ const uint32_t delta_;
+
+ TDigestMerger<> merger_;
+ double total_weight_;
+ double min_, max_;
+
+ // ping-pong buffer holds two tdigests, size = 2 * delta * sizeof(Centroid)
+ std::vector<Centroid> tdigests_[2];
+ // index of active tdigest buffer, 0 or 1
+ int current_;
+};
+
+TDigest::TDigest(uint32_t delta, uint32_t buffer_size) : impl_(new TDigestImpl(delta)) {
+ input_.reserve(buffer_size);
+ Reset();
+}
+
+TDigest::~TDigest() = default;
+TDigest::TDigest(TDigest&&) = default;
+TDigest& TDigest::operator=(TDigest&&) = default;
+
+void TDigest::Reset() {
+ input_.resize(0);
+ impl_->Reset();
+}
+
+Status TDigest::Validate() {
+ MergeInput();
+ return impl_->Validate();
+}
+
+void TDigest::Dump() {
+ MergeInput();
+ impl_->Dump();
+}
+
+void TDigest::Merge(std::vector<TDigest>* tdigests) {
+ MergeInput();
+
+ std::vector<const TDigestImpl*> tdigest_impls;
+ tdigest_impls.reserve(tdigests->size());
+ for (auto& td : *tdigests) {
+ td.MergeInput();
+ tdigest_impls.push_back(td.impl_.get());
+ }
+ impl_->Merge(tdigest_impls);
+}
+
+double TDigest::Quantile(double q) {
+ MergeInput();
+ return impl_->Quantile(q);
+}
+
+double TDigest::Mean() {
+ MergeInput();
+ return impl_->Mean();
+}
+
+bool TDigest::is_empty() const {
+ return input_.size() == 0 && impl_->total_weight() == 0;
+}
+
+void TDigest::MergeInput() {
+ if (input_.size() > 0) {
+ impl_->MergeInput(input_); // will mutate input_
+ }
+}
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
index 361d176bff4..ae42ce48e7d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/tdigest.h
@@ -1,103 +1,103 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// approximate quantiles from arbitrary length dataset with O(1) space
-// based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl
-// - https://arxiv.org/abs/1902.04023
-// - https://github.com/tdunning/t-digest
-
-#pragma once
-
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Status;
-
-namespace internal {
-
-class ARROW_EXPORT TDigest {
- public:
- explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500);
- ~TDigest();
- TDigest(TDigest&&);
- TDigest& operator=(TDigest&&);
-
- // reset and re-use this tdigest
- void Reset();
-
- // validate data integrity
- Status Validate();
-
- // dump internal data, only for debug
- void Dump();
-
- // buffer a single data point, consume internal buffer if full
- // this function is intensively called and performance critical
- // call it only if you are sure no NAN exists in input data
- void Add(double value) {
- DCHECK(!std::isnan(value)) << "cannot add NAN";
- if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) {
- MergeInput();
- }
- input_.push_back(value);
- }
-
- // skip NAN on adding
- template <typename T>
- typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) {
- if (!std::isnan(value)) Add(value);
- }
-
- template <typename T>
- typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) {
- Add(static_cast<double>(value));
- }
-
- // merge with other t-digests, called infrequently
- void Merge(std::vector<TDigest>* tdigests);
-
- // calculate quantile
- double Quantile(double q);
-
- double Min() { return Quantile(0); }
- double Max() { return Quantile(1); }
- double Mean();
-
- // check if this tdigest contains no valid data points
- bool is_empty() const;
-
- private:
- // merge input data with current tdigest
- void MergeInput();
-
- // input buffer, size = buffer_size * sizeof(double)
- std::vector<double> input_;
-
- // hide other members with pimpl
- class TDigestImpl;
- std::unique_ptr<TDigestImpl> impl_;
-};
-
-} // namespace internal
-} // namespace arrow
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// approximate quantiles from arbitrary length dataset with O(1) space
+// based on 'Computing Extremely Accurate Quantiles Using t-Digests' from Dunning & Ertl
+// - https://arxiv.org/abs/1902.04023
+// - https://github.com/tdunning/t-digest
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+namespace internal {
+
+class ARROW_EXPORT TDigest {
+ public:
+ explicit TDigest(uint32_t delta = 100, uint32_t buffer_size = 500);
+ ~TDigest();
+ TDigest(TDigest&&);
+ TDigest& operator=(TDigest&&);
+
+ // reset and re-use this tdigest
+ void Reset();
+
+ // validate data integrity
+ Status Validate();
+
+ // dump internal data, only for debug
+ void Dump();
+
+ // buffer a single data point, consume internal buffer if full
+ // this function is intensively called and performance critical
+ // call it only if you are sure no NAN exists in input data
+ void Add(double value) {
+ DCHECK(!std::isnan(value)) << "cannot add NAN";
+ if (ARROW_PREDICT_FALSE(input_.size() == input_.capacity())) {
+ MergeInput();
+ }
+ input_.push_back(value);
+ }
+
+ // skip NAN on adding
+ template <typename T>
+ typename std::enable_if<std::is_floating_point<T>::value>::type NanAdd(T value) {
+ if (!std::isnan(value)) Add(value);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_integral<T>::value>::type NanAdd(T value) {
+ Add(static_cast<double>(value));
+ }
+
+ // merge with other t-digests, called infrequently
+ void Merge(std::vector<TDigest>* tdigests);
+
+ // calculate quantile
+ double Quantile(double q);
+
+ double Min() { return Quantile(0); }
+ double Max() { return Quantile(1); }
+ double Mean();
+
+ // check if this tdigest contains no valid data points
+ bool is_empty() const;
+
+ private:
+ // merge input data with current tdigest
+ void MergeInput();
+
+ // input buffer, size = buffer_size * sizeof(double)
+ std::vector<double> input_;
+
+ // hide other members with pimpl
+ class TDigestImpl;
+ std::unique_ptr<TDigestImpl> impl_;
+};
+
+} // namespace internal
+} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
index ee480d0dec9..758295d01ed 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.cc
@@ -32,88 +32,88 @@
namespace arrow {
namespace internal {
-Executor::~Executor() = default;
-
-namespace {
-
-struct Task {
- FnOnce<void()> callable;
- StopToken stop_token;
- Executor::StopCallback stop_callback;
-};
-
-} // namespace
-
-struct SerialExecutor::State {
- std::deque<Task> task_queue;
- std::mutex mutex;
- std::condition_variable wait_for_tasks;
- bool finished{false};
-};
-
-SerialExecutor::SerialExecutor() : state_(std::make_shared<State>()) {}
-
-SerialExecutor::~SerialExecutor() = default;
-
-Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
- StopToken stop_token, StopCallback&& stop_callback) {
- // While the SerialExecutor runs tasks synchronously on its main thread,
- // SpawnReal may be called from external threads (e.g. when transferring back
- // from blocking I/O threads), so we need to keep the state alive *and* to
- // lock its contents.
- //
- // Note that holding the lock while notifying the condition variable may
- // not be sufficient, as some exit paths in the main thread are unlocked.
- auto state = state_;
- {
- std::lock_guard<std::mutex> lk(state->mutex);
- state->task_queue.push_back(
- Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
- }
- state->wait_for_tasks.notify_one();
- return Status::OK();
-}
-
-void SerialExecutor::MarkFinished() {
- // Same comment as SpawnReal above
- auto state = state_;
- {
- std::lock_guard<std::mutex> lk(state->mutex);
- state->finished = true;
- }
- state->wait_for_tasks.notify_one();
-}
-
-void SerialExecutor::RunLoop() {
- // This is called from the SerialExecutor's main thread, so the
- // state is guaranteed to be kept alive.
- std::unique_lock<std::mutex> lk(state_->mutex);
-
- while (!state_->finished) {
- while (!state_->task_queue.empty()) {
- Task task = std::move(state_->task_queue.front());
- state_->task_queue.pop_front();
- lk.unlock();
- if (!task.stop_token.IsStopRequested()) {
- std::move(task.callable)();
- } else {
- if (task.stop_callback) {
- std::move(task.stop_callback)(task.stop_token.Poll());
- }
- // Can't break here because there may be cleanup tasks down the chain we still
- // need to run.
- }
- lk.lock();
- }
- // In this case we must be waiting on work from external (e.g. I/O) executors. Wait
- // for tasks to arrive (typically via transferred futures).
- state_->wait_for_tasks.wait(
- lk, [&] { return state_->finished || !state_->task_queue.empty(); });
- }
-}
-
+Executor::~Executor() = default;
+
+namespace {
+
+struct Task {
+ FnOnce<void()> callable;
+ StopToken stop_token;
+ Executor::StopCallback stop_callback;
+};
+
+} // namespace
+
+struct SerialExecutor::State {
+ std::deque<Task> task_queue;
+ std::mutex mutex;
+ std::condition_variable wait_for_tasks;
+ bool finished{false};
+};
+
+SerialExecutor::SerialExecutor() : state_(std::make_shared<State>()) {}
+
+SerialExecutor::~SerialExecutor() = default;
+
+Status SerialExecutor::SpawnReal(TaskHints hints, FnOnce<void()> task,
+ StopToken stop_token, StopCallback&& stop_callback) {
+ // While the SerialExecutor runs tasks synchronously on its main thread,
+ // SpawnReal may be called from external threads (e.g. when transferring back
+ // from blocking I/O threads), so we need to keep the state alive *and* to
+ // lock its contents.
+ //
+ // Note that holding the lock while notifying the condition variable may
+ // not be sufficient, as some exit paths in the main thread are unlocked.
+ auto state = state_;
+ {
+ std::lock_guard<std::mutex> lk(state->mutex);
+ state->task_queue.push_back(
+ Task{std::move(task), std::move(stop_token), std::move(stop_callback)});
+ }
+ state->wait_for_tasks.notify_one();
+ return Status::OK();
+}
+
+void SerialExecutor::MarkFinished() {
+ // Same comment as SpawnReal above
+ auto state = state_;
+ {
+ std::lock_guard<std::mutex> lk(state->mutex);
+ state->finished = true;
+ }
+ state->wait_for_tasks.notify_one();
+}
+
+void SerialExecutor::RunLoop() {
+ // This is called from the SerialExecutor's main thread, so the
+ // state is guaranteed to be kept alive.
+ std::unique_lock<std::mutex> lk(state_->mutex);
+
+ while (!state_->finished) {
+ while (!state_->task_queue.empty()) {
+ Task task = std::move(state_->task_queue.front());
+ state_->task_queue.pop_front();
+ lk.unlock();
+ if (!task.stop_token.IsStopRequested()) {
+ std::move(task.callable)();
+ } else {
+ if (task.stop_callback) {
+ std::move(task.stop_callback)(task.stop_token.Poll());
+ }
+ // Can't break here because there may be cleanup tasks down the chain we still
+ // need to run.
+ }
+ lk.lock();
+ }
+ // In this case we must be waiting on work from external (e.g. I/O) executors. Wait
+ // for tasks to arrive (typically via transferred futures).
+ state_->wait_for_tasks.wait(
+ lk, [&] { return state_->finished || !state_->task_queue.empty(); });
+ }
+}
+
struct ThreadPool::State {
- State() = default;
+ State() = default;
// NOTE: in case locking becomes too expensive, we can investigate lock-free FIFOs
// such as https://github.com/cameron314/concurrentqueue
@@ -125,17 +125,17 @@ struct ThreadPool::State {
std::list<std::thread> workers_;
// Trashcan for finished threads
std::vector<std::thread> finished_workers_;
- std::deque<Task> pending_tasks_;
+ std::deque<Task> pending_tasks_;
// Desired number of threads
- int desired_capacity_ = 0;
-
- // Total number of tasks that are either queued or running
- int tasks_queued_or_running_ = 0;
-
+ int desired_capacity_ = 0;
+
+ // Total number of tasks that are either queued or running
+ int tasks_queued_or_running_ = 0;
+
// Are we shutting down?
- bool please_shutdown_ = false;
- bool quick_shutdown_ = false;
+ bool please_shutdown_ = false;
+ bool quick_shutdown_ = false;
};
// The worker loop is an independent function so that it can keep running
@@ -165,24 +165,24 @@ static void WorkerLoop(std::shared_ptr<ThreadPool::State> state,
if (should_secede()) {
break;
}
-
- DCHECK_GE(state->tasks_queued_or_running_, 0);
+
+ DCHECK_GE(state->tasks_queued_or_running_, 0);
{
- Task task = std::move(state->pending_tasks_.front());
+ Task task = std::move(state->pending_tasks_.front());
state->pending_tasks_.pop_front();
- StopToken* stop_token = &task.stop_token;
+ StopToken* stop_token = &task.stop_token;
lock.unlock();
- if (!stop_token->IsStopRequested()) {
- std::move(task.callable)();
- } else {
- if (task.stop_callback) {
- std::move(task.stop_callback)(stop_token->Poll());
- }
- }
- ARROW_UNUSED(std::move(task)); // release resources before waiting for lock
- lock.lock();
+ if (!stop_token->IsStopRequested()) {
+ std::move(task.callable)();
+ } else {
+ if (task.stop_callback) {
+ std::move(task.stop_callback)(stop_token->Poll());
+ }
+ }
+ ARROW_UNUSED(std::move(task)); // release resources before waiting for lock
+ lock.lock();
}
- state->tasks_queued_or_running_--;
+ state->tasks_queued_or_running_--;
}
// Now either the queue is empty *or* a quick shutdown was requested
if (state->please_shutdown_ || should_secede()) {
@@ -191,7 +191,7 @@ static void WorkerLoop(std::shared_ptr<ThreadPool::State> state,
// Wait for next wakeup
state->cv_.wait(lock);
}
- DCHECK_GE(state->tasks_queued_or_running_, 0);
+ DCHECK_GE(state->tasks_queued_or_running_, 0);
// We're done. Move our thread object to the trashcan of finished
// workers. This has two motivations:
@@ -262,14 +262,14 @@ Status ThreadPool::SetCapacity(int threads) {
CollectFinishedWorkersUnlocked();
state_->desired_capacity_ = threads;
- // See if we need to increase or decrease the number of running threads
- const int required = std::min(static_cast<int>(state_->pending_tasks_.size()),
- threads - static_cast<int>(state_->workers_.size()));
- if (required > 0) {
- // Some tasks are pending, spawn the number of needed threads immediately
- LaunchWorkersUnlocked(required);
- } else if (required < 0) {
- // Excess threads are running, wake them so that they stop
+ // See if we need to increase or decrease the number of running threads
+ const int required = std::min(static_cast<int>(state_->pending_tasks_.size()),
+ threads - static_cast<int>(state_->workers_.size()));
+ if (required > 0) {
+ // Some tasks are pending, spawn the number of needed threads immediately
+ LaunchWorkersUnlocked(required);
+ } else if (required < 0) {
+ // Excess threads are running, wake them so that they stop
state_->cv_.notify_all();
}
return Status::OK();
@@ -281,12 +281,12 @@ int ThreadPool::GetCapacity() {
return state_->desired_capacity_;
}
-int ThreadPool::GetNumTasks() {
- ProtectAgainstFork();
- std::unique_lock<std::mutex> lock(state_->mutex_);
- return state_->tasks_queued_or_running_;
-}
-
+int ThreadPool::GetNumTasks() {
+ ProtectAgainstFork();
+ std::unique_lock<std::mutex> lock(state_->mutex_);
+ return state_->tasks_queued_or_running_;
+}
+
int ThreadPool::GetActualCapacity() {
ProtectAgainstFork();
std::unique_lock<std::mutex> lock(state_->mutex_);
@@ -321,25 +321,25 @@ void ThreadPool::CollectFinishedWorkersUnlocked() {
state_->finished_workers_.clear();
}
-thread_local ThreadPool* current_thread_pool_ = nullptr;
-
-bool ThreadPool::OwnsThisThread() { return current_thread_pool_ == this; }
-
+thread_local ThreadPool* current_thread_pool_ = nullptr;
+
+bool ThreadPool::OwnsThisThread() { return current_thread_pool_ == this; }
+
void ThreadPool::LaunchWorkersUnlocked(int threads) {
std::shared_ptr<State> state = sp_state_;
for (int i = 0; i < threads; i++) {
state_->workers_.emplace_back();
auto it = --(state_->workers_.end());
- *it = std::thread([this, state, it] {
- current_thread_pool_ = this;
- WorkerLoop(state, it);
- });
+ *it = std::thread([this, state, it] {
+ current_thread_pool_ = this;
+ WorkerLoop(state, it);
+ });
}
}
-Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken stop_token,
- StopCallback&& stop_callback) {
+Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken stop_token,
+ StopCallback&& stop_callback) {
{
ProtectAgainstFork();
std::lock_guard<std::mutex> lock(state_->mutex_);
@@ -347,14 +347,14 @@ Status ThreadPool::SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken sto
return Status::Invalid("operation forbidden during or after shutdown");
}
CollectFinishedWorkersUnlocked();
- state_->tasks_queued_or_running_++;
- if (static_cast<int>(state_->workers_.size()) < state_->tasks_queued_or_running_ &&
- state_->desired_capacity_ > static_cast<int>(state_->workers_.size())) {
- // We can still spin up more workers so spin up a new worker
- LaunchWorkersUnlocked(/*threads=*/1);
- }
- state_->pending_tasks_.push_back(
- {std::move(task), std::move(stop_token), std::move(stop_callback)});
+ state_->tasks_queued_or_running_++;
+ if (static_cast<int>(state_->workers_.size()) < state_->tasks_queued_or_running_ &&
+ state_->desired_capacity_ > static_cast<int>(state_->workers_.size())) {
+ // We can still spin up more workers so spin up a new worker
+ LaunchWorkersUnlocked(/*threads=*/1);
+ }
+ state_->pending_tasks_.push_back(
+ {std::move(task), std::move(stop_token), std::move(stop_callback)});
}
state_->cv_.notify_one();
return Status::OK();
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
index 5d866601ab1..9ac8e36a3d8 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/thread_pool.h
@@ -23,14 +23,14 @@
#include <cstdint>
#include <memory>
-#include <queue>
+#include <queue>
#include <type_traits>
#include <utility>
#include "arrow/result.h"
#include "arrow/status.h"
-#include "arrow/util/cancel.h"
-#include "arrow/util/functional.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
#include "arrow/util/future.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -76,229 +76,229 @@ struct TaskHints {
class ARROW_EXPORT Executor {
public:
- using StopCallback = internal::FnOnce<void(const Status&)>;
-
+ using StopCallback = internal::FnOnce<void(const Status&)>;
+
virtual ~Executor();
// Spawn a fire-and-forget task.
template <typename Function>
Status Spawn(Function&& func) {
- return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
- StopCallback{});
+ return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(Function&& func, StopToken stop_token) {
+ return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
+ StopCallback{});
}
template <typename Function>
- Status Spawn(Function&& func, StopToken stop_token) {
- return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
- StopCallback{});
- }
- template <typename Function>
Status Spawn(TaskHints hints, Function&& func) {
- return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
- StopCallback{});
+ return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
+ return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+ StopCallback{});
+ }
+ template <typename Function>
+ Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
+ StopCallback stop_callback) {
+ return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+ std::move(stop_callback));
+ }
+
+ // Transfers a future to this executor. Any continuations added to the
+ // returned future will run in this executor. Otherwise they would run
+ // on the same thread that called MarkFinished.
+ //
+ // This is necessary when (for example) an I/O task is completing a future.
+ // The continuations of that future should run on the CPU thread pool keeping
+ // CPU heavy work off the I/O thread pool. So the I/O task should transfer
+ // the future to the CPU executor before returning.
+ //
+ // By default this method will only transfer if the future is not already completed. If
+ // the future is already completed then any callback would be run synchronously and so
+ // no transfer is typically necessary. However, in cases where you want to force a
+ // transfer (e.g. to help the scheduler break up units of work across multiple cores)
+ // then you can override this behavior with `always_transfer`.
+ template <typename T>
+ Future<T> Transfer(Future<T> future) {
+ return DoTransfer(std::move(future), false);
}
- template <typename Function>
- Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
- return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
- StopCallback{});
- }
- template <typename Function>
- Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
- StopCallback stop_callback) {
- return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
- std::move(stop_callback));
- }
-
- // Transfers a future to this executor. Any continuations added to the
- // returned future will run in this executor. Otherwise they would run
- // on the same thread that called MarkFinished.
- //
- // This is necessary when (for example) an I/O task is completing a future.
- // The continuations of that future should run on the CPU thread pool keeping
- // CPU heavy work off the I/O thread pool. So the I/O task should transfer
- // the future to the CPU executor before returning.
- //
- // By default this method will only transfer if the future is not already completed. If
- // the future is already completed then any callback would be run synchronously and so
- // no transfer is typically necessary. However, in cases where you want to force a
- // transfer (e.g. to help the scheduler break up units of work across multiple cores)
- // then you can override this behavior with `always_transfer`.
- template <typename T>
- Future<T> Transfer(Future<T> future) {
- return DoTransfer(std::move(future), false);
- }
-
- // Overload of Transfer which will always schedule callbacks on new threads even if the
- // future is finished when the callback is added.
- //
- // This can be useful in cases where you want to ensure parallelism
- template <typename T>
- Future<T> TransferAlways(Future<T> future) {
- return DoTransfer(std::move(future), true);
- }
-
+
+ // Overload of Transfer which will always schedule callbacks on new threads even if the
+ // future is finished when the callback is added.
+ //
+ // This can be useful in cases where you want to ensure parallelism
+ template <typename T>
+ Future<T> TransferAlways(Future<T> future) {
+ return DoTransfer(std::move(future), true);
+ }
+
// Submit a callable and arguments for execution. Return a future that
// will return the callable's result value once.
// The callable's arguments are copied before execution.
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
- Args&&... args) {
- using ValueType = typename FutureType::ValueType;
-
- auto future = FutureType::Make();
- auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
- std::forward<Function>(func), std::forward<Args>(args)...);
- struct {
- WeakFuture<ValueType> weak_fut;
-
- void operator()(const Status& st) {
- auto fut = weak_fut.get();
- if (fut.is_valid()) {
- fut.MarkFinished(st);
- }
- }
- } stop_callback{WeakFuture<ValueType>(future)};
- ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
- std::move(stop_callback)));
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
+ Args&&... args) {
+ using ValueType = typename FutureType::ValueType;
+
+ auto future = FutureType::Make();
+ auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
+ std::forward<Function>(func), std::forward<Args>(args)...);
+ struct {
+ WeakFuture<ValueType> weak_fut;
+
+ void operator()(const Status& st) {
+ auto fut = weak_fut.get();
+ if (fut.is_valid()) {
+ fut.MarkFinished(st);
+ }
+ }
+ } stop_callback{WeakFuture<ValueType>(future)};
+ ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
+ std::move(stop_callback)));
return future;
}
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
- return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
- std::forward<Args>(args)...);
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
+ return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
+ std::forward<Args>(args)...);
+ }
+
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
+ return Submit(std::move(hints), StopToken::Unstoppable(),
+ std::forward<Function>(func), std::forward<Args>(args)...);
+ }
+
+ template <typename Function, typename... Args,
+ typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+ Function && (Args && ...)>>
+ Result<FutureType> Submit(Function&& func, Args&&... args) {
+ return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
+ std::forward<Args>(args)...);
}
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
- return Submit(std::move(hints), StopToken::Unstoppable(),
- std::forward<Function>(func), std::forward<Args>(args)...);
- }
-
- template <typename Function, typename... Args,
- typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
- Function && (Args && ...)>>
- Result<FutureType> Submit(Function&& func, Args&&... args) {
- return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
- std::forward<Args>(args)...);
- }
-
// Return the level of parallelism (the number of tasks that may be executed
// concurrently). This may be an approximate number.
virtual int GetCapacity() = 0;
- // Return true if the thread from which this function is called is owned by this
- // Executor. Returns false if this Executor does not support this property.
- virtual bool OwnsThisThread() { return false; }
-
+ // Return true if the thread from which this function is called is owned by this
+ // Executor. Returns false if this Executor does not support this property.
+ virtual bool OwnsThisThread() { return false; }
+
protected:
ARROW_DISALLOW_COPY_AND_ASSIGN(Executor);
Executor() = default;
- template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
- Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
- auto transferred = Future<T>::Make();
- if (always_transfer) {
- CallbackOptions callback_options = CallbackOptions::Defaults();
- callback_options.should_schedule = ShouldSchedule::Always;
- callback_options.executor = this;
- auto sync_callback = [transferred](const FTSync& result) mutable {
- transferred.MarkFinished(result);
- };
- future.AddCallback(sync_callback, callback_options);
- return transferred;
- }
-
- // We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
- // work by doing the test here.
- auto callback = [this, transferred](const FTSync& result) mutable {
- auto spawn_status =
- Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
- if (!spawn_status.ok()) {
- transferred.MarkFinished(spawn_status);
- }
- };
- auto callback_factory = [&callback]() { return callback; };
- if (future.TryAddCallback(callback_factory)) {
- return transferred;
- }
- // If the future is already finished and we aren't going to force spawn a thread
- // then we don't need to add another layer of callback and can return the original
- // future
- return future;
- }
-
+ template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
+ Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
+ auto transferred = Future<T>::Make();
+ if (always_transfer) {
+ CallbackOptions callback_options = CallbackOptions::Defaults();
+ callback_options.should_schedule = ShouldSchedule::Always;
+ callback_options.executor = this;
+ auto sync_callback = [transferred](const FTSync& result) mutable {
+ transferred.MarkFinished(result);
+ };
+ future.AddCallback(sync_callback, callback_options);
+ return transferred;
+ }
+
+ // We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
+ // work by doing the test here.
+ auto callback = [this, transferred](const FTSync& result) mutable {
+ auto spawn_status =
+ Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
+ if (!spawn_status.ok()) {
+ transferred.MarkFinished(spawn_status);
+ }
+ };
+ auto callback_factory = [&callback]() { return callback; };
+ if (future.TryAddCallback(callback_factory)) {
+ return transferred;
+ }
+ // If the future is already finished and we aren't going to force spawn a thread
+ // then we don't need to add another layer of callback and can return the original
+ // future
+ return future;
+ }
+
// Subclassing API
- virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
- StopCallback&&) = 0;
+ virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) = 0;
};
-/// \brief An executor implementation that runs all tasks on a single thread using an
-/// event loop.
-///
-/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
-/// fine but if one task needs to wait for another task it must be expressed as an
-/// asynchronous continuation.
-class ARROW_EXPORT SerialExecutor : public Executor {
- public:
- template <typename T = ::arrow::internal::Empty>
- using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
-
- ~SerialExecutor() override;
-
- int GetCapacity() override { return 1; };
- Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
- StopCallback&&) override;
-
- /// \brief Runs the TopLevelTask and any scheduled tasks
- ///
- /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
- /// status or call the finish signal. Failure to do this will result in a deadlock. For
- /// this reason it is preferable (if possible) to use the helper methods (below)
- /// RunSynchronously/RunSerially which delegates the responsiblity onto a Future
- /// producer's existing responsibility to always mark a future finished (which can
- /// someday be aided by ARROW-12207).
- template <typename T = internal::Empty, typename FT = Future<T>,
- typename FTSync = typename FT::SyncType>
- static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
- Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
- return FutureToSync(fut);
- }
-
- private:
- SerialExecutor();
-
- // State uses mutex
- struct State;
- std::shared_ptr<State> state_;
-
- template <typename T, typename FTSync = typename Future<T>::SyncType>
- Future<T> Run(TopLevelTask<T> initial_task) {
- auto final_fut = std::move(initial_task)(this);
- if (final_fut.is_finished()) {
- return final_fut;
- }
- final_fut.AddCallback([this](const FTSync&) { MarkFinished(); });
- RunLoop();
- return final_fut;
- }
- void RunLoop();
- void MarkFinished();
-};
-
-/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
-/// pool of worker threads.
-///
-/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
-/// fine but if one task needs to wait for another task it must be expressed as an
-/// asynchronous continuation.
+/// \brief An executor implementation that runs all tasks on a single thread using an
+/// event loop.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
+class ARROW_EXPORT SerialExecutor : public Executor {
+ public:
+ template <typename T = ::arrow::internal::Empty>
+ using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
+
+ ~SerialExecutor() override;
+
+ int GetCapacity() override { return 1; };
+ Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) override;
+
+ /// \brief Runs the TopLevelTask and any scheduled tasks
+ ///
+ /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
+ /// status or call the finish signal. Failure to do this will result in a deadlock. For
+ /// this reason it is preferable (if possible) to use the helper methods (below)
+ /// RunSynchronously/RunSerially which delegates the responsiblity onto a Future
+ /// producer's existing responsibility to always mark a future finished (which can
+ /// someday be aided by ARROW-12207).
+ template <typename T = internal::Empty, typename FT = Future<T>,
+ typename FTSync = typename FT::SyncType>
+ static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
+ Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
+ return FutureToSync(fut);
+ }
+
+ private:
+ SerialExecutor();
+
+ // State uses mutex
+ struct State;
+ std::shared_ptr<State> state_;
+
+ template <typename T, typename FTSync = typename Future<T>::SyncType>
+ Future<T> Run(TopLevelTask<T> initial_task) {
+ auto final_fut = std::move(initial_task)(this);
+ if (final_fut.is_finished()) {
+ return final_fut;
+ }
+ final_fut.AddCallback([this](const FTSync&) { MarkFinished(); });
+ RunLoop();
+ return final_fut;
+ }
+ void RunLoop();
+ void MarkFinished();
+};
+
+/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
+/// pool of worker threads.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor. Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
class ARROW_EXPORT ThreadPool : public Executor {
public:
// Construct a thread pool with the given number of worker threads
@@ -309,25 +309,25 @@ class ARROW_EXPORT ThreadPool : public Executor {
static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
// Destroy thread pool; the pool will first be shut down
- ~ThreadPool() override;
+ ~ThreadPool() override;
// Return the desired number of worker threads.
// The actual number of workers may lag a bit before being adjusted to
// match this value.
int GetCapacity() override;
- bool OwnsThisThread() override;
-
- // Return the number of tasks either running or in the queue.
- int GetNumTasks();
-
+ bool OwnsThisThread() override;
+
+ // Return the number of tasks either running or in the queue.
+ int GetNumTasks();
+
// Dynamically change the number of worker threads.
- //
- // This function always returns immediately.
- // If fewer threads are running than this number, new threads are spawned
- // on-demand when needed for task execution.
- // If more threads are running than this number, excess threads are reaped
- // as soon as possible.
+ //
+ // This function always returns immediately.
+ // If fewer threads are running than this number, new threads are spawned
+ // on-demand when needed for task execution.
+ // If more threads are running than this number, excess threads are reaped
+ // as soon as possible.
Status SetCapacity(int threads);
// Heuristic for the default capacity of a thread pool for CPU-bound tasks.
@@ -350,8 +350,8 @@ class ARROW_EXPORT ThreadPool : public Executor {
ThreadPool();
- Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
- StopCallback&&) override;
+ Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+ StopCallback&&) override;
// Collect finished worker threads, making sure the OS threads have exited
void CollectFinishedWorkersUnlocked();
@@ -375,24 +375,24 @@ class ARROW_EXPORT ThreadPool : public Executor {
// Return the process-global thread pool for CPU-bound tasks.
ARROW_EXPORT ThreadPool* GetCpuThreadPool();
-/// \brief Potentially run an async operation serially (if use_threads is false)
-/// \see RunSerially
-///
-/// If `use_threads` is true, the global CPU executor is used.
-/// If `use_threads` is false, a temporary SerialExecutor is used.
-/// `get_future` is called (from this thread) with the chosen executor and must
-/// return a future that will eventually finish. This function returns once the
-/// future has finished.
-template <typename Fut, typename ValueType = typename Fut::ValueType>
-typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
- bool use_threads) {
- if (use_threads) {
- auto fut = std::move(get_future)(GetCpuThreadPool());
- return FutureToSync(fut);
- } else {
- return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
- }
-}
-
+/// \brief Potentially run an async operation serially (if use_threads is false)
+/// \see RunSerially
+///
+/// If `use_threads` is true, the global CPU executor is used.
+/// If `use_threads` is false, a temporary SerialExecutor is used.
+/// `get_future` is called (from this thread) with the chosen executor and must
+/// return a future that will eventually finish. This function returns once the
+/// future has finished.
+template <typename Fut, typename ValueType = typename Fut::ValueType>
+typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
+ bool use_threads) {
+ if (use_threads) {
+ auto fut = std::move(get_future)(GetCpuThreadPool());
+ return FutureToSync(fut);
+ } else {
+ return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
+ }
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
index ed73fdc6b04..b250cca647d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/trie.h
@@ -116,7 +116,7 @@ std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
class ARROW_EXPORT Trie {
using index_type = int16_t;
using fast_index_type = int_fast16_t;
- static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
+ static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
public:
Trie() : size_(0) {}
@@ -126,9 +126,9 @@ class ARROW_EXPORT Trie {
int32_t Find(util::string_view s) const {
const Node* node = &nodes_[0];
fast_index_type pos = 0;
- if (s.length() > static_cast<size_t>(kMaxIndex)) {
- return -1;
- }
+ if (s.length() > static_cast<size_t>(kMaxIndex)) {
+ return -1;
+ }
fast_index_type remaining = static_cast<fast_index_type>(s.length());
while (remaining > 0) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
index b3e69aa632f..ca107c2c69d 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_fwd.h
@@ -19,16 +19,16 @@
namespace arrow {
-namespace internal {
-struct Empty;
-} // namespace internal
-
-template <typename T = internal::Empty>
-class WeakFuture;
-class FutureWaiter;
-
-class TimestampParser;
-
+namespace internal {
+struct Empty;
+} // namespace internal
+
+template <typename T = internal::Empty>
+class WeakFuture;
+class FutureWaiter;
+
+class TimestampParser;
+
namespace internal {
class Executor;
@@ -36,27 +36,27 @@ class TaskGroup;
class ThreadPool;
} // namespace internal
-
-struct Compression {
- /// \brief Compression algorithm
- enum type {
- UNCOMPRESSED,
- SNAPPY,
- GZIP,
- BROTLI,
- ZSTD,
- LZ4,
- LZ4_FRAME,
- LZO,
- BZ2,
- LZ4_HADOOP
- };
-};
-
-namespace util {
-class Compressor;
-class Decompressor;
-class Codec;
-} // namespace util
-
+
+struct Compression {
+ /// \brief Compression algorithm
+ enum type {
+ UNCOMPRESSED,
+ SNAPPY,
+ GZIP,
+ BROTLI,
+ ZSTD,
+ LZ4,
+ LZ4_FRAME,
+ LZO,
+ BZ2,
+ LZ4_HADOOP
+ };
+};
+
+namespace util {
+class Compressor;
+class Decompressor;
+class Codec;
+} // namespace util
+
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
index a8cfec5cc04..80cc6297e39 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/type_traits.h
@@ -17,7 +17,7 @@
#pragma once
-#include <cstdint>
+#include <cstdint>
#include <type_traits>
namespace arrow {
@@ -42,45 +42,45 @@ template <typename T>
struct is_null_pointer : std::is_same<std::nullptr_t, typename std::remove_cv<T>::type> {
};
-#ifdef __GLIBCXX__
-
-// A aligned_union backport, because old libstdc++ versions don't include it.
-
-constexpr std::size_t max_size(std::size_t a, std::size_t b) { return (a > b) ? a : b; }
-
-template <typename...>
-struct max_size_traits;
-
-template <typename H, typename... T>
-struct max_size_traits<H, T...> {
- static constexpr std::size_t max_sizeof() {
- return max_size(sizeof(H), max_size_traits<T...>::max_sizeof());
- }
- static constexpr std::size_t max_alignof() {
- return max_size(alignof(H), max_size_traits<T...>::max_alignof());
- }
-};
-
-template <>
-struct max_size_traits<> {
- static constexpr std::size_t max_sizeof() { return 0; }
- static constexpr std::size_t max_alignof() { return 0; }
-};
-
-template <std::size_t Len, typename... T>
-struct aligned_union {
- static constexpr std::size_t alignment_value = max_size_traits<T...>::max_alignof();
- static constexpr std::size_t size_value =
- max_size(Len, max_size_traits<T...>::max_sizeof());
- using type = typename std::aligned_storage<size_value, alignment_value>::type;
-};
-
-#else
-
-template <std::size_t Len, typename... T>
-using aligned_union = std::aligned_union<Len, T...>;
-
-#endif
-
+#ifdef __GLIBCXX__
+
+// A aligned_union backport, because old libstdc++ versions don't include it.
+
+constexpr std::size_t max_size(std::size_t a, std::size_t b) { return (a > b) ? a : b; }
+
+template <typename...>
+struct max_size_traits;
+
+template <typename H, typename... T>
+struct max_size_traits<H, T...> {
+ static constexpr std::size_t max_sizeof() {
+ return max_size(sizeof(H), max_size_traits<T...>::max_sizeof());
+ }
+ static constexpr std::size_t max_alignof() {
+ return max_size(alignof(H), max_size_traits<T...>::max_alignof());
+ }
+};
+
+template <>
+struct max_size_traits<> {
+ static constexpr std::size_t max_sizeof() { return 0; }
+ static constexpr std::size_t max_alignof() { return 0; }
+};
+
+template <std::size_t Len, typename... T>
+struct aligned_union {
+ static constexpr std::size_t alignment_value = max_size_traits<T...>::max_alignof();
+ static constexpr std::size_t size_value =
+ max_size(Len, max_size_traits<T...>::max_sizeof());
+ using type = typename std::aligned_storage<size_value, alignment_value>::type;
+};
+
+#else
+
+template <std::size_t Len, typename... T>
+using aligned_union = std::aligned_union<Len, T...>;
+
+#endif
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
index f644f73fd8e..c19a7bc2eee 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.cc
@@ -23,7 +23,7 @@
#include "arrow/util/string_view.h"
#include "arrow/util/value_parsing.h"
-#include "contrib/restricted/uriparser/include/uriparser/Uri.h"
+#include "contrib/restricted/uriparser/include/uriparser/Uri.h"
namespace arrow {
namespace internal {
@@ -71,28 +71,28 @@ std::string UriEscape(const std::string& s) {
return escaped;
}
-std::string UriUnescape(const util::string_view s) {
- std::string result(s);
- if (!result.empty()) {
- auto end = uriUnescapeInPlaceA(&result[0]);
- result.resize(end - &result[0]);
- }
- return result;
-}
-
-std::string UriEncodeHost(const std::string& host) {
- // Fairly naive check: if it contains a ':', it's IPv6 and needs
- // brackets, else it's OK
- if (host.find(":") != std::string::npos) {
- std::string result = "[";
- result += host;
- result += ']';
- return result;
- } else {
- return host;
- }
-}
-
+std::string UriUnescape(const util::string_view s) {
+ std::string result(s);
+ if (!result.empty()) {
+ auto end = uriUnescapeInPlaceA(&result[0]);
+ result.resize(end - &result[0]);
+ }
+ return result;
+}
+
+std::string UriEncodeHost(const std::string& host) {
+ // Fairly naive check: if it contains a ':', it's IPv6 and needs
+ // brackets, else it's OK
+ if (host.find(":") != std::string::npos) {
+ std::string result = "[";
+ result += host;
+ result += ']';
+ return result;
+ } else {
+ return host;
+ }
+}
+
struct Uri::Impl {
Impl() : string_rep_(""), port_(-1) { memset(&uri_, 0, sizeof(uri_)); }
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
index 35a9400f92b..b4ffbb04dec 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/uri.h
@@ -24,7 +24,7 @@
#include <vector>
#include "arrow/type_fwd.h"
-#include "arrow/util/string_view.h"
+#include "arrow/util/string_view.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -92,13 +92,13 @@ class ARROW_EXPORT Uri {
ARROW_EXPORT
std::string UriEscape(const std::string& s);
-ARROW_EXPORT
-std::string UriUnescape(const arrow::util::string_view s);
-
-/// Encode a host for use within a URI, such as "localhost",
-/// "127.0.0.1", or "[::1]".
-ARROW_EXPORT
-std::string UriEncodeHost(const std::string& host);
-
+ARROW_EXPORT
+std::string UriUnescape(const arrow::util::string_view s);
+
+/// Encode a host for use within a URI, such as "localhost",
+/// "127.0.0.1", or "[::1]".
+ARROW_EXPORT
+std::string UriEncodeHost(const std::string& host);
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
index af850dfc523..11394d2e64c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.cc
@@ -64,8 +64,8 @@ const uint8_t utf8_small_table[] = { // NOLINT
uint16_t utf8_large_table[9 * 256] = {0xffff};
-const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
-
+const uint8_t utf8_byte_size_table[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+
static void InitializeLargeTable() {
for (uint32_t state = 0; state < 9; ++state) {
for (uint32_t byte = 0; byte < 256; ++byte) {
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
index 54ee9a2820b..0ec3538b95c 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/utf8.h
@@ -23,15 +23,15 @@
#include <memory>
#include <string>
-#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
-#error #include <xsimd/xsimd.hpp>
-#endif
-
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#error #include <xsimd/xsimd.hpp>
+#endif
+
#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
#include "arrow/util/simd.h"
#include "arrow/util/string_view.h"
-#include "arrow/util/ubsan.h"
+#include "arrow/util/ubsan.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -65,8 +65,8 @@ static constexpr uint8_t kUTF8DecodeReject = 12;
// In this table states are multiples of 256.
ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];
-ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];
-
+ARROW_EXPORT extern const uint8_t utf8_byte_size_table[16];
+
// Success / reject states when looked up in the large table
static constexpr uint16_t kUTF8ValidateAccept = 0;
static constexpr uint16_t kUTF8ValidateReject = 256;
@@ -94,9 +94,9 @@ ARROW_EXPORT void InitializeUTF8();
inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
- static constexpr uint32_t high_bits_32 = 0x80808080UL;
- static constexpr uint16_t high_bits_16 = 0x8080U;
- static constexpr uint8_t high_bits_8 = 0x80U;
+ static constexpr uint32_t high_bits_32 = 0x80808080UL;
+ static constexpr uint16_t high_bits_16 = 0x8080U;
+ static constexpr uint8_t high_bits_8 = 0x80U;
#ifndef NDEBUG
internal::CheckUTF8Initialized();
@@ -106,8 +106,8 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
// XXX This is doing an unaligned access. Contemporary architectures
// (x86-64, AArch64, PPC64) support it natively and often have good
// performance nevertheless.
- uint64_t mask64 = SafeLoadAs<uint64_t>(data);
- if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
+ uint64_t mask64 = SafeLoadAs<uint64_t>(data);
+ if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
// 8 bytes of pure ASCII, move forward
size -= 8;
data += 8;
@@ -162,50 +162,50 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
return false;
}
- // Check if string tail is full ASCII (common case, fast)
- if (size >= 4) {
- uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
- uint32_t head_mask = SafeLoadAs<uint32_t>(data);
- if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
- return true;
- }
- } else if (size >= 2) {
- uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
- uint16_t head_mask = SafeLoadAs<uint16_t>(data);
- if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
- return true;
- }
- } else if (size == 1) {
- if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
- return true;
- }
- } else {
- /* size == 0 */
- return true;
- }
-
- // Fall back to UTF8 validation of tail string.
+ // Check if string tail is full ASCII (common case, fast)
+ if (size >= 4) {
+ uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
+ uint32_t head_mask = SafeLoadAs<uint32_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
+ return true;
+ }
+ } else if (size >= 2) {
+ uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
+ uint16_t head_mask = SafeLoadAs<uint16_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
+ return true;
+ }
+ } else if (size == 1) {
+ if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
+ return true;
+ }
+ } else {
+ /* size == 0 */
+ return true;
+ }
+
+ // Fall back to UTF8 validation of tail string.
// Note the state table is designed so that, once in the reject state,
// we remain in that state until the end. So we needn't check for
// rejection at each char (we don't gain much by short-circuiting here).
uint16_t state = internal::kUTF8ValidateAccept;
- switch (size) {
- case 7:
- state = internal::ValidateOneUTF8Byte(data[size - 7], state);
- case 6:
- state = internal::ValidateOneUTF8Byte(data[size - 6], state);
- case 5:
- state = internal::ValidateOneUTF8Byte(data[size - 5], state);
- case 4:
- state = internal::ValidateOneUTF8Byte(data[size - 4], state);
- case 3:
- state = internal::ValidateOneUTF8Byte(data[size - 3], state);
- case 2:
- state = internal::ValidateOneUTF8Byte(data[size - 2], state);
- case 1:
- state = internal::ValidateOneUTF8Byte(data[size - 1], state);
- default:
- break;
+ switch (size) {
+ case 7:
+ state = internal::ValidateOneUTF8Byte(data[size - 7], state);
+ case 6:
+ state = internal::ValidateOneUTF8Byte(data[size - 6], state);
+ case 5:
+ state = internal::ValidateOneUTF8Byte(data[size - 5], state);
+ case 4:
+ state = internal::ValidateOneUTF8Byte(data[size - 4], state);
+ case 3:
+ state = internal::ValidateOneUTF8Byte(data[size - 3], state);
+ case 2:
+ state = internal::ValidateOneUTF8Byte(data[size - 2], state);
+ case 1:
+ state = internal::ValidateOneUTF8Byte(data[size - 1], state);
+ default:
+ break;
}
return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
}
@@ -246,26 +246,26 @@ inline bool ValidateAsciiSw(const uint8_t* data, int64_t len) {
}
}
-#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
+#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
inline bool ValidateAsciiSimd(const uint8_t* data, int64_t len) {
- using simd_batch = xsimd::batch<int8_t, 16>;
+ using simd_batch = xsimd::batch<int8_t, 16>;
if (len >= 32) {
- const simd_batch zero(static_cast<int8_t>(0));
+ const simd_batch zero(static_cast<int8_t>(0));
const uint8_t* data2 = data + 16;
- simd_batch or1 = zero, or2 = zero;
+ simd_batch or1 = zero, or2 = zero;
while (len >= 32) {
- or1 |= simd_batch(reinterpret_cast<const int8_t*>(data), xsimd::unaligned_mode{});
- or2 |= simd_batch(reinterpret_cast<const int8_t*>(data2), xsimd::unaligned_mode{});
+ or1 |= simd_batch(reinterpret_cast<const int8_t*>(data), xsimd::unaligned_mode{});
+ or2 |= simd_batch(reinterpret_cast<const int8_t*>(data2), xsimd::unaligned_mode{});
data += 32;
data2 += 32;
len -= 32;
}
- // To test for upper bit in all bytes, test whether any of them is negative
- or1 |= or2;
- if (xsimd::any(or1 < zero)) {
+ // To test for upper bit in all bytes, test whether any of them is negative
+ or1 |= or2;
+ if (xsimd::any(or1 < zero)) {
return false;
}
}
@@ -295,34 +295,34 @@ Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);
static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;
-// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
-// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
-// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
-// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
-// utf8_byte_size_table[14] --> 3B long UTF8 chars
-// utf8_byte_size_table[15] --> 4B long UTF8 chars
-// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
-// ex: \xFF... returns 4B
-static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
- return internal::utf8_byte_size_table[*codeunit >> 4];
-}
-
+// size of a valid UTF8 can be determined by looking at leading 4 bits of BYTE1
+// utf8_byte_size_table[0..7] --> pure ascii chars --> 1B length
+// utf8_byte_size_table[8..11] --> internal bytes --> 1B length
+// utf8_byte_size_table[12,13] --> 2B long UTF8 chars
+// utf8_byte_size_table[14] --> 3B long UTF8 chars
+// utf8_byte_size_table[15] --> 4B long UTF8 chars
+// NOTE: Results for invalid/ malformed utf-8 sequences are undefined.
+// ex: \xFF... returns 4B
+static inline uint8_t ValidUtf8CodepointByteSize(const uint8_t* codeunit) {
+ return internal::utf8_byte_size_table[*codeunit >> 4];
+}
+
static inline bool Utf8IsContinuation(const uint8_t codeunit) {
return (codeunit & 0xC0) == 0x80; // upper two bits should be 10
}
-static inline bool Utf8Is2ByteStart(const uint8_t codeunit) {
- return (codeunit & 0xE0) == 0xC0; // upper three bits should be 110
-}
-
-static inline bool Utf8Is3ByteStart(const uint8_t codeunit) {
- return (codeunit & 0xF0) == 0xE0; // upper four bits should be 1110
-}
-
-static inline bool Utf8Is4ByteStart(const uint8_t codeunit) {
- return (codeunit & 0xF8) == 0xF0; // upper five bits should be 11110
-}
-
+static inline bool Utf8Is2ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xE0) == 0xC0; // upper three bits should be 110
+}
+
+static inline bool Utf8Is3ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xF0) == 0xE0; // upper four bits should be 1110
+}
+
+static inline bool Utf8Is4ByteStart(const uint8_t codeunit) {
+ return (codeunit & 0xF8) == 0xF0; // upper five bits should be 11110
+}
+
static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
if (codepoint < 0x80) {
*str++ = codepoint;
@@ -346,7 +346,7 @@ static inline uint8_t* UTF8Encode(uint8_t* str, uint32_t codepoint) {
static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
const uint8_t* str = *data;
- if (*str < 0x80) { // ascii
+ if (*str < 0x80) { // ascii
*codepoint = *str++;
} else if (ARROW_PREDICT_FALSE(*str < 0xC0)) { // invalid non-ascii char
return false;
@@ -391,45 +391,45 @@ static inline bool UTF8Decode(const uint8_t** data, uint32_t* codepoint) {
return true;
}
-static inline bool UTF8DecodeReverse(const uint8_t** data, uint32_t* codepoint) {
- const uint8_t* str = *data;
- if (*str < 0x80) { // ascii
- *codepoint = *str--;
- } else {
- if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
- return false;
- }
- uint8_t code_unit_N = (*str--) & 0x3F; // take last 6 bits
- if (Utf8Is2ByteStart(*str)) {
- uint8_t code_unit_1 = (*str--) & 0x1F; // take last 5 bits
- *codepoint = (code_unit_1 << 6) + code_unit_N;
- } else {
- if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
- return false;
- }
- uint8_t code_unit_Nmin1 = (*str--) & 0x3F; // take last 6 bits
- if (Utf8Is3ByteStart(*str)) {
- uint8_t code_unit_1 = (*str--) & 0x0F; // take last 4 bits
- *codepoint = (code_unit_1 << 12) + (code_unit_Nmin1 << 6) + code_unit_N;
- } else {
- if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
- return false;
- }
- uint8_t code_unit_Nmin2 = (*str--) & 0x3F; // take last 6 bits
- if (ARROW_PREDICT_TRUE(Utf8Is4ByteStart(*str))) {
- uint8_t code_unit_1 = (*str--) & 0x07; // take last 3 bits
- *codepoint = (code_unit_1 << 18) + (code_unit_Nmin2 << 12) +
- (code_unit_Nmin1 << 6) + code_unit_N;
- } else {
- return false;
- }
- }
- }
- }
- *data = str;
- return true;
-}
-
+static inline bool UTF8DecodeReverse(const uint8_t** data, uint32_t* codepoint) {
+ const uint8_t* str = *data;
+ if (*str < 0x80) { // ascii
+ *codepoint = *str--;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_N = (*str--) & 0x3F; // take last 6 bits
+ if (Utf8Is2ByteStart(*str)) {
+ uint8_t code_unit_1 = (*str--) & 0x1F; // take last 5 bits
+ *codepoint = (code_unit_1 << 6) + code_unit_N;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_Nmin1 = (*str--) & 0x3F; // take last 6 bits
+ if (Utf8Is3ByteStart(*str)) {
+ uint8_t code_unit_1 = (*str--) & 0x0F; // take last 4 bits
+ *codepoint = (code_unit_1 << 12) + (code_unit_Nmin1 << 6) + code_unit_N;
+ } else {
+ if (ARROW_PREDICT_FALSE(!Utf8IsContinuation(*str))) {
+ return false;
+ }
+ uint8_t code_unit_Nmin2 = (*str--) & 0x3F; // take last 6 bits
+ if (ARROW_PREDICT_TRUE(Utf8Is4ByteStart(*str))) {
+ uint8_t code_unit_1 = (*str--) & 0x07; // take last 3 bits
+ *codepoint = (code_unit_1 << 18) + (code_unit_Nmin2 << 12) +
+ (code_unit_Nmin1 << 6) + code_unit_N;
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+ *data = str;
+ return true;
+}
+
template <class UnaryOperation>
static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
uint8_t** destination, UnaryOperation&& unary_op) {
@@ -446,97 +446,97 @@ static inline bool UTF8Transform(const uint8_t* first, const uint8_t* last,
return true;
}
-template <class Predicate>
-static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
- Predicate&& predicate, const uint8_t** position) {
- const uint8_t* i = first;
- while (i < last) {
- uint32_t codepoint = 0;
- const uint8_t* current = i;
- if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
- return false;
- }
- if (predicate(codepoint)) {
- *position = current;
- return true;
- }
- }
- *position = last;
- return true;
-}
-
-// Same semantics as std::find_if using reverse iterators with the return value
-// having the same semantics as std::reverse_iterator<..>.base()
-// A reverse iterator physically points to the next address, e.g.:
-// &*reverse_iterator(i) == &*(i + 1)
-template <class Predicate>
-static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
- Predicate&& predicate, const uint8_t** position) {
- // converts to a normal point
- const uint8_t* i = last - 1;
- while (i >= first) {
- uint32_t codepoint = 0;
- const uint8_t* current = i;
- if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(&i, &codepoint))) {
- return false;
- }
- if (predicate(codepoint)) {
- // converts normal pointer to 'reverse iterator semantics'.
- *position = current + 1;
- return true;
- }
- }
- // similar to how an end pointer point to 1 beyond the last, reverse iterators point
- // to the 'first' pointer to indicate out of range.
- *position = first;
- return true;
-}
-
-static inline bool UTF8AdvanceCodepoints(const uint8_t* first, const uint8_t* last,
- const uint8_t** destination, int64_t n) {
- return UTF8FindIf(
- first, last,
- [&](uint32_t codepoint) {
- bool done = n == 0;
- n--;
- return done;
- },
- destination);
-}
-
-static inline bool UTF8AdvanceCodepointsReverse(const uint8_t* first, const uint8_t* last,
- const uint8_t** destination, int64_t n) {
- return UTF8FindIfReverse(
- first, last,
- [&](uint32_t codepoint) {
- bool done = n == 0;
- n--;
- return done;
- },
- destination);
-}
-
-template <class UnaryFunction>
-static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last,
- UnaryFunction&& f) {
- const uint8_t* i = first;
- while (i < last) {
- uint32_t codepoint = 0;
- if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
- return false;
- }
- f(codepoint);
- }
- return true;
-}
-
-template <class UnaryFunction>
-static inline bool UTF8ForEach(const std::string& s, UnaryFunction&& f) {
- return UTF8ForEach(reinterpret_cast<const uint8_t*>(s.data()),
- reinterpret_cast<const uint8_t*>(s.data() + s.length()),
- std::forward<UnaryFunction>(f));
-}
-
+template <class Predicate>
+static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** position) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ const uint8_t* current = i;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ if (predicate(codepoint)) {
+ *position = current;
+ return true;
+ }
+ }
+ *position = last;
+ return true;
+}
+
+// Same semantics as std::find_if using reverse iterators with the return value
+// having the same semantics as std::reverse_iterator<..>.base()
+// A reverse iterator physically points to the next address, e.g.:
+// &*reverse_iterator(i) == &*(i + 1)
+template <class Predicate>
+static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** position) {
+ // converts to a normal point
+ const uint8_t* i = last - 1;
+ while (i >= first) {
+ uint32_t codepoint = 0;
+ const uint8_t* current = i;
+ if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(&i, &codepoint))) {
+ return false;
+ }
+ if (predicate(codepoint)) {
+ // converts normal pointer to 'reverse iterator semantics'.
+ *position = current + 1;
+ return true;
+ }
+ }
+ // similar to how an end pointer point to 1 beyond the last, reverse iterators point
+ // to the 'first' pointer to indicate out of range.
+ *position = first;
+ return true;
+}
+
+static inline bool UTF8AdvanceCodepoints(const uint8_t* first, const uint8_t* last,
+ const uint8_t** destination, int64_t n) {
+ return UTF8FindIf(
+ first, last,
+ [&](uint32_t codepoint) {
+ bool done = n == 0;
+ n--;
+ return done;
+ },
+ destination);
+}
+
+static inline bool UTF8AdvanceCodepointsReverse(const uint8_t* first, const uint8_t* last,
+ const uint8_t** destination, int64_t n) {
+ return UTF8FindIfReverse(
+ first, last,
+ [&](uint32_t codepoint) {
+ bool done = n == 0;
+ n--;
+ return done;
+ },
+ destination);
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last,
+ UnaryFunction&& f) {
+ const uint8_t* i = first;
+ while (i < last) {
+ uint32_t codepoint = 0;
+ if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+ return false;
+ }
+ f(codepoint);
+ }
+ return true;
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const std::string& s, UnaryFunction&& f) {
+ return UTF8ForEach(reinterpret_cast<const uint8_t*>(s.data()),
+ reinterpret_cast<const uint8_t*>(s.data() + s.length()),
+ std::forward<UnaryFunction>(f));
+}
+
template <class UnaryPredicate>
static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* result,
UnaryPredicate&& predicate) {
@@ -556,15 +556,15 @@ static inline bool UTF8AllOf(const uint8_t* first, const uint8_t* last, bool* re
return true;
}
-/// Count the number of codepoints in the given string (assuming it is valid UTF8).
-static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
- int64_t length = 0;
- while (first != last) {
- length += ((*first & 0xc0) != 0x80);
- ++first;
- }
- return length;
-}
-
+/// Count the number of codepoints in the given string (assuming it is valid UTF8).
+static inline int64_t UTF8Length(const uint8_t* first, const uint8_t* last) {
+ int64_t length = 0;
+ while (first != last) {
+ length += ((*first & 0xc0) != 0x80);
+ ++first;
+ }
+ return length;
+}
+
} // namespace util
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
index 5460dfb91f9..3b147366636 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.cc
@@ -20,19 +20,19 @@
#include <string>
#include <utility>
-#include "contrib/restricted/fast_float/include/fast_float/fast_float.h"
+#include "contrib/restricted/fast_float/include/fast_float/fast_float.h"
namespace arrow {
namespace internal {
bool StringToFloat(const char* s, size_t length, float* out) {
- const auto res = fast_float::from_chars(s, s + length, *out);
- return res.ec == std::errc() && res.ptr == s + length;
+ const auto res = fast_float::from_chars(s, s + length, *out);
+ return res.ec == std::errc() && res.ptr == s + length;
}
bool StringToFloat(const char* s, size_t length, double* out) {
- const auto res = fast_float::from_chars(s, s + length, *out);
- return res.ec == std::errc() && res.ptr == s + length;
+ const auto res = fast_float::from_chars(s, s + length, *out);
+ return res.ec == std::errc() && res.ptr == s + length;
}
// ----------------------------------------------------------------------
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
index e8de13287c1..00295d1b51f 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/value_parsing.h
@@ -486,80 +486,80 @@ static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
uint32_t* out) {
- // The decimal point has been peeled off at this point
-
- // Fail if number of decimal places provided exceeds what the unit can hold.
- // Calculate how many trailing decimal places are omitted for the unit
- // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
- size_t omitted = 0;
- switch (unit) {
- case TimeUnit::MILLI:
- if (ARROW_PREDICT_FALSE(length > 3)) {
- return false;
- }
- if (length < 3) {
- omitted = 3 - length;
- }
+ // The decimal point has been peeled off at this point
+
+ // Fail if number of decimal places provided exceeds what the unit can hold.
+ // Calculate how many trailing decimal places are omitted for the unit
+ // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
+ size_t omitted = 0;
+ switch (unit) {
+ case TimeUnit::MILLI:
+ if (ARROW_PREDICT_FALSE(length > 3)) {
+ return false;
+ }
+ if (length < 3) {
+ omitted = 3 - length;
+ }
break;
- case TimeUnit::MICRO:
- if (ARROW_PREDICT_FALSE(length > 6)) {
- return false;
- }
- if (length < 6) {
- omitted = 6 - length;
- }
+ case TimeUnit::MICRO:
+ if (ARROW_PREDICT_FALSE(length > 6)) {
+ return false;
+ }
+ if (length < 6) {
+ omitted = 6 - length;
+ }
break;
- case TimeUnit::NANO:
- if (ARROW_PREDICT_FALSE(length > 9)) {
- return false;
- }
- if (length < 9) {
- omitted = 9 - length;
- }
+ case TimeUnit::NANO:
+ if (ARROW_PREDICT_FALSE(length > 9)) {
+ return false;
+ }
+ if (length < 9) {
+ omitted = 9 - length;
+ }
break;
default:
return false;
}
- if (ARROW_PREDICT_TRUE(omitted == 0)) {
- return ParseUnsigned(s, length, out);
- } else {
- uint32_t subseconds;
- bool success = ParseUnsigned(s, length, &subseconds);
- if (ARROW_PREDICT_TRUE(success)) {
- switch (omitted) {
- case 1:
- *out = subseconds * 10;
- break;
- case 2:
- *out = subseconds * 100;
- break;
- case 3:
- *out = subseconds * 1000;
- break;
- case 4:
- *out = subseconds * 10000;
- break;
- case 5:
- *out = subseconds * 100000;
- break;
- case 6:
- *out = subseconds * 1000000;
- break;
- case 7:
- *out = subseconds * 10000000;
- break;
- case 8:
- *out = subseconds * 100000000;
- break;
- default:
- // Impossible case
- break;
- }
- return true;
- } else {
- return false;
- }
+ if (ARROW_PREDICT_TRUE(omitted == 0)) {
+ return ParseUnsigned(s, length, out);
+ } else {
+ uint32_t subseconds;
+ bool success = ParseUnsigned(s, length, &subseconds);
+ if (ARROW_PREDICT_TRUE(success)) {
+ switch (omitted) {
+ case 1:
+ *out = subseconds * 10;
+ break;
+ case 2:
+ *out = subseconds * 100;
+ break;
+ case 3:
+ *out = subseconds * 1000;
+ break;
+ case 4:
+ *out = subseconds * 10000;
+ break;
+ case 5:
+ *out = subseconds * 100000;
+ break;
+ case 6:
+ *out = subseconds * 1000000;
+ break;
+ case 7:
+ *out = subseconds * 10000000;
+ break;
+ case 8:
+ *out = subseconds * 100000000;
+ break;
+ default:
+ // Impossible case
+ break;
+ }
+ return true;
+ } else {
+ return false;
+ }
}
}
@@ -572,21 +572,21 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
// We allow the following formats for all units:
// - "YYYY-MM-DD"
- // - "YYYY-MM-DD[ T]hhZ?"
- // - "YYYY-MM-DD[ T]hh:mmZ?"
- // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
+ // - "YYYY-MM-DD[ T]hhZ?"
+ // - "YYYY-MM-DD[ T]hh:mmZ?"
+ // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
//
- // We allow the following formats for unit == MILLI, MICRO, or NANO:
- // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
+ // We allow the following formats for unit == MILLI, MICRO, or NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
//
- // We allow the following formats for unit == MICRO, or NANO:
- // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
+ // We allow the following formats for unit == MICRO, or NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
//
- // We allow the following formats for unit == NANO:
- // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
+ // We allow the following formats for unit == NANO:
+ // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
//
// UTC is always assumed, and the DataType's timezone is ignored.
- //
+ //
if (ARROW_PREDICT_FALSE(length < 10)) return false;
@@ -621,15 +621,15 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
}
break;
case 19: // YYYY-MM-DD[ T]hh:mm:ss
- case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
- case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
- case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
- case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
- case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
- case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
- case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
- case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
- case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
+ case 21: // YYYY-MM-DD[ T]hh:mm:ss.s
+ case 22: // YYYY-MM-DD[ T]hh:mm:ss.ss
+ case 23: // YYYY-MM-DD[ T]hh:mm:ss.sss
+ case 24: // YYYY-MM-DD[ T]hh:mm:ss.ssss
+ case 25: // YYYY-MM-DD[ T]hh:mm:ss.sssss
+ case 26: // YYYY-MM-DD[ T]hh:mm:ss.ssssss
+ case 27: // YYYY-MM-DD[ T]hh:mm:ss.sssssss
+ case 28: // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
+ case 29: // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
return false;
}
@@ -645,13 +645,13 @@ static inline bool ParseTimestampISO8601(const char* s, size_t length,
return true;
}
- if (ARROW_PREDICT_FALSE(s[19] != '.')) {
- return false;
- }
-
+ if (ARROW_PREDICT_FALSE(s[19] != '.')) {
+ return false;
+ }
+
uint32_t subseconds = 0;
if (ARROW_PREDICT_FALSE(
- !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
+ !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
return false;
}
@@ -753,7 +753,7 @@ struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
uint32_t subseconds_count = 0;
if (ARROW_PREDICT_FALSE(
- !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
+ !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
return false;
}
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
index 8f8d23c2b76..b4b0d8f6f31 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/variant.h
@@ -17,423 +17,423 @@
#pragma once
-#include <cstddef>
-#include <exception>
-#include <type_traits>
-#include <utility>
-
-#include "arrow/util/macros.h"
-#include "arrow/util/type_traits.h"
-
+#include <cstddef>
+#include <exception>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
+
namespace arrow {
namespace util {
-/// \brief a std::variant-like discriminated union
-///
-/// Simplifications from std::variant:
-///
-/// - Strictly defaultable. The first type of T... should be nothrow default constructible
-/// and it will be used for default Variants.
-///
-/// - Never valueless_by_exception. std::variant supports a state outside those specified
-/// by T... to which it can return in the event that a constructor throws. If a Variant
-/// would become valueless_by_exception it will instead return to its default state.
-///
-/// - Strictly nothrow move constructible and assignable
-///
-/// - Less sophisticated type deduction. std::variant<bool, std::string>("hello") will
-/// intelligently construct std::string while Variant<bool, std::string>("hello") will
-/// construct bool.
-///
-/// - Either both copy constructible and assignable or neither (std::variant independently
-/// enables copy construction and copy assignment). Variant is copy constructible if
-/// each of T... is copy constructible and assignable.
-///
-/// - Slimmer interface; several members of std::variant are omitted.
-///
-/// - Throws no exceptions; if a bad_variant_access would be thrown Variant will instead
-/// segfault (nullptr dereference).
-///
-/// - Mutable visit takes a pointer instead of mutable reference or rvalue reference,
-/// which is more conformant with our code style.
-template <typename... T>
-class Variant;
-
-namespace detail {
-
-template <typename T, typename = void>
-struct is_equality_comparable : std::false_type {};
-
-template <typename T>
-struct is_equality_comparable<
- T, typename std::enable_if<std::is_convertible<
- decltype(std::declval<T>() == std::declval<T>()), bool>::value>::type>
- : std::true_type {};
-
-template <bool C, typename T, typename E>
-using conditional_t = typename std::conditional<C, T, E>::type;
-
-template <typename T>
-struct type_constant {
- using type = T;
-};
-
-template <typename...>
-struct first;
-
-template <typename H, typename... T>
-struct first<H, T...> {
- using type = H;
-};
-
-template <typename T>
-using decay_t = typename std::decay<T>::type;
-
-template <bool...>
-struct all : std::true_type {};
-
-template <bool H, bool... T>
-struct all<H, T...> : conditional_t<H, all<T...>, std::false_type> {};
-
-struct delete_copy_constructor {
- template <typename>
- struct type {
- type() = default;
- type(const type& other) = delete;
- type& operator=(const type& other) = delete;
- };
-};
-
-struct explicit_copy_constructor {
- template <typename Copyable>
- struct type {
- type() = default;
- type(const type& other) { static_cast<const Copyable&>(other).copy_to(this); }
- type& operator=(const type& other) {
- static_cast<Copyable*>(this)->destroy();
- static_cast<const Copyable&>(other).copy_to(this);
- return *this;
- }
- };
-};
-
-template <typename... T>
-struct VariantStorage {
- VariantStorage() = default;
- VariantStorage(const VariantStorage&) {}
- VariantStorage& operator=(const VariantStorage&) { return *this; }
- VariantStorage(VariantStorage&&) noexcept {}
- VariantStorage& operator=(VariantStorage&&) noexcept { return *this; }
- ~VariantStorage() {
- static_assert(offsetof(VariantStorage, data_) == 0,
- "(void*)&VariantStorage::data_ == (void*)this");
- }
-
- typename arrow::internal::aligned_union<0, T...>::type data_;
- uint8_t index_ = 0;
-};
-
-template <typename V, typename...>
-struct VariantImpl;
-
-template <typename... T>
-struct VariantImpl<Variant<T...>> : VariantStorage<T...> {
- static void index_of() noexcept {}
- void destroy() noexcept {}
- void move_to(...) noexcept {}
- void copy_to(...) const {}
-
- template <typename R, typename Visitor>
- [[noreturn]] R visit_const(Visitor&& /* visitor */) const {
- std::terminate();
- }
- template <typename R, typename Visitor>
- [[noreturn]] R visit_mutable(Visitor&& /* visitor */) {
- std::terminate();
- }
-};
-
-template <typename... M, typename H, typename... T>
-struct VariantImpl<Variant<M...>, H, T...> : VariantImpl<Variant<M...>, T...> {
- using VariantType = Variant<M...>;
- using Impl = VariantImpl<VariantType, T...>;
-
- static constexpr uint8_t kIndex = sizeof...(M) - sizeof...(T) - 1;
-
- VariantImpl() = default;
-
- using VariantImpl<VariantType, T...>::VariantImpl;
- using Impl::operator=;
- using Impl::index_of;
-
- explicit VariantImpl(H value) {
- new (this) H(std::move(value));
- this->index_ = kIndex;
- }
-
- VariantImpl& operator=(H value) {
- static_cast<VariantType*>(this)->destroy();
- new (this) H(std::move(value));
- this->index_ = kIndex;
- return *this;
- }
-
- H& cast_this() { return *reinterpret_cast<H*>(this); }
- const H& cast_this() const { return *reinterpret_cast<const H*>(this); }
-
- void move_to(VariantType* target) noexcept {
- if (this->index_ == kIndex) {
- new (target) H(std::move(cast_this()));
- target->index_ = kIndex;
- } else {
- Impl::move_to(target);
- }
- }
-
- // Templated to avoid instantiation in case H is not copy constructible
- template <typename Void>
- void copy_to(Void* generic_target) const {
- const auto target = static_cast<VariantType*>(generic_target);
- try {
- if (this->index_ == kIndex) {
- new (target) H(cast_this());
- target->index_ = kIndex;
- } else {
- Impl::copy_to(target);
- }
- } catch (...) {
- target->construct_default();
- throw;
- }
- }
-
- void destroy() noexcept {
- if (this->index_ == kIndex) {
- if (!std::is_trivially_destructible<H>::value) {
- cast_this().~H();
- }
- } else {
- Impl::destroy();
- }
- }
-
- static constexpr std::integral_constant<uint8_t, kIndex> index_of(
- const type_constant<H>&) {
- return {};
- }
-
- template <typename R, typename Visitor>
- R visit_const(Visitor&& visitor) const {
- if (this->index_ == kIndex) {
- return std::forward<Visitor>(visitor)(cast_this());
- }
- return Impl::template visit_const<R>(std::forward<Visitor>(visitor));
- }
-
- template <typename R, typename Visitor>
- R visit_mutable(Visitor&& visitor) {
- if (this->index_ == kIndex) {
- return std::forward<Visitor>(visitor)(&cast_this());
- }
- return Impl::template visit_mutable<R>(std::forward<Visitor>(visitor));
- }
-};
-
-} // namespace detail
-
-template <typename... T>
-class Variant : detail::VariantImpl<Variant<T...>, T...>,
- detail::conditional_t<
- detail::all<(std::is_copy_constructible<T>::value &&
- std::is_copy_assignable<T>::value)...>::value,
- detail::explicit_copy_constructor,
- detail::delete_copy_constructor>::template type<Variant<T...>> {
- template <typename U>
- static constexpr uint8_t index_of() {
- return Impl::index_of(detail::type_constant<U>{});
- }
-
- using Impl = detail::VariantImpl<Variant<T...>, T...>;
-
- public:
- using default_type = typename util::detail::first<T...>::type;
-
- Variant() noexcept { construct_default(); }
-
- Variant(const Variant& other) = default;
- Variant& operator=(const Variant& other) = default;
- Variant& operator=(Variant&& other) noexcept {
- this->destroy();
- other.move_to(this);
- return *this;
- }
-
- using Impl::Impl;
- using Impl::operator=;
-
- Variant(Variant&& other) noexcept { other.move_to(this); }
-
- ~Variant() {
- static_assert(offsetof(Variant, data_) == 0, "(void*)&Variant::data_ == (void*)this");
- this->destroy();
- }
-
- /// \brief Return the zero-based type index of the value held by the variant
- uint8_t index() const noexcept { return this->index_; }
-
- /// \brief Get a const pointer to the value held by the variant
- ///
- /// If the type given as template argument doesn't match, a null pointer is returned.
- template <typename U, uint8_t I = index_of<U>()>
- const U* get() const noexcept {
- return index() == I ? reinterpret_cast<const U*>(this) : NULLPTR;
- }
-
- /// \brief Get a pointer to the value held by the variant
- ///
- /// If the type given as template argument doesn't match, a null pointer is returned.
- template <typename U, uint8_t I = index_of<U>()>
- U* get() noexcept {
- return index() == I ? reinterpret_cast<U*>(this) : NULLPTR;
- }
-
- /// \brief Replace the value held by the variant
- ///
- /// The intended type must be given as a template argument.
- /// The value is constructed in-place using the given function arguments.
- template <typename U, typename... A, uint8_t I = index_of<U>()>
- void emplace(A&&... args) try {
- this->destroy();
- new (this) U(std::forward<A>(args)...);
- this->index_ = I;
- } catch (...) {
- construct_default();
- throw;
- }
-
- template <typename U, typename E, typename... A, uint8_t I = index_of<U>()>
- void emplace(std::initializer_list<E> il, A&&... args) try {
- this->destroy();
- new (this) U(il, std::forward<A>(args)...);
- this->index_ = I;
- } catch (...) {
- construct_default();
- throw;
- }
-
- /// \brief Swap with another variant's contents
- void swap(Variant& other) noexcept { // NOLINT google-runtime-references
- Variant tmp = std::move(other);
- other = std::move(*this);
- *this = std::move(tmp);
- }
-
- using Impl::visit_const;
- using Impl::visit_mutable;
-
- private:
- void construct_default() noexcept {
- new (this) default_type();
- this->index_ = 0;
- }
-
- template <typename V>
- friend struct detail::explicit_copy_constructor::type;
-
- template <typename V, typename...>
- friend struct detail::VariantImpl;
-};
-
-/// \brief Call polymorphic visitor on a const variant's value
-///
-/// The visitor will receive a const reference to the value held by the variant.
-/// It must define overloads for each possible variant type.
-/// The overloads should all return the same type (no attempt
-/// is made to find a generalized return type).
-template <typename Visitor, typename... T,
- typename R = decltype(std::declval<Visitor&&>()(
- std::declval<const typename Variant<T...>::default_type&>()))>
-R visit(Visitor&& visitor, const util::Variant<T...>& v) {
- return v.template visit_const<R>(std::forward<Visitor>(visitor));
-}
-
-/// \brief Call polymorphic visitor on a non-const variant's value
-///
-/// The visitor will receive a pointer to the value held by the variant.
-/// It must define overloads for each possible variant type.
-/// The overloads should all return the same type (no attempt
-/// is made to find a generalized return type).
-template <typename Visitor, typename... T,
- typename R = decltype(std::declval<Visitor&&>()(
- std::declval<typename Variant<T...>::default_type*>()))>
-R visit(Visitor&& visitor, util::Variant<T...>* v) {
- return v->template visit_mutable<R>(std::forward<Visitor>(visitor));
-}
-
-/// \brief Get a const reference to the value held by the variant
-///
-/// If the type given as template argument doesn't match, behavior is undefined
-/// (a null pointer will be dereferenced).
-template <typename U, typename... T>
-const U& get(const Variant<T...>& v) {
- return *v.template get<U>();
-}
-
-/// \brief Get a reference to the value held by the variant
-///
-/// If the type given as template argument doesn't match, behavior is undefined
-/// (a null pointer will be dereferenced).
-template <typename U, typename... T>
-U& get(Variant<T...>& v) {
- return *v.template get<U>();
-}
-
-/// \brief Get a const pointer to the value held by the variant
-///
-/// If the type given as template argument doesn't match, a nullptr is returned.
-template <typename U, typename... T>
-const U* get_if(const Variant<T...>* v) {
- return v->template get<U>();
-}
-
-/// \brief Get a pointer to the value held by the variant
-///
-/// If the type given as template argument doesn't match, a nullptr is returned.
-template <typename U, typename... T>
-U* get_if(Variant<T...>* v) {
- return v->template get<U>();
-}
-
-namespace detail {
-
-template <typename... T>
-struct VariantsEqual {
- template <typename U>
- bool operator()(const U& r) const {
- return get<U>(l_) == r;
- }
- const Variant<T...>& l_;
-};
-
-} // namespace detail
-
-template <typename... T, typename = typename std::enable_if<detail::all<
- detail::is_equality_comparable<T>::value...>::value>>
-bool operator==(const Variant<T...>& l, const Variant<T...>& r) {
- if (l.index() != r.index()) return false;
- return visit(detail::VariantsEqual<T...>{l}, r);
-}
-
-template <typename... T>
-auto operator!=(const Variant<T...>& l, const Variant<T...>& r) -> decltype(l == r) {
- return !(l == r);
-}
-
-/// \brief Return whether the variant holds a value of the given type
-template <typename U, typename... T>
-bool holds_alternative(const Variant<T...>& v) {
- return v.template get<U>();
-}
-
+/// \brief a std::variant-like discriminated union
+///
+/// Simplifications from std::variant:
+///
+/// - Strictly defaultable. The first type of T... should be nothrow default constructible
+/// and it will be used for default Variants.
+///
+/// - Never valueless_by_exception. std::variant supports a state outside those specified
+/// by T... to which it can return in the event that a constructor throws. If a Variant
+/// would become valueless_by_exception it will instead return to its default state.
+///
+/// - Strictly nothrow move constructible and assignable
+///
+/// - Less sophisticated type deduction. std::variant<bool, std::string>("hello") will
+/// intelligently construct std::string while Variant<bool, std::string>("hello") will
+/// construct bool.
+///
+/// - Either both copy constructible and assignable or neither (std::variant independently
+/// enables copy construction and copy assignment). Variant is copy constructible if
+/// each of T... is copy constructible and assignable.
+///
+/// - Slimmer interface; several members of std::variant are omitted.
+///
+/// - Throws no exceptions; if a bad_variant_access would be thrown Variant will instead
+/// segfault (nullptr dereference).
+///
+/// - Mutable visit takes a pointer instead of mutable reference or rvalue reference,
+/// which is more conformant with our code style.
+template <typename... T>
+class Variant;
+
+namespace detail {
+
+template <typename T, typename = void>
+struct is_equality_comparable : std::false_type {};
+
+template <typename T>
+struct is_equality_comparable<
+ T, typename std::enable_if<std::is_convertible<
+ decltype(std::declval<T>() == std::declval<T>()), bool>::value>::type>
+ : std::true_type {};
+
+template <bool C, typename T, typename E>
+using conditional_t = typename std::conditional<C, T, E>::type;
+
+template <typename T>
+struct type_constant {
+ using type = T;
+};
+
+template <typename...>
+struct first;
+
+template <typename H, typename... T>
+struct first<H, T...> {
+ using type = H;
+};
+
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <bool...>
+struct all : std::true_type {};
+
+template <bool H, bool... T>
+struct all<H, T...> : conditional_t<H, all<T...>, std::false_type> {};
+
+struct delete_copy_constructor {
+ template <typename>
+ struct type {
+ type() = default;
+ type(const type& other) = delete;
+ type& operator=(const type& other) = delete;
+ };
+};
+
+struct explicit_copy_constructor {
+ template <typename Copyable>
+ struct type {
+ type() = default;
+ type(const type& other) { static_cast<const Copyable&>(other).copy_to(this); }
+ type& operator=(const type& other) {
+ static_cast<Copyable*>(this)->destroy();
+ static_cast<const Copyable&>(other).copy_to(this);
+ return *this;
+ }
+ };
+};
+
+template <typename... T>
+struct VariantStorage {
+ VariantStorage() = default;
+ VariantStorage(const VariantStorage&) {}
+ VariantStorage& operator=(const VariantStorage&) { return *this; }
+ VariantStorage(VariantStorage&&) noexcept {}
+ VariantStorage& operator=(VariantStorage&&) noexcept { return *this; }
+ ~VariantStorage() {
+ static_assert(offsetof(VariantStorage, data_) == 0,
+ "(void*)&VariantStorage::data_ == (void*)this");
+ }
+
+ typename arrow::internal::aligned_union<0, T...>::type data_;
+ uint8_t index_ = 0;
+};
+
+template <typename V, typename...>
+struct VariantImpl;
+
+template <typename... T>
+struct VariantImpl<Variant<T...>> : VariantStorage<T...> {
+ static void index_of() noexcept {}
+ void destroy() noexcept {}
+ void move_to(...) noexcept {}
+ void copy_to(...) const {}
+
+ template <typename R, typename Visitor>
+ [[noreturn]] R visit_const(Visitor&& /* visitor */) const {
+ std::terminate();
+ }
+ template <typename R, typename Visitor>
+ [[noreturn]] R visit_mutable(Visitor&& /* visitor */) {
+ std::terminate();
+ }
+};
+
+template <typename... M, typename H, typename... T>
+struct VariantImpl<Variant<M...>, H, T...> : VariantImpl<Variant<M...>, T...> {
+ using VariantType = Variant<M...>;
+ using Impl = VariantImpl<VariantType, T...>;
+
+ static constexpr uint8_t kIndex = sizeof...(M) - sizeof...(T) - 1;
+
+ VariantImpl() = default;
+
+ using VariantImpl<VariantType, T...>::VariantImpl;
+ using Impl::operator=;
+ using Impl::index_of;
+
+ explicit VariantImpl(H value) {
+ new (this) H(std::move(value));
+ this->index_ = kIndex;
+ }
+
+ VariantImpl& operator=(H value) {
+ static_cast<VariantType*>(this)->destroy();
+ new (this) H(std::move(value));
+ this->index_ = kIndex;
+ return *this;
+ }
+
+ H& cast_this() { return *reinterpret_cast<H*>(this); }
+ const H& cast_this() const { return *reinterpret_cast<const H*>(this); }
+
+ void move_to(VariantType* target) noexcept {
+ if (this->index_ == kIndex) {
+ new (target) H(std::move(cast_this()));
+ target->index_ = kIndex;
+ } else {
+ Impl::move_to(target);
+ }
+ }
+
+ // Templated to avoid instantiation in case H is not copy constructible
+ template <typename Void>
+ void copy_to(Void* generic_target) const {
+ const auto target = static_cast<VariantType*>(generic_target);
+ try {
+ if (this->index_ == kIndex) {
+ new (target) H(cast_this());
+ target->index_ = kIndex;
+ } else {
+ Impl::copy_to(target);
+ }
+ } catch (...) {
+ target->construct_default();
+ throw;
+ }
+ }
+
+ void destroy() noexcept {
+ if (this->index_ == kIndex) {
+ if (!std::is_trivially_destructible<H>::value) {
+ cast_this().~H();
+ }
+ } else {
+ Impl::destroy();
+ }
+ }
+
+ static constexpr std::integral_constant<uint8_t, kIndex> index_of(
+ const type_constant<H>&) {
+ return {};
+ }
+
+ template <typename R, typename Visitor>
+ R visit_const(Visitor&& visitor) const {
+ if (this->index_ == kIndex) {
+ return std::forward<Visitor>(visitor)(cast_this());
+ }
+ return Impl::template visit_const<R>(std::forward<Visitor>(visitor));
+ }
+
+ template <typename R, typename Visitor>
+ R visit_mutable(Visitor&& visitor) {
+ if (this->index_ == kIndex) {
+ return std::forward<Visitor>(visitor)(&cast_this());
+ }
+ return Impl::template visit_mutable<R>(std::forward<Visitor>(visitor));
+ }
+};
+
+} // namespace detail
+
+template <typename... T>
+class Variant : detail::VariantImpl<Variant<T...>, T...>,
+ detail::conditional_t<
+ detail::all<(std::is_copy_constructible<T>::value &&
+ std::is_copy_assignable<T>::value)...>::value,
+ detail::explicit_copy_constructor,
+ detail::delete_copy_constructor>::template type<Variant<T...>> {
+ template <typename U>
+ static constexpr uint8_t index_of() {
+ return Impl::index_of(detail::type_constant<U>{});
+ }
+
+ using Impl = detail::VariantImpl<Variant<T...>, T...>;
+
+ public:
+ using default_type = typename util::detail::first<T...>::type;
+
+ Variant() noexcept { construct_default(); }
+
+ Variant(const Variant& other) = default;
+ Variant& operator=(const Variant& other) = default;
+ Variant& operator=(Variant&& other) noexcept {
+ this->destroy();
+ other.move_to(this);
+ return *this;
+ }
+
+ using Impl::Impl;
+ using Impl::operator=;
+
+ Variant(Variant&& other) noexcept { other.move_to(this); }
+
+ ~Variant() {
+ static_assert(offsetof(Variant, data_) == 0, "(void*)&Variant::data_ == (void*)this");
+ this->destroy();
+ }
+
+ /// \brief Return the zero-based type index of the value held by the variant
+ uint8_t index() const noexcept { return this->index_; }
+
+ /// \brief Get a const pointer to the value held by the variant
+ ///
+ /// If the type given as template argument doesn't match, a null pointer is returned.
+ template <typename U, uint8_t I = index_of<U>()>
+ const U* get() const noexcept {
+ return index() == I ? reinterpret_cast<const U*>(this) : NULLPTR;
+ }
+
+ /// \brief Get a pointer to the value held by the variant
+ ///
+ /// If the type given as template argument doesn't match, a null pointer is returned.
+ template <typename U, uint8_t I = index_of<U>()>
+ U* get() noexcept {
+ return index() == I ? reinterpret_cast<U*>(this) : NULLPTR;
+ }
+
+ /// \brief Replace the value held by the variant
+ ///
+ /// The intended type must be given as a template argument.
+ /// The value is constructed in-place using the given function arguments.
+ template <typename U, typename... A, uint8_t I = index_of<U>()>
+ void emplace(A&&... args) try {
+ this->destroy();
+ new (this) U(std::forward<A>(args)...);
+ this->index_ = I;
+ } catch (...) {
+ construct_default();
+ throw;
+ }
+
+ template <typename U, typename E, typename... A, uint8_t I = index_of<U>()>
+ void emplace(std::initializer_list<E> il, A&&... args) try {
+ this->destroy();
+ new (this) U(il, std::forward<A>(args)...);
+ this->index_ = I;
+ } catch (...) {
+ construct_default();
+ throw;
+ }
+
+ /// \brief Swap with another variant's contents
+ void swap(Variant& other) noexcept { // NOLINT google-runtime-references
+ Variant tmp = std::move(other);
+ other = std::move(*this);
+ *this = std::move(tmp);
+ }
+
+ using Impl::visit_const;
+ using Impl::visit_mutable;
+
+ private:
+ void construct_default() noexcept {
+ new (this) default_type();
+ this->index_ = 0;
+ }
+
+ template <typename V>
+ friend struct detail::explicit_copy_constructor::type;
+
+ template <typename V, typename...>
+ friend struct detail::VariantImpl;
+};
+
+/// \brief Call polymorphic visitor on a const variant's value
+///
+/// The visitor will receive a const reference to the value held by the variant.
+/// It must define overloads for each possible variant type.
+/// The overloads should all return the same type (no attempt
+/// is made to find a generalized return type).
+template <typename Visitor, typename... T,
+ typename R = decltype(std::declval<Visitor&&>()(
+ std::declval<const typename Variant<T...>::default_type&>()))>
+R visit(Visitor&& visitor, const util::Variant<T...>& v) {
+ return v.template visit_const<R>(std::forward<Visitor>(visitor));
+}
+
+/// \brief Call polymorphic visitor on a non-const variant's value
+///
+/// The visitor will receive a pointer to the value held by the variant.
+/// It must define overloads for each possible variant type.
+/// The overloads should all return the same type (no attempt
+/// is made to find a generalized return type).
+template <typename Visitor, typename... T,
+ typename R = decltype(std::declval<Visitor&&>()(
+ std::declval<typename Variant<T...>::default_type*>()))>
+R visit(Visitor&& visitor, util::Variant<T...>* v) {
+ return v->template visit_mutable<R>(std::forward<Visitor>(visitor));
+}
+
+/// \brief Get a const reference to the value held by the variant
+///
+/// If the type given as template argument doesn't match, behavior is undefined
+/// (a null pointer will be dereferenced).
+template <typename U, typename... T>
+const U& get(const Variant<T...>& v) {
+ return *v.template get<U>();
+}
+
+/// \brief Get a reference to the value held by the variant
+///
+/// If the type given as template argument doesn't match, behavior is undefined
+/// (a null pointer will be dereferenced).
+template <typename U, typename... T>
+U& get(Variant<T...>& v) {
+ return *v.template get<U>();
+}
+
+/// \brief Get a const pointer to the value held by the variant
+///
+/// If the type given as template argument doesn't match, a nullptr is returned.
+template <typename U, typename... T>
+const U* get_if(const Variant<T...>* v) {
+ return v->template get<U>();
+}
+
+/// \brief Get a pointer to the value held by the variant
+///
+/// If the type given as template argument doesn't match, a nullptr is returned.
+template <typename U, typename... T>
+U* get_if(Variant<T...>* v) {
+ return v->template get<U>();
+}
+
+namespace detail {
+
+template <typename... T>
+struct VariantsEqual {
+ template <typename U>
+ bool operator()(const U& r) const {
+ return get<U>(l_) == r;
+ }
+ const Variant<T...>& l_;
+};
+
+} // namespace detail
+
+template <typename... T, typename = typename std::enable_if<detail::all<
+ detail::is_equality_comparable<T>::value...>::value>>
+bool operator==(const Variant<T...>& l, const Variant<T...>& r) {
+ if (l.index() != r.index()) return false;
+ return visit(detail::VariantsEqual<T...>{l}, r);
+}
+
+template <typename... T>
+auto operator!=(const Variant<T...>& l, const Variant<T...>& r) -> decltype(l == r) {
+ return !(l == r);
+}
+
+/// \brief Return whether the variant holds a value of the given type
+template <typename U, typename... T>
+bool holds_alternative(const Variant<T...>& v) {
+ return v.template get<U>();
+}
+
} // namespace util
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
index 8bb6f44a4d5..041bdb424a7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/vector.h
@@ -17,20 +17,20 @@
#pragma once
-#include <algorithm>
+#include <algorithm>
#include <utility>
#include <vector>
-#include "arrow/result.h"
-#include "arrow/util/algorithm.h"
-#include "arrow/util/functional.h"
+#include "arrow/result.h"
+#include "arrow/util/algorithm.h"
+#include "arrow/util/functional.h"
#include "arrow/util/logging.h"
namespace arrow {
namespace internal {
template <typename T>
-std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
+std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
DCHECK(!values.empty());
DCHECK_LT(index, values.size());
std::vector<T> out;
@@ -45,8 +45,8 @@ std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
}
template <typename T>
-std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
- T new_element) {
+std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
+ T new_element) {
DCHECK_LE(index, values.size());
std::vector<T> out;
out.reserve(values.size() + 1);
@@ -61,8 +61,8 @@ std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
}
template <typename T>
-std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
- T new_element) {
+std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
+ T new_element) {
DCHECK_LE(index, values.size());
std::vector<T> out;
out.reserve(values.size());
@@ -76,97 +76,97 @@ std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
return out;
}
-template <typename T, typename Predicate>
-std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
- auto new_end =
- std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
- values.erase(new_end, values.end());
- return values;
-}
-
-template <typename Fn, typename From,
- typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
-std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
- std::vector<To> out;
- out.reserve(source.size());
- std::transform(source.begin(), source.end(), std::back_inserter(out),
- std::forward<Fn>(map));
- return out;
-}
-
-template <typename Fn, typename From,
- typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
-std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
- std::vector<To> out;
- out.reserve(source.size());
- std::transform(std::make_move_iterator(source.begin()),
- std::make_move_iterator(source.end()), std::back_inserter(out),
- std::forward<Fn>(map));
- return out;
-}
-
-/// \brief Like MapVector, but where the function can fail.
-template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
- typename To = typename internal::call_traits::return_type<Fn>::ValueType>
-Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
- std::vector<To> out;
- out.reserve(source.size());
- ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
- std::back_inserter(out), std::forward<Fn>(map)));
- return std::move(out);
-}
-
-template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
- typename To = typename internal::call_traits::return_type<Fn>::ValueType>
-Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
- std::vector<To> out;
- out.reserve(source.size());
- ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
- std::make_move_iterator(source.end()),
- std::back_inserter(out), std::forward<Fn>(map)));
- return std::move(out);
-}
-
-template <typename T>
-std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
- std::size_t sum = 0;
- for (const auto& vec : vecs) {
- sum += vec.size();
- }
- std::vector<T> out;
- out.reserve(sum);
- for (const auto& vec : vecs) {
- out.insert(out.end(), vec.begin(), vec.end());
- }
- return out;
-}
-
-template <typename T>
-Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
- std::vector<T> out;
- out.reserve(results.size());
- auto end = std::make_move_iterator(results.end());
- for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
- if (!it->ok()) {
- return it->status();
- }
- out.push_back(it->MoveValueUnsafe());
- }
- return std::move(out);
-}
-
-template <typename T>
-Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
- std::vector<T> out;
- out.reserve(results.size());
- for (const auto& result : results) {
- if (!result.ok()) {
- return result.status();
- }
- out.push_back(result.ValueUnsafe());
- }
- return std::move(out);
-}
-
+template <typename T, typename Predicate>
+std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
+ auto new_end =
+ std::remove_if(values.begin(), values.end(), std::forward<Predicate>(predicate));
+ values.erase(new_end, values.end());
+ return values;
+}
+
+template <typename Fn, typename From,
+ typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ std::transform(source.begin(), source.end(), std::back_inserter(out),
+ std::forward<Fn>(map));
+ return out;
+}
+
+template <typename Fn, typename From,
+ typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ std::transform(std::make_move_iterator(source.begin()),
+ std::make_move_iterator(source.end()), std::back_inserter(out),
+ std::forward<Fn>(map));
+ return out;
+}
+
+/// \brief Like MapVector, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
+ std::back_inserter(out), std::forward<Fn>(map)));
+ return std::move(out);
+}
+
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+ typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
+ std::vector<To> out;
+ out.reserve(source.size());
+ ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
+ std::make_move_iterator(source.end()),
+ std::back_inserter(out), std::forward<Fn>(map)));
+ return std::move(out);
+}
+
+template <typename T>
+std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
+ std::size_t sum = 0;
+ for (const auto& vec : vecs) {
+ sum += vec.size();
+ }
+ std::vector<T> out;
+ out.reserve(sum);
+ for (const auto& vec : vecs) {
+ out.insert(out.end(), vec.begin(), vec.end());
+ }
+ return out;
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
+ std::vector<T> out;
+ out.reserve(results.size());
+ auto end = std::make_move_iterator(results.end());
+ for (auto it = std::make_move_iterator(results.begin()); it != end; it++) {
+ if (!it->ok()) {
+ return it->status();
+ }
+ out.push_back(it->MoveValueUnsafe());
+ }
+ return std::move(out);
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
+ std::vector<T> out;
+ out.reserve(results.size());
+ for (const auto& result : results) {
+ if (!result.ok()) {
+ return result.status();
+ }
+ out.push_back(result.ValueUnsafe());
+ }
+ return std::move(out);
+}
+
} // namespace internal
} // namespace arrow
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
index 6cb5a5e66be..2949ac4ab76 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/util/windows_fixup.h
@@ -19,13 +19,13 @@
#ifdef _WIN32
-#ifdef max
-#undef max
-#endif
-#ifdef min
-#undef min
-#endif
-
+#ifdef max
+#undef max
+#endif
+#ifdef min
+#undef min
+#endif
+
// The Windows API defines macros from *File resolving to either
// *FileA or *FileW. Need to undo them.
#ifdef CopyFile
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
index bdd776bfc4c..0b7cfa1cb16 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/ProducerConsumerQueue.h
@@ -1,217 +1,217 @@
-// Vendored from git tag v2021.02.15.00
-
-/*
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// @author Bo Hu ([email protected])
-// @author Jordan DeLong ([email protected])
-
-// This file has been modified as part of Apache Arrow to conform to
-// Apache Arrow's coding conventions
-
-#pragma once
-
-#include <atomic>
-#include <cassert>
-#include <cstdlib>
-#include <memory>
-#include <stdexcept>
-#include <type_traits>
-#include <utility>
-
-namespace arrow_vendored {
-namespace folly {
-
-// Vendored from folly/Portability.h
-namespace {
-#if defined(__arm__)
-#define FOLLY_ARM 1
-#else
-#define FOLLY_ARM 0
-#endif
-
-#if defined(__s390x__)
-#define FOLLY_S390X 1
-#else
-#define FOLLY_S390X 0
-#endif
-
-constexpr bool kIsArchArm = FOLLY_ARM == 1;
-constexpr bool kIsArchS390X = FOLLY_S390X == 1;
-} // namespace
-
-// Vendored from folly/lang/Align.h
-namespace {
-
-constexpr std::size_t hardware_destructive_interference_size =
- (kIsArchArm || kIsArchS390X) ? 64 : 128;
-
-} // namespace
-
-/*
- * ProducerConsumerQueue is a one producer and one consumer queue
- * without locks.
- */
-template <class T>
-struct ProducerConsumerQueue {
- typedef T value_type;
-
- ProducerConsumerQueue(const ProducerConsumerQueue&) = delete;
- ProducerConsumerQueue& operator=(const ProducerConsumerQueue&) = delete;
-
- // size must be >= 2.
- //
- // Also, note that the number of usable slots in the queue at any
- // given time is actually (size-1), so if you start with an empty queue,
- // IsFull() will return true after size-1 insertions.
- explicit ProducerConsumerQueue(uint32_t size)
- : size_(size),
- records_(static_cast<T*>(std::malloc(sizeof(T) * size))),
- readIndex_(0),
- writeIndex_(0) {
- assert(size >= 2);
- if (!records_) {
- throw std::bad_alloc();
- }
- }
-
- ~ProducerConsumerQueue() {
- // We need to destruct anything that may still exist in our queue.
- // (No real synchronization needed at destructor time: only one
- // thread can be doing this.)
- if (!std::is_trivially_destructible<T>::value) {
- size_t readIndex = readIndex_;
- size_t endIndex = writeIndex_;
- while (readIndex != endIndex) {
- records_[readIndex].~T();
- if (++readIndex == size_) {
- readIndex = 0;
- }
- }
- }
-
- std::free(records_);
- }
-
- template <class... Args>
- bool Write(Args&&... recordArgs) {
- auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
- auto nextRecord = currentWrite + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
- new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
- writeIndex_.store(nextRecord, std::memory_order_release);
- return true;
- }
-
- // queue is full
- return false;
- }
-
- // move the value at the front of the queue to given variable
- bool Read(T& record) {
- auto const currentRead = readIndex_.load(std::memory_order_relaxed);
- if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
- // queue is empty
- return false;
- }
-
- auto nextRecord = currentRead + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- record = std::move(records_[currentRead]);
- records_[currentRead].~T();
- readIndex_.store(nextRecord, std::memory_order_release);
- return true;
- }
-
- // pointer to the value at the front of the queue (for use in-place) or
- // nullptr if empty.
- T* FrontPtr() {
- auto const currentRead = readIndex_.load(std::memory_order_relaxed);
- if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
- // queue is empty
- return nullptr;
- }
- return &records_[currentRead];
- }
-
- // queue must not be empty
- void PopFront() {
- auto const currentRead = readIndex_.load(std::memory_order_relaxed);
- assert(currentRead != writeIndex_.load(std::memory_order_acquire));
-
- auto nextRecord = currentRead + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- records_[currentRead].~T();
- readIndex_.store(nextRecord, std::memory_order_release);
- }
-
- bool IsEmpty() const {
- return readIndex_.load(std::memory_order_acquire) ==
- writeIndex_.load(std::memory_order_acquire);
- }
-
- bool IsFull() const {
- auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
- if (nextRecord == size_) {
- nextRecord = 0;
- }
- if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
- return false;
- }
- // queue is full
- return true;
- }
-
- // * If called by consumer, then true size may be more (because producer may
- // be adding items concurrently).
- // * If called by producer, then true size may be less (because consumer may
- // be removing items concurrently).
- // * It is undefined to call this from any other thread.
- size_t SizeGuess() const {
- int ret = writeIndex_.load(std::memory_order_acquire) -
- readIndex_.load(std::memory_order_acquire);
- if (ret < 0) {
- ret += size_;
- }
- return ret;
- }
-
- // maximum number of items in the queue.
- size_t capacity() const { return size_ - 1; }
-
- private:
- using AtomicIndex = std::atomic<unsigned int>;
-
- char pad0_[hardware_destructive_interference_size];
- const uint32_t size_;
- T* const records_;
-
- AtomicIndex readIndex_;
- char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
- AtomicIndex writeIndex_;
-
- char pad2_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
-};
-
-} // namespace folly
-} // namespace arrow_vendored
+// Vendored from git tag v2021.02.15.00
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Bo Hu ([email protected])
+// @author Jordan DeLong ([email protected])
+
+// This file has been modified as part of Apache Arrow to conform to
+// Apache Arrow's coding conventions
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace arrow_vendored {
+namespace folly {
+
+// Vendored from folly/Portability.h
+namespace {
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
+#if defined(__s390x__)
+#define FOLLY_S390X 1
+#else
+#define FOLLY_S390X 0
+#endif
+
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
+constexpr bool kIsArchS390X = FOLLY_S390X == 1;
+} // namespace
+
+// Vendored from folly/lang/Align.h
+namespace {
+
+constexpr std::size_t hardware_destructive_interference_size =
+ (kIsArchArm || kIsArchS390X) ? 64 : 128;
+
+} // namespace
+
+/*
+ * ProducerConsumerQueue is a one producer and one consumer queue
+ * without locks.
+ */
+template <class T>
+struct ProducerConsumerQueue {
+ typedef T value_type;
+
+ ProducerConsumerQueue(const ProducerConsumerQueue&) = delete;
+ ProducerConsumerQueue& operator=(const ProducerConsumerQueue&) = delete;
+
+ // size must be >= 2.
+ //
+ // Also, note that the number of usable slots in the queue at any
+ // given time is actually (size-1), so if you start with an empty queue,
+ // IsFull() will return true after size-1 insertions.
+ explicit ProducerConsumerQueue(uint32_t size)
+ : size_(size),
+ records_(static_cast<T*>(std::malloc(sizeof(T) * size))),
+ readIndex_(0),
+ writeIndex_(0) {
+ assert(size >= 2);
+ if (!records_) {
+ throw std::bad_alloc();
+ }
+ }
+
+ ~ProducerConsumerQueue() {
+ // We need to destruct anything that may still exist in our queue.
+ // (No real synchronization needed at destructor time: only one
+ // thread can be doing this.)
+ if (!std::is_trivially_destructible<T>::value) {
+ size_t readIndex = readIndex_;
+ size_t endIndex = writeIndex_;
+ while (readIndex != endIndex) {
+ records_[readIndex].~T();
+ if (++readIndex == size_) {
+ readIndex = 0;
+ }
+ }
+ }
+
+ std::free(records_);
+ }
+
+ template <class... Args>
+ bool Write(Args&&... recordArgs) {
+ auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
+ auto nextRecord = currentWrite + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+ new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
+ writeIndex_.store(nextRecord, std::memory_order_release);
+ return true;
+ }
+
+ // queue is full
+ return false;
+ }
+
+ // move the value at the front of the queue to given variable
+ bool Read(T& record) {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+ // queue is empty
+ return false;
+ }
+
+ auto nextRecord = currentRead + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ record = std::move(records_[currentRead]);
+ records_[currentRead].~T();
+ readIndex_.store(nextRecord, std::memory_order_release);
+ return true;
+ }
+
+ // pointer to the value at the front of the queue (for use in-place) or
+ // nullptr if empty.
+ T* FrontPtr() {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+ // queue is empty
+ return nullptr;
+ }
+ return &records_[currentRead];
+ }
+
+ // queue must not be empty
+ void PopFront() {
+ auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+ assert(currentRead != writeIndex_.load(std::memory_order_acquire));
+
+ auto nextRecord = currentRead + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ records_[currentRead].~T();
+ readIndex_.store(nextRecord, std::memory_order_release);
+ }
+
+ bool IsEmpty() const {
+ return readIndex_.load(std::memory_order_acquire) ==
+ writeIndex_.load(std::memory_order_acquire);
+ }
+
+ bool IsFull() const {
+ auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
+ if (nextRecord == size_) {
+ nextRecord = 0;
+ }
+ if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+ return false;
+ }
+ // queue is full
+ return true;
+ }
+
+ // * If called by consumer, then true size may be more (because producer may
+ // be adding items concurrently).
+ // * If called by producer, then true size may be less (because consumer may
+ // be removing items concurrently).
+ // * It is undefined to call this from any other thread.
+ size_t SizeGuess() const {
+ int ret = writeIndex_.load(std::memory_order_acquire) -
+ readIndex_.load(std::memory_order_acquire);
+ if (ret < 0) {
+ ret += size_;
+ }
+ return ret;
+ }
+
+ // maximum number of items in the queue.
+ size_t capacity() const { return size_ - 1; }
+
+ private:
+ using AtomicIndex = std::atomic<unsigned int>;
+
+ char pad0_[hardware_destructive_interference_size];
+ const uint32_t size_;
+ T* const records_;
+
+ AtomicIndex readIndex_;
+ char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+ AtomicIndex writeIndex_;
+
+ char pad2_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+};
+
+} // namespace folly
+} // namespace arrow_vendored
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
index 6430a57af29..7f6426ac765 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/vendored/portable-snippets/safe-math.h
@@ -58,8 +58,8 @@
# define PSNIP_SAFE__FUNCTION PSNIP_SAFE__COMPILER_ATTRIBUTES static PSNIP_SAFE__INLINE
#endif
-// !defined(__cplusplus) added for Solaris support
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+// !defined(__cplusplus) added for Solaris support
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
# define psnip_safe_bool _Bool
#else
# define psnip_safe_bool int
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
index 83d4de210d3..851785081c7 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.cc
@@ -67,7 +67,7 @@ ARRAY_VISITOR_DEFAULT(SparseUnionArray)
ARRAY_VISITOR_DEFAULT(DenseUnionArray)
ARRAY_VISITOR_DEFAULT(DictionaryArray)
ARRAY_VISITOR_DEFAULT(Decimal128Array)
-ARRAY_VISITOR_DEFAULT(Decimal256Array)
+ARRAY_VISITOR_DEFAULT(Decimal256Array)
ARRAY_VISITOR_DEFAULT(ExtensionArray)
#undef ARRAY_VISITOR_DEFAULT
@@ -107,7 +107,7 @@ TYPE_VISITOR_DEFAULT(DayTimeIntervalType)
TYPE_VISITOR_DEFAULT(MonthIntervalType)
TYPE_VISITOR_DEFAULT(DurationType)
TYPE_VISITOR_DEFAULT(Decimal128Type)
-TYPE_VISITOR_DEFAULT(Decimal256Type)
+TYPE_VISITOR_DEFAULT(Decimal256Type)
TYPE_VISITOR_DEFAULT(ListType)
TYPE_VISITOR_DEFAULT(LargeListType)
TYPE_VISITOR_DEFAULT(MapType)
@@ -156,7 +156,7 @@ SCALAR_VISITOR_DEFAULT(DayTimeIntervalScalar)
SCALAR_VISITOR_DEFAULT(MonthIntervalScalar)
SCALAR_VISITOR_DEFAULT(DurationScalar)
SCALAR_VISITOR_DEFAULT(Decimal128Scalar)
-SCALAR_VISITOR_DEFAULT(Decimal256Scalar)
+SCALAR_VISITOR_DEFAULT(Decimal256Scalar)
SCALAR_VISITOR_DEFAULT(ListScalar)
SCALAR_VISITOR_DEFAULT(LargeListScalar)
SCALAR_VISITOR_DEFAULT(MapScalar)
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
index fe49f51ce3d..0382e461199 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor.h
@@ -54,7 +54,7 @@ class ARROW_EXPORT ArrayVisitor {
virtual Status Visit(const MonthIntervalArray& array);
virtual Status Visit(const DurationArray& array);
virtual Status Visit(const Decimal128Array& array);
- virtual Status Visit(const Decimal256Array& array);
+ virtual Status Visit(const Decimal256Array& array);
virtual Status Visit(const ListArray& array);
virtual Status Visit(const LargeListArray& array);
virtual Status Visit(const MapArray& array);
@@ -97,7 +97,7 @@ class ARROW_EXPORT TypeVisitor {
virtual Status Visit(const DayTimeIntervalType& type);
virtual Status Visit(const DurationType& type);
virtual Status Visit(const Decimal128Type& type);
- virtual Status Visit(const Decimal256Type& type);
+ virtual Status Visit(const Decimal256Type& type);
virtual Status Visit(const ListType& type);
virtual Status Visit(const LargeListType& type);
virtual Status Visit(const MapType& type);
@@ -140,7 +140,7 @@ class ARROW_EXPORT ScalarVisitor {
virtual Status Visit(const MonthIntervalScalar& scalar);
virtual Status Visit(const DurationScalar& scalar);
virtual Status Visit(const Decimal128Scalar& scalar);
- virtual Status Visit(const Decimal256Scalar& scalar);
+ virtual Status Visit(const Decimal256Scalar& scalar);
virtual Status Visit(const ListScalar& scalar);
virtual Status Visit(const LargeListScalar& scalar);
virtual Status Visit(const MapScalar& scalar);
diff --git a/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h b/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
index 69d443b48cd..132c35aeaa1 100644
--- a/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
+++ b/contrib/libs/apache/arrow/cpp/src/arrow/visitor_inline.h
@@ -68,7 +68,7 @@ namespace arrow {
ACTION(MonthInterval); \
ACTION(DayTimeInterval); \
ACTION(Decimal128); \
- ACTION(Decimal256); \
+ ACTION(Decimal256); \
ACTION(List); \
ACTION(LargeList); \
ACTION(Map); \
@@ -199,9 +199,9 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
using offset_type = typename T::offset_type;
constexpr char empty_value = 0;
- if (arr.length == 0) {
- return Status::OK();
- }
+ if (arr.length == 0) {
+ return Status::OK();
+ }
const offset_type* offsets = arr.GetValues<offset_type>(1);
const char* data;
if (!arr.buffers[2]) {
@@ -232,9 +232,9 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
using offset_type = typename T::offset_type;
constexpr uint8_t empty_value = 0;
- if (arr.length == 0) {
- return;
- }
+ if (arr.length == 0) {
+ return;
+ }
const offset_type* offsets = arr.GetValues<offset_type>(1);
const uint8_t* data;
if (!arr.buffers[2]) {
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
index 5df101f4369..b1b4ce62673 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.cpp
@@ -1,17 +1,17 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#include "parquet_constants.h"
-
-namespace parquet { namespace format {
-
-const parquetConstants g_parquet_constants;
-
-parquetConstants::parquetConstants() {
-}
-
-}} // namespace
-
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#include "parquet_constants.h"
+
+namespace parquet { namespace format {
+
+const parquetConstants g_parquet_constants;
+
+parquetConstants::parquetConstants() {
+}
+
+}} // namespace
+
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
index 98df7236774..1e288c7cd1f 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_constants.h
@@ -1,24 +1,24 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#ifndef parquet_CONSTANTS_H
-#define parquet_CONSTANTS_H
-
-#include "parquet_types.h"
-
-namespace parquet { namespace format {
-
-class parquetConstants {
- public:
- parquetConstants();
-
-};
-
-extern const parquetConstants g_parquet_constants;
-
-}} // namespace
-
-#endif
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#ifndef parquet_CONSTANTS_H
+#define parquet_CONSTANTS_H
+
+#include "parquet_types.h"
+
+namespace parquet { namespace format {
+
+class parquetConstants {
+ public:
+ parquetConstants();
+
+};
+
+extern const parquetConstants g_parquet_constants;
+
+}} // namespace
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
index ca55e9ab0ae..7c7289658ee 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.cpp
@@ -1,7415 +1,7415 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#include "parquet_types.h"
-
-#include <algorithm>
-#include <ostream>
-
-#include <thrift/TToString.h>
-
-namespace parquet { namespace format {
-
-int _kTypeValues[] = {
- Type::BOOLEAN,
- Type::INT32,
- Type::INT64,
- Type::INT96,
- Type::FLOAT,
- Type::DOUBLE,
- Type::BYTE_ARRAY,
- Type::FIXED_LEN_BYTE_ARRAY
-};
-const char* _kTypeNames[] = {
- "BOOLEAN",
- "INT32",
- "INT64",
- "INT96",
- "FLOAT",
- "DOUBLE",
- "BYTE_ARRAY",
- "FIXED_LEN_BYTE_ARRAY"
-};
-const std::map<int, const char*> _Type_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kTypeValues, _kTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const Type::type& val) {
- std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
- if (it != _Type_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const Type::type& val) {
- std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
- if (it != _Type_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kConvertedTypeValues[] = {
- ConvertedType::UTF8,
- ConvertedType::MAP,
- ConvertedType::MAP_KEY_VALUE,
- ConvertedType::LIST,
- ConvertedType::ENUM,
- ConvertedType::DECIMAL,
- ConvertedType::DATE,
- ConvertedType::TIME_MILLIS,
- ConvertedType::TIME_MICROS,
- ConvertedType::TIMESTAMP_MILLIS,
- ConvertedType::TIMESTAMP_MICROS,
- ConvertedType::UINT_8,
- ConvertedType::UINT_16,
- ConvertedType::UINT_32,
- ConvertedType::UINT_64,
- ConvertedType::INT_8,
- ConvertedType::INT_16,
- ConvertedType::INT_32,
- ConvertedType::INT_64,
- ConvertedType::JSON,
- ConvertedType::BSON,
- ConvertedType::INTERVAL
-};
-const char* _kConvertedTypeNames[] = {
- "UTF8",
- "MAP",
- "MAP_KEY_VALUE",
- "LIST",
- "ENUM",
- "DECIMAL",
- "DATE",
- "TIME_MILLIS",
- "TIME_MICROS",
- "TIMESTAMP_MILLIS",
- "TIMESTAMP_MICROS",
- "UINT_8",
- "UINT_16",
- "UINT_32",
- "UINT_64",
- "INT_8",
- "INT_16",
- "INT_32",
- "INT_64",
- "JSON",
- "BSON",
- "INTERVAL"
-};
-const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(22, _kConvertedTypeValues, _kConvertedTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val) {
- std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
- if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const ConvertedType::type& val) {
- std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
- if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kFieldRepetitionTypeValues[] = {
- FieldRepetitionType::REQUIRED,
- FieldRepetitionType::OPTIONAL,
- FieldRepetitionType::REPEATED
-};
-const char* _kFieldRepetitionTypeNames[] = {
- "REQUIRED",
- "OPTIONAL",
- "REPEATED"
-};
-const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) {
- std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
- if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const FieldRepetitionType::type& val) {
- std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
- if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kEncodingValues[] = {
- Encoding::PLAIN,
- Encoding::PLAIN_DICTIONARY,
- Encoding::RLE,
- Encoding::BIT_PACKED,
- Encoding::DELTA_BINARY_PACKED,
- Encoding::DELTA_LENGTH_BYTE_ARRAY,
- Encoding::DELTA_BYTE_ARRAY,
- Encoding::RLE_DICTIONARY,
- Encoding::BYTE_STREAM_SPLIT
-};
-const char* _kEncodingNames[] = {
- "PLAIN",
- "PLAIN_DICTIONARY",
- "RLE",
- "BIT_PACKED",
- "DELTA_BINARY_PACKED",
- "DELTA_LENGTH_BYTE_ARRAY",
- "DELTA_BYTE_ARRAY",
- "RLE_DICTIONARY",
- "BYTE_STREAM_SPLIT"
-};
-const std::map<int, const char*> _Encoding_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(9, _kEncodingValues, _kEncodingNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const Encoding::type& val) {
- std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
- if (it != _Encoding_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const Encoding::type& val) {
- std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
- if (it != _Encoding_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kCompressionCodecValues[] = {
- CompressionCodec::UNCOMPRESSED,
- CompressionCodec::SNAPPY,
- CompressionCodec::GZIP,
- CompressionCodec::LZO,
- CompressionCodec::BROTLI,
- CompressionCodec::LZ4,
- CompressionCodec::ZSTD,
- CompressionCodec::LZ4_RAW
-};
-const char* _kCompressionCodecNames[] = {
- "UNCOMPRESSED",
- "SNAPPY",
- "GZIP",
- "LZO",
- "BROTLI",
- "LZ4",
- "ZSTD",
- "LZ4_RAW"
-};
-const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kCompressionCodecValues, _kCompressionCodecNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val) {
- std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
- if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const CompressionCodec::type& val) {
- std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
- if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kPageTypeValues[] = {
- PageType::DATA_PAGE,
- PageType::INDEX_PAGE,
- PageType::DICTIONARY_PAGE,
- PageType::DATA_PAGE_V2
-};
-const char* _kPageTypeNames[] = {
- "DATA_PAGE",
- "INDEX_PAGE",
- "DICTIONARY_PAGE",
- "DATA_PAGE_V2"
-};
-const std::map<int, const char*> _PageType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(4, _kPageTypeValues, _kPageTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const PageType::type& val) {
- std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
- if (it != _PageType_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const PageType::type& val) {
- std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
- if (it != _PageType_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-int _kBoundaryOrderValues[] = {
- BoundaryOrder::UNORDERED,
- BoundaryOrder::ASCENDING,
- BoundaryOrder::DESCENDING
-};
-const char* _kBoundaryOrderNames[] = {
- "UNORDERED",
- "ASCENDING",
- "DESCENDING"
-};
-const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kBoundaryOrderValues, _kBoundaryOrderNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
-
-std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val) {
- std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
- if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
- out << it->second;
- } else {
- out << static_cast<int>(val);
- }
- return out;
-}
-
-std::string to_string(const BoundaryOrder::type& val) {
- std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
- if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
- return std::string(it->second);
- } else {
- return std::to_string(static_cast<int>(val));
- }
-}
-
-
-Statistics::~Statistics() noexcept {
-}
-
-
-void Statistics::__set_max(const std::string& val) {
- this->max = val;
-__isset.max = true;
-}
-
-void Statistics::__set_min(const std::string& val) {
- this->min = val;
-__isset.min = true;
-}
-
-void Statistics::__set_null_count(const int64_t val) {
- this->null_count = val;
-__isset.null_count = true;
-}
-
-void Statistics::__set_distinct_count(const int64_t val) {
- this->distinct_count = val;
-__isset.distinct_count = true;
-}
-
-void Statistics::__set_max_value(const std::string& val) {
- this->max_value = val;
-__isset.max_value = true;
-}
-
-void Statistics::__set_min_value(const std::string& val) {
- this->min_value = val;
-__isset.min_value = true;
-}
-std::ostream& operator<<(std::ostream& out, const Statistics& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->max);
- this->__isset.max = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->min);
- this->__isset.min = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->null_count);
- this->__isset.null_count = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->distinct_count);
- this->__isset.distinct_count = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->max_value);
- this->__isset.max_value = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->min_value);
- this->__isset.min_value = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("Statistics");
-
- if (this->__isset.max) {
- xfer += oprot->writeFieldBegin("max", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeBinary(this->max);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.min) {
- xfer += oprot->writeFieldBegin("min", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->min);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.null_count) {
- xfer += oprot->writeFieldBegin("null_count", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->null_count);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.distinct_count) {
- xfer += oprot->writeFieldBegin("distinct_count", ::apache::thrift::protocol::T_I64, 4);
- xfer += oprot->writeI64(this->distinct_count);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.max_value) {
- xfer += oprot->writeFieldBegin("max_value", ::apache::thrift::protocol::T_STRING, 5);
- xfer += oprot->writeBinary(this->max_value);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.min_value) {
- xfer += oprot->writeFieldBegin("min_value", ::apache::thrift::protocol::T_STRING, 6);
- xfer += oprot->writeBinary(this->min_value);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(Statistics &a, Statistics &b) {
- using ::std::swap;
- swap(a.max, b.max);
- swap(a.min, b.min);
- swap(a.null_count, b.null_count);
- swap(a.distinct_count, b.distinct_count);
- swap(a.max_value, b.max_value);
- swap(a.min_value, b.min_value);
- swap(a.__isset, b.__isset);
-}
-
-Statistics::Statistics(const Statistics& other0) {
- max = other0.max;
- min = other0.min;
- null_count = other0.null_count;
- distinct_count = other0.distinct_count;
- max_value = other0.max_value;
- min_value = other0.min_value;
- __isset = other0.__isset;
-}
-Statistics& Statistics::operator=(const Statistics& other1) {
- max = other1.max;
- min = other1.min;
- null_count = other1.null_count;
- distinct_count = other1.distinct_count;
- max_value = other1.max_value;
- min_value = other1.min_value;
- __isset = other1.__isset;
- return *this;
-}
-void Statistics::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "Statistics(";
- out << "max="; (__isset.max ? (out << to_string(max)) : (out << "<null>"));
- out << ", " << "min="; (__isset.min ? (out << to_string(min)) : (out << "<null>"));
- out << ", " << "null_count="; (__isset.null_count ? (out << to_string(null_count)) : (out << "<null>"));
- out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "<null>"));
- out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
- out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
- out << ")";
-}
-
-
-StringType::~StringType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const StringType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t StringType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t StringType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("StringType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(StringType &a, StringType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-StringType::StringType(const StringType& other2) {
- (void) other2;
-}
-StringType& StringType::operator=(const StringType& other3) {
- (void) other3;
- return *this;
-}
-void StringType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "StringType(";
- out << ")";
-}
-
-
-UUIDType::~UUIDType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const UUIDType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t UUIDType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t UUIDType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("UUIDType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(UUIDType &a, UUIDType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-UUIDType::UUIDType(const UUIDType& other4) {
- (void) other4;
-}
-UUIDType& UUIDType::operator=(const UUIDType& other5) {
- (void) other5;
- return *this;
-}
-void UUIDType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "UUIDType(";
- out << ")";
-}
-
-
-MapType::~MapType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const MapType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t MapType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t MapType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("MapType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(MapType &a, MapType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-MapType::MapType(const MapType& other6) {
- (void) other6;
-}
-MapType& MapType::operator=(const MapType& other7) {
- (void) other7;
- return *this;
-}
-void MapType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "MapType(";
- out << ")";
-}
-
-
-ListType::~ListType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const ListType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ListType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t ListType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ListType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ListType &a, ListType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-ListType::ListType(const ListType& other8) {
- (void) other8;
-}
-ListType& ListType::operator=(const ListType& other9) {
- (void) other9;
- return *this;
-}
-void ListType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ListType(";
- out << ")";
-}
-
-
-EnumType::~EnumType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const EnumType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EnumType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t EnumType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EnumType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EnumType &a, EnumType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-EnumType::EnumType(const EnumType& other10) {
- (void) other10;
-}
-EnumType& EnumType::operator=(const EnumType& other11) {
- (void) other11;
- return *this;
-}
-void EnumType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EnumType(";
- out << ")";
-}
-
-
-DateType::~DateType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const DateType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DateType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t DateType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DateType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DateType &a, DateType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-DateType::DateType(const DateType& other12) {
- (void) other12;
-}
-DateType& DateType::operator=(const DateType& other13) {
- (void) other13;
- return *this;
-}
-void DateType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DateType(";
- out << ")";
-}
-
-
-NullType::~NullType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const NullType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t NullType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t NullType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("NullType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(NullType &a, NullType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-NullType::NullType(const NullType& other14) {
- (void) other14;
-}
-NullType& NullType::operator=(const NullType& other15) {
- (void) other15;
- return *this;
-}
-void NullType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "NullType(";
- out << ")";
-}
-
-
-DecimalType::~DecimalType() noexcept {
-}
-
-
-void DecimalType::__set_scale(const int32_t val) {
- this->scale = val;
-}
-
-void DecimalType::__set_precision(const int32_t val) {
- this->precision = val;
-}
-std::ostream& operator<<(std::ostream& out, const DecimalType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DecimalType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_scale = false;
- bool isset_precision = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->scale);
- isset_scale = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->precision);
- isset_precision = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_scale)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_precision)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DecimalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DecimalType");
-
- xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->scale);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->precision);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DecimalType &a, DecimalType &b) {
- using ::std::swap;
- swap(a.scale, b.scale);
- swap(a.precision, b.precision);
-}
-
-DecimalType::DecimalType(const DecimalType& other16) {
- scale = other16.scale;
- precision = other16.precision;
-}
-DecimalType& DecimalType::operator=(const DecimalType& other17) {
- scale = other17.scale;
- precision = other17.precision;
- return *this;
-}
-void DecimalType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DecimalType(";
- out << "scale=" << to_string(scale);
- out << ", " << "precision=" << to_string(precision);
- out << ")";
-}
-
-
-MilliSeconds::~MilliSeconds() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t MilliSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t MilliSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("MilliSeconds");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(MilliSeconds &a, MilliSeconds &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-MilliSeconds::MilliSeconds(const MilliSeconds& other18) {
- (void) other18;
-}
-MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other19) {
- (void) other19;
- return *this;
-}
-void MilliSeconds::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "MilliSeconds(";
- out << ")";
-}
-
-
-MicroSeconds::~MicroSeconds() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t MicroSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t MicroSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("MicroSeconds");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(MicroSeconds &a, MicroSeconds &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-MicroSeconds::MicroSeconds(const MicroSeconds& other20) {
- (void) other20;
-}
-MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other21) {
- (void) other21;
- return *this;
-}
-void MicroSeconds::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "MicroSeconds(";
- out << ")";
-}
-
-
-NanoSeconds::~NanoSeconds() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t NanoSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t NanoSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("NanoSeconds");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(NanoSeconds &a, NanoSeconds &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-NanoSeconds::NanoSeconds(const NanoSeconds& other22) {
- (void) other22;
-}
-NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other23) {
- (void) other23;
- return *this;
-}
-void NanoSeconds::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "NanoSeconds(";
- out << ")";
-}
-
-
-TimeUnit::~TimeUnit() noexcept {
-}
-
-
-void TimeUnit::__set_MILLIS(const MilliSeconds& val) {
- this->MILLIS = val;
-__isset.MILLIS = true;
-}
-
-void TimeUnit::__set_MICROS(const MicroSeconds& val) {
- this->MICROS = val;
-__isset.MICROS = true;
-}
-
-void TimeUnit::__set_NANOS(const NanoSeconds& val) {
- this->NANOS = val;
-__isset.NANOS = true;
-}
-std::ostream& operator<<(std::ostream& out, const TimeUnit& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TimeUnit::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->MILLIS.read(iprot);
- this->__isset.MILLIS = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->MICROS.read(iprot);
- this->__isset.MICROS = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->NANOS.read(iprot);
- this->__isset.NANOS = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t TimeUnit::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TimeUnit");
-
- if (this->__isset.MILLIS) {
- xfer += oprot->writeFieldBegin("MILLIS", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->MILLIS.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.MICROS) {
- xfer += oprot->writeFieldBegin("MICROS", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->MICROS.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.NANOS) {
- xfer += oprot->writeFieldBegin("NANOS", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->NANOS.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TimeUnit &a, TimeUnit &b) {
- using ::std::swap;
- swap(a.MILLIS, b.MILLIS);
- swap(a.MICROS, b.MICROS);
- swap(a.NANOS, b.NANOS);
- swap(a.__isset, b.__isset);
-}
-
-TimeUnit::TimeUnit(const TimeUnit& other24) {
- MILLIS = other24.MILLIS;
- MICROS = other24.MICROS;
- NANOS = other24.NANOS;
- __isset = other24.__isset;
-}
-TimeUnit& TimeUnit::operator=(const TimeUnit& other25) {
- MILLIS = other25.MILLIS;
- MICROS = other25.MICROS;
- NANOS = other25.NANOS;
- __isset = other25.__isset;
- return *this;
-}
-void TimeUnit::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TimeUnit(";
- out << "MILLIS="; (__isset.MILLIS ? (out << to_string(MILLIS)) : (out << "<null>"));
- out << ", " << "MICROS="; (__isset.MICROS ? (out << to_string(MICROS)) : (out << "<null>"));
- out << ", " << "NANOS="; (__isset.NANOS ? (out << to_string(NANOS)) : (out << "<null>"));
- out << ")";
-}
-
-
-TimestampType::~TimestampType() noexcept {
-}
-
-
-void TimestampType::__set_isAdjustedToUTC(const bool val) {
- this->isAdjustedToUTC = val;
-}
-
-void TimestampType::__set_unit(const TimeUnit& val) {
- this->unit = val;
-}
-std::ostream& operator<<(std::ostream& out, const TimestampType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TimestampType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_isAdjustedToUTC = false;
- bool isset_unit = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->isAdjustedToUTC);
- isset_isAdjustedToUTC = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->unit.read(iprot);
- isset_unit = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_isAdjustedToUTC)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_unit)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t TimestampType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TimestampType");
-
- xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
- xfer += oprot->writeBool(this->isAdjustedToUTC);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->unit.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TimestampType &a, TimestampType &b) {
- using ::std::swap;
- swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
- swap(a.unit, b.unit);
-}
-
-TimestampType::TimestampType(const TimestampType& other26) {
- isAdjustedToUTC = other26.isAdjustedToUTC;
- unit = other26.unit;
-}
-TimestampType& TimestampType::operator=(const TimestampType& other27) {
- isAdjustedToUTC = other27.isAdjustedToUTC;
- unit = other27.unit;
- return *this;
-}
-void TimestampType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TimestampType(";
- out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
- out << ", " << "unit=" << to_string(unit);
- out << ")";
-}
-
-
-TimeType::~TimeType() noexcept {
-}
-
-
-void TimeType::__set_isAdjustedToUTC(const bool val) {
- this->isAdjustedToUTC = val;
-}
-
-void TimeType::__set_unit(const TimeUnit& val) {
- this->unit = val;
-}
-std::ostream& operator<<(std::ostream& out, const TimeType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TimeType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_isAdjustedToUTC = false;
- bool isset_unit = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->isAdjustedToUTC);
- isset_isAdjustedToUTC = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->unit.read(iprot);
- isset_unit = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_isAdjustedToUTC)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_unit)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t TimeType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TimeType");
-
- xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
- xfer += oprot->writeBool(this->isAdjustedToUTC);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->unit.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TimeType &a, TimeType &b) {
- using ::std::swap;
- swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
- swap(a.unit, b.unit);
-}
-
-TimeType::TimeType(const TimeType& other28) {
- isAdjustedToUTC = other28.isAdjustedToUTC;
- unit = other28.unit;
-}
-TimeType& TimeType::operator=(const TimeType& other29) {
- isAdjustedToUTC = other29.isAdjustedToUTC;
- unit = other29.unit;
- return *this;
-}
-void TimeType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TimeType(";
- out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
- out << ", " << "unit=" << to_string(unit);
- out << ")";
-}
-
-
-IntType::~IntType() noexcept {
-}
-
-
-void IntType::__set_bitWidth(const int8_t val) {
- this->bitWidth = val;
-}
-
-void IntType::__set_isSigned(const bool val) {
- this->isSigned = val;
-}
-std::ostream& operator<<(std::ostream& out, const IntType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t IntType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_bitWidth = false;
- bool isset_isSigned = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_BYTE) {
- xfer += iprot->readByte(this->bitWidth);
- isset_bitWidth = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->isSigned);
- isset_isSigned = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_bitWidth)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_isSigned)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t IntType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("IntType");
-
- xfer += oprot->writeFieldBegin("bitWidth", ::apache::thrift::protocol::T_BYTE, 1);
- xfer += oprot->writeByte(this->bitWidth);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("isSigned", ::apache::thrift::protocol::T_BOOL, 2);
- xfer += oprot->writeBool(this->isSigned);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(IntType &a, IntType &b) {
- using ::std::swap;
- swap(a.bitWidth, b.bitWidth);
- swap(a.isSigned, b.isSigned);
-}
-
-IntType::IntType(const IntType& other30) {
- bitWidth = other30.bitWidth;
- isSigned = other30.isSigned;
-}
-IntType& IntType::operator=(const IntType& other31) {
- bitWidth = other31.bitWidth;
- isSigned = other31.isSigned;
- return *this;
-}
-void IntType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "IntType(";
- out << "bitWidth=" << to_string(bitWidth);
- out << ", " << "isSigned=" << to_string(isSigned);
- out << ")";
-}
-
-
-JsonType::~JsonType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const JsonType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t JsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t JsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("JsonType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(JsonType &a, JsonType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-JsonType::JsonType(const JsonType& other32) {
- (void) other32;
-}
-JsonType& JsonType::operator=(const JsonType& other33) {
- (void) other33;
- return *this;
-}
-void JsonType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "JsonType(";
- out << ")";
-}
-
-
-BsonType::~BsonType() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const BsonType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BsonType");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BsonType &a, BsonType &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-BsonType::BsonType(const BsonType& other34) {
- (void) other34;
-}
-BsonType& BsonType::operator=(const BsonType& other35) {
- (void) other35;
- return *this;
-}
-void BsonType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BsonType(";
- out << ")";
-}
-
-
-LogicalType::~LogicalType() noexcept {
-}
-
-
-void LogicalType::__set_STRING(const StringType& val) {
- this->STRING = val;
-__isset.STRING = true;
-}
-
-void LogicalType::__set_MAP(const MapType& val) {
- this->MAP = val;
-__isset.MAP = true;
-}
-
-void LogicalType::__set_LIST(const ListType& val) {
- this->LIST = val;
-__isset.LIST = true;
-}
-
-void LogicalType::__set_ENUM(const EnumType& val) {
- this->ENUM = val;
-__isset.ENUM = true;
-}
-
-void LogicalType::__set_DECIMAL(const DecimalType& val) {
- this->DECIMAL = val;
-__isset.DECIMAL = true;
-}
-
-void LogicalType::__set_DATE(const DateType& val) {
- this->DATE = val;
-__isset.DATE = true;
-}
-
-void LogicalType::__set_TIME(const TimeType& val) {
- this->TIME = val;
-__isset.TIME = true;
-}
-
-void LogicalType::__set_TIMESTAMP(const TimestampType& val) {
- this->TIMESTAMP = val;
-__isset.TIMESTAMP = true;
-}
-
-void LogicalType::__set_INTEGER(const IntType& val) {
- this->INTEGER = val;
-__isset.INTEGER = true;
-}
-
-void LogicalType::__set_UNKNOWN(const NullType& val) {
- this->UNKNOWN = val;
-__isset.UNKNOWN = true;
-}
-
-void LogicalType::__set_JSON(const JsonType& val) {
- this->JSON = val;
-__isset.JSON = true;
-}
-
-void LogicalType::__set_BSON(const BsonType& val) {
- this->BSON = val;
-__isset.BSON = true;
-}
-
-void LogicalType::__set_UUID(const UUIDType& val) {
- this->UUID = val;
-__isset.UUID = true;
-}
-std::ostream& operator<<(std::ostream& out, const LogicalType& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t LogicalType::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->STRING.read(iprot);
- this->__isset.STRING = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->MAP.read(iprot);
- this->__isset.MAP = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->LIST.read(iprot);
- this->__isset.LIST = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->ENUM.read(iprot);
- this->__isset.ENUM = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->DECIMAL.read(iprot);
- this->__isset.DECIMAL = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->DATE.read(iprot);
- this->__isset.DATE = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->TIME.read(iprot);
- this->__isset.TIME = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->TIMESTAMP.read(iprot);
- this->__isset.TIMESTAMP = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 10:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->INTEGER.read(iprot);
- this->__isset.INTEGER = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 11:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->UNKNOWN.read(iprot);
- this->__isset.UNKNOWN = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 12:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->JSON.read(iprot);
- this->__isset.JSON = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 13:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->BSON.read(iprot);
- this->__isset.BSON = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 14:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->UUID.read(iprot);
- this->__isset.UUID = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t LogicalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("LogicalType");
-
- if (this->__isset.STRING) {
- xfer += oprot->writeFieldBegin("STRING", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->STRING.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.MAP) {
- xfer += oprot->writeFieldBegin("MAP", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->MAP.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.LIST) {
- xfer += oprot->writeFieldBegin("LIST", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->LIST.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.ENUM) {
- xfer += oprot->writeFieldBegin("ENUM", ::apache::thrift::protocol::T_STRUCT, 4);
- xfer += this->ENUM.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.DECIMAL) {
- xfer += oprot->writeFieldBegin("DECIMAL", ::apache::thrift::protocol::T_STRUCT, 5);
- xfer += this->DECIMAL.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.DATE) {
- xfer += oprot->writeFieldBegin("DATE", ::apache::thrift::protocol::T_STRUCT, 6);
- xfer += this->DATE.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.TIME) {
- xfer += oprot->writeFieldBegin("TIME", ::apache::thrift::protocol::T_STRUCT, 7);
- xfer += this->TIME.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.TIMESTAMP) {
- xfer += oprot->writeFieldBegin("TIMESTAMP", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->TIMESTAMP.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.INTEGER) {
- xfer += oprot->writeFieldBegin("INTEGER", ::apache::thrift::protocol::T_STRUCT, 10);
- xfer += this->INTEGER.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.UNKNOWN) {
- xfer += oprot->writeFieldBegin("UNKNOWN", ::apache::thrift::protocol::T_STRUCT, 11);
- xfer += this->UNKNOWN.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.JSON) {
- xfer += oprot->writeFieldBegin("JSON", ::apache::thrift::protocol::T_STRUCT, 12);
- xfer += this->JSON.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.BSON) {
- xfer += oprot->writeFieldBegin("BSON", ::apache::thrift::protocol::T_STRUCT, 13);
- xfer += this->BSON.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.UUID) {
- xfer += oprot->writeFieldBegin("UUID", ::apache::thrift::protocol::T_STRUCT, 14);
- xfer += this->UUID.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(LogicalType &a, LogicalType &b) {
- using ::std::swap;
- swap(a.STRING, b.STRING);
- swap(a.MAP, b.MAP);
- swap(a.LIST, b.LIST);
- swap(a.ENUM, b.ENUM);
- swap(a.DECIMAL, b.DECIMAL);
- swap(a.DATE, b.DATE);
- swap(a.TIME, b.TIME);
- swap(a.TIMESTAMP, b.TIMESTAMP);
- swap(a.INTEGER, b.INTEGER);
- swap(a.UNKNOWN, b.UNKNOWN);
- swap(a.JSON, b.JSON);
- swap(a.BSON, b.BSON);
- swap(a.UUID, b.UUID);
- swap(a.__isset, b.__isset);
-}
-
-LogicalType::LogicalType(const LogicalType& other36) {
- STRING = other36.STRING;
- MAP = other36.MAP;
- LIST = other36.LIST;
- ENUM = other36.ENUM;
- DECIMAL = other36.DECIMAL;
- DATE = other36.DATE;
- TIME = other36.TIME;
- TIMESTAMP = other36.TIMESTAMP;
- INTEGER = other36.INTEGER;
- UNKNOWN = other36.UNKNOWN;
- JSON = other36.JSON;
- BSON = other36.BSON;
- UUID = other36.UUID;
- __isset = other36.__isset;
-}
-LogicalType& LogicalType::operator=(const LogicalType& other37) {
- STRING = other37.STRING;
- MAP = other37.MAP;
- LIST = other37.LIST;
- ENUM = other37.ENUM;
- DECIMAL = other37.DECIMAL;
- DATE = other37.DATE;
- TIME = other37.TIME;
- TIMESTAMP = other37.TIMESTAMP;
- INTEGER = other37.INTEGER;
- UNKNOWN = other37.UNKNOWN;
- JSON = other37.JSON;
- BSON = other37.BSON;
- UUID = other37.UUID;
- __isset = other37.__isset;
- return *this;
-}
-void LogicalType::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "LogicalType(";
- out << "STRING="; (__isset.STRING ? (out << to_string(STRING)) : (out << "<null>"));
- out << ", " << "MAP="; (__isset.MAP ? (out << to_string(MAP)) : (out << "<null>"));
- out << ", " << "LIST="; (__isset.LIST ? (out << to_string(LIST)) : (out << "<null>"));
- out << ", " << "ENUM="; (__isset.ENUM ? (out << to_string(ENUM)) : (out << "<null>"));
- out << ", " << "DECIMAL="; (__isset.DECIMAL ? (out << to_string(DECIMAL)) : (out << "<null>"));
- out << ", " << "DATE="; (__isset.DATE ? (out << to_string(DATE)) : (out << "<null>"));
- out << ", " << "TIME="; (__isset.TIME ? (out << to_string(TIME)) : (out << "<null>"));
- out << ", " << "TIMESTAMP="; (__isset.TIMESTAMP ? (out << to_string(TIMESTAMP)) : (out << "<null>"));
- out << ", " << "INTEGER="; (__isset.INTEGER ? (out << to_string(INTEGER)) : (out << "<null>"));
- out << ", " << "UNKNOWN="; (__isset.UNKNOWN ? (out << to_string(UNKNOWN)) : (out << "<null>"));
- out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "<null>"));
- out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "<null>"));
- out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "<null>"));
- out << ")";
-}
-
-
-SchemaElement::~SchemaElement() noexcept {
-}
-
-
-void SchemaElement::__set_type(const Type::type val) {
- this->type = val;
-__isset.type = true;
-}
-
-void SchemaElement::__set_type_length(const int32_t val) {
- this->type_length = val;
-__isset.type_length = true;
-}
-
-void SchemaElement::__set_repetition_type(const FieldRepetitionType::type val) {
- this->repetition_type = val;
-__isset.repetition_type = true;
-}
-
-void SchemaElement::__set_name(const std::string& val) {
- this->name = val;
-}
-
-void SchemaElement::__set_num_children(const int32_t val) {
- this->num_children = val;
-__isset.num_children = true;
-}
-
-void SchemaElement::__set_converted_type(const ConvertedType::type val) {
- this->converted_type = val;
-__isset.converted_type = true;
-}
-
-void SchemaElement::__set_scale(const int32_t val) {
- this->scale = val;
-__isset.scale = true;
-}
-
-void SchemaElement::__set_precision(const int32_t val) {
- this->precision = val;
-__isset.precision = true;
-}
-
-void SchemaElement::__set_field_id(const int32_t val) {
- this->field_id = val;
-__isset.field_id = true;
-}
-
-void SchemaElement::__set_logicalType(const LogicalType& val) {
- this->logicalType = val;
-__isset.logicalType = true;
-}
-std::ostream& operator<<(std::ostream& out, const SchemaElement& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_name = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast38;
- xfer += iprot->readI32(ecast38);
- this->type = (Type::type)ecast38;
- this->__isset.type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->type_length);
- this->__isset.type_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast39;
- xfer += iprot->readI32(ecast39);
- this->repetition_type = (FieldRepetitionType::type)ecast39;
- this->__isset.repetition_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->name);
- isset_name = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_children);
- this->__isset.num_children = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast40;
- xfer += iprot->readI32(ecast40);
- this->converted_type = (ConvertedType::type)ecast40;
- this->__isset.converted_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->scale);
- this->__isset.scale = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->precision);
- this->__isset.precision = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->field_id);
- this->__isset.field_id = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 10:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->logicalType.read(iprot);
- this->__isset.logicalType = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_name)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t SchemaElement::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("SchemaElement");
-
- if (this->__isset.type) {
- xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->type);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.type_length) {
- xfer += oprot->writeFieldBegin("type_length", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->type_length);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.repetition_type) {
- xfer += oprot->writeFieldBegin("repetition_type", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32((int32_t)this->repetition_type);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldBegin("name", ::apache::thrift::protocol::T_STRING, 4);
- xfer += oprot->writeString(this->name);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.num_children) {
- xfer += oprot->writeFieldBegin("num_children", ::apache::thrift::protocol::T_I32, 5);
- xfer += oprot->writeI32(this->num_children);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.converted_type) {
- xfer += oprot->writeFieldBegin("converted_type", ::apache::thrift::protocol::T_I32, 6);
- xfer += oprot->writeI32((int32_t)this->converted_type);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.scale) {
- xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 7);
- xfer += oprot->writeI32(this->scale);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.precision) {
- xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 8);
- xfer += oprot->writeI32(this->precision);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.field_id) {
- xfer += oprot->writeFieldBegin("field_id", ::apache::thrift::protocol::T_I32, 9);
- xfer += oprot->writeI32(this->field_id);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.logicalType) {
- xfer += oprot->writeFieldBegin("logicalType", ::apache::thrift::protocol::T_STRUCT, 10);
- xfer += this->logicalType.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(SchemaElement &a, SchemaElement &b) {
- using ::std::swap;
- swap(a.type, b.type);
- swap(a.type_length, b.type_length);
- swap(a.repetition_type, b.repetition_type);
- swap(a.name, b.name);
- swap(a.num_children, b.num_children);
- swap(a.converted_type, b.converted_type);
- swap(a.scale, b.scale);
- swap(a.precision, b.precision);
- swap(a.field_id, b.field_id);
- swap(a.logicalType, b.logicalType);
- swap(a.__isset, b.__isset);
-}
-
-SchemaElement::SchemaElement(const SchemaElement& other41) {
- type = other41.type;
- type_length = other41.type_length;
- repetition_type = other41.repetition_type;
- name = other41.name;
- num_children = other41.num_children;
- converted_type = other41.converted_type;
- scale = other41.scale;
- precision = other41.precision;
- field_id = other41.field_id;
- logicalType = other41.logicalType;
- __isset = other41.__isset;
-}
-SchemaElement& SchemaElement::operator=(const SchemaElement& other42) {
- type = other42.type;
- type_length = other42.type_length;
- repetition_type = other42.repetition_type;
- name = other42.name;
- num_children = other42.num_children;
- converted_type = other42.converted_type;
- scale = other42.scale;
- precision = other42.precision;
- field_id = other42.field_id;
- logicalType = other42.logicalType;
- __isset = other42.__isset;
- return *this;
-}
-void SchemaElement::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "SchemaElement(";
- out << "type="; (__isset.type ? (out << to_string(type)) : (out << "<null>"));
- out << ", " << "type_length="; (__isset.type_length ? (out << to_string(type_length)) : (out << "<null>"));
- out << ", " << "repetition_type="; (__isset.repetition_type ? (out << to_string(repetition_type)) : (out << "<null>"));
- out << ", " << "name=" << to_string(name);
- out << ", " << "num_children="; (__isset.num_children ? (out << to_string(num_children)) : (out << "<null>"));
- out << ", " << "converted_type="; (__isset.converted_type ? (out << to_string(converted_type)) : (out << "<null>"));
- out << ", " << "scale="; (__isset.scale ? (out << to_string(scale)) : (out << "<null>"));
- out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "<null>"));
- out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "<null>"));
- out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "<null>"));
- out << ")";
-}
-
-
-DataPageHeader::~DataPageHeader() noexcept {
-}
-
-
-void DataPageHeader::__set_num_values(const int32_t val) {
- this->num_values = val;
-}
-
-void DataPageHeader::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void DataPageHeader::__set_definition_level_encoding(const Encoding::type val) {
- this->definition_level_encoding = val;
-}
-
-void DataPageHeader::__set_repetition_level_encoding(const Encoding::type val) {
- this->repetition_level_encoding = val;
-}
-
-void DataPageHeader::__set_statistics(const Statistics& val) {
- this->statistics = val;
-__isset.statistics = true;
-}
-std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_num_values = false;
- bool isset_encoding = false;
- bool isset_definition_level_encoding = false;
- bool isset_repetition_level_encoding = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast43;
- xfer += iprot->readI32(ecast43);
- this->encoding = (Encoding::type)ecast43;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast44;
- xfer += iprot->readI32(ecast44);
- this->definition_level_encoding = (Encoding::type)ecast44;
- isset_definition_level_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast45;
- xfer += iprot->readI32(ecast45);
- this->repetition_level_encoding = (Encoding::type)ecast45;
- isset_repetition_level_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->statistics.read(iprot);
- this->__isset.statistics = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_definition_level_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_repetition_level_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DataPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DataPageHeader");
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("definition_level_encoding", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32((int32_t)this->definition_level_encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("repetition_level_encoding", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->repetition_level_encoding);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.statistics) {
- xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 5);
- xfer += this->statistics.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DataPageHeader &a, DataPageHeader &b) {
- using ::std::swap;
- swap(a.num_values, b.num_values);
- swap(a.encoding, b.encoding);
- swap(a.definition_level_encoding, b.definition_level_encoding);
- swap(a.repetition_level_encoding, b.repetition_level_encoding);
- swap(a.statistics, b.statistics);
- swap(a.__isset, b.__isset);
-}
-
-DataPageHeader::DataPageHeader(const DataPageHeader& other46) {
- num_values = other46.num_values;
- encoding = other46.encoding;
- definition_level_encoding = other46.definition_level_encoding;
- repetition_level_encoding = other46.repetition_level_encoding;
- statistics = other46.statistics;
- __isset = other46.__isset;
-}
-DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other47) {
- num_values = other47.num_values;
- encoding = other47.encoding;
- definition_level_encoding = other47.definition_level_encoding;
- repetition_level_encoding = other47.repetition_level_encoding;
- statistics = other47.statistics;
- __isset = other47.__isset;
- return *this;
-}
-void DataPageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DataPageHeader(";
- out << "num_values=" << to_string(num_values);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "definition_level_encoding=" << to_string(definition_level_encoding);
- out << ", " << "repetition_level_encoding=" << to_string(repetition_level_encoding);
- out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
- out << ")";
-}
-
-
-IndexPageHeader::~IndexPageHeader() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t IndexPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t IndexPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("IndexPageHeader");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(IndexPageHeader &a, IndexPageHeader &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-IndexPageHeader::IndexPageHeader(const IndexPageHeader& other48) {
- (void) other48;
-}
-IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other49) {
- (void) other49;
- return *this;
-}
-void IndexPageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "IndexPageHeader(";
- out << ")";
-}
-
-
-DictionaryPageHeader::~DictionaryPageHeader() noexcept {
-}
-
-
-void DictionaryPageHeader::__set_num_values(const int32_t val) {
- this->num_values = val;
-}
-
-void DictionaryPageHeader::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void DictionaryPageHeader::__set_is_sorted(const bool val) {
- this->is_sorted = val;
-__isset.is_sorted = true;
-}
-std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_num_values = false;
- bool isset_encoding = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast50;
- xfer += iprot->readI32(ecast50);
- this->encoding = (Encoding::type)ecast50;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->is_sorted);
- this->__isset.is_sorted = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DictionaryPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DictionaryPageHeader");
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.is_sorted) {
- xfer += oprot->writeFieldBegin("is_sorted", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->is_sorted);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) {
- using ::std::swap;
- swap(a.num_values, b.num_values);
- swap(a.encoding, b.encoding);
- swap(a.is_sorted, b.is_sorted);
- swap(a.__isset, b.__isset);
-}
-
-DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other51) {
- num_values = other51.num_values;
- encoding = other51.encoding;
- is_sorted = other51.is_sorted;
- __isset = other51.__isset;
-}
-DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other52) {
- num_values = other52.num_values;
- encoding = other52.encoding;
- is_sorted = other52.is_sorted;
- __isset = other52.__isset;
- return *this;
-}
-void DictionaryPageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DictionaryPageHeader(";
- out << "num_values=" << to_string(num_values);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "is_sorted="; (__isset.is_sorted ? (out << to_string(is_sorted)) : (out << "<null>"));
- out << ")";
-}
-
-
-DataPageHeaderV2::~DataPageHeaderV2() noexcept {
-}
-
-
-void DataPageHeaderV2::__set_num_values(const int32_t val) {
- this->num_values = val;
-}
-
-void DataPageHeaderV2::__set_num_nulls(const int32_t val) {
- this->num_nulls = val;
-}
-
-void DataPageHeaderV2::__set_num_rows(const int32_t val) {
- this->num_rows = val;
-}
-
-void DataPageHeaderV2::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void DataPageHeaderV2::__set_definition_levels_byte_length(const int32_t val) {
- this->definition_levels_byte_length = val;
-}
-
-void DataPageHeaderV2::__set_repetition_levels_byte_length(const int32_t val) {
- this->repetition_levels_byte_length = val;
-}
-
-void DataPageHeaderV2::__set_is_compressed(const bool val) {
- this->is_compressed = val;
-__isset.is_compressed = true;
-}
-
-void DataPageHeaderV2::__set_statistics(const Statistics& val) {
- this->statistics = val;
-__isset.statistics = true;
-}
-std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_num_values = false;
- bool isset_num_nulls = false;
- bool isset_num_rows = false;
- bool isset_encoding = false;
- bool isset_definition_levels_byte_length = false;
- bool isset_repetition_levels_byte_length = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_nulls);
- isset_num_nulls = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->num_rows);
- isset_num_rows = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast53;
- xfer += iprot->readI32(ecast53);
- this->encoding = (Encoding::type)ecast53;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->definition_levels_byte_length);
- isset_definition_levels_byte_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->repetition_levels_byte_length);
- isset_repetition_levels_byte_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->is_compressed);
- this->__isset.is_compressed = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->statistics.read(iprot);
- this->__isset.statistics = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_nulls)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_rows)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_definition_levels_byte_length)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_repetition_levels_byte_length)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t DataPageHeaderV2::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("DataPageHeaderV2");
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_nulls", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->num_nulls);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32(this->num_rows);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("definition_levels_byte_length", ::apache::thrift::protocol::T_I32, 5);
- xfer += oprot->writeI32(this->definition_levels_byte_length);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("repetition_levels_byte_length", ::apache::thrift::protocol::T_I32, 6);
- xfer += oprot->writeI32(this->repetition_levels_byte_length);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.is_compressed) {
- xfer += oprot->writeFieldBegin("is_compressed", ::apache::thrift::protocol::T_BOOL, 7);
- xfer += oprot->writeBool(this->is_compressed);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.statistics) {
- xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->statistics.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) {
- using ::std::swap;
- swap(a.num_values, b.num_values);
- swap(a.num_nulls, b.num_nulls);
- swap(a.num_rows, b.num_rows);
- swap(a.encoding, b.encoding);
- swap(a.definition_levels_byte_length, b.definition_levels_byte_length);
- swap(a.repetition_levels_byte_length, b.repetition_levels_byte_length);
- swap(a.is_compressed, b.is_compressed);
- swap(a.statistics, b.statistics);
- swap(a.__isset, b.__isset);
-}
-
-DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other54) {
- num_values = other54.num_values;
- num_nulls = other54.num_nulls;
- num_rows = other54.num_rows;
- encoding = other54.encoding;
- definition_levels_byte_length = other54.definition_levels_byte_length;
- repetition_levels_byte_length = other54.repetition_levels_byte_length;
- is_compressed = other54.is_compressed;
- statistics = other54.statistics;
- __isset = other54.__isset;
-}
-DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other55) {
- num_values = other55.num_values;
- num_nulls = other55.num_nulls;
- num_rows = other55.num_rows;
- encoding = other55.encoding;
- definition_levels_byte_length = other55.definition_levels_byte_length;
- repetition_levels_byte_length = other55.repetition_levels_byte_length;
- is_compressed = other55.is_compressed;
- statistics = other55.statistics;
- __isset = other55.__isset;
- return *this;
-}
-void DataPageHeaderV2::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "DataPageHeaderV2(";
- out << "num_values=" << to_string(num_values);
- out << ", " << "num_nulls=" << to_string(num_nulls);
- out << ", " << "num_rows=" << to_string(num_rows);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "definition_levels_byte_length=" << to_string(definition_levels_byte_length);
- out << ", " << "repetition_levels_byte_length=" << to_string(repetition_levels_byte_length);
- out << ", " << "is_compressed="; (__isset.is_compressed ? (out << to_string(is_compressed)) : (out << "<null>"));
- out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
- out << ")";
-}
-
-
-SplitBlockAlgorithm::~SplitBlockAlgorithm() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t SplitBlockAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t SplitBlockAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("SplitBlockAlgorithm");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other56) {
- (void) other56;
-}
-SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other57) {
- (void) other57;
- return *this;
-}
-void SplitBlockAlgorithm::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "SplitBlockAlgorithm(";
- out << ")";
-}
-
-
-BloomFilterAlgorithm::~BloomFilterAlgorithm() noexcept {
-}
-
-
-void BloomFilterAlgorithm::__set_BLOCK(const SplitBlockAlgorithm& val) {
- this->BLOCK = val;
-__isset.BLOCK = true;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->BLOCK.read(iprot);
- this->__isset.BLOCK = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BloomFilterAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterAlgorithm");
-
- if (this->__isset.BLOCK) {
- xfer += oprot->writeFieldBegin("BLOCK", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->BLOCK.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) {
- using ::std::swap;
- swap(a.BLOCK, b.BLOCK);
- swap(a.__isset, b.__isset);
-}
-
-BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other58) {
- BLOCK = other58.BLOCK;
- __isset = other58.__isset;
-}
-BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other59) {
- BLOCK = other59.BLOCK;
- __isset = other59.__isset;
- return *this;
-}
-void BloomFilterAlgorithm::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterAlgorithm(";
- out << "BLOCK="; (__isset.BLOCK ? (out << to_string(BLOCK)) : (out << "<null>"));
- out << ")";
-}
-
-
-XxHash::~XxHash() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const XxHash& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t XxHash::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t XxHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("XxHash");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(XxHash &a, XxHash &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-XxHash::XxHash(const XxHash& other60) {
- (void) other60;
-}
-XxHash& XxHash::operator=(const XxHash& other61) {
- (void) other61;
- return *this;
-}
-void XxHash::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "XxHash(";
- out << ")";
-}
-
-
-BloomFilterHash::~BloomFilterHash() noexcept {
-}
-
-
-void BloomFilterHash::__set_XXHASH(const XxHash& val) {
- this->XXHASH = val;
-__isset.XXHASH = true;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterHash::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->XXHASH.read(iprot);
- this->__isset.XXHASH = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BloomFilterHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterHash");
-
- if (this->__isset.XXHASH) {
- xfer += oprot->writeFieldBegin("XXHASH", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->XXHASH.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterHash &a, BloomFilterHash &b) {
- using ::std::swap;
- swap(a.XXHASH, b.XXHASH);
- swap(a.__isset, b.__isset);
-}
-
-BloomFilterHash::BloomFilterHash(const BloomFilterHash& other62) {
- XXHASH = other62.XXHASH;
- __isset = other62.__isset;
-}
-BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other63) {
- XXHASH = other63.XXHASH;
- __isset = other63.__isset;
- return *this;
-}
-void BloomFilterHash::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterHash(";
- out << "XXHASH="; (__isset.XXHASH ? (out << to_string(XXHASH)) : (out << "<null>"));
- out << ")";
-}
-
-
-Uncompressed::~Uncompressed() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const Uncompressed& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t Uncompressed::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t Uncompressed::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("Uncompressed");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(Uncompressed &a, Uncompressed &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-Uncompressed::Uncompressed(const Uncompressed& other64) {
- (void) other64;
-}
-Uncompressed& Uncompressed::operator=(const Uncompressed& other65) {
- (void) other65;
- return *this;
-}
-void Uncompressed::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "Uncompressed(";
- out << ")";
-}
-
-
-BloomFilterCompression::~BloomFilterCompression() noexcept {
-}
-
-
-void BloomFilterCompression::__set_UNCOMPRESSED(const Uncompressed& val) {
- this->UNCOMPRESSED = val;
-__isset.UNCOMPRESSED = true;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterCompression::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->UNCOMPRESSED.read(iprot);
- this->__isset.UNCOMPRESSED = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t BloomFilterCompression::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterCompression");
-
- if (this->__isset.UNCOMPRESSED) {
- xfer += oprot->writeFieldBegin("UNCOMPRESSED", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->UNCOMPRESSED.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterCompression &a, BloomFilterCompression &b) {
- using ::std::swap;
- swap(a.UNCOMPRESSED, b.UNCOMPRESSED);
- swap(a.__isset, b.__isset);
-}
-
-BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other66) {
- UNCOMPRESSED = other66.UNCOMPRESSED;
- __isset = other66.__isset;
-}
-BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other67) {
- UNCOMPRESSED = other67.UNCOMPRESSED;
- __isset = other67.__isset;
- return *this;
-}
-void BloomFilterCompression::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterCompression(";
- out << "UNCOMPRESSED="; (__isset.UNCOMPRESSED ? (out << to_string(UNCOMPRESSED)) : (out << "<null>"));
- out << ")";
-}
-
-
-BloomFilterHeader::~BloomFilterHeader() noexcept {
-}
-
-
-void BloomFilterHeader::__set_numBytes(const int32_t val) {
- this->numBytes = val;
-}
-
-void BloomFilterHeader::__set_algorithm(const BloomFilterAlgorithm& val) {
- this->algorithm = val;
-}
-
-void BloomFilterHeader::__set_hash(const BloomFilterHash& val) {
- this->hash = val;
-}
-
-void BloomFilterHeader::__set_compression(const BloomFilterCompression& val) {
- this->compression = val;
-}
-std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t BloomFilterHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_numBytes = false;
- bool isset_algorithm = false;
- bool isset_hash = false;
- bool isset_compression = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->numBytes);
- isset_numBytes = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->algorithm.read(iprot);
- isset_algorithm = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->hash.read(iprot);
- isset_hash = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->compression.read(iprot);
- isset_compression = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_numBytes)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_algorithm)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_hash)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_compression)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t BloomFilterHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("BloomFilterHeader");
-
- xfer += oprot->writeFieldBegin("numBytes", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->numBytes);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("algorithm", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->algorithm.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("hash", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->hash.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("compression", ::apache::thrift::protocol::T_STRUCT, 4);
- xfer += this->compression.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(BloomFilterHeader &a, BloomFilterHeader &b) {
- using ::std::swap;
- swap(a.numBytes, b.numBytes);
- swap(a.algorithm, b.algorithm);
- swap(a.hash, b.hash);
- swap(a.compression, b.compression);
-}
-
-BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other68) {
- numBytes = other68.numBytes;
- algorithm = other68.algorithm;
- hash = other68.hash;
- compression = other68.compression;
-}
-BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other69) {
- numBytes = other69.numBytes;
- algorithm = other69.algorithm;
- hash = other69.hash;
- compression = other69.compression;
- return *this;
-}
-void BloomFilterHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "BloomFilterHeader(";
- out << "numBytes=" << to_string(numBytes);
- out << ", " << "algorithm=" << to_string(algorithm);
- out << ", " << "hash=" << to_string(hash);
- out << ", " << "compression=" << to_string(compression);
- out << ")";
-}
-
-
-PageHeader::~PageHeader() noexcept {
-}
-
-
-void PageHeader::__set_type(const PageType::type val) {
- this->type = val;
-}
-
-void PageHeader::__set_uncompressed_page_size(const int32_t val) {
- this->uncompressed_page_size = val;
-}
-
-void PageHeader::__set_compressed_page_size(const int32_t val) {
- this->compressed_page_size = val;
-}
-
-void PageHeader::__set_crc(const int32_t val) {
- this->crc = val;
-__isset.crc = true;
-}
-
-void PageHeader::__set_data_page_header(const DataPageHeader& val) {
- this->data_page_header = val;
-__isset.data_page_header = true;
-}
-
-void PageHeader::__set_index_page_header(const IndexPageHeader& val) {
- this->index_page_header = val;
-__isset.index_page_header = true;
-}
-
-void PageHeader::__set_dictionary_page_header(const DictionaryPageHeader& val) {
- this->dictionary_page_header = val;
-__isset.dictionary_page_header = true;
-}
-
-void PageHeader::__set_data_page_header_v2(const DataPageHeaderV2& val) {
- this->data_page_header_v2 = val;
-__isset.data_page_header_v2 = true;
-}
-std::ostream& operator<<(std::ostream& out, const PageHeader& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_type = false;
- bool isset_uncompressed_page_size = false;
- bool isset_compressed_page_size = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast70;
- xfer += iprot->readI32(ecast70);
- this->type = (PageType::type)ecast70;
- isset_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->uncompressed_page_size);
- isset_uncompressed_page_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->compressed_page_size);
- isset_compressed_page_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->crc);
- this->__isset.crc = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->data_page_header.read(iprot);
- this->__isset.data_page_header = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->index_page_header.read(iprot);
- this->__isset.index_page_header = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->dictionary_page_header.read(iprot);
- this->__isset.dictionary_page_header = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->data_page_header_v2.read(iprot);
- this->__isset.data_page_header_v2 = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_type)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_uncompressed_page_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_compressed_page_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t PageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("PageHeader");
-
- xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->type);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("uncompressed_page_size", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->uncompressed_page_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32(this->compressed_page_size);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.crc) {
- xfer += oprot->writeFieldBegin("crc", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32(this->crc);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.data_page_header) {
- xfer += oprot->writeFieldBegin("data_page_header", ::apache::thrift::protocol::T_STRUCT, 5);
- xfer += this->data_page_header.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.index_page_header) {
- xfer += oprot->writeFieldBegin("index_page_header", ::apache::thrift::protocol::T_STRUCT, 6);
- xfer += this->index_page_header.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.dictionary_page_header) {
- xfer += oprot->writeFieldBegin("dictionary_page_header", ::apache::thrift::protocol::T_STRUCT, 7);
- xfer += this->dictionary_page_header.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.data_page_header_v2) {
- xfer += oprot->writeFieldBegin("data_page_header_v2", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->data_page_header_v2.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(PageHeader &a, PageHeader &b) {
- using ::std::swap;
- swap(a.type, b.type);
- swap(a.uncompressed_page_size, b.uncompressed_page_size);
- swap(a.compressed_page_size, b.compressed_page_size);
- swap(a.crc, b.crc);
- swap(a.data_page_header, b.data_page_header);
- swap(a.index_page_header, b.index_page_header);
- swap(a.dictionary_page_header, b.dictionary_page_header);
- swap(a.data_page_header_v2, b.data_page_header_v2);
- swap(a.__isset, b.__isset);
-}
-
-PageHeader::PageHeader(const PageHeader& other71) {
- type = other71.type;
- uncompressed_page_size = other71.uncompressed_page_size;
- compressed_page_size = other71.compressed_page_size;
- crc = other71.crc;
- data_page_header = other71.data_page_header;
- index_page_header = other71.index_page_header;
- dictionary_page_header = other71.dictionary_page_header;
- data_page_header_v2 = other71.data_page_header_v2;
- __isset = other71.__isset;
-}
-PageHeader& PageHeader::operator=(const PageHeader& other72) {
- type = other72.type;
- uncompressed_page_size = other72.uncompressed_page_size;
- compressed_page_size = other72.compressed_page_size;
- crc = other72.crc;
- data_page_header = other72.data_page_header;
- index_page_header = other72.index_page_header;
- dictionary_page_header = other72.dictionary_page_header;
- data_page_header_v2 = other72.data_page_header_v2;
- __isset = other72.__isset;
- return *this;
-}
-void PageHeader::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "PageHeader(";
- out << "type=" << to_string(type);
- out << ", " << "uncompressed_page_size=" << to_string(uncompressed_page_size);
- out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
- out << ", " << "crc="; (__isset.crc ? (out << to_string(crc)) : (out << "<null>"));
- out << ", " << "data_page_header="; (__isset.data_page_header ? (out << to_string(data_page_header)) : (out << "<null>"));
- out << ", " << "index_page_header="; (__isset.index_page_header ? (out << to_string(index_page_header)) : (out << "<null>"));
- out << ", " << "dictionary_page_header="; (__isset.dictionary_page_header ? (out << to_string(dictionary_page_header)) : (out << "<null>"));
- out << ", " << "data_page_header_v2="; (__isset.data_page_header_v2 ? (out << to_string(data_page_header_v2)) : (out << "<null>"));
- out << ")";
-}
-
-
-KeyValue::~KeyValue() noexcept {
-}
-
-
-void KeyValue::__set_key(const std::string& val) {
- this->key = val;
-}
-
-void KeyValue::__set_value(const std::string& val) {
- this->value = val;
-__isset.value = true;
-}
-std::ostream& operator<<(std::ostream& out, const KeyValue& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t KeyValue::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_key = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->key);
- isset_key = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->value);
- this->__isset.value = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_key)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t KeyValue::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("KeyValue");
-
- xfer += oprot->writeFieldBegin("key", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeString(this->key);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.value) {
- xfer += oprot->writeFieldBegin("value", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeString(this->value);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(KeyValue &a, KeyValue &b) {
- using ::std::swap;
- swap(a.key, b.key);
- swap(a.value, b.value);
- swap(a.__isset, b.__isset);
-}
-
-KeyValue::KeyValue(const KeyValue& other73) {
- key = other73.key;
- value = other73.value;
- __isset = other73.__isset;
-}
-KeyValue& KeyValue::operator=(const KeyValue& other74) {
- key = other74.key;
- value = other74.value;
- __isset = other74.__isset;
- return *this;
-}
-void KeyValue::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "KeyValue(";
- out << "key=" << to_string(key);
- out << ", " << "value="; (__isset.value ? (out << to_string(value)) : (out << "<null>"));
- out << ")";
-}
-
-
-SortingColumn::~SortingColumn() noexcept {
-}
-
-
-void SortingColumn::__set_column_idx(const int32_t val) {
- this->column_idx = val;
-}
-
-void SortingColumn::__set_descending(const bool val) {
- this->descending = val;
-}
-
-void SortingColumn::__set_nulls_first(const bool val) {
- this->nulls_first = val;
-}
-std::ostream& operator<<(std::ostream& out, const SortingColumn& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t SortingColumn::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_column_idx = false;
- bool isset_descending = false;
- bool isset_nulls_first = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->column_idx);
- isset_column_idx = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->descending);
- isset_descending = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->nulls_first);
- isset_nulls_first = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_column_idx)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_descending)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_nulls_first)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t SortingColumn::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("SortingColumn");
-
- xfer += oprot->writeFieldBegin("column_idx", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->column_idx);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("descending", ::apache::thrift::protocol::T_BOOL, 2);
- xfer += oprot->writeBool(this->descending);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("nulls_first", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->nulls_first);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(SortingColumn &a, SortingColumn &b) {
- using ::std::swap;
- swap(a.column_idx, b.column_idx);
- swap(a.descending, b.descending);
- swap(a.nulls_first, b.nulls_first);
-}
-
-SortingColumn::SortingColumn(const SortingColumn& other75) {
- column_idx = other75.column_idx;
- descending = other75.descending;
- nulls_first = other75.nulls_first;
-}
-SortingColumn& SortingColumn::operator=(const SortingColumn& other76) {
- column_idx = other76.column_idx;
- descending = other76.descending;
- nulls_first = other76.nulls_first;
- return *this;
-}
-void SortingColumn::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "SortingColumn(";
- out << "column_idx=" << to_string(column_idx);
- out << ", " << "descending=" << to_string(descending);
- out << ", " << "nulls_first=" << to_string(nulls_first);
- out << ")";
-}
-
-
-PageEncodingStats::~PageEncodingStats() noexcept {
-}
-
-
-void PageEncodingStats::__set_page_type(const PageType::type val) {
- this->page_type = val;
-}
-
-void PageEncodingStats::__set_encoding(const Encoding::type val) {
- this->encoding = val;
-}
-
-void PageEncodingStats::__set_count(const int32_t val) {
- this->count = val;
-}
-std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_page_type = false;
- bool isset_encoding = false;
- bool isset_count = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast77;
- xfer += iprot->readI32(ecast77);
- this->page_type = (PageType::type)ecast77;
- isset_page_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast78;
- xfer += iprot->readI32(ecast78);
- this->encoding = (Encoding::type)ecast78;
- isset_encoding = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->count);
- isset_count = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_page_type)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encoding)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_count)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t PageEncodingStats::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("PageEncodingStats");
-
- xfer += oprot->writeFieldBegin("page_type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->page_type);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32((int32_t)this->encoding);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("count", ::apache::thrift::protocol::T_I32, 3);
- xfer += oprot->writeI32(this->count);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(PageEncodingStats &a, PageEncodingStats &b) {
- using ::std::swap;
- swap(a.page_type, b.page_type);
- swap(a.encoding, b.encoding);
- swap(a.count, b.count);
-}
-
-PageEncodingStats::PageEncodingStats(const PageEncodingStats& other79) {
- page_type = other79.page_type;
- encoding = other79.encoding;
- count = other79.count;
-}
-PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other80) {
- page_type = other80.page_type;
- encoding = other80.encoding;
- count = other80.count;
- return *this;
-}
-void PageEncodingStats::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "PageEncodingStats(";
- out << "page_type=" << to_string(page_type);
- out << ", " << "encoding=" << to_string(encoding);
- out << ", " << "count=" << to_string(count);
- out << ")";
-}
-
-
-ColumnMetaData::~ColumnMetaData() noexcept {
-}
-
-
-void ColumnMetaData::__set_type(const Type::type val) {
- this->type = val;
-}
-
-void ColumnMetaData::__set_encodings(const std::vector<Encoding::type> & val) {
- this->encodings = val;
-}
-
-void ColumnMetaData::__set_path_in_schema(const std::vector<std::string> & val) {
- this->path_in_schema = val;
-}
-
-void ColumnMetaData::__set_codec(const CompressionCodec::type val) {
- this->codec = val;
-}
-
-void ColumnMetaData::__set_num_values(const int64_t val) {
- this->num_values = val;
-}
-
-void ColumnMetaData::__set_total_uncompressed_size(const int64_t val) {
- this->total_uncompressed_size = val;
-}
-
-void ColumnMetaData::__set_total_compressed_size(const int64_t val) {
- this->total_compressed_size = val;
-}
-
-void ColumnMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
- this->key_value_metadata = val;
-__isset.key_value_metadata = true;
-}
-
-void ColumnMetaData::__set_data_page_offset(const int64_t val) {
- this->data_page_offset = val;
-}
-
-void ColumnMetaData::__set_index_page_offset(const int64_t val) {
- this->index_page_offset = val;
-__isset.index_page_offset = true;
-}
-
-void ColumnMetaData::__set_dictionary_page_offset(const int64_t val) {
- this->dictionary_page_offset = val;
-__isset.dictionary_page_offset = true;
-}
-
-void ColumnMetaData::__set_statistics(const Statistics& val) {
- this->statistics = val;
-__isset.statistics = true;
-}
-
-void ColumnMetaData::__set_encoding_stats(const std::vector<PageEncodingStats> & val) {
- this->encoding_stats = val;
-__isset.encoding_stats = true;
-}
-
-void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) {
- this->bloom_filter_offset = val;
-__isset.bloom_filter_offset = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_type = false;
- bool isset_encodings = false;
- bool isset_path_in_schema = false;
- bool isset_codec = false;
- bool isset_num_values = false;
- bool isset_total_uncompressed_size = false;
- bool isset_total_compressed_size = false;
- bool isset_data_page_offset = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast81;
- xfer += iprot->readI32(ecast81);
- this->type = (Type::type)ecast81;
- isset_type = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->encodings.clear();
- uint32_t _size82;
- ::apache::thrift::protocol::TType _etype85;
- xfer += iprot->readListBegin(_etype85, _size82);
- this->encodings.resize(_size82);
- uint32_t _i86;
- for (_i86 = 0; _i86 < _size82; ++_i86)
- {
- int32_t ecast87;
- xfer += iprot->readI32(ecast87);
- this->encodings[_i86] = (Encoding::type)ecast87;
- }
- xfer += iprot->readListEnd();
- }
- isset_encodings = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->path_in_schema.clear();
- uint32_t _size88;
- ::apache::thrift::protocol::TType _etype91;
- xfer += iprot->readListBegin(_etype91, _size88);
- this->path_in_schema.resize(_size88);
- uint32_t _i92;
- for (_i92 = 0; _i92 < _size88; ++_i92)
- {
- xfer += iprot->readString(this->path_in_schema[_i92]);
- }
- xfer += iprot->readListEnd();
- }
- isset_path_in_schema = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast93;
- xfer += iprot->readI32(ecast93);
- this->codec = (CompressionCodec::type)ecast93;
- isset_codec = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->num_values);
- isset_num_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_uncompressed_size);
- isset_total_uncompressed_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_compressed_size);
- isset_total_compressed_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->key_value_metadata.clear();
- uint32_t _size94;
- ::apache::thrift::protocol::TType _etype97;
- xfer += iprot->readListBegin(_etype97, _size94);
- this->key_value_metadata.resize(_size94);
- uint32_t _i98;
- for (_i98 = 0; _i98 < _size94; ++_i98)
- {
- xfer += this->key_value_metadata[_i98].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.key_value_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->data_page_offset);
- isset_data_page_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 10:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->index_page_offset);
- this->__isset.index_page_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 11:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->dictionary_page_offset);
- this->__isset.dictionary_page_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 12:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->statistics.read(iprot);
- this->__isset.statistics = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 13:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->encoding_stats.clear();
- uint32_t _size99;
- ::apache::thrift::protocol::TType _etype102;
- xfer += iprot->readListBegin(_etype102, _size99);
- this->encoding_stats.resize(_size99);
- uint32_t _i103;
- for (_i103 = 0; _i103 < _size99; ++_i103)
- {
- xfer += this->encoding_stats[_i103].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.encoding_stats = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 14:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->bloom_filter_offset);
- this->__isset.bloom_filter_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_type)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_encodings)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_path_in_schema)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_codec)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_total_uncompressed_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_total_compressed_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_data_page_offset)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnMetaData");
-
- xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32((int32_t)this->type);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast<uint32_t>(this->encodings.size()));
- std::vector<Encoding::type> ::const_iterator _iter104;
- for (_iter104 = this->encodings.begin(); _iter104 != this->encodings.end(); ++_iter104)
- {
- xfer += oprot->writeI32((int32_t)(*_iter104));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
- std::vector<std::string> ::const_iterator _iter105;
- for (_iter105 = this->path_in_schema.begin(); _iter105 != this->path_in_schema.end(); ++_iter105)
- {
- xfer += oprot->writeString((*_iter105));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("codec", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->codec);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I64, 5);
- xfer += oprot->writeI64(this->num_values);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("total_uncompressed_size", ::apache::thrift::protocol::T_I64, 6);
- xfer += oprot->writeI64(this->total_uncompressed_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 7);
- xfer += oprot->writeI64(this->total_compressed_size);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_value_metadata) {
- xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
- std::vector<KeyValue> ::const_iterator _iter106;
- for (_iter106 = this->key_value_metadata.begin(); _iter106 != this->key_value_metadata.end(); ++_iter106)
- {
- xfer += (*_iter106).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldBegin("data_page_offset", ::apache::thrift::protocol::T_I64, 9);
- xfer += oprot->writeI64(this->data_page_offset);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.index_page_offset) {
- xfer += oprot->writeFieldBegin("index_page_offset", ::apache::thrift::protocol::T_I64, 10);
- xfer += oprot->writeI64(this->index_page_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.dictionary_page_offset) {
- xfer += oprot->writeFieldBegin("dictionary_page_offset", ::apache::thrift::protocol::T_I64, 11);
- xfer += oprot->writeI64(this->dictionary_page_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.statistics) {
- xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 12);
- xfer += this->statistics.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.encoding_stats) {
- xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->encoding_stats.size()));
- std::vector<PageEncodingStats> ::const_iterator _iter107;
- for (_iter107 = this->encoding_stats.begin(); _iter107 != this->encoding_stats.end(); ++_iter107)
- {
- xfer += (*_iter107).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.bloom_filter_offset) {
- xfer += oprot->writeFieldBegin("bloom_filter_offset", ::apache::thrift::protocol::T_I64, 14);
- xfer += oprot->writeI64(this->bloom_filter_offset);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnMetaData &a, ColumnMetaData &b) {
- using ::std::swap;
- swap(a.type, b.type);
- swap(a.encodings, b.encodings);
- swap(a.path_in_schema, b.path_in_schema);
- swap(a.codec, b.codec);
- swap(a.num_values, b.num_values);
- swap(a.total_uncompressed_size, b.total_uncompressed_size);
- swap(a.total_compressed_size, b.total_compressed_size);
- swap(a.key_value_metadata, b.key_value_metadata);
- swap(a.data_page_offset, b.data_page_offset);
- swap(a.index_page_offset, b.index_page_offset);
- swap(a.dictionary_page_offset, b.dictionary_page_offset);
- swap(a.statistics, b.statistics);
- swap(a.encoding_stats, b.encoding_stats);
- swap(a.bloom_filter_offset, b.bloom_filter_offset);
- swap(a.__isset, b.__isset);
-}
-
-ColumnMetaData::ColumnMetaData(const ColumnMetaData& other108) {
- type = other108.type;
- encodings = other108.encodings;
- path_in_schema = other108.path_in_schema;
- codec = other108.codec;
- num_values = other108.num_values;
- total_uncompressed_size = other108.total_uncompressed_size;
- total_compressed_size = other108.total_compressed_size;
- key_value_metadata = other108.key_value_metadata;
- data_page_offset = other108.data_page_offset;
- index_page_offset = other108.index_page_offset;
- dictionary_page_offset = other108.dictionary_page_offset;
- statistics = other108.statistics;
- encoding_stats = other108.encoding_stats;
- bloom_filter_offset = other108.bloom_filter_offset;
- __isset = other108.__isset;
-}
-ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other109) {
- type = other109.type;
- encodings = other109.encodings;
- path_in_schema = other109.path_in_schema;
- codec = other109.codec;
- num_values = other109.num_values;
- total_uncompressed_size = other109.total_uncompressed_size;
- total_compressed_size = other109.total_compressed_size;
- key_value_metadata = other109.key_value_metadata;
- data_page_offset = other109.data_page_offset;
- index_page_offset = other109.index_page_offset;
- dictionary_page_offset = other109.dictionary_page_offset;
- statistics = other109.statistics;
- encoding_stats = other109.encoding_stats;
- bloom_filter_offset = other109.bloom_filter_offset;
- __isset = other109.__isset;
- return *this;
-}
-void ColumnMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnMetaData(";
- out << "type=" << to_string(type);
- out << ", " << "encodings=" << to_string(encodings);
- out << ", " << "path_in_schema=" << to_string(path_in_schema);
- out << ", " << "codec=" << to_string(codec);
- out << ", " << "num_values=" << to_string(num_values);
- out << ", " << "total_uncompressed_size=" << to_string(total_uncompressed_size);
- out << ", " << "total_compressed_size=" << to_string(total_compressed_size);
- out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
- out << ", " << "data_page_offset=" << to_string(data_page_offset);
- out << ", " << "index_page_offset="; (__isset.index_page_offset ? (out << to_string(index_page_offset)) : (out << "<null>"));
- out << ", " << "dictionary_page_offset="; (__isset.dictionary_page_offset ? (out << to_string(dictionary_page_offset)) : (out << "<null>"));
- out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
- out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "<null>"));
- out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "<null>"));
- out << ")";
-}
-
-
-EncryptionWithFooterKey::~EncryptionWithFooterKey() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EncryptionWithFooterKey::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t EncryptionWithFooterKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EncryptionWithFooterKey");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other110) {
- (void) other110;
-}
-EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other111) {
- (void) other111;
- return *this;
-}
-void EncryptionWithFooterKey::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EncryptionWithFooterKey(";
- out << ")";
-}
-
-
-EncryptionWithColumnKey::~EncryptionWithColumnKey() noexcept {
-}
-
-
-void EncryptionWithColumnKey::__set_path_in_schema(const std::vector<std::string> & val) {
- this->path_in_schema = val;
-}
-
-void EncryptionWithColumnKey::__set_key_metadata(const std::string& val) {
- this->key_metadata = val;
-__isset.key_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_path_in_schema = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->path_in_schema.clear();
- uint32_t _size112;
- ::apache::thrift::protocol::TType _etype115;
- xfer += iprot->readListBegin(_etype115, _size112);
- this->path_in_schema.resize(_size112);
- uint32_t _i116;
- for (_i116 = 0; _i116 < _size112; ++_i116)
- {
- xfer += iprot->readString(this->path_in_schema[_i116]);
- }
- xfer += iprot->readListEnd();
- }
- isset_path_in_schema = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->key_metadata);
- this->__isset.key_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_path_in_schema)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EncryptionWithColumnKey");
-
- xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
- std::vector<std::string> ::const_iterator _iter117;
- for (_iter117 = this->path_in_schema.begin(); _iter117 != this->path_in_schema.end(); ++_iter117)
- {
- xfer += oprot->writeString((*_iter117));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_metadata) {
- xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->key_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) {
- using ::std::swap;
- swap(a.path_in_schema, b.path_in_schema);
- swap(a.key_metadata, b.key_metadata);
- swap(a.__isset, b.__isset);
-}
-
-EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other118) {
- path_in_schema = other118.path_in_schema;
- key_metadata = other118.key_metadata;
- __isset = other118.__isset;
-}
-EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other119) {
- path_in_schema = other119.path_in_schema;
- key_metadata = other119.key_metadata;
- __isset = other119.__isset;
- return *this;
-}
-void EncryptionWithColumnKey::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EncryptionWithColumnKey(";
- out << "path_in_schema=" << to_string(path_in_schema);
- out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-
-ColumnCryptoMetaData::~ColumnCryptoMetaData() noexcept {
-}
-
-
-void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val) {
- this->ENCRYPTION_WITH_FOOTER_KEY = val;
-__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
-}
-
-void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val) {
- this->ENCRYPTION_WITH_COLUMN_KEY = val;
-__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->ENCRYPTION_WITH_FOOTER_KEY.read(iprot);
- this->__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->ENCRYPTION_WITH_COLUMN_KEY.read(iprot);
- this->__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t ColumnCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnCryptoMetaData");
-
- if (this->__isset.ENCRYPTION_WITH_FOOTER_KEY) {
- xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_FOOTER_KEY", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->ENCRYPTION_WITH_FOOTER_KEY.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.ENCRYPTION_WITH_COLUMN_KEY) {
- xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_COLUMN_KEY", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->ENCRYPTION_WITH_COLUMN_KEY.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) {
- using ::std::swap;
- swap(a.ENCRYPTION_WITH_FOOTER_KEY, b.ENCRYPTION_WITH_FOOTER_KEY);
- swap(a.ENCRYPTION_WITH_COLUMN_KEY, b.ENCRYPTION_WITH_COLUMN_KEY);
- swap(a.__isset, b.__isset);
-}
-
-ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other120) {
- ENCRYPTION_WITH_FOOTER_KEY = other120.ENCRYPTION_WITH_FOOTER_KEY;
- ENCRYPTION_WITH_COLUMN_KEY = other120.ENCRYPTION_WITH_COLUMN_KEY;
- __isset = other120.__isset;
-}
-ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other121) {
- ENCRYPTION_WITH_FOOTER_KEY = other121.ENCRYPTION_WITH_FOOTER_KEY;
- ENCRYPTION_WITH_COLUMN_KEY = other121.ENCRYPTION_WITH_COLUMN_KEY;
- __isset = other121.__isset;
- return *this;
-}
-void ColumnCryptoMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnCryptoMetaData(";
- out << "ENCRYPTION_WITH_FOOTER_KEY="; (__isset.ENCRYPTION_WITH_FOOTER_KEY ? (out << to_string(ENCRYPTION_WITH_FOOTER_KEY)) : (out << "<null>"));
- out << ", " << "ENCRYPTION_WITH_COLUMN_KEY="; (__isset.ENCRYPTION_WITH_COLUMN_KEY ? (out << to_string(ENCRYPTION_WITH_COLUMN_KEY)) : (out << "<null>"));
- out << ")";
-}
-
-
-ColumnChunk::~ColumnChunk() noexcept {
-}
-
-
-void ColumnChunk::__set_file_path(const std::string& val) {
- this->file_path = val;
-__isset.file_path = true;
-}
-
-void ColumnChunk::__set_file_offset(const int64_t val) {
- this->file_offset = val;
-}
-
-void ColumnChunk::__set_meta_data(const ColumnMetaData& val) {
- this->meta_data = val;
-__isset.meta_data = true;
-}
-
-void ColumnChunk::__set_offset_index_offset(const int64_t val) {
- this->offset_index_offset = val;
-__isset.offset_index_offset = true;
-}
-
-void ColumnChunk::__set_offset_index_length(const int32_t val) {
- this->offset_index_length = val;
-__isset.offset_index_length = true;
-}
-
-void ColumnChunk::__set_column_index_offset(const int64_t val) {
- this->column_index_offset = val;
-__isset.column_index_offset = true;
-}
-
-void ColumnChunk::__set_column_index_length(const int32_t val) {
- this->column_index_length = val;
-__isset.column_index_length = true;
-}
-
-void ColumnChunk::__set_crypto_metadata(const ColumnCryptoMetaData& val) {
- this->crypto_metadata = val;
-__isset.crypto_metadata = true;
-}
-
-void ColumnChunk::__set_encrypted_column_metadata(const std::string& val) {
- this->encrypted_column_metadata = val;
-__isset.encrypted_column_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnChunk::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_file_offset = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->file_path);
- this->__isset.file_path = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->file_offset);
- isset_file_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->meta_data.read(iprot);
- this->__isset.meta_data = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->offset_index_offset);
- this->__isset.offset_index_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->offset_index_length);
- this->__isset.offset_index_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->column_index_offset);
- this->__isset.column_index_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->column_index_length);
- this->__isset.column_index_length = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->crypto_metadata.read(iprot);
- this->__isset.crypto_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->encrypted_column_metadata);
- this->__isset.encrypted_column_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_file_offset)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t ColumnChunk::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnChunk");
-
- if (this->__isset.file_path) {
- xfer += oprot->writeFieldBegin("file_path", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeString(this->file_path);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 2);
- xfer += oprot->writeI64(this->file_offset);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.meta_data) {
- xfer += oprot->writeFieldBegin("meta_data", ::apache::thrift::protocol::T_STRUCT, 3);
- xfer += this->meta_data.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.offset_index_offset) {
- xfer += oprot->writeFieldBegin("offset_index_offset", ::apache::thrift::protocol::T_I64, 4);
- xfer += oprot->writeI64(this->offset_index_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.offset_index_length) {
- xfer += oprot->writeFieldBegin("offset_index_length", ::apache::thrift::protocol::T_I32, 5);
- xfer += oprot->writeI32(this->offset_index_length);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.column_index_offset) {
- xfer += oprot->writeFieldBegin("column_index_offset", ::apache::thrift::protocol::T_I64, 6);
- xfer += oprot->writeI64(this->column_index_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.column_index_length) {
- xfer += oprot->writeFieldBegin("column_index_length", ::apache::thrift::protocol::T_I32, 7);
- xfer += oprot->writeI32(this->column_index_length);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.crypto_metadata) {
- xfer += oprot->writeFieldBegin("crypto_metadata", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->crypto_metadata.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.encrypted_column_metadata) {
- xfer += oprot->writeFieldBegin("encrypted_column_metadata", ::apache::thrift::protocol::T_STRING, 9);
- xfer += oprot->writeBinary(this->encrypted_column_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnChunk &a, ColumnChunk &b) {
- using ::std::swap;
- swap(a.file_path, b.file_path);
- swap(a.file_offset, b.file_offset);
- swap(a.meta_data, b.meta_data);
- swap(a.offset_index_offset, b.offset_index_offset);
- swap(a.offset_index_length, b.offset_index_length);
- swap(a.column_index_offset, b.column_index_offset);
- swap(a.column_index_length, b.column_index_length);
- swap(a.crypto_metadata, b.crypto_metadata);
- swap(a.encrypted_column_metadata, b.encrypted_column_metadata);
- swap(a.__isset, b.__isset);
-}
-
-ColumnChunk::ColumnChunk(const ColumnChunk& other122) {
- file_path = other122.file_path;
- file_offset = other122.file_offset;
- meta_data = other122.meta_data;
- offset_index_offset = other122.offset_index_offset;
- offset_index_length = other122.offset_index_length;
- column_index_offset = other122.column_index_offset;
- column_index_length = other122.column_index_length;
- crypto_metadata = other122.crypto_metadata;
- encrypted_column_metadata = other122.encrypted_column_metadata;
- __isset = other122.__isset;
-}
-ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other123) {
- file_path = other123.file_path;
- file_offset = other123.file_offset;
- meta_data = other123.meta_data;
- offset_index_offset = other123.offset_index_offset;
- offset_index_length = other123.offset_index_length;
- column_index_offset = other123.column_index_offset;
- column_index_length = other123.column_index_length;
- crypto_metadata = other123.crypto_metadata;
- encrypted_column_metadata = other123.encrypted_column_metadata;
- __isset = other123.__isset;
- return *this;
-}
-void ColumnChunk::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnChunk(";
- out << "file_path="; (__isset.file_path ? (out << to_string(file_path)) : (out << "<null>"));
- out << ", " << "file_offset=" << to_string(file_offset);
- out << ", " << "meta_data="; (__isset.meta_data ? (out << to_string(meta_data)) : (out << "<null>"));
- out << ", " << "offset_index_offset="; (__isset.offset_index_offset ? (out << to_string(offset_index_offset)) : (out << "<null>"));
- out << ", " << "offset_index_length="; (__isset.offset_index_length ? (out << to_string(offset_index_length)) : (out << "<null>"));
- out << ", " << "column_index_offset="; (__isset.column_index_offset ? (out << to_string(column_index_offset)) : (out << "<null>"));
- out << ", " << "column_index_length="; (__isset.column_index_length ? (out << to_string(column_index_length)) : (out << "<null>"));
- out << ", " << "crypto_metadata="; (__isset.crypto_metadata ? (out << to_string(crypto_metadata)) : (out << "<null>"));
- out << ", " << "encrypted_column_metadata="; (__isset.encrypted_column_metadata ? (out << to_string(encrypted_column_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-
-RowGroup::~RowGroup() noexcept {
-}
-
-
-void RowGroup::__set_columns(const std::vector<ColumnChunk> & val) {
- this->columns = val;
-}
-
-void RowGroup::__set_total_byte_size(const int64_t val) {
- this->total_byte_size = val;
-}
-
-void RowGroup::__set_num_rows(const int64_t val) {
- this->num_rows = val;
-}
-
-void RowGroup::__set_sorting_columns(const std::vector<SortingColumn> & val) {
- this->sorting_columns = val;
-__isset.sorting_columns = true;
-}
-
-void RowGroup::__set_file_offset(const int64_t val) {
- this->file_offset = val;
-__isset.file_offset = true;
-}
-
-void RowGroup::__set_total_compressed_size(const int64_t val) {
- this->total_compressed_size = val;
-__isset.total_compressed_size = true;
-}
-
-void RowGroup::__set_ordinal(const int16_t val) {
- this->ordinal = val;
-__isset.ordinal = true;
-}
-std::ostream& operator<<(std::ostream& out, const RowGroup& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_columns = false;
- bool isset_total_byte_size = false;
- bool isset_num_rows = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->columns.clear();
- uint32_t _size124;
- ::apache::thrift::protocol::TType _etype127;
- xfer += iprot->readListBegin(_etype127, _size124);
- this->columns.resize(_size124);
- uint32_t _i128;
- for (_i128 = 0; _i128 < _size124; ++_i128)
- {
- xfer += this->columns[_i128].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_columns = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_byte_size);
- isset_total_byte_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->num_rows);
- isset_num_rows = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->sorting_columns.clear();
- uint32_t _size129;
- ::apache::thrift::protocol::TType _etype132;
- xfer += iprot->readListBegin(_etype132, _size129);
- this->sorting_columns.resize(_size129);
- uint32_t _i133;
- for (_i133 = 0; _i133 < _size129; ++_i133)
- {
- xfer += this->sorting_columns[_i133].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.sorting_columns = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->file_offset);
- this->__isset.file_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->total_compressed_size);
- this->__isset.total_compressed_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_I16) {
- xfer += iprot->readI16(this->ordinal);
- this->__isset.ordinal = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_columns)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_total_byte_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_rows)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("RowGroup");
-
- xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->columns.size()));
- std::vector<ColumnChunk> ::const_iterator _iter134;
- for (_iter134 = this->columns.begin(); _iter134 != this->columns.end(); ++_iter134)
- {
- xfer += (*_iter134).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("total_byte_size", ::apache::thrift::protocol::T_I64, 2);
- xfer += oprot->writeI64(this->total_byte_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->num_rows);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.sorting_columns) {
- xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->sorting_columns.size()));
- std::vector<SortingColumn> ::const_iterator _iter135;
- for (_iter135 = this->sorting_columns.begin(); _iter135 != this->sorting_columns.end(); ++_iter135)
- {
- xfer += (*_iter135).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.file_offset) {
- xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 5);
- xfer += oprot->writeI64(this->file_offset);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.total_compressed_size) {
- xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 6);
- xfer += oprot->writeI64(this->total_compressed_size);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.ordinal) {
- xfer += oprot->writeFieldBegin("ordinal", ::apache::thrift::protocol::T_I16, 7);
- xfer += oprot->writeI16(this->ordinal);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(RowGroup &a, RowGroup &b) {
- using ::std::swap;
- swap(a.columns, b.columns);
- swap(a.total_byte_size, b.total_byte_size);
- swap(a.num_rows, b.num_rows);
- swap(a.sorting_columns, b.sorting_columns);
- swap(a.file_offset, b.file_offset);
- swap(a.total_compressed_size, b.total_compressed_size);
- swap(a.ordinal, b.ordinal);
- swap(a.__isset, b.__isset);
-}
-
-RowGroup::RowGroup(const RowGroup& other136) {
- columns = other136.columns;
- total_byte_size = other136.total_byte_size;
- num_rows = other136.num_rows;
- sorting_columns = other136.sorting_columns;
- file_offset = other136.file_offset;
- total_compressed_size = other136.total_compressed_size;
- ordinal = other136.ordinal;
- __isset = other136.__isset;
-}
-RowGroup& RowGroup::operator=(const RowGroup& other137) {
- columns = other137.columns;
- total_byte_size = other137.total_byte_size;
- num_rows = other137.num_rows;
- sorting_columns = other137.sorting_columns;
- file_offset = other137.file_offset;
- total_compressed_size = other137.total_compressed_size;
- ordinal = other137.ordinal;
- __isset = other137.__isset;
- return *this;
-}
-void RowGroup::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "RowGroup(";
- out << "columns=" << to_string(columns);
- out << ", " << "total_byte_size=" << to_string(total_byte_size);
- out << ", " << "num_rows=" << to_string(num_rows);
- out << ", " << "sorting_columns="; (__isset.sorting_columns ? (out << to_string(sorting_columns)) : (out << "<null>"));
- out << ", " << "file_offset="; (__isset.file_offset ? (out << to_string(file_offset)) : (out << "<null>"));
- out << ", " << "total_compressed_size="; (__isset.total_compressed_size ? (out << to_string(total_compressed_size)) : (out << "<null>"));
- out << ", " << "ordinal="; (__isset.ordinal ? (out << to_string(ordinal)) : (out << "<null>"));
- out << ")";
-}
-
-
-TypeDefinedOrder::~TypeDefinedOrder() noexcept {
-}
-
-std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t TypeDefinedOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- xfer += iprot->skip(ftype);
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t TypeDefinedOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("TypeDefinedOrder");
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) {
- using ::std::swap;
- (void) a;
- (void) b;
-}
-
-TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other138) {
- (void) other138;
-}
-TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other139) {
- (void) other139;
- return *this;
-}
-void TypeDefinedOrder::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "TypeDefinedOrder(";
- out << ")";
-}
-
-
-ColumnOrder::~ColumnOrder() noexcept {
-}
-
-
-void ColumnOrder::__set_TYPE_ORDER(const TypeDefinedOrder& val) {
- this->TYPE_ORDER = val;
-__isset.TYPE_ORDER = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->TYPE_ORDER.read(iprot);
- this->__isset.TYPE_ORDER = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t ColumnOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnOrder");
-
- if (this->__isset.TYPE_ORDER) {
- xfer += oprot->writeFieldBegin("TYPE_ORDER", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->TYPE_ORDER.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnOrder &a, ColumnOrder &b) {
- using ::std::swap;
- swap(a.TYPE_ORDER, b.TYPE_ORDER);
- swap(a.__isset, b.__isset);
-}
-
-ColumnOrder::ColumnOrder(const ColumnOrder& other140) {
- TYPE_ORDER = other140.TYPE_ORDER;
- __isset = other140.__isset;
-}
-ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other141) {
- TYPE_ORDER = other141.TYPE_ORDER;
- __isset = other141.__isset;
- return *this;
-}
-void ColumnOrder::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnOrder(";
- out << "TYPE_ORDER="; (__isset.TYPE_ORDER ? (out << to_string(TYPE_ORDER)) : (out << "<null>"));
- out << ")";
-}
-
-
-PageLocation::~PageLocation() noexcept {
-}
-
-
-void PageLocation::__set_offset(const int64_t val) {
- this->offset = val;
-}
-
-void PageLocation::__set_compressed_page_size(const int32_t val) {
- this->compressed_page_size = val;
-}
-
-void PageLocation::__set_first_row_index(const int64_t val) {
- this->first_row_index = val;
-}
-std::ostream& operator<<(std::ostream& out, const PageLocation& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t PageLocation::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_offset = false;
- bool isset_compressed_page_size = false;
- bool isset_first_row_index = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->offset);
- isset_offset = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->compressed_page_size);
- isset_compressed_page_size = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->first_row_index);
- isset_first_row_index = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_offset)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_compressed_page_size)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_first_row_index)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t PageLocation::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("PageLocation");
-
- xfer += oprot->writeFieldBegin("offset", ::apache::thrift::protocol::T_I64, 1);
- xfer += oprot->writeI64(this->offset);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 2);
- xfer += oprot->writeI32(this->compressed_page_size);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("first_row_index", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->first_row_index);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(PageLocation &a, PageLocation &b) {
- using ::std::swap;
- swap(a.offset, b.offset);
- swap(a.compressed_page_size, b.compressed_page_size);
- swap(a.first_row_index, b.first_row_index);
-}
-
-PageLocation::PageLocation(const PageLocation& other142) {
- offset = other142.offset;
- compressed_page_size = other142.compressed_page_size;
- first_row_index = other142.first_row_index;
-}
-PageLocation& PageLocation::operator=(const PageLocation& other143) {
- offset = other143.offset;
- compressed_page_size = other143.compressed_page_size;
- first_row_index = other143.first_row_index;
- return *this;
-}
-void PageLocation::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "PageLocation(";
- out << "offset=" << to_string(offset);
- out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
- out << ", " << "first_row_index=" << to_string(first_row_index);
- out << ")";
-}
-
-
-OffsetIndex::~OffsetIndex() noexcept {
-}
-
-
-void OffsetIndex::__set_page_locations(const std::vector<PageLocation> & val) {
- this->page_locations = val;
-}
-std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_page_locations = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->page_locations.clear();
- uint32_t _size144;
- ::apache::thrift::protocol::TType _etype147;
- xfer += iprot->readListBegin(_etype147, _size144);
- this->page_locations.resize(_size144);
- uint32_t _i148;
- for (_i148 = 0; _i148 < _size144; ++_i148)
- {
- xfer += this->page_locations[_i148].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_page_locations = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_page_locations)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("OffsetIndex");
-
- xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->page_locations.size()));
- std::vector<PageLocation> ::const_iterator _iter149;
- for (_iter149 = this->page_locations.begin(); _iter149 != this->page_locations.end(); ++_iter149)
- {
- xfer += (*_iter149).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(OffsetIndex &a, OffsetIndex &b) {
- using ::std::swap;
- swap(a.page_locations, b.page_locations);
-}
-
-OffsetIndex::OffsetIndex(const OffsetIndex& other150) {
- page_locations = other150.page_locations;
-}
-OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other151) {
- page_locations = other151.page_locations;
- return *this;
-}
-void OffsetIndex::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "OffsetIndex(";
- out << "page_locations=" << to_string(page_locations);
- out << ")";
-}
-
-
-ColumnIndex::~ColumnIndex() noexcept {
-}
-
-
-void ColumnIndex::__set_null_pages(const std::vector<bool> & val) {
- this->null_pages = val;
-}
-
-void ColumnIndex::__set_min_values(const std::vector<std::string> & val) {
- this->min_values = val;
-}
-
-void ColumnIndex::__set_max_values(const std::vector<std::string> & val) {
- this->max_values = val;
-}
-
-void ColumnIndex::__set_boundary_order(const BoundaryOrder::type val) {
- this->boundary_order = val;
-}
-
-void ColumnIndex::__set_null_counts(const std::vector<int64_t> & val) {
- this->null_counts = val;
-__isset.null_counts = true;
-}
-std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_null_pages = false;
- bool isset_min_values = false;
- bool isset_max_values = false;
- bool isset_boundary_order = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->null_pages.clear();
- uint32_t _size152;
- ::apache::thrift::protocol::TType _etype155;
- xfer += iprot->readListBegin(_etype155, _size152);
- this->null_pages.resize(_size152);
- uint32_t _i156;
- for (_i156 = 0; _i156 < _size152; ++_i156)
- {
- bool result;
- xfer += iprot->readBool(result);
- this->null_pages[_i156] = result;
- }
- xfer += iprot->readListEnd();
- }
- isset_null_pages = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->min_values.clear();
- uint32_t _size157;
- ::apache::thrift::protocol::TType _etype160;
- xfer += iprot->readListBegin(_etype160, _size157);
- this->min_values.resize(_size157);
- uint32_t _i161;
- for (_i161 = 0; _i161 < _size157; ++_i161)
- {
- xfer += iprot->readBinary(this->min_values[_i161]);
- }
- xfer += iprot->readListEnd();
- }
- isset_min_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->max_values.clear();
- uint32_t _size162;
- ::apache::thrift::protocol::TType _etype165;
- xfer += iprot->readListBegin(_etype165, _size162);
- this->max_values.resize(_size162);
- uint32_t _i166;
- for (_i166 = 0; _i166 < _size162; ++_i166)
- {
- xfer += iprot->readBinary(this->max_values[_i166]);
- }
- xfer += iprot->readListEnd();
- }
- isset_max_values = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- int32_t ecast167;
- xfer += iprot->readI32(ecast167);
- this->boundary_order = (BoundaryOrder::type)ecast167;
- isset_boundary_order = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->null_counts.clear();
- uint32_t _size168;
- ::apache::thrift::protocol::TType _etype171;
- xfer += iprot->readListBegin(_etype171, _size168);
- this->null_counts.resize(_size168);
- uint32_t _i172;
- for (_i172 = 0; _i172 < _size168; ++_i172)
- {
- xfer += iprot->readI64(this->null_counts[_i172]);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.null_counts = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_null_pages)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_min_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_max_values)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_boundary_order)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("ColumnIndex");
-
- xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast<uint32_t>(this->null_pages.size()));
- std::vector<bool> ::const_iterator _iter173;
- for (_iter173 = this->null_pages.begin(); _iter173 != this->null_pages.end(); ++_iter173)
- {
- xfer += oprot->writeBool((*_iter173));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->min_values.size()));
- std::vector<std::string> ::const_iterator _iter174;
- for (_iter174 = this->min_values.begin(); _iter174 != this->min_values.end(); ++_iter174)
- {
- xfer += oprot->writeBinary((*_iter174));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->max_values.size()));
- std::vector<std::string> ::const_iterator _iter175;
- for (_iter175 = this->max_values.begin(); _iter175 != this->max_values.end(); ++_iter175)
- {
- xfer += oprot->writeBinary((*_iter175));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("boundary_order", ::apache::thrift::protocol::T_I32, 4);
- xfer += oprot->writeI32((int32_t)this->boundary_order);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.null_counts) {
- xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->null_counts.size()));
- std::vector<int64_t> ::const_iterator _iter176;
- for (_iter176 = this->null_counts.begin(); _iter176 != this->null_counts.end(); ++_iter176)
- {
- xfer += oprot->writeI64((*_iter176));
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(ColumnIndex &a, ColumnIndex &b) {
- using ::std::swap;
- swap(a.null_pages, b.null_pages);
- swap(a.min_values, b.min_values);
- swap(a.max_values, b.max_values);
- swap(a.boundary_order, b.boundary_order);
- swap(a.null_counts, b.null_counts);
- swap(a.__isset, b.__isset);
-}
-
-ColumnIndex::ColumnIndex(const ColumnIndex& other177) {
- null_pages = other177.null_pages;
- min_values = other177.min_values;
- max_values = other177.max_values;
- boundary_order = other177.boundary_order;
- null_counts = other177.null_counts;
- __isset = other177.__isset;
-}
-ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other178) {
- null_pages = other178.null_pages;
- min_values = other178.min_values;
- max_values = other178.max_values;
- boundary_order = other178.boundary_order;
- null_counts = other178.null_counts;
- __isset = other178.__isset;
- return *this;
-}
-void ColumnIndex::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "ColumnIndex(";
- out << "null_pages=" << to_string(null_pages);
- out << ", " << "min_values=" << to_string(min_values);
- out << ", " << "max_values=" << to_string(max_values);
- out << ", " << "boundary_order=" << to_string(boundary_order);
- out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "<null>"));
- out << ")";
-}
-
-
-AesGcmV1::~AesGcmV1() noexcept {
-}
-
-
-void AesGcmV1::__set_aad_prefix(const std::string& val) {
- this->aad_prefix = val;
-__isset.aad_prefix = true;
-}
-
-void AesGcmV1::__set_aad_file_unique(const std::string& val) {
- this->aad_file_unique = val;
-__isset.aad_file_unique = true;
-}
-
-void AesGcmV1::__set_supply_aad_prefix(const bool val) {
- this->supply_aad_prefix = val;
-__isset.supply_aad_prefix = true;
-}
-std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t AesGcmV1::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_prefix);
- this->__isset.aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_file_unique);
- this->__isset.aad_file_unique = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->supply_aad_prefix);
- this->__isset.supply_aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t AesGcmV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("AesGcmV1");
-
- if (this->__isset.aad_prefix) {
- xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeBinary(this->aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.aad_file_unique) {
- xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->aad_file_unique);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.supply_aad_prefix) {
- xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->supply_aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(AesGcmV1 &a, AesGcmV1 &b) {
- using ::std::swap;
- swap(a.aad_prefix, b.aad_prefix);
- swap(a.aad_file_unique, b.aad_file_unique);
- swap(a.supply_aad_prefix, b.supply_aad_prefix);
- swap(a.__isset, b.__isset);
-}
-
-AesGcmV1::AesGcmV1(const AesGcmV1& other179) {
- aad_prefix = other179.aad_prefix;
- aad_file_unique = other179.aad_file_unique;
- supply_aad_prefix = other179.supply_aad_prefix;
- __isset = other179.__isset;
-}
-AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other180) {
- aad_prefix = other180.aad_prefix;
- aad_file_unique = other180.aad_file_unique;
- supply_aad_prefix = other180.supply_aad_prefix;
- __isset = other180.__isset;
- return *this;
-}
-void AesGcmV1::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "AesGcmV1(";
- out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
- out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
- out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
- out << ")";
-}
-
-
-AesGcmCtrV1::~AesGcmCtrV1() noexcept {
-}
-
-
-void AesGcmCtrV1::__set_aad_prefix(const std::string& val) {
- this->aad_prefix = val;
-__isset.aad_prefix = true;
-}
-
-void AesGcmCtrV1::__set_aad_file_unique(const std::string& val) {
- this->aad_file_unique = val;
-__isset.aad_file_unique = true;
-}
-
-void AesGcmCtrV1::__set_supply_aad_prefix(const bool val) {
- this->supply_aad_prefix = val;
-__isset.supply_aad_prefix = true;
-}
-std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t AesGcmCtrV1::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_prefix);
- this->__isset.aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->aad_file_unique);
- this->__isset.aad_file_unique = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_BOOL) {
- xfer += iprot->readBool(this->supply_aad_prefix);
- this->__isset.supply_aad_prefix = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t AesGcmCtrV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("AesGcmCtrV1");
-
- if (this->__isset.aad_prefix) {
- xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
- xfer += oprot->writeBinary(this->aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.aad_file_unique) {
- xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->aad_file_unique);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.supply_aad_prefix) {
- xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
- xfer += oprot->writeBool(this->supply_aad_prefix);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) {
- using ::std::swap;
- swap(a.aad_prefix, b.aad_prefix);
- swap(a.aad_file_unique, b.aad_file_unique);
- swap(a.supply_aad_prefix, b.supply_aad_prefix);
- swap(a.__isset, b.__isset);
-}
-
-AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other181) {
- aad_prefix = other181.aad_prefix;
- aad_file_unique = other181.aad_file_unique;
- supply_aad_prefix = other181.supply_aad_prefix;
- __isset = other181.__isset;
-}
-AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other182) {
- aad_prefix = other182.aad_prefix;
- aad_file_unique = other182.aad_file_unique;
- supply_aad_prefix = other182.supply_aad_prefix;
- __isset = other182.__isset;
- return *this;
-}
-void AesGcmCtrV1::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "AesGcmCtrV1(";
- out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
- out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
- out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
- out << ")";
-}
-
-
-EncryptionAlgorithm::~EncryptionAlgorithm() noexcept {
-}
-
-
-void EncryptionAlgorithm::__set_AES_GCM_V1(const AesGcmV1& val) {
- this->AES_GCM_V1 = val;
-__isset.AES_GCM_V1 = true;
-}
-
-void EncryptionAlgorithm::__set_AES_GCM_CTR_V1(const AesGcmCtrV1& val) {
- this->AES_GCM_CTR_V1 = val;
-__isset.AES_GCM_CTR_V1 = true;
-}
-std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t EncryptionAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->AES_GCM_V1.read(iprot);
- this->__isset.AES_GCM_V1 = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->AES_GCM_CTR_V1.read(iprot);
- this->__isset.AES_GCM_CTR_V1 = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- return xfer;
-}
-
-uint32_t EncryptionAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("EncryptionAlgorithm");
-
- if (this->__isset.AES_GCM_V1) {
- xfer += oprot->writeFieldBegin("AES_GCM_V1", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->AES_GCM_V1.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.AES_GCM_CTR_V1) {
- xfer += oprot->writeFieldBegin("AES_GCM_CTR_V1", ::apache::thrift::protocol::T_STRUCT, 2);
- xfer += this->AES_GCM_CTR_V1.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) {
- using ::std::swap;
- swap(a.AES_GCM_V1, b.AES_GCM_V1);
- swap(a.AES_GCM_CTR_V1, b.AES_GCM_CTR_V1);
- swap(a.__isset, b.__isset);
-}
-
-EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other183) {
- AES_GCM_V1 = other183.AES_GCM_V1;
- AES_GCM_CTR_V1 = other183.AES_GCM_CTR_V1;
- __isset = other183.__isset;
-}
-EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other184) {
- AES_GCM_V1 = other184.AES_GCM_V1;
- AES_GCM_CTR_V1 = other184.AES_GCM_CTR_V1;
- __isset = other184.__isset;
- return *this;
-}
-void EncryptionAlgorithm::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "EncryptionAlgorithm(";
- out << "AES_GCM_V1="; (__isset.AES_GCM_V1 ? (out << to_string(AES_GCM_V1)) : (out << "<null>"));
- out << ", " << "AES_GCM_CTR_V1="; (__isset.AES_GCM_CTR_V1 ? (out << to_string(AES_GCM_CTR_V1)) : (out << "<null>"));
- out << ")";
-}
-
-
-FileMetaData::~FileMetaData() noexcept {
-}
-
-
-void FileMetaData::__set_version(const int32_t val) {
- this->version = val;
-}
-
-void FileMetaData::__set_schema(const std::vector<SchemaElement> & val) {
- this->schema = val;
-}
-
-void FileMetaData::__set_num_rows(const int64_t val) {
- this->num_rows = val;
-}
-
-void FileMetaData::__set_row_groups(const std::vector<RowGroup> & val) {
- this->row_groups = val;
-}
-
-void FileMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
- this->key_value_metadata = val;
-__isset.key_value_metadata = true;
-}
-
-void FileMetaData::__set_created_by(const std::string& val) {
- this->created_by = val;
-__isset.created_by = true;
-}
-
-void FileMetaData::__set_column_orders(const std::vector<ColumnOrder> & val) {
- this->column_orders = val;
-__isset.column_orders = true;
-}
-
-void FileMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
- this->encryption_algorithm = val;
-__isset.encryption_algorithm = true;
-}
-
-void FileMetaData::__set_footer_signing_key_metadata(const std::string& val) {
- this->footer_signing_key_metadata = val;
-__isset.footer_signing_key_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const FileMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_version = false;
- bool isset_schema = false;
- bool isset_num_rows = false;
- bool isset_row_groups = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_I32) {
- xfer += iprot->readI32(this->version);
- isset_version = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->schema.clear();
- uint32_t _size185;
- ::apache::thrift::protocol::TType _etype188;
- xfer += iprot->readListBegin(_etype188, _size185);
- this->schema.resize(_size185);
- uint32_t _i189;
- for (_i189 = 0; _i189 < _size185; ++_i189)
- {
- xfer += this->schema[_i189].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_schema = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 3:
- if (ftype == ::apache::thrift::protocol::T_I64) {
- xfer += iprot->readI64(this->num_rows);
- isset_num_rows = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 4:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->row_groups.clear();
- uint32_t _size190;
- ::apache::thrift::protocol::TType _etype193;
- xfer += iprot->readListBegin(_etype193, _size190);
- this->row_groups.resize(_size190);
- uint32_t _i194;
- for (_i194 = 0; _i194 < _size190; ++_i194)
- {
- xfer += this->row_groups[_i194].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- isset_row_groups = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 5:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->key_value_metadata.clear();
- uint32_t _size195;
- ::apache::thrift::protocol::TType _etype198;
- xfer += iprot->readListBegin(_etype198, _size195);
- this->key_value_metadata.resize(_size195);
- uint32_t _i199;
- for (_i199 = 0; _i199 < _size195; ++_i199)
- {
- xfer += this->key_value_metadata[_i199].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.key_value_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 6:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readString(this->created_by);
- this->__isset.created_by = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 7:
- if (ftype == ::apache::thrift::protocol::T_LIST) {
- {
- this->column_orders.clear();
- uint32_t _size200;
- ::apache::thrift::protocol::TType _etype203;
- xfer += iprot->readListBegin(_etype203, _size200);
- this->column_orders.resize(_size200);
- uint32_t _i204;
- for (_i204 = 0; _i204 < _size200; ++_i204)
- {
- xfer += this->column_orders[_i204].read(iprot);
- }
- xfer += iprot->readListEnd();
- }
- this->__isset.column_orders = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 8:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->encryption_algorithm.read(iprot);
- this->__isset.encryption_algorithm = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 9:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->footer_signing_key_metadata);
- this->__isset.footer_signing_key_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_version)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_schema)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_num_rows)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- if (!isset_row_groups)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("FileMetaData");
-
- xfer += oprot->writeFieldBegin("version", ::apache::thrift::protocol::T_I32, 1);
- xfer += oprot->writeI32(this->version);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->schema.size()));
- std::vector<SchemaElement> ::const_iterator _iter205;
- for (_iter205 = this->schema.begin(); _iter205 != this->schema.end(); ++_iter205)
- {
- xfer += (*_iter205).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
- xfer += oprot->writeI64(this->num_rows);
- xfer += oprot->writeFieldEnd();
-
- xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->row_groups.size()));
- std::vector<RowGroup> ::const_iterator _iter206;
- for (_iter206 = this->row_groups.begin(); _iter206 != this->row_groups.end(); ++_iter206)
- {
- xfer += (*_iter206).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_value_metadata) {
- xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
- std::vector<KeyValue> ::const_iterator _iter207;
- for (_iter207 = this->key_value_metadata.begin(); _iter207 != this->key_value_metadata.end(); ++_iter207)
- {
- xfer += (*_iter207).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.created_by) {
- xfer += oprot->writeFieldBegin("created_by", ::apache::thrift::protocol::T_STRING, 6);
- xfer += oprot->writeString(this->created_by);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.column_orders) {
- xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7);
- {
- xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->column_orders.size()));
- std::vector<ColumnOrder> ::const_iterator _iter208;
- for (_iter208 = this->column_orders.begin(); _iter208 != this->column_orders.end(); ++_iter208)
- {
- xfer += (*_iter208).write(oprot);
- }
- xfer += oprot->writeListEnd();
- }
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.encryption_algorithm) {
- xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 8);
- xfer += this->encryption_algorithm.write(oprot);
- xfer += oprot->writeFieldEnd();
- }
- if (this->__isset.footer_signing_key_metadata) {
- xfer += oprot->writeFieldBegin("footer_signing_key_metadata", ::apache::thrift::protocol::T_STRING, 9);
- xfer += oprot->writeBinary(this->footer_signing_key_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(FileMetaData &a, FileMetaData &b) {
- using ::std::swap;
- swap(a.version, b.version);
- swap(a.schema, b.schema);
- swap(a.num_rows, b.num_rows);
- swap(a.row_groups, b.row_groups);
- swap(a.key_value_metadata, b.key_value_metadata);
- swap(a.created_by, b.created_by);
- swap(a.column_orders, b.column_orders);
- swap(a.encryption_algorithm, b.encryption_algorithm);
- swap(a.footer_signing_key_metadata, b.footer_signing_key_metadata);
- swap(a.__isset, b.__isset);
-}
-
-FileMetaData::FileMetaData(const FileMetaData& other209) {
- version = other209.version;
- schema = other209.schema;
- num_rows = other209.num_rows;
- row_groups = other209.row_groups;
- key_value_metadata = other209.key_value_metadata;
- created_by = other209.created_by;
- column_orders = other209.column_orders;
- encryption_algorithm = other209.encryption_algorithm;
- footer_signing_key_metadata = other209.footer_signing_key_metadata;
- __isset = other209.__isset;
-}
-FileMetaData& FileMetaData::operator=(const FileMetaData& other210) {
- version = other210.version;
- schema = other210.schema;
- num_rows = other210.num_rows;
- row_groups = other210.row_groups;
- key_value_metadata = other210.key_value_metadata;
- created_by = other210.created_by;
- column_orders = other210.column_orders;
- encryption_algorithm = other210.encryption_algorithm;
- footer_signing_key_metadata = other210.footer_signing_key_metadata;
- __isset = other210.__isset;
- return *this;
-}
-void FileMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "FileMetaData(";
- out << "version=" << to_string(version);
- out << ", " << "schema=" << to_string(schema);
- out << ", " << "num_rows=" << to_string(num_rows);
- out << ", " << "row_groups=" << to_string(row_groups);
- out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
- out << ", " << "created_by="; (__isset.created_by ? (out << to_string(created_by)) : (out << "<null>"));
- out << ", " << "column_orders="; (__isset.column_orders ? (out << to_string(column_orders)) : (out << "<null>"));
- out << ", " << "encryption_algorithm="; (__isset.encryption_algorithm ? (out << to_string(encryption_algorithm)) : (out << "<null>"));
- out << ", " << "footer_signing_key_metadata="; (__isset.footer_signing_key_metadata ? (out << to_string(footer_signing_key_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-
-FileCryptoMetaData::~FileCryptoMetaData() noexcept {
-}
-
-
-void FileCryptoMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
- this->encryption_algorithm = val;
-}
-
-void FileCryptoMetaData::__set_key_metadata(const std::string& val) {
- this->key_metadata = val;
-__isset.key_metadata = true;
-}
-std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj)
-{
- obj.printTo(out);
- return out;
-}
-
-
-uint32_t FileCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
-
- ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
- uint32_t xfer = 0;
- std::string fname;
- ::apache::thrift::protocol::TType ftype;
- int16_t fid;
-
- xfer += iprot->readStructBegin(fname);
-
- using ::apache::thrift::protocol::TProtocolException;
-
- bool isset_encryption_algorithm = false;
-
- while (true)
- {
- xfer += iprot->readFieldBegin(fname, ftype, fid);
- if (ftype == ::apache::thrift::protocol::T_STOP) {
- break;
- }
- switch (fid)
- {
- case 1:
- if (ftype == ::apache::thrift::protocol::T_STRUCT) {
- xfer += this->encryption_algorithm.read(iprot);
- isset_encryption_algorithm = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- case 2:
- if (ftype == ::apache::thrift::protocol::T_STRING) {
- xfer += iprot->readBinary(this->key_metadata);
- this->__isset.key_metadata = true;
- } else {
- xfer += iprot->skip(ftype);
- }
- break;
- default:
- xfer += iprot->skip(ftype);
- break;
- }
- xfer += iprot->readFieldEnd();
- }
-
- xfer += iprot->readStructEnd();
-
- if (!isset_encryption_algorithm)
- throw TProtocolException(TProtocolException::INVALID_DATA);
- return xfer;
-}
-
-uint32_t FileCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
- uint32_t xfer = 0;
- ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
- xfer += oprot->writeStructBegin("FileCryptoMetaData");
-
- xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 1);
- xfer += this->encryption_algorithm.write(oprot);
- xfer += oprot->writeFieldEnd();
-
- if (this->__isset.key_metadata) {
- xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
- xfer += oprot->writeBinary(this->key_metadata);
- xfer += oprot->writeFieldEnd();
- }
- xfer += oprot->writeFieldStop();
- xfer += oprot->writeStructEnd();
- return xfer;
-}
-
-void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) {
- using ::std::swap;
- swap(a.encryption_algorithm, b.encryption_algorithm);
- swap(a.key_metadata, b.key_metadata);
- swap(a.__isset, b.__isset);
-}
-
-FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other211) {
- encryption_algorithm = other211.encryption_algorithm;
- key_metadata = other211.key_metadata;
- __isset = other211.__isset;
-}
-FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other212) {
- encryption_algorithm = other212.encryption_algorithm;
- key_metadata = other212.key_metadata;
- __isset = other212.__isset;
- return *this;
-}
-void FileCryptoMetaData::printTo(std::ostream& out) const {
- using ::apache::thrift::to_string;
- out << "FileCryptoMetaData(";
- out << "encryption_algorithm=" << to_string(encryption_algorithm);
- out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
- out << ")";
-}
-
-}} // namespace
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#include "parquet_types.h"
+
+#include <algorithm>
+#include <ostream>
+
+#include <thrift/TToString.h>
+
+namespace parquet { namespace format {
+
+int _kTypeValues[] = {
+ Type::BOOLEAN,
+ Type::INT32,
+ Type::INT64,
+ Type::INT96,
+ Type::FLOAT,
+ Type::DOUBLE,
+ Type::BYTE_ARRAY,
+ Type::FIXED_LEN_BYTE_ARRAY
+};
+const char* _kTypeNames[] = {
+ "BOOLEAN",
+ "INT32",
+ "INT64",
+ "INT96",
+ "FLOAT",
+ "DOUBLE",
+ "BYTE_ARRAY",
+ "FIXED_LEN_BYTE_ARRAY"
+};
+const std::map<int, const char*> _Type_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kTypeValues, _kTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const Type::type& val) {
+ std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
+ if (it != _Type_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const Type::type& val) {
+ std::map<int, const char*>::const_iterator it = _Type_VALUES_TO_NAMES.find(val);
+ if (it != _Type_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kConvertedTypeValues[] = {
+ ConvertedType::UTF8,
+ ConvertedType::MAP,
+ ConvertedType::MAP_KEY_VALUE,
+ ConvertedType::LIST,
+ ConvertedType::ENUM,
+ ConvertedType::DECIMAL,
+ ConvertedType::DATE,
+ ConvertedType::TIME_MILLIS,
+ ConvertedType::TIME_MICROS,
+ ConvertedType::TIMESTAMP_MILLIS,
+ ConvertedType::TIMESTAMP_MICROS,
+ ConvertedType::UINT_8,
+ ConvertedType::UINT_16,
+ ConvertedType::UINT_32,
+ ConvertedType::UINT_64,
+ ConvertedType::INT_8,
+ ConvertedType::INT_16,
+ ConvertedType::INT_32,
+ ConvertedType::INT_64,
+ ConvertedType::JSON,
+ ConvertedType::BSON,
+ ConvertedType::INTERVAL
+};
+const char* _kConvertedTypeNames[] = {
+ "UTF8",
+ "MAP",
+ "MAP_KEY_VALUE",
+ "LIST",
+ "ENUM",
+ "DECIMAL",
+ "DATE",
+ "TIME_MILLIS",
+ "TIME_MICROS",
+ "TIMESTAMP_MILLIS",
+ "TIMESTAMP_MICROS",
+ "UINT_8",
+ "UINT_16",
+ "UINT_32",
+ "UINT_64",
+ "INT_8",
+ "INT_16",
+ "INT_32",
+ "INT_64",
+ "JSON",
+ "BSON",
+ "INTERVAL"
+};
+const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(22, _kConvertedTypeValues, _kConvertedTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val) {
+ std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
+ if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const ConvertedType::type& val) {
+ std::map<int, const char*>::const_iterator it = _ConvertedType_VALUES_TO_NAMES.find(val);
+ if (it != _ConvertedType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kFieldRepetitionTypeValues[] = {
+ FieldRepetitionType::REQUIRED,
+ FieldRepetitionType::OPTIONAL,
+ FieldRepetitionType::REPEATED
+};
+const char* _kFieldRepetitionTypeNames[] = {
+ "REQUIRED",
+ "OPTIONAL",
+ "REPEATED"
+};
+const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kFieldRepetitionTypeValues, _kFieldRepetitionTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val) {
+ std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
+ if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const FieldRepetitionType::type& val) {
+ std::map<int, const char*>::const_iterator it = _FieldRepetitionType_VALUES_TO_NAMES.find(val);
+ if (it != _FieldRepetitionType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kEncodingValues[] = {
+ Encoding::PLAIN,
+ Encoding::PLAIN_DICTIONARY,
+ Encoding::RLE,
+ Encoding::BIT_PACKED,
+ Encoding::DELTA_BINARY_PACKED,
+ Encoding::DELTA_LENGTH_BYTE_ARRAY,
+ Encoding::DELTA_BYTE_ARRAY,
+ Encoding::RLE_DICTIONARY,
+ Encoding::BYTE_STREAM_SPLIT
+};
+const char* _kEncodingNames[] = {
+ "PLAIN",
+ "PLAIN_DICTIONARY",
+ "RLE",
+ "BIT_PACKED",
+ "DELTA_BINARY_PACKED",
+ "DELTA_LENGTH_BYTE_ARRAY",
+ "DELTA_BYTE_ARRAY",
+ "RLE_DICTIONARY",
+ "BYTE_STREAM_SPLIT"
+};
+const std::map<int, const char*> _Encoding_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(9, _kEncodingValues, _kEncodingNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const Encoding::type& val) {
+ std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
+ if (it != _Encoding_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const Encoding::type& val) {
+ std::map<int, const char*>::const_iterator it = _Encoding_VALUES_TO_NAMES.find(val);
+ if (it != _Encoding_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kCompressionCodecValues[] = {
+ CompressionCodec::UNCOMPRESSED,
+ CompressionCodec::SNAPPY,
+ CompressionCodec::GZIP,
+ CompressionCodec::LZO,
+ CompressionCodec::BROTLI,
+ CompressionCodec::LZ4,
+ CompressionCodec::ZSTD,
+ CompressionCodec::LZ4_RAW
+};
+const char* _kCompressionCodecNames[] = {
+ "UNCOMPRESSED",
+ "SNAPPY",
+ "GZIP",
+ "LZO",
+ "BROTLI",
+ "LZ4",
+ "ZSTD",
+ "LZ4_RAW"
+};
+const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(8, _kCompressionCodecValues, _kCompressionCodecNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val) {
+ std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
+ if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const CompressionCodec::type& val) {
+ std::map<int, const char*>::const_iterator it = _CompressionCodec_VALUES_TO_NAMES.find(val);
+ if (it != _CompressionCodec_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kPageTypeValues[] = {
+ PageType::DATA_PAGE,
+ PageType::INDEX_PAGE,
+ PageType::DICTIONARY_PAGE,
+ PageType::DATA_PAGE_V2
+};
+const char* _kPageTypeNames[] = {
+ "DATA_PAGE",
+ "INDEX_PAGE",
+ "DICTIONARY_PAGE",
+ "DATA_PAGE_V2"
+};
+const std::map<int, const char*> _PageType_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(4, _kPageTypeValues, _kPageTypeNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const PageType::type& val) {
+ std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
+ if (it != _PageType_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const PageType::type& val) {
+ std::map<int, const char*>::const_iterator it = _PageType_VALUES_TO_NAMES.find(val);
+ if (it != _PageType_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+int _kBoundaryOrderValues[] = {
+ BoundaryOrder::UNORDERED,
+ BoundaryOrder::ASCENDING,
+ BoundaryOrder::DESCENDING
+};
+const char* _kBoundaryOrderNames[] = {
+ "UNORDERED",
+ "ASCENDING",
+ "DESCENDING"
+};
+const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES(::apache::thrift::TEnumIterator(3, _kBoundaryOrderValues, _kBoundaryOrderNames), ::apache::thrift::TEnumIterator(-1, NULL, NULL));
+
+std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val) {
+ std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
+ if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
+ out << it->second;
+ } else {
+ out << static_cast<int>(val);
+ }
+ return out;
+}
+
+std::string to_string(const BoundaryOrder::type& val) {
+ std::map<int, const char*>::const_iterator it = _BoundaryOrder_VALUES_TO_NAMES.find(val);
+ if (it != _BoundaryOrder_VALUES_TO_NAMES.end()) {
+ return std::string(it->second);
+ } else {
+ return std::to_string(static_cast<int>(val));
+ }
+}
+
+
+Statistics::~Statistics() noexcept {
+}
+
+
+void Statistics::__set_max(const std::string& val) {
+ this->max = val;
+__isset.max = true;
+}
+
+void Statistics::__set_min(const std::string& val) {
+ this->min = val;
+__isset.min = true;
+}
+
+void Statistics::__set_null_count(const int64_t val) {
+ this->null_count = val;
+__isset.null_count = true;
+}
+
+void Statistics::__set_distinct_count(const int64_t val) {
+ this->distinct_count = val;
+__isset.distinct_count = true;
+}
+
+void Statistics::__set_max_value(const std::string& val) {
+ this->max_value = val;
+__isset.max_value = true;
+}
+
+void Statistics::__set_min_value(const std::string& val) {
+ this->min_value = val;
+__isset.min_value = true;
+}
+std::ostream& operator<<(std::ostream& out, const Statistics& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t Statistics::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->max);
+ this->__isset.max = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->min);
+ this->__isset.min = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->null_count);
+ this->__isset.null_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->distinct_count);
+ this->__isset.distinct_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->max_value);
+ this->__isset.max_value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->min_value);
+ this->__isset.min_value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t Statistics::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("Statistics");
+
+ if (this->__isset.max) {
+ xfer += oprot->writeFieldBegin("max", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->max);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.min) {
+ xfer += oprot->writeFieldBegin("min", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->min);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.null_count) {
+ xfer += oprot->writeFieldBegin("null_count", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->null_count);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.distinct_count) {
+ xfer += oprot->writeFieldBegin("distinct_count", ::apache::thrift::protocol::T_I64, 4);
+ xfer += oprot->writeI64(this->distinct_count);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.max_value) {
+ xfer += oprot->writeFieldBegin("max_value", ::apache::thrift::protocol::T_STRING, 5);
+ xfer += oprot->writeBinary(this->max_value);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.min_value) {
+ xfer += oprot->writeFieldBegin("min_value", ::apache::thrift::protocol::T_STRING, 6);
+ xfer += oprot->writeBinary(this->min_value);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(Statistics &a, Statistics &b) {
+ using ::std::swap;
+ swap(a.max, b.max);
+ swap(a.min, b.min);
+ swap(a.null_count, b.null_count);
+ swap(a.distinct_count, b.distinct_count);
+ swap(a.max_value, b.max_value);
+ swap(a.min_value, b.min_value);
+ swap(a.__isset, b.__isset);
+}
+
+Statistics::Statistics(const Statistics& other0) {
+ max = other0.max;
+ min = other0.min;
+ null_count = other0.null_count;
+ distinct_count = other0.distinct_count;
+ max_value = other0.max_value;
+ min_value = other0.min_value;
+ __isset = other0.__isset;
+}
+Statistics& Statistics::operator=(const Statistics& other1) {
+ max = other1.max;
+ min = other1.min;
+ null_count = other1.null_count;
+ distinct_count = other1.distinct_count;
+ max_value = other1.max_value;
+ min_value = other1.min_value;
+ __isset = other1.__isset;
+ return *this;
+}
+void Statistics::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "Statistics(";
+ out << "max="; (__isset.max ? (out << to_string(max)) : (out << "<null>"));
+ out << ", " << "min="; (__isset.min ? (out << to_string(min)) : (out << "<null>"));
+ out << ", " << "null_count="; (__isset.null_count ? (out << to_string(null_count)) : (out << "<null>"));
+ out << ", " << "distinct_count="; (__isset.distinct_count ? (out << to_string(distinct_count)) : (out << "<null>"));
+ out << ", " << "max_value="; (__isset.max_value ? (out << to_string(max_value)) : (out << "<null>"));
+ out << ", " << "min_value="; (__isset.min_value ? (out << to_string(min_value)) : (out << "<null>"));
+ out << ")";
+}
+
+
+StringType::~StringType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const StringType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t StringType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t StringType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("StringType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(StringType &a, StringType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+StringType::StringType(const StringType& other2) {
+ (void) other2;
+}
+StringType& StringType::operator=(const StringType& other3) {
+ (void) other3;
+ return *this;
+}
+void StringType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "StringType(";
+ out << ")";
+}
+
+
+UUIDType::~UUIDType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const UUIDType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t UUIDType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t UUIDType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("UUIDType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(UUIDType &a, UUIDType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+UUIDType::UUIDType(const UUIDType& other4) {
+ (void) other4;
+}
+UUIDType& UUIDType::operator=(const UUIDType& other5) {
+ (void) other5;
+ return *this;
+}
+void UUIDType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "UUIDType(";
+ out << ")";
+}
+
+
+MapType::~MapType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MapType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MapType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MapType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MapType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MapType &a, MapType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MapType::MapType(const MapType& other6) {
+ (void) other6;
+}
+MapType& MapType::operator=(const MapType& other7) {
+ (void) other7;
+ return *this;
+}
+void MapType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MapType(";
+ out << ")";
+}
+
+
+ListType::~ListType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const ListType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ListType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ListType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ListType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ListType &a, ListType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+ListType::ListType(const ListType& other8) {
+ (void) other8;
+}
+ListType& ListType::operator=(const ListType& other9) {
+ (void) other9;
+ return *this;
+}
+void ListType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ListType(";
+ out << ")";
+}
+
+
+EnumType::~EnumType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const EnumType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EnumType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EnumType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EnumType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EnumType &a, EnumType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+EnumType::EnumType(const EnumType& other10) {
+ (void) other10;
+}
+EnumType& EnumType::operator=(const EnumType& other11) {
+ (void) other11;
+ return *this;
+}
+void EnumType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EnumType(";
+ out << ")";
+}
+
+
+DateType::~DateType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const DateType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DateType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t DateType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DateType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DateType &a, DateType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+DateType::DateType(const DateType& other12) {
+ (void) other12;
+}
+DateType& DateType::operator=(const DateType& other13) {
+ (void) other13;
+ return *this;
+}
+void DateType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DateType(";
+ out << ")";
+}
+
+
+NullType::~NullType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const NullType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t NullType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t NullType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("NullType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(NullType &a, NullType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+NullType::NullType(const NullType& other14) {
+ (void) other14;
+}
+NullType& NullType::operator=(const NullType& other15) {
+ (void) other15;
+ return *this;
+}
+void NullType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "NullType(";
+ out << ")";
+}
+
+
+DecimalType::~DecimalType() noexcept {
+}
+
+
+void DecimalType::__set_scale(const int32_t val) {
+ this->scale = val;
+}
+
+void DecimalType::__set_precision(const int32_t val) {
+ this->precision = val;
+}
+std::ostream& operator<<(std::ostream& out, const DecimalType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DecimalType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_scale = false;
+ bool isset_precision = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->scale);
+ isset_scale = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->precision);
+ isset_precision = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_scale)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_precision)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DecimalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DecimalType");
+
+ xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->scale);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->precision);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DecimalType &a, DecimalType &b) {
+ using ::std::swap;
+ swap(a.scale, b.scale);
+ swap(a.precision, b.precision);
+}
+
+DecimalType::DecimalType(const DecimalType& other16) {
+ scale = other16.scale;
+ precision = other16.precision;
+}
+DecimalType& DecimalType::operator=(const DecimalType& other17) {
+ scale = other17.scale;
+ precision = other17.precision;
+ return *this;
+}
+void DecimalType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DecimalType(";
+ out << "scale=" << to_string(scale);
+ out << ", " << "precision=" << to_string(precision);
+ out << ")";
+}
+
+
+MilliSeconds::~MilliSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MilliSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MilliSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MilliSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MilliSeconds &a, MilliSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MilliSeconds::MilliSeconds(const MilliSeconds& other18) {
+ (void) other18;
+}
+MilliSeconds& MilliSeconds::operator=(const MilliSeconds& other19) {
+ (void) other19;
+ return *this;
+}
+void MilliSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MilliSeconds(";
+ out << ")";
+}
+
+
+MicroSeconds::~MicroSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t MicroSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t MicroSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("MicroSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(MicroSeconds &a, MicroSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+MicroSeconds::MicroSeconds(const MicroSeconds& other20) {
+ (void) other20;
+}
+MicroSeconds& MicroSeconds::operator=(const MicroSeconds& other21) {
+ (void) other21;
+ return *this;
+}
+void MicroSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "MicroSeconds(";
+ out << ")";
+}
+
+
+NanoSeconds::~NanoSeconds() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t NanoSeconds::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t NanoSeconds::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("NanoSeconds");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(NanoSeconds &a, NanoSeconds &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+NanoSeconds::NanoSeconds(const NanoSeconds& other22) {
+ (void) other22;
+}
+NanoSeconds& NanoSeconds::operator=(const NanoSeconds& other23) {
+ (void) other23;
+ return *this;
+}
+void NanoSeconds::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "NanoSeconds(";
+ out << ")";
+}
+
+
+TimeUnit::~TimeUnit() noexcept {
+}
+
+
+void TimeUnit::__set_MILLIS(const MilliSeconds& val) {
+ this->MILLIS = val;
+__isset.MILLIS = true;
+}
+
+void TimeUnit::__set_MICROS(const MicroSeconds& val) {
+ this->MICROS = val;
+__isset.MICROS = true;
+}
+
+void TimeUnit::__set_NANOS(const NanoSeconds& val) {
+ this->NANOS = val;
+__isset.NANOS = true;
+}
+std::ostream& operator<<(std::ostream& out, const TimeUnit& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimeUnit::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MILLIS.read(iprot);
+ this->__isset.MILLIS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MICROS.read(iprot);
+ this->__isset.MICROS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->NANOS.read(iprot);
+ this->__isset.NANOS = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t TimeUnit::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimeUnit");
+
+ if (this->__isset.MILLIS) {
+ xfer += oprot->writeFieldBegin("MILLIS", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->MILLIS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.MICROS) {
+ xfer += oprot->writeFieldBegin("MICROS", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->MICROS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.NANOS) {
+ xfer += oprot->writeFieldBegin("NANOS", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->NANOS.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimeUnit &a, TimeUnit &b) {
+ using ::std::swap;
+ swap(a.MILLIS, b.MILLIS);
+ swap(a.MICROS, b.MICROS);
+ swap(a.NANOS, b.NANOS);
+ swap(a.__isset, b.__isset);
+}
+
+TimeUnit::TimeUnit(const TimeUnit& other24) {
+ MILLIS = other24.MILLIS;
+ MICROS = other24.MICROS;
+ NANOS = other24.NANOS;
+ __isset = other24.__isset;
+}
+TimeUnit& TimeUnit::operator=(const TimeUnit& other25) {
+ MILLIS = other25.MILLIS;
+ MICROS = other25.MICROS;
+ NANOS = other25.NANOS;
+ __isset = other25.__isset;
+ return *this;
+}
+void TimeUnit::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimeUnit(";
+ out << "MILLIS="; (__isset.MILLIS ? (out << to_string(MILLIS)) : (out << "<null>"));
+ out << ", " << "MICROS="; (__isset.MICROS ? (out << to_string(MICROS)) : (out << "<null>"));
+ out << ", " << "NANOS="; (__isset.NANOS ? (out << to_string(NANOS)) : (out << "<null>"));
+ out << ")";
+}
+
+
+TimestampType::~TimestampType() noexcept {
+}
+
+
+void TimestampType::__set_isAdjustedToUTC(const bool val) {
+ this->isAdjustedToUTC = val;
+}
+
+void TimestampType::__set_unit(const TimeUnit& val) {
+ this->unit = val;
+}
+std::ostream& operator<<(std::ostream& out, const TimestampType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimestampType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_isAdjustedToUTC = false;
+ bool isset_unit = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isAdjustedToUTC);
+ isset_isAdjustedToUTC = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->unit.read(iprot);
+ isset_unit = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_isAdjustedToUTC)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_unit)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t TimestampType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimestampType");
+
+ xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
+ xfer += oprot->writeBool(this->isAdjustedToUTC);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->unit.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimestampType &a, TimestampType &b) {
+ using ::std::swap;
+ swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
+ swap(a.unit, b.unit);
+}
+
+TimestampType::TimestampType(const TimestampType& other26) {
+ isAdjustedToUTC = other26.isAdjustedToUTC;
+ unit = other26.unit;
+}
+TimestampType& TimestampType::operator=(const TimestampType& other27) {
+ isAdjustedToUTC = other27.isAdjustedToUTC;
+ unit = other27.unit;
+ return *this;
+}
+void TimestampType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimestampType(";
+ out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
+ out << ", " << "unit=" << to_string(unit);
+ out << ")";
+}
+
+
+TimeType::~TimeType() noexcept {
+}
+
+
+void TimeType::__set_isAdjustedToUTC(const bool val) {
+ this->isAdjustedToUTC = val;
+}
+
+void TimeType::__set_unit(const TimeUnit& val) {
+ this->unit = val;
+}
+std::ostream& operator<<(std::ostream& out, const TimeType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TimeType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_isAdjustedToUTC = false;
+ bool isset_unit = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isAdjustedToUTC);
+ isset_isAdjustedToUTC = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->unit.read(iprot);
+ isset_unit = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_isAdjustedToUTC)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_unit)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t TimeType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TimeType");
+
+ xfer += oprot->writeFieldBegin("isAdjustedToUTC", ::apache::thrift::protocol::T_BOOL, 1);
+ xfer += oprot->writeBool(this->isAdjustedToUTC);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("unit", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->unit.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TimeType &a, TimeType &b) {
+ using ::std::swap;
+ swap(a.isAdjustedToUTC, b.isAdjustedToUTC);
+ swap(a.unit, b.unit);
+}
+
+TimeType::TimeType(const TimeType& other28) {
+ isAdjustedToUTC = other28.isAdjustedToUTC;
+ unit = other28.unit;
+}
+TimeType& TimeType::operator=(const TimeType& other29) {
+ isAdjustedToUTC = other29.isAdjustedToUTC;
+ unit = other29.unit;
+ return *this;
+}
+void TimeType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TimeType(";
+ out << "isAdjustedToUTC=" << to_string(isAdjustedToUTC);
+ out << ", " << "unit=" << to_string(unit);
+ out << ")";
+}
+
+
+IntType::~IntType() noexcept {
+}
+
+
+void IntType::__set_bitWidth(const int8_t val) {
+ this->bitWidth = val;
+}
+
+void IntType::__set_isSigned(const bool val) {
+ this->isSigned = val;
+}
+std::ostream& operator<<(std::ostream& out, const IntType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t IntType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_bitWidth = false;
+ bool isset_isSigned = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_BYTE) {
+ xfer += iprot->readByte(this->bitWidth);
+ isset_bitWidth = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->isSigned);
+ isset_isSigned = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_bitWidth)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_isSigned)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t IntType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("IntType");
+
+ xfer += oprot->writeFieldBegin("bitWidth", ::apache::thrift::protocol::T_BYTE, 1);
+ xfer += oprot->writeByte(this->bitWidth);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("isSigned", ::apache::thrift::protocol::T_BOOL, 2);
+ xfer += oprot->writeBool(this->isSigned);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(IntType &a, IntType &b) {
+ using ::std::swap;
+ swap(a.bitWidth, b.bitWidth);
+ swap(a.isSigned, b.isSigned);
+}
+
+IntType::IntType(const IntType& other30) {
+ bitWidth = other30.bitWidth;
+ isSigned = other30.isSigned;
+}
+IntType& IntType::operator=(const IntType& other31) {
+ bitWidth = other31.bitWidth;
+ isSigned = other31.isSigned;
+ return *this;
+}
+void IntType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "IntType(";
+ out << "bitWidth=" << to_string(bitWidth);
+ out << ", " << "isSigned=" << to_string(isSigned);
+ out << ")";
+}
+
+
+JsonType::~JsonType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const JsonType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t JsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t JsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("JsonType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(JsonType &a, JsonType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+JsonType::JsonType(const JsonType& other32) {
+ (void) other32;
+}
+JsonType& JsonType::operator=(const JsonType& other33) {
+ (void) other33;
+ return *this;
+}
+void JsonType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "JsonType(";
+ out << ")";
+}
+
+
+BsonType::~BsonType() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const BsonType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BsonType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BsonType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BsonType");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BsonType &a, BsonType &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+BsonType::BsonType(const BsonType& other34) {
+ (void) other34;
+}
+BsonType& BsonType::operator=(const BsonType& other35) {
+ (void) other35;
+ return *this;
+}
+void BsonType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BsonType(";
+ out << ")";
+}
+
+
+LogicalType::~LogicalType() noexcept {
+}
+
+
+void LogicalType::__set_STRING(const StringType& val) {
+ this->STRING = val;
+__isset.STRING = true;
+}
+
+void LogicalType::__set_MAP(const MapType& val) {
+ this->MAP = val;
+__isset.MAP = true;
+}
+
+void LogicalType::__set_LIST(const ListType& val) {
+ this->LIST = val;
+__isset.LIST = true;
+}
+
+void LogicalType::__set_ENUM(const EnumType& val) {
+ this->ENUM = val;
+__isset.ENUM = true;
+}
+
+void LogicalType::__set_DECIMAL(const DecimalType& val) {
+ this->DECIMAL = val;
+__isset.DECIMAL = true;
+}
+
+void LogicalType::__set_DATE(const DateType& val) {
+ this->DATE = val;
+__isset.DATE = true;
+}
+
+void LogicalType::__set_TIME(const TimeType& val) {
+ this->TIME = val;
+__isset.TIME = true;
+}
+
+void LogicalType::__set_TIMESTAMP(const TimestampType& val) {
+ this->TIMESTAMP = val;
+__isset.TIMESTAMP = true;
+}
+
+void LogicalType::__set_INTEGER(const IntType& val) {
+ this->INTEGER = val;
+__isset.INTEGER = true;
+}
+
+void LogicalType::__set_UNKNOWN(const NullType& val) {
+ this->UNKNOWN = val;
+__isset.UNKNOWN = true;
+}
+
+void LogicalType::__set_JSON(const JsonType& val) {
+ this->JSON = val;
+__isset.JSON = true;
+}
+
+void LogicalType::__set_BSON(const BsonType& val) {
+ this->BSON = val;
+__isset.BSON = true;
+}
+
+void LogicalType::__set_UUID(const UUIDType& val) {
+ this->UUID = val;
+__isset.UUID = true;
+}
+std::ostream& operator<<(std::ostream& out, const LogicalType& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t LogicalType::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->STRING.read(iprot);
+ this->__isset.STRING = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->MAP.read(iprot);
+ this->__isset.MAP = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->LIST.read(iprot);
+ this->__isset.LIST = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENUM.read(iprot);
+ this->__isset.ENUM = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->DECIMAL.read(iprot);
+ this->__isset.DECIMAL = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->DATE.read(iprot);
+ this->__isset.DATE = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TIME.read(iprot);
+ this->__isset.TIME = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TIMESTAMP.read(iprot);
+ this->__isset.TIMESTAMP = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->INTEGER.read(iprot);
+ this->__isset.INTEGER = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 11:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UNKNOWN.read(iprot);
+ this->__isset.UNKNOWN = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 12:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->JSON.read(iprot);
+ this->__isset.JSON = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 13:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->BSON.read(iprot);
+ this->__isset.BSON = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 14:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UUID.read(iprot);
+ this->__isset.UUID = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t LogicalType::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("LogicalType");
+
+ if (this->__isset.STRING) {
+ xfer += oprot->writeFieldBegin("STRING", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->STRING.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.MAP) {
+ xfer += oprot->writeFieldBegin("MAP", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->MAP.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.LIST) {
+ xfer += oprot->writeFieldBegin("LIST", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->LIST.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ENUM) {
+ xfer += oprot->writeFieldBegin("ENUM", ::apache::thrift::protocol::T_STRUCT, 4);
+ xfer += this->ENUM.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.DECIMAL) {
+ xfer += oprot->writeFieldBegin("DECIMAL", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->DECIMAL.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.DATE) {
+ xfer += oprot->writeFieldBegin("DATE", ::apache::thrift::protocol::T_STRUCT, 6);
+ xfer += this->DATE.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.TIME) {
+ xfer += oprot->writeFieldBegin("TIME", ::apache::thrift::protocol::T_STRUCT, 7);
+ xfer += this->TIME.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.TIMESTAMP) {
+ xfer += oprot->writeFieldBegin("TIMESTAMP", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->TIMESTAMP.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.INTEGER) {
+ xfer += oprot->writeFieldBegin("INTEGER", ::apache::thrift::protocol::T_STRUCT, 10);
+ xfer += this->INTEGER.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.UNKNOWN) {
+ xfer += oprot->writeFieldBegin("UNKNOWN", ::apache::thrift::protocol::T_STRUCT, 11);
+ xfer += this->UNKNOWN.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.JSON) {
+ xfer += oprot->writeFieldBegin("JSON", ::apache::thrift::protocol::T_STRUCT, 12);
+ xfer += this->JSON.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.BSON) {
+ xfer += oprot->writeFieldBegin("BSON", ::apache::thrift::protocol::T_STRUCT, 13);
+ xfer += this->BSON.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.UUID) {
+ xfer += oprot->writeFieldBegin("UUID", ::apache::thrift::protocol::T_STRUCT, 14);
+ xfer += this->UUID.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(LogicalType &a, LogicalType &b) {
+ using ::std::swap;
+ swap(a.STRING, b.STRING);
+ swap(a.MAP, b.MAP);
+ swap(a.LIST, b.LIST);
+ swap(a.ENUM, b.ENUM);
+ swap(a.DECIMAL, b.DECIMAL);
+ swap(a.DATE, b.DATE);
+ swap(a.TIME, b.TIME);
+ swap(a.TIMESTAMP, b.TIMESTAMP);
+ swap(a.INTEGER, b.INTEGER);
+ swap(a.UNKNOWN, b.UNKNOWN);
+ swap(a.JSON, b.JSON);
+ swap(a.BSON, b.BSON);
+ swap(a.UUID, b.UUID);
+ swap(a.__isset, b.__isset);
+}
+
+LogicalType::LogicalType(const LogicalType& other36) {
+ STRING = other36.STRING;
+ MAP = other36.MAP;
+ LIST = other36.LIST;
+ ENUM = other36.ENUM;
+ DECIMAL = other36.DECIMAL;
+ DATE = other36.DATE;
+ TIME = other36.TIME;
+ TIMESTAMP = other36.TIMESTAMP;
+ INTEGER = other36.INTEGER;
+ UNKNOWN = other36.UNKNOWN;
+ JSON = other36.JSON;
+ BSON = other36.BSON;
+ UUID = other36.UUID;
+ __isset = other36.__isset;
+}
+LogicalType& LogicalType::operator=(const LogicalType& other37) {
+ STRING = other37.STRING;
+ MAP = other37.MAP;
+ LIST = other37.LIST;
+ ENUM = other37.ENUM;
+ DECIMAL = other37.DECIMAL;
+ DATE = other37.DATE;
+ TIME = other37.TIME;
+ TIMESTAMP = other37.TIMESTAMP;
+ INTEGER = other37.INTEGER;
+ UNKNOWN = other37.UNKNOWN;
+ JSON = other37.JSON;
+ BSON = other37.BSON;
+ UUID = other37.UUID;
+ __isset = other37.__isset;
+ return *this;
+}
+void LogicalType::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "LogicalType(";
+ out << "STRING="; (__isset.STRING ? (out << to_string(STRING)) : (out << "<null>"));
+ out << ", " << "MAP="; (__isset.MAP ? (out << to_string(MAP)) : (out << "<null>"));
+ out << ", " << "LIST="; (__isset.LIST ? (out << to_string(LIST)) : (out << "<null>"));
+ out << ", " << "ENUM="; (__isset.ENUM ? (out << to_string(ENUM)) : (out << "<null>"));
+ out << ", " << "DECIMAL="; (__isset.DECIMAL ? (out << to_string(DECIMAL)) : (out << "<null>"));
+ out << ", " << "DATE="; (__isset.DATE ? (out << to_string(DATE)) : (out << "<null>"));
+ out << ", " << "TIME="; (__isset.TIME ? (out << to_string(TIME)) : (out << "<null>"));
+ out << ", " << "TIMESTAMP="; (__isset.TIMESTAMP ? (out << to_string(TIMESTAMP)) : (out << "<null>"));
+ out << ", " << "INTEGER="; (__isset.INTEGER ? (out << to_string(INTEGER)) : (out << "<null>"));
+ out << ", " << "UNKNOWN="; (__isset.UNKNOWN ? (out << to_string(UNKNOWN)) : (out << "<null>"));
+ out << ", " << "JSON="; (__isset.JSON ? (out << to_string(JSON)) : (out << "<null>"));
+ out << ", " << "BSON="; (__isset.BSON ? (out << to_string(BSON)) : (out << "<null>"));
+ out << ", " << "UUID="; (__isset.UUID ? (out << to_string(UUID)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SchemaElement::~SchemaElement() noexcept {
+}
+
+
+void SchemaElement::__set_type(const Type::type val) {
+ this->type = val;
+__isset.type = true;
+}
+
+void SchemaElement::__set_type_length(const int32_t val) {
+ this->type_length = val;
+__isset.type_length = true;
+}
+
+void SchemaElement::__set_repetition_type(const FieldRepetitionType::type val) {
+ this->repetition_type = val;
+__isset.repetition_type = true;
+}
+
+void SchemaElement::__set_name(const std::string& val) {
+ this->name = val;
+}
+
+void SchemaElement::__set_num_children(const int32_t val) {
+ this->num_children = val;
+__isset.num_children = true;
+}
+
+void SchemaElement::__set_converted_type(const ConvertedType::type val) {
+ this->converted_type = val;
+__isset.converted_type = true;
+}
+
+void SchemaElement::__set_scale(const int32_t val) {
+ this->scale = val;
+__isset.scale = true;
+}
+
+void SchemaElement::__set_precision(const int32_t val) {
+ this->precision = val;
+__isset.precision = true;
+}
+
+void SchemaElement::__set_field_id(const int32_t val) {
+ this->field_id = val;
+__isset.field_id = true;
+}
+
+void SchemaElement::__set_logicalType(const LogicalType& val) {
+ this->logicalType = val;
+__isset.logicalType = true;
+}
+std::ostream& operator<<(std::ostream& out, const SchemaElement& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SchemaElement::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_name = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast38;
+ xfer += iprot->readI32(ecast38);
+ this->type = (Type::type)ecast38;
+ this->__isset.type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->type_length);
+ this->__isset.type_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast39;
+ xfer += iprot->readI32(ecast39);
+ this->repetition_type = (FieldRepetitionType::type)ecast39;
+ this->__isset.repetition_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->name);
+ isset_name = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_children);
+ this->__isset.num_children = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast40;
+ xfer += iprot->readI32(ecast40);
+ this->converted_type = (ConvertedType::type)ecast40;
+ this->__isset.converted_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->scale);
+ this->__isset.scale = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->precision);
+ this->__isset.precision = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->field_id);
+ this->__isset.field_id = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->logicalType.read(iprot);
+ this->__isset.logicalType = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_name)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t SchemaElement::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SchemaElement");
+
+ if (this->__isset.type) {
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.type_length) {
+ xfer += oprot->writeFieldBegin("type_length", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->type_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.repetition_type) {
+ xfer += oprot->writeFieldBegin("repetition_type", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32((int32_t)this->repetition_type);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("name", ::apache::thrift::protocol::T_STRING, 4);
+ xfer += oprot->writeString(this->name);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.num_children) {
+ xfer += oprot->writeFieldBegin("num_children", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->num_children);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.converted_type) {
+ xfer += oprot->writeFieldBegin("converted_type", ::apache::thrift::protocol::T_I32, 6);
+ xfer += oprot->writeI32((int32_t)this->converted_type);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.scale) {
+ xfer += oprot->writeFieldBegin("scale", ::apache::thrift::protocol::T_I32, 7);
+ xfer += oprot->writeI32(this->scale);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.precision) {
+ xfer += oprot->writeFieldBegin("precision", ::apache::thrift::protocol::T_I32, 8);
+ xfer += oprot->writeI32(this->precision);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.field_id) {
+ xfer += oprot->writeFieldBegin("field_id", ::apache::thrift::protocol::T_I32, 9);
+ xfer += oprot->writeI32(this->field_id);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.logicalType) {
+ xfer += oprot->writeFieldBegin("logicalType", ::apache::thrift::protocol::T_STRUCT, 10);
+ xfer += this->logicalType.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SchemaElement &a, SchemaElement &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.type_length, b.type_length);
+ swap(a.repetition_type, b.repetition_type);
+ swap(a.name, b.name);
+ swap(a.num_children, b.num_children);
+ swap(a.converted_type, b.converted_type);
+ swap(a.scale, b.scale);
+ swap(a.precision, b.precision);
+ swap(a.field_id, b.field_id);
+ swap(a.logicalType, b.logicalType);
+ swap(a.__isset, b.__isset);
+}
+
+SchemaElement::SchemaElement(const SchemaElement& other41) {
+ type = other41.type;
+ type_length = other41.type_length;
+ repetition_type = other41.repetition_type;
+ name = other41.name;
+ num_children = other41.num_children;
+ converted_type = other41.converted_type;
+ scale = other41.scale;
+ precision = other41.precision;
+ field_id = other41.field_id;
+ logicalType = other41.logicalType;
+ __isset = other41.__isset;
+}
+SchemaElement& SchemaElement::operator=(const SchemaElement& other42) {
+ type = other42.type;
+ type_length = other42.type_length;
+ repetition_type = other42.repetition_type;
+ name = other42.name;
+ num_children = other42.num_children;
+ converted_type = other42.converted_type;
+ scale = other42.scale;
+ precision = other42.precision;
+ field_id = other42.field_id;
+ logicalType = other42.logicalType;
+ __isset = other42.__isset;
+ return *this;
+}
+void SchemaElement::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SchemaElement(";
+ out << "type="; (__isset.type ? (out << to_string(type)) : (out << "<null>"));
+ out << ", " << "type_length="; (__isset.type_length ? (out << to_string(type_length)) : (out << "<null>"));
+ out << ", " << "repetition_type="; (__isset.repetition_type ? (out << to_string(repetition_type)) : (out << "<null>"));
+ out << ", " << "name=" << to_string(name);
+ out << ", " << "num_children="; (__isset.num_children ? (out << to_string(num_children)) : (out << "<null>"));
+ out << ", " << "converted_type="; (__isset.converted_type ? (out << to_string(converted_type)) : (out << "<null>"));
+ out << ", " << "scale="; (__isset.scale ? (out << to_string(scale)) : (out << "<null>"));
+ out << ", " << "precision="; (__isset.precision ? (out << to_string(precision)) : (out << "<null>"));
+ out << ", " << "field_id="; (__isset.field_id ? (out << to_string(field_id)) : (out << "<null>"));
+ out << ", " << "logicalType="; (__isset.logicalType ? (out << to_string(logicalType)) : (out << "<null>"));
+ out << ")";
+}
+
+
+DataPageHeader::~DataPageHeader() noexcept {
+}
+
+
+void DataPageHeader::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DataPageHeader::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DataPageHeader::__set_definition_level_encoding(const Encoding::type val) {
+ this->definition_level_encoding = val;
+}
+
+void DataPageHeader::__set_repetition_level_encoding(const Encoding::type val) {
+ this->repetition_level_encoding = val;
+}
+
+void DataPageHeader::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DataPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_encoding = false;
+ bool isset_definition_level_encoding = false;
+ bool isset_repetition_level_encoding = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast43;
+ xfer += iprot->readI32(ecast43);
+ this->encoding = (Encoding::type)ecast43;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast44;
+ xfer += iprot->readI32(ecast44);
+ this->definition_level_encoding = (Encoding::type)ecast44;
+ isset_definition_level_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast45;
+ xfer += iprot->readI32(ecast45);
+ this->repetition_level_encoding = (Encoding::type)ecast45;
+ isset_repetition_level_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_definition_level_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_repetition_level_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DataPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DataPageHeader");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("definition_level_encoding", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32((int32_t)this->definition_level_encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("repetition_level_encoding", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->repetition_level_encoding);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DataPageHeader &a, DataPageHeader &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.encoding, b.encoding);
+ swap(a.definition_level_encoding, b.definition_level_encoding);
+ swap(a.repetition_level_encoding, b.repetition_level_encoding);
+ swap(a.statistics, b.statistics);
+ swap(a.__isset, b.__isset);
+}
+
+DataPageHeader::DataPageHeader(const DataPageHeader& other46) {
+ num_values = other46.num_values;
+ encoding = other46.encoding;
+ definition_level_encoding = other46.definition_level_encoding;
+ repetition_level_encoding = other46.repetition_level_encoding;
+ statistics = other46.statistics;
+ __isset = other46.__isset;
+}
+DataPageHeader& DataPageHeader::operator=(const DataPageHeader& other47) {
+ num_values = other47.num_values;
+ encoding = other47.encoding;
+ definition_level_encoding = other47.definition_level_encoding;
+ repetition_level_encoding = other47.repetition_level_encoding;
+ statistics = other47.statistics;
+ __isset = other47.__isset;
+ return *this;
+}
+void DataPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DataPageHeader(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "definition_level_encoding=" << to_string(definition_level_encoding);
+ out << ", " << "repetition_level_encoding=" << to_string(repetition_level_encoding);
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ")";
+}
+
+
+IndexPageHeader::~IndexPageHeader() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t IndexPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t IndexPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("IndexPageHeader");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(IndexPageHeader &a, IndexPageHeader &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+IndexPageHeader::IndexPageHeader(const IndexPageHeader& other48) {
+ (void) other48;
+}
+IndexPageHeader& IndexPageHeader::operator=(const IndexPageHeader& other49) {
+ (void) other49;
+ return *this;
+}
+void IndexPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "IndexPageHeader(";
+ out << ")";
+}
+
+
+DictionaryPageHeader::~DictionaryPageHeader() noexcept {
+}
+
+
+void DictionaryPageHeader::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DictionaryPageHeader::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DictionaryPageHeader::__set_is_sorted(const bool val) {
+ this->is_sorted = val;
+__isset.is_sorted = true;
+}
+std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DictionaryPageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_encoding = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast50;
+ xfer += iprot->readI32(ecast50);
+ this->encoding = (Encoding::type)ecast50;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->is_sorted);
+ this->__isset.is_sorted = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DictionaryPageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DictionaryPageHeader");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.is_sorted) {
+ xfer += oprot->writeFieldBegin("is_sorted", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->is_sorted);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.encoding, b.encoding);
+ swap(a.is_sorted, b.is_sorted);
+ swap(a.__isset, b.__isset);
+}
+
+DictionaryPageHeader::DictionaryPageHeader(const DictionaryPageHeader& other51) {
+ num_values = other51.num_values;
+ encoding = other51.encoding;
+ is_sorted = other51.is_sorted;
+ __isset = other51.__isset;
+}
+DictionaryPageHeader& DictionaryPageHeader::operator=(const DictionaryPageHeader& other52) {
+ num_values = other52.num_values;
+ encoding = other52.encoding;
+ is_sorted = other52.is_sorted;
+ __isset = other52.__isset;
+ return *this;
+}
+void DictionaryPageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DictionaryPageHeader(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "is_sorted="; (__isset.is_sorted ? (out << to_string(is_sorted)) : (out << "<null>"));
+ out << ")";
+}
+
+
+DataPageHeaderV2::~DataPageHeaderV2() noexcept {
+}
+
+
+void DataPageHeaderV2::__set_num_values(const int32_t val) {
+ this->num_values = val;
+}
+
+void DataPageHeaderV2::__set_num_nulls(const int32_t val) {
+ this->num_nulls = val;
+}
+
+void DataPageHeaderV2::__set_num_rows(const int32_t val) {
+ this->num_rows = val;
+}
+
+void DataPageHeaderV2::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void DataPageHeaderV2::__set_definition_levels_byte_length(const int32_t val) {
+ this->definition_levels_byte_length = val;
+}
+
+void DataPageHeaderV2::__set_repetition_levels_byte_length(const int32_t val) {
+ this->repetition_levels_byte_length = val;
+}
+
+void DataPageHeaderV2::__set_is_compressed(const bool val) {
+ this->is_compressed = val;
+__isset.is_compressed = true;
+}
+
+void DataPageHeaderV2::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t DataPageHeaderV2::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_num_values = false;
+ bool isset_num_nulls = false;
+ bool isset_num_rows = false;
+ bool isset_encoding = false;
+ bool isset_definition_levels_byte_length = false;
+ bool isset_repetition_levels_byte_length = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_nulls);
+ isset_num_nulls = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast53;
+ xfer += iprot->readI32(ecast53);
+ this->encoding = (Encoding::type)ecast53;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->definition_levels_byte_length);
+ isset_definition_levels_byte_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->repetition_levels_byte_length);
+ isset_repetition_levels_byte_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->is_compressed);
+ this->__isset.is_compressed = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_nulls)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_definition_levels_byte_length)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_repetition_levels_byte_length)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t DataPageHeaderV2::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("DataPageHeaderV2");
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_nulls", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->num_nulls);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("definition_levels_byte_length", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->definition_levels_byte_length);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("repetition_levels_byte_length", ::apache::thrift::protocol::T_I32, 6);
+ xfer += oprot->writeI32(this->repetition_levels_byte_length);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.is_compressed) {
+ xfer += oprot->writeFieldBegin("is_compressed", ::apache::thrift::protocol::T_BOOL, 7);
+ xfer += oprot->writeBool(this->is_compressed);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b) {
+ using ::std::swap;
+ swap(a.num_values, b.num_values);
+ swap(a.num_nulls, b.num_nulls);
+ swap(a.num_rows, b.num_rows);
+ swap(a.encoding, b.encoding);
+ swap(a.definition_levels_byte_length, b.definition_levels_byte_length);
+ swap(a.repetition_levels_byte_length, b.repetition_levels_byte_length);
+ swap(a.is_compressed, b.is_compressed);
+ swap(a.statistics, b.statistics);
+ swap(a.__isset, b.__isset);
+}
+
+DataPageHeaderV2::DataPageHeaderV2(const DataPageHeaderV2& other54) {
+ num_values = other54.num_values;
+ num_nulls = other54.num_nulls;
+ num_rows = other54.num_rows;
+ encoding = other54.encoding;
+ definition_levels_byte_length = other54.definition_levels_byte_length;
+ repetition_levels_byte_length = other54.repetition_levels_byte_length;
+ is_compressed = other54.is_compressed;
+ statistics = other54.statistics;
+ __isset = other54.__isset;
+}
+DataPageHeaderV2& DataPageHeaderV2::operator=(const DataPageHeaderV2& other55) {
+ num_values = other55.num_values;
+ num_nulls = other55.num_nulls;
+ num_rows = other55.num_rows;
+ encoding = other55.encoding;
+ definition_levels_byte_length = other55.definition_levels_byte_length;
+ repetition_levels_byte_length = other55.repetition_levels_byte_length;
+ is_compressed = other55.is_compressed;
+ statistics = other55.statistics;
+ __isset = other55.__isset;
+ return *this;
+}
+void DataPageHeaderV2::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "DataPageHeaderV2(";
+ out << "num_values=" << to_string(num_values);
+ out << ", " << "num_nulls=" << to_string(num_nulls);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "definition_levels_byte_length=" << to_string(definition_levels_byte_length);
+ out << ", " << "repetition_levels_byte_length=" << to_string(repetition_levels_byte_length);
+ out << ", " << "is_compressed="; (__isset.is_compressed ? (out << to_string(is_compressed)) : (out << "<null>"));
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SplitBlockAlgorithm::~SplitBlockAlgorithm() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SplitBlockAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t SplitBlockAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SplitBlockAlgorithm");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+SplitBlockAlgorithm::SplitBlockAlgorithm(const SplitBlockAlgorithm& other56) {
+ (void) other56;
+}
+SplitBlockAlgorithm& SplitBlockAlgorithm::operator=(const SplitBlockAlgorithm& other57) {
+ (void) other57;
+ return *this;
+}
+void SplitBlockAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SplitBlockAlgorithm(";
+ out << ")";
+}
+
+
+BloomFilterAlgorithm::~BloomFilterAlgorithm() noexcept {
+}
+
+
+void BloomFilterAlgorithm::__set_BLOCK(const SplitBlockAlgorithm& val) {
+ this->BLOCK = val;
+__isset.BLOCK = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->BLOCK.read(iprot);
+ this->__isset.BLOCK = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterAlgorithm");
+
+ if (this->__isset.BLOCK) {
+ xfer += oprot->writeFieldBegin("BLOCK", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->BLOCK.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b) {
+ using ::std::swap;
+ swap(a.BLOCK, b.BLOCK);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterAlgorithm::BloomFilterAlgorithm(const BloomFilterAlgorithm& other58) {
+ BLOCK = other58.BLOCK;
+ __isset = other58.__isset;
+}
+BloomFilterAlgorithm& BloomFilterAlgorithm::operator=(const BloomFilterAlgorithm& other59) {
+ BLOCK = other59.BLOCK;
+ __isset = other59.__isset;
+ return *this;
+}
+void BloomFilterAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterAlgorithm(";
+ out << "BLOCK="; (__isset.BLOCK ? (out << to_string(BLOCK)) : (out << "<null>"));
+ out << ")";
+}
+
+
+XxHash::~XxHash() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const XxHash& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t XxHash::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t XxHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("XxHash");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(XxHash &a, XxHash &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+XxHash::XxHash(const XxHash& other60) {
+ (void) other60;
+}
+XxHash& XxHash::operator=(const XxHash& other61) {
+ (void) other61;
+ return *this;
+}
+void XxHash::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "XxHash(";
+ out << ")";
+}
+
+
+BloomFilterHash::~BloomFilterHash() noexcept {
+}
+
+
+void BloomFilterHash::__set_XXHASH(const XxHash& val) {
+ this->XXHASH = val;
+__isset.XXHASH = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterHash::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->XXHASH.read(iprot);
+ this->__isset.XXHASH = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterHash::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterHash");
+
+ if (this->__isset.XXHASH) {
+ xfer += oprot->writeFieldBegin("XXHASH", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->XXHASH.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterHash &a, BloomFilterHash &b) {
+ using ::std::swap;
+ swap(a.XXHASH, b.XXHASH);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterHash::BloomFilterHash(const BloomFilterHash& other62) {
+ XXHASH = other62.XXHASH;
+ __isset = other62.__isset;
+}
+BloomFilterHash& BloomFilterHash::operator=(const BloomFilterHash& other63) {
+ XXHASH = other63.XXHASH;
+ __isset = other63.__isset;
+ return *this;
+}
+void BloomFilterHash::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterHash(";
+ out << "XXHASH="; (__isset.XXHASH ? (out << to_string(XXHASH)) : (out << "<null>"));
+ out << ")";
+}
+
+
+Uncompressed::~Uncompressed() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const Uncompressed& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t Uncompressed::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t Uncompressed::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("Uncompressed");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(Uncompressed &a, Uncompressed &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+Uncompressed::Uncompressed(const Uncompressed& other64) {
+ (void) other64;
+}
+Uncompressed& Uncompressed::operator=(const Uncompressed& other65) {
+ (void) other65;
+ return *this;
+}
+void Uncompressed::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "Uncompressed(";
+ out << ")";
+}
+
+
+BloomFilterCompression::~BloomFilterCompression() noexcept {
+}
+
+
+void BloomFilterCompression::__set_UNCOMPRESSED(const Uncompressed& val) {
+ this->UNCOMPRESSED = val;
+__isset.UNCOMPRESSED = true;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterCompression::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->UNCOMPRESSED.read(iprot);
+ this->__isset.UNCOMPRESSED = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t BloomFilterCompression::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterCompression");
+
+ if (this->__isset.UNCOMPRESSED) {
+ xfer += oprot->writeFieldBegin("UNCOMPRESSED", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->UNCOMPRESSED.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterCompression &a, BloomFilterCompression &b) {
+ using ::std::swap;
+ swap(a.UNCOMPRESSED, b.UNCOMPRESSED);
+ swap(a.__isset, b.__isset);
+}
+
+BloomFilterCompression::BloomFilterCompression(const BloomFilterCompression& other66) {
+ UNCOMPRESSED = other66.UNCOMPRESSED;
+ __isset = other66.__isset;
+}
+BloomFilterCompression& BloomFilterCompression::operator=(const BloomFilterCompression& other67) {
+ UNCOMPRESSED = other67.UNCOMPRESSED;
+ __isset = other67.__isset;
+ return *this;
+}
+void BloomFilterCompression::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterCompression(";
+ out << "UNCOMPRESSED="; (__isset.UNCOMPRESSED ? (out << to_string(UNCOMPRESSED)) : (out << "<null>"));
+ out << ")";
+}
+
+
+BloomFilterHeader::~BloomFilterHeader() noexcept {
+}
+
+
+void BloomFilterHeader::__set_numBytes(const int32_t val) {
+ this->numBytes = val;
+}
+
+void BloomFilterHeader::__set_algorithm(const BloomFilterAlgorithm& val) {
+ this->algorithm = val;
+}
+
+void BloomFilterHeader::__set_hash(const BloomFilterHash& val) {
+ this->hash = val;
+}
+
+void BloomFilterHeader::__set_compression(const BloomFilterCompression& val) {
+ this->compression = val;
+}
+std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t BloomFilterHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_numBytes = false;
+ bool isset_algorithm = false;
+ bool isset_hash = false;
+ bool isset_compression = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->numBytes);
+ isset_numBytes = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->algorithm.read(iprot);
+ isset_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->hash.read(iprot);
+ isset_hash = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->compression.read(iprot);
+ isset_compression = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_numBytes)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_algorithm)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_hash)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compression)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t BloomFilterHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("BloomFilterHeader");
+
+ xfer += oprot->writeFieldBegin("numBytes", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->numBytes);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("algorithm", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("hash", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->hash.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compression", ::apache::thrift::protocol::T_STRUCT, 4);
+ xfer += this->compression.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(BloomFilterHeader &a, BloomFilterHeader &b) {
+ using ::std::swap;
+ swap(a.numBytes, b.numBytes);
+ swap(a.algorithm, b.algorithm);
+ swap(a.hash, b.hash);
+ swap(a.compression, b.compression);
+}
+
+BloomFilterHeader::BloomFilterHeader(const BloomFilterHeader& other68) {
+ numBytes = other68.numBytes;
+ algorithm = other68.algorithm;
+ hash = other68.hash;
+ compression = other68.compression;
+}
+BloomFilterHeader& BloomFilterHeader::operator=(const BloomFilterHeader& other69) {
+ numBytes = other69.numBytes;
+ algorithm = other69.algorithm;
+ hash = other69.hash;
+ compression = other69.compression;
+ return *this;
+}
+void BloomFilterHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "BloomFilterHeader(";
+ out << "numBytes=" << to_string(numBytes);
+ out << ", " << "algorithm=" << to_string(algorithm);
+ out << ", " << "hash=" << to_string(hash);
+ out << ", " << "compression=" << to_string(compression);
+ out << ")";
+}
+
+
+PageHeader::~PageHeader() noexcept {
+}
+
+
+void PageHeader::__set_type(const PageType::type val) {
+ this->type = val;
+}
+
+void PageHeader::__set_uncompressed_page_size(const int32_t val) {
+ this->uncompressed_page_size = val;
+}
+
+void PageHeader::__set_compressed_page_size(const int32_t val) {
+ this->compressed_page_size = val;
+}
+
+void PageHeader::__set_crc(const int32_t val) {
+ this->crc = val;
+__isset.crc = true;
+}
+
+void PageHeader::__set_data_page_header(const DataPageHeader& val) {
+ this->data_page_header = val;
+__isset.data_page_header = true;
+}
+
+void PageHeader::__set_index_page_header(const IndexPageHeader& val) {
+ this->index_page_header = val;
+__isset.index_page_header = true;
+}
+
+void PageHeader::__set_dictionary_page_header(const DictionaryPageHeader& val) {
+ this->dictionary_page_header = val;
+__isset.dictionary_page_header = true;
+}
+
+void PageHeader::__set_data_page_header_v2(const DataPageHeaderV2& val) {
+ this->data_page_header_v2 = val;
+__isset.data_page_header_v2 = true;
+}
+std::ostream& operator<<(std::ostream& out, const PageHeader& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageHeader::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_type = false;
+ bool isset_uncompressed_page_size = false;
+ bool isset_compressed_page_size = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast70;
+ xfer += iprot->readI32(ecast70);
+ this->type = (PageType::type)ecast70;
+ isset_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->uncompressed_page_size);
+ isset_uncompressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->compressed_page_size);
+ isset_compressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->crc);
+ this->__isset.crc = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->data_page_header.read(iprot);
+ this->__isset.data_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->index_page_header.read(iprot);
+ this->__isset.index_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->dictionary_page_header.read(iprot);
+ this->__isset.dictionary_page_header = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->data_page_header_v2.read(iprot);
+ this->__isset.data_page_header_v2 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_uncompressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageHeader::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageHeader");
+
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("uncompressed_page_size", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->uncompressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->compressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.crc) {
+ xfer += oprot->writeFieldBegin("crc", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32(this->crc);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.data_page_header) {
+ xfer += oprot->writeFieldBegin("data_page_header", ::apache::thrift::protocol::T_STRUCT, 5);
+ xfer += this->data_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.index_page_header) {
+ xfer += oprot->writeFieldBegin("index_page_header", ::apache::thrift::protocol::T_STRUCT, 6);
+ xfer += this->index_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.dictionary_page_header) {
+ xfer += oprot->writeFieldBegin("dictionary_page_header", ::apache::thrift::protocol::T_STRUCT, 7);
+ xfer += this->dictionary_page_header.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.data_page_header_v2) {
+ xfer += oprot->writeFieldBegin("data_page_header_v2", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->data_page_header_v2.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageHeader &a, PageHeader &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.uncompressed_page_size, b.uncompressed_page_size);
+ swap(a.compressed_page_size, b.compressed_page_size);
+ swap(a.crc, b.crc);
+ swap(a.data_page_header, b.data_page_header);
+ swap(a.index_page_header, b.index_page_header);
+ swap(a.dictionary_page_header, b.dictionary_page_header);
+ swap(a.data_page_header_v2, b.data_page_header_v2);
+ swap(a.__isset, b.__isset);
+}
+
+PageHeader::PageHeader(const PageHeader& other71) {
+ type = other71.type;
+ uncompressed_page_size = other71.uncompressed_page_size;
+ compressed_page_size = other71.compressed_page_size;
+ crc = other71.crc;
+ data_page_header = other71.data_page_header;
+ index_page_header = other71.index_page_header;
+ dictionary_page_header = other71.dictionary_page_header;
+ data_page_header_v2 = other71.data_page_header_v2;
+ __isset = other71.__isset;
+}
+PageHeader& PageHeader::operator=(const PageHeader& other72) {
+ type = other72.type;
+ uncompressed_page_size = other72.uncompressed_page_size;
+ compressed_page_size = other72.compressed_page_size;
+ crc = other72.crc;
+ data_page_header = other72.data_page_header;
+ index_page_header = other72.index_page_header;
+ dictionary_page_header = other72.dictionary_page_header;
+ data_page_header_v2 = other72.data_page_header_v2;
+ __isset = other72.__isset;
+ return *this;
+}
+void PageHeader::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageHeader(";
+ out << "type=" << to_string(type);
+ out << ", " << "uncompressed_page_size=" << to_string(uncompressed_page_size);
+ out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
+ out << ", " << "crc="; (__isset.crc ? (out << to_string(crc)) : (out << "<null>"));
+ out << ", " << "data_page_header="; (__isset.data_page_header ? (out << to_string(data_page_header)) : (out << "<null>"));
+ out << ", " << "index_page_header="; (__isset.index_page_header ? (out << to_string(index_page_header)) : (out << "<null>"));
+ out << ", " << "dictionary_page_header="; (__isset.dictionary_page_header ? (out << to_string(dictionary_page_header)) : (out << "<null>"));
+ out << ", " << "data_page_header_v2="; (__isset.data_page_header_v2 ? (out << to_string(data_page_header_v2)) : (out << "<null>"));
+ out << ")";
+}
+
+
+KeyValue::~KeyValue() noexcept {
+}
+
+
+void KeyValue::__set_key(const std::string& val) {
+ this->key = val;
+}
+
+void KeyValue::__set_value(const std::string& val) {
+ this->value = val;
+__isset.value = true;
+}
+std::ostream& operator<<(std::ostream& out, const KeyValue& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t KeyValue::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_key = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->key);
+ isset_key = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->value);
+ this->__isset.value = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_key)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t KeyValue::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("KeyValue");
+
+ xfer += oprot->writeFieldBegin("key", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeString(this->key);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.value) {
+ xfer += oprot->writeFieldBegin("value", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeString(this->value);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(KeyValue &a, KeyValue &b) {
+ using ::std::swap;
+ swap(a.key, b.key);
+ swap(a.value, b.value);
+ swap(a.__isset, b.__isset);
+}
+
+KeyValue::KeyValue(const KeyValue& other73) {
+ key = other73.key;
+ value = other73.value;
+ __isset = other73.__isset;
+}
+KeyValue& KeyValue::operator=(const KeyValue& other74) {
+ key = other74.key;
+ value = other74.value;
+ __isset = other74.__isset;
+ return *this;
+}
+void KeyValue::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "KeyValue(";
+ out << "key=" << to_string(key);
+ out << ", " << "value="; (__isset.value ? (out << to_string(value)) : (out << "<null>"));
+ out << ")";
+}
+
+
+SortingColumn::~SortingColumn() noexcept {
+}
+
+
+void SortingColumn::__set_column_idx(const int32_t val) {
+ this->column_idx = val;
+}
+
+void SortingColumn::__set_descending(const bool val) {
+ this->descending = val;
+}
+
+void SortingColumn::__set_nulls_first(const bool val) {
+ this->nulls_first = val;
+}
+std::ostream& operator<<(std::ostream& out, const SortingColumn& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t SortingColumn::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_column_idx = false;
+ bool isset_descending = false;
+ bool isset_nulls_first = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->column_idx);
+ isset_column_idx = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->descending);
+ isset_descending = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->nulls_first);
+ isset_nulls_first = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_column_idx)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_descending)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_nulls_first)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t SortingColumn::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("SortingColumn");
+
+ xfer += oprot->writeFieldBegin("column_idx", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->column_idx);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("descending", ::apache::thrift::protocol::T_BOOL, 2);
+ xfer += oprot->writeBool(this->descending);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("nulls_first", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->nulls_first);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(SortingColumn &a, SortingColumn &b) {
+ using ::std::swap;
+ swap(a.column_idx, b.column_idx);
+ swap(a.descending, b.descending);
+ swap(a.nulls_first, b.nulls_first);
+}
+
+SortingColumn::SortingColumn(const SortingColumn& other75) {
+ column_idx = other75.column_idx;
+ descending = other75.descending;
+ nulls_first = other75.nulls_first;
+}
+SortingColumn& SortingColumn::operator=(const SortingColumn& other76) {
+ column_idx = other76.column_idx;
+ descending = other76.descending;
+ nulls_first = other76.nulls_first;
+ return *this;
+}
+void SortingColumn::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "SortingColumn(";
+ out << "column_idx=" << to_string(column_idx);
+ out << ", " << "descending=" << to_string(descending);
+ out << ", " << "nulls_first=" << to_string(nulls_first);
+ out << ")";
+}
+
+
+PageEncodingStats::~PageEncodingStats() noexcept {
+}
+
+
+void PageEncodingStats::__set_page_type(const PageType::type val) {
+ this->page_type = val;
+}
+
+void PageEncodingStats::__set_encoding(const Encoding::type val) {
+ this->encoding = val;
+}
+
+void PageEncodingStats::__set_count(const int32_t val) {
+ this->count = val;
+}
+std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageEncodingStats::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_page_type = false;
+ bool isset_encoding = false;
+ bool isset_count = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast77;
+ xfer += iprot->readI32(ecast77);
+ this->page_type = (PageType::type)ecast77;
+ isset_page_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast78;
+ xfer += iprot->readI32(ecast78);
+ this->encoding = (Encoding::type)ecast78;
+ isset_encoding = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->count);
+ isset_count = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_page_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encoding)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_count)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageEncodingStats::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageEncodingStats");
+
+ xfer += oprot->writeFieldBegin("page_type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->page_type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encoding", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32((int32_t)this->encoding);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("count", ::apache::thrift::protocol::T_I32, 3);
+ xfer += oprot->writeI32(this->count);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageEncodingStats &a, PageEncodingStats &b) {
+ using ::std::swap;
+ swap(a.page_type, b.page_type);
+ swap(a.encoding, b.encoding);
+ swap(a.count, b.count);
+}
+
+PageEncodingStats::PageEncodingStats(const PageEncodingStats& other79) {
+ page_type = other79.page_type;
+ encoding = other79.encoding;
+ count = other79.count;
+}
+PageEncodingStats& PageEncodingStats::operator=(const PageEncodingStats& other80) {
+ page_type = other80.page_type;
+ encoding = other80.encoding;
+ count = other80.count;
+ return *this;
+}
+void PageEncodingStats::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageEncodingStats(";
+ out << "page_type=" << to_string(page_type);
+ out << ", " << "encoding=" << to_string(encoding);
+ out << ", " << "count=" << to_string(count);
+ out << ")";
+}
+
+
+ColumnMetaData::~ColumnMetaData() noexcept {
+}
+
+
+void ColumnMetaData::__set_type(const Type::type val) {
+ this->type = val;
+}
+
+void ColumnMetaData::__set_encodings(const std::vector<Encoding::type> & val) {
+ this->encodings = val;
+}
+
+void ColumnMetaData::__set_path_in_schema(const std::vector<std::string> & val) {
+ this->path_in_schema = val;
+}
+
+void ColumnMetaData::__set_codec(const CompressionCodec::type val) {
+ this->codec = val;
+}
+
+void ColumnMetaData::__set_num_values(const int64_t val) {
+ this->num_values = val;
+}
+
+void ColumnMetaData::__set_total_uncompressed_size(const int64_t val) {
+ this->total_uncompressed_size = val;
+}
+
+void ColumnMetaData::__set_total_compressed_size(const int64_t val) {
+ this->total_compressed_size = val;
+}
+
+void ColumnMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
+ this->key_value_metadata = val;
+__isset.key_value_metadata = true;
+}
+
+void ColumnMetaData::__set_data_page_offset(const int64_t val) {
+ this->data_page_offset = val;
+}
+
+void ColumnMetaData::__set_index_page_offset(const int64_t val) {
+ this->index_page_offset = val;
+__isset.index_page_offset = true;
+}
+
+void ColumnMetaData::__set_dictionary_page_offset(const int64_t val) {
+ this->dictionary_page_offset = val;
+__isset.dictionary_page_offset = true;
+}
+
+void ColumnMetaData::__set_statistics(const Statistics& val) {
+ this->statistics = val;
+__isset.statistics = true;
+}
+
+void ColumnMetaData::__set_encoding_stats(const std::vector<PageEncodingStats> & val) {
+ this->encoding_stats = val;
+__isset.encoding_stats = true;
+}
+
+void ColumnMetaData::__set_bloom_filter_offset(const int64_t val) {
+ this->bloom_filter_offset = val;
+__isset.bloom_filter_offset = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_type = false;
+ bool isset_encodings = false;
+ bool isset_path_in_schema = false;
+ bool isset_codec = false;
+ bool isset_num_values = false;
+ bool isset_total_uncompressed_size = false;
+ bool isset_total_compressed_size = false;
+ bool isset_data_page_offset = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast81;
+ xfer += iprot->readI32(ecast81);
+ this->type = (Type::type)ecast81;
+ isset_type = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->encodings.clear();
+ uint32_t _size82;
+ ::apache::thrift::protocol::TType _etype85;
+ xfer += iprot->readListBegin(_etype85, _size82);
+ this->encodings.resize(_size82);
+ uint32_t _i86;
+ for (_i86 = 0; _i86 < _size82; ++_i86)
+ {
+ int32_t ecast87;
+ xfer += iprot->readI32(ecast87);
+ this->encodings[_i86] = (Encoding::type)ecast87;
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_encodings = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->path_in_schema.clear();
+ uint32_t _size88;
+ ::apache::thrift::protocol::TType _etype91;
+ xfer += iprot->readListBegin(_etype91, _size88);
+ this->path_in_schema.resize(_size88);
+ uint32_t _i92;
+ for (_i92 = 0; _i92 < _size88; ++_i92)
+ {
+ xfer += iprot->readString(this->path_in_schema[_i92]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_path_in_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast93;
+ xfer += iprot->readI32(ecast93);
+ this->codec = (CompressionCodec::type)ecast93;
+ isset_codec = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_values);
+ isset_num_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_uncompressed_size);
+ isset_total_uncompressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_compressed_size);
+ isset_total_compressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->key_value_metadata.clear();
+ uint32_t _size94;
+ ::apache::thrift::protocol::TType _etype97;
+ xfer += iprot->readListBegin(_etype97, _size94);
+ this->key_value_metadata.resize(_size94);
+ uint32_t _i98;
+ for (_i98 = 0; _i98 < _size94; ++_i98)
+ {
+ xfer += this->key_value_metadata[_i98].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.key_value_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->data_page_offset);
+ isset_data_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 10:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->index_page_offset);
+ this->__isset.index_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 11:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->dictionary_page_offset);
+ this->__isset.dictionary_page_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 12:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->statistics.read(iprot);
+ this->__isset.statistics = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 13:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->encoding_stats.clear();
+ uint32_t _size99;
+ ::apache::thrift::protocol::TType _etype102;
+ xfer += iprot->readListBegin(_etype102, _size99);
+ this->encoding_stats.resize(_size99);
+ uint32_t _i103;
+ for (_i103 = 0; _i103 < _size99; ++_i103)
+ {
+ xfer += this->encoding_stats[_i103].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.encoding_stats = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 14:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->bloom_filter_offset);
+ this->__isset.bloom_filter_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_type)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_encodings)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_path_in_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_codec)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_uncompressed_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_compressed_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_data_page_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnMetaData");
+
+ xfer += oprot->writeFieldBegin("type", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32((int32_t)this->type);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("encodings", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I32, static_cast<uint32_t>(this->encodings.size()));
+ std::vector<Encoding::type> ::const_iterator _iter104;
+ for (_iter104 = this->encodings.begin(); _iter104 != this->encodings.end(); ++_iter104)
+ {
+ xfer += oprot->writeI32((int32_t)(*_iter104));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 3);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
+ std::vector<std::string> ::const_iterator _iter105;
+ for (_iter105 = this->path_in_schema.begin(); _iter105 != this->path_in_schema.end(); ++_iter105)
+ {
+ xfer += oprot->writeString((*_iter105));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("codec", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->codec);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_values", ::apache::thrift::protocol::T_I64, 5);
+ xfer += oprot->writeI64(this->num_values);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_uncompressed_size", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->total_uncompressed_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 7);
+ xfer += oprot->writeI64(this->total_compressed_size);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_value_metadata) {
+ xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 8);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
+ std::vector<KeyValue> ::const_iterator _iter106;
+ for (_iter106 = this->key_value_metadata.begin(); _iter106 != this->key_value_metadata.end(); ++_iter106)
+ {
+ xfer += (*_iter106).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("data_page_offset", ::apache::thrift::protocol::T_I64, 9);
+ xfer += oprot->writeI64(this->data_page_offset);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.index_page_offset) {
+ xfer += oprot->writeFieldBegin("index_page_offset", ::apache::thrift::protocol::T_I64, 10);
+ xfer += oprot->writeI64(this->index_page_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.dictionary_page_offset) {
+ xfer += oprot->writeFieldBegin("dictionary_page_offset", ::apache::thrift::protocol::T_I64, 11);
+ xfer += oprot->writeI64(this->dictionary_page_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.statistics) {
+ xfer += oprot->writeFieldBegin("statistics", ::apache::thrift::protocol::T_STRUCT, 12);
+ xfer += this->statistics.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encoding_stats) {
+ xfer += oprot->writeFieldBegin("encoding_stats", ::apache::thrift::protocol::T_LIST, 13);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->encoding_stats.size()));
+ std::vector<PageEncodingStats> ::const_iterator _iter107;
+ for (_iter107 = this->encoding_stats.begin(); _iter107 != this->encoding_stats.end(); ++_iter107)
+ {
+ xfer += (*_iter107).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.bloom_filter_offset) {
+ xfer += oprot->writeFieldBegin("bloom_filter_offset", ::apache::thrift::protocol::T_I64, 14);
+ xfer += oprot->writeI64(this->bloom_filter_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnMetaData &a, ColumnMetaData &b) {
+ using ::std::swap;
+ swap(a.type, b.type);
+ swap(a.encodings, b.encodings);
+ swap(a.path_in_schema, b.path_in_schema);
+ swap(a.codec, b.codec);
+ swap(a.num_values, b.num_values);
+ swap(a.total_uncompressed_size, b.total_uncompressed_size);
+ swap(a.total_compressed_size, b.total_compressed_size);
+ swap(a.key_value_metadata, b.key_value_metadata);
+ swap(a.data_page_offset, b.data_page_offset);
+ swap(a.index_page_offset, b.index_page_offset);
+ swap(a.dictionary_page_offset, b.dictionary_page_offset);
+ swap(a.statistics, b.statistics);
+ swap(a.encoding_stats, b.encoding_stats);
+ swap(a.bloom_filter_offset, b.bloom_filter_offset);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnMetaData::ColumnMetaData(const ColumnMetaData& other108) {
+ type = other108.type;
+ encodings = other108.encodings;
+ path_in_schema = other108.path_in_schema;
+ codec = other108.codec;
+ num_values = other108.num_values;
+ total_uncompressed_size = other108.total_uncompressed_size;
+ total_compressed_size = other108.total_compressed_size;
+ key_value_metadata = other108.key_value_metadata;
+ data_page_offset = other108.data_page_offset;
+ index_page_offset = other108.index_page_offset;
+ dictionary_page_offset = other108.dictionary_page_offset;
+ statistics = other108.statistics;
+ encoding_stats = other108.encoding_stats;
+ bloom_filter_offset = other108.bloom_filter_offset;
+ __isset = other108.__isset;
+}
+ColumnMetaData& ColumnMetaData::operator=(const ColumnMetaData& other109) {
+ type = other109.type;
+ encodings = other109.encodings;
+ path_in_schema = other109.path_in_schema;
+ codec = other109.codec;
+ num_values = other109.num_values;
+ total_uncompressed_size = other109.total_uncompressed_size;
+ total_compressed_size = other109.total_compressed_size;
+ key_value_metadata = other109.key_value_metadata;
+ data_page_offset = other109.data_page_offset;
+ index_page_offset = other109.index_page_offset;
+ dictionary_page_offset = other109.dictionary_page_offset;
+ statistics = other109.statistics;
+ encoding_stats = other109.encoding_stats;
+ bloom_filter_offset = other109.bloom_filter_offset;
+ __isset = other109.__isset;
+ return *this;
+}
+void ColumnMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnMetaData(";
+ out << "type=" << to_string(type);
+ out << ", " << "encodings=" << to_string(encodings);
+ out << ", " << "path_in_schema=" << to_string(path_in_schema);
+ out << ", " << "codec=" << to_string(codec);
+ out << ", " << "num_values=" << to_string(num_values);
+ out << ", " << "total_uncompressed_size=" << to_string(total_uncompressed_size);
+ out << ", " << "total_compressed_size=" << to_string(total_compressed_size);
+ out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
+ out << ", " << "data_page_offset=" << to_string(data_page_offset);
+ out << ", " << "index_page_offset="; (__isset.index_page_offset ? (out << to_string(index_page_offset)) : (out << "<null>"));
+ out << ", " << "dictionary_page_offset="; (__isset.dictionary_page_offset ? (out << to_string(dictionary_page_offset)) : (out << "<null>"));
+ out << ", " << "statistics="; (__isset.statistics ? (out << to_string(statistics)) : (out << "<null>"));
+ out << ", " << "encoding_stats="; (__isset.encoding_stats ? (out << to_string(encoding_stats)) : (out << "<null>"));
+ out << ", " << "bloom_filter_offset="; (__isset.bloom_filter_offset ? (out << to_string(bloom_filter_offset)) : (out << "<null>"));
+ out << ")";
+}
+
+
+EncryptionWithFooterKey::~EncryptionWithFooterKey() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionWithFooterKey::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EncryptionWithFooterKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionWithFooterKey");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+EncryptionWithFooterKey::EncryptionWithFooterKey(const EncryptionWithFooterKey& other110) {
+ (void) other110;
+}
+EncryptionWithFooterKey& EncryptionWithFooterKey::operator=(const EncryptionWithFooterKey& other111) {
+ (void) other111;
+ return *this;
+}
+void EncryptionWithFooterKey::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionWithFooterKey(";
+ out << ")";
+}
+
+
+EncryptionWithColumnKey::~EncryptionWithColumnKey() noexcept {
+}
+
+
+void EncryptionWithColumnKey::__set_path_in_schema(const std::vector<std::string> & val) {
+ this->path_in_schema = val;
+}
+
+void EncryptionWithColumnKey::__set_key_metadata(const std::string& val) {
+ this->key_metadata = val;
+__isset.key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionWithColumnKey::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_path_in_schema = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->path_in_schema.clear();
+ uint32_t _size112;
+ ::apache::thrift::protocol::TType _etype115;
+ xfer += iprot->readListBegin(_etype115, _size112);
+ this->path_in_schema.resize(_size112);
+ uint32_t _i116;
+ for (_i116 = 0; _i116 < _size112; ++_i116)
+ {
+ xfer += iprot->readString(this->path_in_schema[_i116]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_path_in_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->key_metadata);
+ this->__isset.key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_path_in_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t EncryptionWithColumnKey::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionWithColumnKey");
+
+ xfer += oprot->writeFieldBegin("path_in_schema", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->path_in_schema.size()));
+ std::vector<std::string> ::const_iterator _iter117;
+ for (_iter117 = this->path_in_schema.begin(); _iter117 != this->path_in_schema.end(); ++_iter117)
+ {
+ xfer += oprot->writeString((*_iter117));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_metadata) {
+ xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b) {
+ using ::std::swap;
+ swap(a.path_in_schema, b.path_in_schema);
+ swap(a.key_metadata, b.key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+EncryptionWithColumnKey::EncryptionWithColumnKey(const EncryptionWithColumnKey& other118) {
+ path_in_schema = other118.path_in_schema;
+ key_metadata = other118.key_metadata;
+ __isset = other118.__isset;
+}
+EncryptionWithColumnKey& EncryptionWithColumnKey::operator=(const EncryptionWithColumnKey& other119) {
+ path_in_schema = other119.path_in_schema;
+ key_metadata = other119.key_metadata;
+ __isset = other119.__isset;
+ return *this;
+}
+void EncryptionWithColumnKey::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionWithColumnKey(";
+ out << "path_in_schema=" << to_string(path_in_schema);
+ out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+ColumnCryptoMetaData::~ColumnCryptoMetaData() noexcept {
+}
+
+
+void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val) {
+ this->ENCRYPTION_WITH_FOOTER_KEY = val;
+__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+}
+
+void ColumnCryptoMetaData::__set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val) {
+ this->ENCRYPTION_WITH_COLUMN_KEY = val;
+__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENCRYPTION_WITH_FOOTER_KEY.read(iprot);
+ this->__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->ENCRYPTION_WITH_COLUMN_KEY.read(iprot);
+ this->__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ColumnCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnCryptoMetaData");
+
+ if (this->__isset.ENCRYPTION_WITH_FOOTER_KEY) {
+ xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_FOOTER_KEY", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->ENCRYPTION_WITH_FOOTER_KEY.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ xfer += oprot->writeFieldBegin("ENCRYPTION_WITH_COLUMN_KEY", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->ENCRYPTION_WITH_COLUMN_KEY.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b) {
+ using ::std::swap;
+ swap(a.ENCRYPTION_WITH_FOOTER_KEY, b.ENCRYPTION_WITH_FOOTER_KEY);
+ swap(a.ENCRYPTION_WITH_COLUMN_KEY, b.ENCRYPTION_WITH_COLUMN_KEY);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnCryptoMetaData::ColumnCryptoMetaData(const ColumnCryptoMetaData& other120) {
+ ENCRYPTION_WITH_FOOTER_KEY = other120.ENCRYPTION_WITH_FOOTER_KEY;
+ ENCRYPTION_WITH_COLUMN_KEY = other120.ENCRYPTION_WITH_COLUMN_KEY;
+ __isset = other120.__isset;
+}
+ColumnCryptoMetaData& ColumnCryptoMetaData::operator=(const ColumnCryptoMetaData& other121) {
+ ENCRYPTION_WITH_FOOTER_KEY = other121.ENCRYPTION_WITH_FOOTER_KEY;
+ ENCRYPTION_WITH_COLUMN_KEY = other121.ENCRYPTION_WITH_COLUMN_KEY;
+ __isset = other121.__isset;
+ return *this;
+}
+void ColumnCryptoMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnCryptoMetaData(";
+ out << "ENCRYPTION_WITH_FOOTER_KEY="; (__isset.ENCRYPTION_WITH_FOOTER_KEY ? (out << to_string(ENCRYPTION_WITH_FOOTER_KEY)) : (out << "<null>"));
+ out << ", " << "ENCRYPTION_WITH_COLUMN_KEY="; (__isset.ENCRYPTION_WITH_COLUMN_KEY ? (out << to_string(ENCRYPTION_WITH_COLUMN_KEY)) : (out << "<null>"));
+ out << ")";
+}
+
+
+ColumnChunk::~ColumnChunk() noexcept {
+}
+
+
+void ColumnChunk::__set_file_path(const std::string& val) {
+ this->file_path = val;
+__isset.file_path = true;
+}
+
+void ColumnChunk::__set_file_offset(const int64_t val) {
+ this->file_offset = val;
+}
+
+void ColumnChunk::__set_meta_data(const ColumnMetaData& val) {
+ this->meta_data = val;
+__isset.meta_data = true;
+}
+
+void ColumnChunk::__set_offset_index_offset(const int64_t val) {
+ this->offset_index_offset = val;
+__isset.offset_index_offset = true;
+}
+
+void ColumnChunk::__set_offset_index_length(const int32_t val) {
+ this->offset_index_length = val;
+__isset.offset_index_length = true;
+}
+
+void ColumnChunk::__set_column_index_offset(const int64_t val) {
+ this->column_index_offset = val;
+__isset.column_index_offset = true;
+}
+
+void ColumnChunk::__set_column_index_length(const int32_t val) {
+ this->column_index_length = val;
+__isset.column_index_length = true;
+}
+
+void ColumnChunk::__set_crypto_metadata(const ColumnCryptoMetaData& val) {
+ this->crypto_metadata = val;
+__isset.crypto_metadata = true;
+}
+
+void ColumnChunk::__set_encrypted_column_metadata(const std::string& val) {
+ this->encrypted_column_metadata = val;
+__isset.encrypted_column_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnChunk::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_file_offset = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->file_path);
+ this->__isset.file_path = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->file_offset);
+ isset_file_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->meta_data.read(iprot);
+ this->__isset.meta_data = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->offset_index_offset);
+ this->__isset.offset_index_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->offset_index_length);
+ this->__isset.offset_index_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->column_index_offset);
+ this->__isset.column_index_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->column_index_length);
+ this->__isset.column_index_length = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->crypto_metadata.read(iprot);
+ this->__isset.crypto_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->encrypted_column_metadata);
+ this->__isset.encrypted_column_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_file_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnChunk::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnChunk");
+
+ if (this->__isset.file_path) {
+ xfer += oprot->writeFieldBegin("file_path", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeString(this->file_path);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 2);
+ xfer += oprot->writeI64(this->file_offset);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.meta_data) {
+ xfer += oprot->writeFieldBegin("meta_data", ::apache::thrift::protocol::T_STRUCT, 3);
+ xfer += this->meta_data.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.offset_index_offset) {
+ xfer += oprot->writeFieldBegin("offset_index_offset", ::apache::thrift::protocol::T_I64, 4);
+ xfer += oprot->writeI64(this->offset_index_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.offset_index_length) {
+ xfer += oprot->writeFieldBegin("offset_index_length", ::apache::thrift::protocol::T_I32, 5);
+ xfer += oprot->writeI32(this->offset_index_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_index_offset) {
+ xfer += oprot->writeFieldBegin("column_index_offset", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->column_index_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_index_length) {
+ xfer += oprot->writeFieldBegin("column_index_length", ::apache::thrift::protocol::T_I32, 7);
+ xfer += oprot->writeI32(this->column_index_length);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.crypto_metadata) {
+ xfer += oprot->writeFieldBegin("crypto_metadata", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->crypto_metadata.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encrypted_column_metadata) {
+ xfer += oprot->writeFieldBegin("encrypted_column_metadata", ::apache::thrift::protocol::T_STRING, 9);
+ xfer += oprot->writeBinary(this->encrypted_column_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnChunk &a, ColumnChunk &b) {
+ using ::std::swap;
+ swap(a.file_path, b.file_path);
+ swap(a.file_offset, b.file_offset);
+ swap(a.meta_data, b.meta_data);
+ swap(a.offset_index_offset, b.offset_index_offset);
+ swap(a.offset_index_length, b.offset_index_length);
+ swap(a.column_index_offset, b.column_index_offset);
+ swap(a.column_index_length, b.column_index_length);
+ swap(a.crypto_metadata, b.crypto_metadata);
+ swap(a.encrypted_column_metadata, b.encrypted_column_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnChunk::ColumnChunk(const ColumnChunk& other122) {
+ file_path = other122.file_path;
+ file_offset = other122.file_offset;
+ meta_data = other122.meta_data;
+ offset_index_offset = other122.offset_index_offset;
+ offset_index_length = other122.offset_index_length;
+ column_index_offset = other122.column_index_offset;
+ column_index_length = other122.column_index_length;
+ crypto_metadata = other122.crypto_metadata;
+ encrypted_column_metadata = other122.encrypted_column_metadata;
+ __isset = other122.__isset;
+}
+ColumnChunk& ColumnChunk::operator=(const ColumnChunk& other123) {
+ file_path = other123.file_path;
+ file_offset = other123.file_offset;
+ meta_data = other123.meta_data;
+ offset_index_offset = other123.offset_index_offset;
+ offset_index_length = other123.offset_index_length;
+ column_index_offset = other123.column_index_offset;
+ column_index_length = other123.column_index_length;
+ crypto_metadata = other123.crypto_metadata;
+ encrypted_column_metadata = other123.encrypted_column_metadata;
+ __isset = other123.__isset;
+ return *this;
+}
+void ColumnChunk::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnChunk(";
+ out << "file_path="; (__isset.file_path ? (out << to_string(file_path)) : (out << "<null>"));
+ out << ", " << "file_offset=" << to_string(file_offset);
+ out << ", " << "meta_data="; (__isset.meta_data ? (out << to_string(meta_data)) : (out << "<null>"));
+ out << ", " << "offset_index_offset="; (__isset.offset_index_offset ? (out << to_string(offset_index_offset)) : (out << "<null>"));
+ out << ", " << "offset_index_length="; (__isset.offset_index_length ? (out << to_string(offset_index_length)) : (out << "<null>"));
+ out << ", " << "column_index_offset="; (__isset.column_index_offset ? (out << to_string(column_index_offset)) : (out << "<null>"));
+ out << ", " << "column_index_length="; (__isset.column_index_length ? (out << to_string(column_index_length)) : (out << "<null>"));
+ out << ", " << "crypto_metadata="; (__isset.crypto_metadata ? (out << to_string(crypto_metadata)) : (out << "<null>"));
+ out << ", " << "encrypted_column_metadata="; (__isset.encrypted_column_metadata ? (out << to_string(encrypted_column_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+RowGroup::~RowGroup() noexcept {
+}
+
+
+void RowGroup::__set_columns(const std::vector<ColumnChunk> & val) {
+ this->columns = val;
+}
+
+void RowGroup::__set_total_byte_size(const int64_t val) {
+ this->total_byte_size = val;
+}
+
+void RowGroup::__set_num_rows(const int64_t val) {
+ this->num_rows = val;
+}
+
+void RowGroup::__set_sorting_columns(const std::vector<SortingColumn> & val) {
+ this->sorting_columns = val;
+__isset.sorting_columns = true;
+}
+
+void RowGroup::__set_file_offset(const int64_t val) {
+ this->file_offset = val;
+__isset.file_offset = true;
+}
+
+void RowGroup::__set_total_compressed_size(const int64_t val) {
+ this->total_compressed_size = val;
+__isset.total_compressed_size = true;
+}
+
+void RowGroup::__set_ordinal(const int16_t val) {
+ this->ordinal = val;
+__isset.ordinal = true;
+}
+std::ostream& operator<<(std::ostream& out, const RowGroup& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t RowGroup::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_columns = false;
+ bool isset_total_byte_size = false;
+ bool isset_num_rows = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->columns.clear();
+ uint32_t _size124;
+ ::apache::thrift::protocol::TType _etype127;
+ xfer += iprot->readListBegin(_etype127, _size124);
+ this->columns.resize(_size124);
+ uint32_t _i128;
+ for (_i128 = 0; _i128 < _size124; ++_i128)
+ {
+ xfer += this->columns[_i128].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_columns = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_byte_size);
+ isset_total_byte_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->sorting_columns.clear();
+ uint32_t _size129;
+ ::apache::thrift::protocol::TType _etype132;
+ xfer += iprot->readListBegin(_etype132, _size129);
+ this->sorting_columns.resize(_size129);
+ uint32_t _i133;
+ for (_i133 = 0; _i133 < _size129; ++_i133)
+ {
+ xfer += this->sorting_columns[_i133].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.sorting_columns = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->file_offset);
+ this->__isset.file_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->total_compressed_size);
+ this->__isset.total_compressed_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_I16) {
+ xfer += iprot->readI16(this->ordinal);
+ this->__isset.ordinal = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_columns)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_total_byte_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t RowGroup::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("RowGroup");
+
+ xfer += oprot->writeFieldBegin("columns", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->columns.size()));
+ std::vector<ColumnChunk> ::const_iterator _iter134;
+ for (_iter134 = this->columns.begin(); _iter134 != this->columns.end(); ++_iter134)
+ {
+ xfer += (*_iter134).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("total_byte_size", ::apache::thrift::protocol::T_I64, 2);
+ xfer += oprot->writeI64(this->total_byte_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.sorting_columns) {
+ xfer += oprot->writeFieldBegin("sorting_columns", ::apache::thrift::protocol::T_LIST, 4);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->sorting_columns.size()));
+ std::vector<SortingColumn> ::const_iterator _iter135;
+ for (_iter135 = this->sorting_columns.begin(); _iter135 != this->sorting_columns.end(); ++_iter135)
+ {
+ xfer += (*_iter135).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.file_offset) {
+ xfer += oprot->writeFieldBegin("file_offset", ::apache::thrift::protocol::T_I64, 5);
+ xfer += oprot->writeI64(this->file_offset);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.total_compressed_size) {
+ xfer += oprot->writeFieldBegin("total_compressed_size", ::apache::thrift::protocol::T_I64, 6);
+ xfer += oprot->writeI64(this->total_compressed_size);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.ordinal) {
+ xfer += oprot->writeFieldBegin("ordinal", ::apache::thrift::protocol::T_I16, 7);
+ xfer += oprot->writeI16(this->ordinal);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(RowGroup &a, RowGroup &b) {
+ using ::std::swap;
+ swap(a.columns, b.columns);
+ swap(a.total_byte_size, b.total_byte_size);
+ swap(a.num_rows, b.num_rows);
+ swap(a.sorting_columns, b.sorting_columns);
+ swap(a.file_offset, b.file_offset);
+ swap(a.total_compressed_size, b.total_compressed_size);
+ swap(a.ordinal, b.ordinal);
+ swap(a.__isset, b.__isset);
+}
+
+RowGroup::RowGroup(const RowGroup& other136) {
+ columns = other136.columns;
+ total_byte_size = other136.total_byte_size;
+ num_rows = other136.num_rows;
+ sorting_columns = other136.sorting_columns;
+ file_offset = other136.file_offset;
+ total_compressed_size = other136.total_compressed_size;
+ ordinal = other136.ordinal;
+ __isset = other136.__isset;
+}
+RowGroup& RowGroup::operator=(const RowGroup& other137) {
+ columns = other137.columns;
+ total_byte_size = other137.total_byte_size;
+ num_rows = other137.num_rows;
+ sorting_columns = other137.sorting_columns;
+ file_offset = other137.file_offset;
+ total_compressed_size = other137.total_compressed_size;
+ ordinal = other137.ordinal;
+ __isset = other137.__isset;
+ return *this;
+}
+void RowGroup::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "RowGroup(";
+ out << "columns=" << to_string(columns);
+ out << ", " << "total_byte_size=" << to_string(total_byte_size);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "sorting_columns="; (__isset.sorting_columns ? (out << to_string(sorting_columns)) : (out << "<null>"));
+ out << ", " << "file_offset="; (__isset.file_offset ? (out << to_string(file_offset)) : (out << "<null>"));
+ out << ", " << "total_compressed_size="; (__isset.total_compressed_size ? (out << to_string(total_compressed_size)) : (out << "<null>"));
+ out << ", " << "ordinal="; (__isset.ordinal ? (out << to_string(ordinal)) : (out << "<null>"));
+ out << ")";
+}
+
+
+TypeDefinedOrder::~TypeDefinedOrder() noexcept {
+}
+
+std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t TypeDefinedOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ xfer += iprot->skip(ftype);
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t TypeDefinedOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("TypeDefinedOrder");
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(TypeDefinedOrder &a, TypeDefinedOrder &b) {
+ using ::std::swap;
+ (void) a;
+ (void) b;
+}
+
+TypeDefinedOrder::TypeDefinedOrder(const TypeDefinedOrder& other138) {
+ (void) other138;
+}
+TypeDefinedOrder& TypeDefinedOrder::operator=(const TypeDefinedOrder& other139) {
+ (void) other139;
+ return *this;
+}
+void TypeDefinedOrder::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "TypeDefinedOrder(";
+ out << ")";
+}
+
+
+ColumnOrder::~ColumnOrder() noexcept {
+}
+
+
+void ColumnOrder::__set_TYPE_ORDER(const TypeDefinedOrder& val) {
+ this->TYPE_ORDER = val;
+__isset.TYPE_ORDER = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnOrder::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->TYPE_ORDER.read(iprot);
+ this->__isset.TYPE_ORDER = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t ColumnOrder::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnOrder");
+
+ if (this->__isset.TYPE_ORDER) {
+ xfer += oprot->writeFieldBegin("TYPE_ORDER", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->TYPE_ORDER.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnOrder &a, ColumnOrder &b) {
+ using ::std::swap;
+ swap(a.TYPE_ORDER, b.TYPE_ORDER);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnOrder::ColumnOrder(const ColumnOrder& other140) {
+ TYPE_ORDER = other140.TYPE_ORDER;
+ __isset = other140.__isset;
+}
+ColumnOrder& ColumnOrder::operator=(const ColumnOrder& other141) {
+ TYPE_ORDER = other141.TYPE_ORDER;
+ __isset = other141.__isset;
+ return *this;
+}
+void ColumnOrder::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnOrder(";
+ out << "TYPE_ORDER="; (__isset.TYPE_ORDER ? (out << to_string(TYPE_ORDER)) : (out << "<null>"));
+ out << ")";
+}
+
+
+PageLocation::~PageLocation() noexcept {
+}
+
+
+void PageLocation::__set_offset(const int64_t val) {
+ this->offset = val;
+}
+
+void PageLocation::__set_compressed_page_size(const int32_t val) {
+ this->compressed_page_size = val;
+}
+
+void PageLocation::__set_first_row_index(const int64_t val) {
+ this->first_row_index = val;
+}
+std::ostream& operator<<(std::ostream& out, const PageLocation& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t PageLocation::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_offset = false;
+ bool isset_compressed_page_size = false;
+ bool isset_first_row_index = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->offset);
+ isset_offset = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->compressed_page_size);
+ isset_compressed_page_size = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->first_row_index);
+ isset_first_row_index = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_offset)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_compressed_page_size)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_first_row_index)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t PageLocation::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("PageLocation");
+
+ xfer += oprot->writeFieldBegin("offset", ::apache::thrift::protocol::T_I64, 1);
+ xfer += oprot->writeI64(this->offset);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("compressed_page_size", ::apache::thrift::protocol::T_I32, 2);
+ xfer += oprot->writeI32(this->compressed_page_size);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("first_row_index", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->first_row_index);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(PageLocation &a, PageLocation &b) {
+ using ::std::swap;
+ swap(a.offset, b.offset);
+ swap(a.compressed_page_size, b.compressed_page_size);
+ swap(a.first_row_index, b.first_row_index);
+}
+
+PageLocation::PageLocation(const PageLocation& other142) {
+ offset = other142.offset;
+ compressed_page_size = other142.compressed_page_size;
+ first_row_index = other142.first_row_index;
+}
+PageLocation& PageLocation::operator=(const PageLocation& other143) {
+ offset = other143.offset;
+ compressed_page_size = other143.compressed_page_size;
+ first_row_index = other143.first_row_index;
+ return *this;
+}
+void PageLocation::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "PageLocation(";
+ out << "offset=" << to_string(offset);
+ out << ", " << "compressed_page_size=" << to_string(compressed_page_size);
+ out << ", " << "first_row_index=" << to_string(first_row_index);
+ out << ")";
+}
+
+
+OffsetIndex::~OffsetIndex() noexcept {
+}
+
+
+void OffsetIndex::__set_page_locations(const std::vector<PageLocation> & val) {
+ this->page_locations = val;
+}
+std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t OffsetIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_page_locations = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->page_locations.clear();
+ uint32_t _size144;
+ ::apache::thrift::protocol::TType _etype147;
+ xfer += iprot->readListBegin(_etype147, _size144);
+ this->page_locations.resize(_size144);
+ uint32_t _i148;
+ for (_i148 = 0; _i148 < _size144; ++_i148)
+ {
+ xfer += this->page_locations[_i148].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_page_locations = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_page_locations)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t OffsetIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("OffsetIndex");
+
+ xfer += oprot->writeFieldBegin("page_locations", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->page_locations.size()));
+ std::vector<PageLocation> ::const_iterator _iter149;
+ for (_iter149 = this->page_locations.begin(); _iter149 != this->page_locations.end(); ++_iter149)
+ {
+ xfer += (*_iter149).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(OffsetIndex &a, OffsetIndex &b) {
+ using ::std::swap;
+ swap(a.page_locations, b.page_locations);
+}
+
+OffsetIndex::OffsetIndex(const OffsetIndex& other150) {
+ page_locations = other150.page_locations;
+}
+OffsetIndex& OffsetIndex::operator=(const OffsetIndex& other151) {
+ page_locations = other151.page_locations;
+ return *this;
+}
+void OffsetIndex::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "OffsetIndex(";
+ out << "page_locations=" << to_string(page_locations);
+ out << ")";
+}
+
+
+ColumnIndex::~ColumnIndex() noexcept {
+}
+
+
+void ColumnIndex::__set_null_pages(const std::vector<bool> & val) {
+ this->null_pages = val;
+}
+
+void ColumnIndex::__set_min_values(const std::vector<std::string> & val) {
+ this->min_values = val;
+}
+
+void ColumnIndex::__set_max_values(const std::vector<std::string> & val) {
+ this->max_values = val;
+}
+
+void ColumnIndex::__set_boundary_order(const BoundaryOrder::type val) {
+ this->boundary_order = val;
+}
+
+void ColumnIndex::__set_null_counts(const std::vector<int64_t> & val) {
+ this->null_counts = val;
+__isset.null_counts = true;
+}
+std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t ColumnIndex::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_null_pages = false;
+ bool isset_min_values = false;
+ bool isset_max_values = false;
+ bool isset_boundary_order = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->null_pages.clear();
+ uint32_t _size152;
+ ::apache::thrift::protocol::TType _etype155;
+ xfer += iprot->readListBegin(_etype155, _size152);
+ this->null_pages.resize(_size152);
+ uint32_t _i156;
+ for (_i156 = 0; _i156 < _size152; ++_i156)
+ {
+ bool result;
+ xfer += iprot->readBool(result);
+ this->null_pages[_i156] = result;
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_null_pages = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->min_values.clear();
+ uint32_t _size157;
+ ::apache::thrift::protocol::TType _etype160;
+ xfer += iprot->readListBegin(_etype160, _size157);
+ this->min_values.resize(_size157);
+ uint32_t _i161;
+ for (_i161 = 0; _i161 < _size157; ++_i161)
+ {
+ xfer += iprot->readBinary(this->min_values[_i161]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_min_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->max_values.clear();
+ uint32_t _size162;
+ ::apache::thrift::protocol::TType _etype165;
+ xfer += iprot->readListBegin(_etype165, _size162);
+ this->max_values.resize(_size162);
+ uint32_t _i166;
+ for (_i166 = 0; _i166 < _size162; ++_i166)
+ {
+ xfer += iprot->readBinary(this->max_values[_i166]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_max_values = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ int32_t ecast167;
+ xfer += iprot->readI32(ecast167);
+ this->boundary_order = (BoundaryOrder::type)ecast167;
+ isset_boundary_order = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->null_counts.clear();
+ uint32_t _size168;
+ ::apache::thrift::protocol::TType _etype171;
+ xfer += iprot->readListBegin(_etype171, _size168);
+ this->null_counts.resize(_size168);
+ uint32_t _i172;
+ for (_i172 = 0; _i172 < _size168; ++_i172)
+ {
+ xfer += iprot->readI64(this->null_counts[_i172]);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.null_counts = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_null_pages)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_min_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_max_values)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_boundary_order)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t ColumnIndex::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("ColumnIndex");
+
+ xfer += oprot->writeFieldBegin("null_pages", ::apache::thrift::protocol::T_LIST, 1);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_BOOL, static_cast<uint32_t>(this->null_pages.size()));
+ std::vector<bool> ::const_iterator _iter173;
+ for (_iter173 = this->null_pages.begin(); _iter173 != this->null_pages.end(); ++_iter173)
+ {
+ xfer += oprot->writeBool((*_iter173));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("min_values", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->min_values.size()));
+ std::vector<std::string> ::const_iterator _iter174;
+ for (_iter174 = this->min_values.begin(); _iter174 != this->min_values.end(); ++_iter174)
+ {
+ xfer += oprot->writeBinary((*_iter174));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("max_values", ::apache::thrift::protocol::T_LIST, 3);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRING, static_cast<uint32_t>(this->max_values.size()));
+ std::vector<std::string> ::const_iterator _iter175;
+ for (_iter175 = this->max_values.begin(); _iter175 != this->max_values.end(); ++_iter175)
+ {
+ xfer += oprot->writeBinary((*_iter175));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("boundary_order", ::apache::thrift::protocol::T_I32, 4);
+ xfer += oprot->writeI32((int32_t)this->boundary_order);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.null_counts) {
+ xfer += oprot->writeFieldBegin("null_counts", ::apache::thrift::protocol::T_LIST, 5);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_I64, static_cast<uint32_t>(this->null_counts.size()));
+ std::vector<int64_t> ::const_iterator _iter176;
+ for (_iter176 = this->null_counts.begin(); _iter176 != this->null_counts.end(); ++_iter176)
+ {
+ xfer += oprot->writeI64((*_iter176));
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(ColumnIndex &a, ColumnIndex &b) {
+ using ::std::swap;
+ swap(a.null_pages, b.null_pages);
+ swap(a.min_values, b.min_values);
+ swap(a.max_values, b.max_values);
+ swap(a.boundary_order, b.boundary_order);
+ swap(a.null_counts, b.null_counts);
+ swap(a.__isset, b.__isset);
+}
+
+ColumnIndex::ColumnIndex(const ColumnIndex& other177) {
+ null_pages = other177.null_pages;
+ min_values = other177.min_values;
+ max_values = other177.max_values;
+ boundary_order = other177.boundary_order;
+ null_counts = other177.null_counts;
+ __isset = other177.__isset;
+}
+ColumnIndex& ColumnIndex::operator=(const ColumnIndex& other178) {
+ null_pages = other178.null_pages;
+ min_values = other178.min_values;
+ max_values = other178.max_values;
+ boundary_order = other178.boundary_order;
+ null_counts = other178.null_counts;
+ __isset = other178.__isset;
+ return *this;
+}
+void ColumnIndex::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "ColumnIndex(";
+ out << "null_pages=" << to_string(null_pages);
+ out << ", " << "min_values=" << to_string(min_values);
+ out << ", " << "max_values=" << to_string(max_values);
+ out << ", " << "boundary_order=" << to_string(boundary_order);
+ out << ", " << "null_counts="; (__isset.null_counts ? (out << to_string(null_counts)) : (out << "<null>"));
+ out << ")";
+}
+
+
+AesGcmV1::~AesGcmV1() noexcept {
+}
+
+
+void AesGcmV1::__set_aad_prefix(const std::string& val) {
+ this->aad_prefix = val;
+__isset.aad_prefix = true;
+}
+
+void AesGcmV1::__set_aad_file_unique(const std::string& val) {
+ this->aad_file_unique = val;
+__isset.aad_file_unique = true;
+}
+
+void AesGcmV1::__set_supply_aad_prefix(const bool val) {
+ this->supply_aad_prefix = val;
+__isset.supply_aad_prefix = true;
+}
+std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t AesGcmV1::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_prefix);
+ this->__isset.aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_file_unique);
+ this->__isset.aad_file_unique = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->supply_aad_prefix);
+ this->__isset.supply_aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t AesGcmV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("AesGcmV1");
+
+ if (this->__isset.aad_prefix) {
+ xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.aad_file_unique) {
+ xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->aad_file_unique);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.supply_aad_prefix) {
+ xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->supply_aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(AesGcmV1 &a, AesGcmV1 &b) {
+ using ::std::swap;
+ swap(a.aad_prefix, b.aad_prefix);
+ swap(a.aad_file_unique, b.aad_file_unique);
+ swap(a.supply_aad_prefix, b.supply_aad_prefix);
+ swap(a.__isset, b.__isset);
+}
+
+AesGcmV1::AesGcmV1(const AesGcmV1& other179) {
+ aad_prefix = other179.aad_prefix;
+ aad_file_unique = other179.aad_file_unique;
+ supply_aad_prefix = other179.supply_aad_prefix;
+ __isset = other179.__isset;
+}
+AesGcmV1& AesGcmV1::operator=(const AesGcmV1& other180) {
+ aad_prefix = other180.aad_prefix;
+ aad_file_unique = other180.aad_file_unique;
+ supply_aad_prefix = other180.supply_aad_prefix;
+ __isset = other180.__isset;
+ return *this;
+}
+void AesGcmV1::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "AesGcmV1(";
+ out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
+ out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
+ out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
+ out << ")";
+}
+
+
+AesGcmCtrV1::~AesGcmCtrV1() noexcept {
+}
+
+
+void AesGcmCtrV1::__set_aad_prefix(const std::string& val) {
+ this->aad_prefix = val;
+__isset.aad_prefix = true;
+}
+
+void AesGcmCtrV1::__set_aad_file_unique(const std::string& val) {
+ this->aad_file_unique = val;
+__isset.aad_file_unique = true;
+}
+
+void AesGcmCtrV1::__set_supply_aad_prefix(const bool val) {
+ this->supply_aad_prefix = val;
+__isset.supply_aad_prefix = true;
+}
+std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t AesGcmCtrV1::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_prefix);
+ this->__isset.aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->aad_file_unique);
+ this->__isset.aad_file_unique = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_BOOL) {
+ xfer += iprot->readBool(this->supply_aad_prefix);
+ this->__isset.supply_aad_prefix = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t AesGcmCtrV1::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("AesGcmCtrV1");
+
+ if (this->__isset.aad_prefix) {
+ xfer += oprot->writeFieldBegin("aad_prefix", ::apache::thrift::protocol::T_STRING, 1);
+ xfer += oprot->writeBinary(this->aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.aad_file_unique) {
+ xfer += oprot->writeFieldBegin("aad_file_unique", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->aad_file_unique);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.supply_aad_prefix) {
+ xfer += oprot->writeFieldBegin("supply_aad_prefix", ::apache::thrift::protocol::T_BOOL, 3);
+ xfer += oprot->writeBool(this->supply_aad_prefix);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b) {
+ using ::std::swap;
+ swap(a.aad_prefix, b.aad_prefix);
+ swap(a.aad_file_unique, b.aad_file_unique);
+ swap(a.supply_aad_prefix, b.supply_aad_prefix);
+ swap(a.__isset, b.__isset);
+}
+
+AesGcmCtrV1::AesGcmCtrV1(const AesGcmCtrV1& other181) {
+ aad_prefix = other181.aad_prefix;
+ aad_file_unique = other181.aad_file_unique;
+ supply_aad_prefix = other181.supply_aad_prefix;
+ __isset = other181.__isset;
+}
+AesGcmCtrV1& AesGcmCtrV1::operator=(const AesGcmCtrV1& other182) {
+ aad_prefix = other182.aad_prefix;
+ aad_file_unique = other182.aad_file_unique;
+ supply_aad_prefix = other182.supply_aad_prefix;
+ __isset = other182.__isset;
+ return *this;
+}
+void AesGcmCtrV1::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "AesGcmCtrV1(";
+ out << "aad_prefix="; (__isset.aad_prefix ? (out << to_string(aad_prefix)) : (out << "<null>"));
+ out << ", " << "aad_file_unique="; (__isset.aad_file_unique ? (out << to_string(aad_file_unique)) : (out << "<null>"));
+ out << ", " << "supply_aad_prefix="; (__isset.supply_aad_prefix ? (out << to_string(supply_aad_prefix)) : (out << "<null>"));
+ out << ")";
+}
+
+
+EncryptionAlgorithm::~EncryptionAlgorithm() noexcept {
+}
+
+
+void EncryptionAlgorithm::__set_AES_GCM_V1(const AesGcmV1& val) {
+ this->AES_GCM_V1 = val;
+__isset.AES_GCM_V1 = true;
+}
+
+void EncryptionAlgorithm::__set_AES_GCM_CTR_V1(const AesGcmCtrV1& val) {
+ this->AES_GCM_CTR_V1 = val;
+__isset.AES_GCM_CTR_V1 = true;
+}
+std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t EncryptionAlgorithm::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->AES_GCM_V1.read(iprot);
+ this->__isset.AES_GCM_V1 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->AES_GCM_CTR_V1.read(iprot);
+ this->__isset.AES_GCM_CTR_V1 = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ return xfer;
+}
+
+uint32_t EncryptionAlgorithm::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("EncryptionAlgorithm");
+
+ if (this->__isset.AES_GCM_V1) {
+ xfer += oprot->writeFieldBegin("AES_GCM_V1", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->AES_GCM_V1.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.AES_GCM_CTR_V1) {
+ xfer += oprot->writeFieldBegin("AES_GCM_CTR_V1", ::apache::thrift::protocol::T_STRUCT, 2);
+ xfer += this->AES_GCM_CTR_V1.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b) {
+ using ::std::swap;
+ swap(a.AES_GCM_V1, b.AES_GCM_V1);
+ swap(a.AES_GCM_CTR_V1, b.AES_GCM_CTR_V1);
+ swap(a.__isset, b.__isset);
+}
+
+EncryptionAlgorithm::EncryptionAlgorithm(const EncryptionAlgorithm& other183) {
+ AES_GCM_V1 = other183.AES_GCM_V1;
+ AES_GCM_CTR_V1 = other183.AES_GCM_CTR_V1;
+ __isset = other183.__isset;
+}
+EncryptionAlgorithm& EncryptionAlgorithm::operator=(const EncryptionAlgorithm& other184) {
+ AES_GCM_V1 = other184.AES_GCM_V1;
+ AES_GCM_CTR_V1 = other184.AES_GCM_CTR_V1;
+ __isset = other184.__isset;
+ return *this;
+}
+void EncryptionAlgorithm::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "EncryptionAlgorithm(";
+ out << "AES_GCM_V1="; (__isset.AES_GCM_V1 ? (out << to_string(AES_GCM_V1)) : (out << "<null>"));
+ out << ", " << "AES_GCM_CTR_V1="; (__isset.AES_GCM_CTR_V1 ? (out << to_string(AES_GCM_CTR_V1)) : (out << "<null>"));
+ out << ")";
+}
+
+
+FileMetaData::~FileMetaData() noexcept {
+}
+
+
+void FileMetaData::__set_version(const int32_t val) {
+ this->version = val;
+}
+
+void FileMetaData::__set_schema(const std::vector<SchemaElement> & val) {
+ this->schema = val;
+}
+
+void FileMetaData::__set_num_rows(const int64_t val) {
+ this->num_rows = val;
+}
+
+void FileMetaData::__set_row_groups(const std::vector<RowGroup> & val) {
+ this->row_groups = val;
+}
+
+void FileMetaData::__set_key_value_metadata(const std::vector<KeyValue> & val) {
+ this->key_value_metadata = val;
+__isset.key_value_metadata = true;
+}
+
+void FileMetaData::__set_created_by(const std::string& val) {
+ this->created_by = val;
+__isset.created_by = true;
+}
+
+void FileMetaData::__set_column_orders(const std::vector<ColumnOrder> & val) {
+ this->column_orders = val;
+__isset.column_orders = true;
+}
+
+void FileMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
+ this->encryption_algorithm = val;
+__isset.encryption_algorithm = true;
+}
+
+void FileMetaData::__set_footer_signing_key_metadata(const std::string& val) {
+ this->footer_signing_key_metadata = val;
+__isset.footer_signing_key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const FileMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t FileMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_version = false;
+ bool isset_schema = false;
+ bool isset_num_rows = false;
+ bool isset_row_groups = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_I32) {
+ xfer += iprot->readI32(this->version);
+ isset_version = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->schema.clear();
+ uint32_t _size185;
+ ::apache::thrift::protocol::TType _etype188;
+ xfer += iprot->readListBegin(_etype188, _size185);
+ this->schema.resize(_size185);
+ uint32_t _i189;
+ for (_i189 = 0; _i189 < _size185; ++_i189)
+ {
+ xfer += this->schema[_i189].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_schema = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 3:
+ if (ftype == ::apache::thrift::protocol::T_I64) {
+ xfer += iprot->readI64(this->num_rows);
+ isset_num_rows = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 4:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->row_groups.clear();
+ uint32_t _size190;
+ ::apache::thrift::protocol::TType _etype193;
+ xfer += iprot->readListBegin(_etype193, _size190);
+ this->row_groups.resize(_size190);
+ uint32_t _i194;
+ for (_i194 = 0; _i194 < _size190; ++_i194)
+ {
+ xfer += this->row_groups[_i194].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ isset_row_groups = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 5:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->key_value_metadata.clear();
+ uint32_t _size195;
+ ::apache::thrift::protocol::TType _etype198;
+ xfer += iprot->readListBegin(_etype198, _size195);
+ this->key_value_metadata.resize(_size195);
+ uint32_t _i199;
+ for (_i199 = 0; _i199 < _size195; ++_i199)
+ {
+ xfer += this->key_value_metadata[_i199].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.key_value_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 6:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readString(this->created_by);
+ this->__isset.created_by = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 7:
+ if (ftype == ::apache::thrift::protocol::T_LIST) {
+ {
+ this->column_orders.clear();
+ uint32_t _size200;
+ ::apache::thrift::protocol::TType _etype203;
+ xfer += iprot->readListBegin(_etype203, _size200);
+ this->column_orders.resize(_size200);
+ uint32_t _i204;
+ for (_i204 = 0; _i204 < _size200; ++_i204)
+ {
+ xfer += this->column_orders[_i204].read(iprot);
+ }
+ xfer += iprot->readListEnd();
+ }
+ this->__isset.column_orders = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 8:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->encryption_algorithm.read(iprot);
+ this->__isset.encryption_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 9:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->footer_signing_key_metadata);
+ this->__isset.footer_signing_key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_version)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_schema)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_num_rows)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ if (!isset_row_groups)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t FileMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("FileMetaData");
+
+ xfer += oprot->writeFieldBegin("version", ::apache::thrift::protocol::T_I32, 1);
+ xfer += oprot->writeI32(this->version);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("schema", ::apache::thrift::protocol::T_LIST, 2);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->schema.size()));
+ std::vector<SchemaElement> ::const_iterator _iter205;
+ for (_iter205 = this->schema.begin(); _iter205 != this->schema.end(); ++_iter205)
+ {
+ xfer += (*_iter205).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("num_rows", ::apache::thrift::protocol::T_I64, 3);
+ xfer += oprot->writeI64(this->num_rows);
+ xfer += oprot->writeFieldEnd();
+
+ xfer += oprot->writeFieldBegin("row_groups", ::apache::thrift::protocol::T_LIST, 4);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->row_groups.size()));
+ std::vector<RowGroup> ::const_iterator _iter206;
+ for (_iter206 = this->row_groups.begin(); _iter206 != this->row_groups.end(); ++_iter206)
+ {
+ xfer += (*_iter206).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_value_metadata) {
+ xfer += oprot->writeFieldBegin("key_value_metadata", ::apache::thrift::protocol::T_LIST, 5);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->key_value_metadata.size()));
+ std::vector<KeyValue> ::const_iterator _iter207;
+ for (_iter207 = this->key_value_metadata.begin(); _iter207 != this->key_value_metadata.end(); ++_iter207)
+ {
+ xfer += (*_iter207).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.created_by) {
+ xfer += oprot->writeFieldBegin("created_by", ::apache::thrift::protocol::T_STRING, 6);
+ xfer += oprot->writeString(this->created_by);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.column_orders) {
+ xfer += oprot->writeFieldBegin("column_orders", ::apache::thrift::protocol::T_LIST, 7);
+ {
+ xfer += oprot->writeListBegin(::apache::thrift::protocol::T_STRUCT, static_cast<uint32_t>(this->column_orders.size()));
+ std::vector<ColumnOrder> ::const_iterator _iter208;
+ for (_iter208 = this->column_orders.begin(); _iter208 != this->column_orders.end(); ++_iter208)
+ {
+ xfer += (*_iter208).write(oprot);
+ }
+ xfer += oprot->writeListEnd();
+ }
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.encryption_algorithm) {
+ xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 8);
+ xfer += this->encryption_algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+ }
+ if (this->__isset.footer_signing_key_metadata) {
+ xfer += oprot->writeFieldBegin("footer_signing_key_metadata", ::apache::thrift::protocol::T_STRING, 9);
+ xfer += oprot->writeBinary(this->footer_signing_key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(FileMetaData &a, FileMetaData &b) {
+ using ::std::swap;
+ swap(a.version, b.version);
+ swap(a.schema, b.schema);
+ swap(a.num_rows, b.num_rows);
+ swap(a.row_groups, b.row_groups);
+ swap(a.key_value_metadata, b.key_value_metadata);
+ swap(a.created_by, b.created_by);
+ swap(a.column_orders, b.column_orders);
+ swap(a.encryption_algorithm, b.encryption_algorithm);
+ swap(a.footer_signing_key_metadata, b.footer_signing_key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+FileMetaData::FileMetaData(const FileMetaData& other209) {
+ version = other209.version;
+ schema = other209.schema;
+ num_rows = other209.num_rows;
+ row_groups = other209.row_groups;
+ key_value_metadata = other209.key_value_metadata;
+ created_by = other209.created_by;
+ column_orders = other209.column_orders;
+ encryption_algorithm = other209.encryption_algorithm;
+ footer_signing_key_metadata = other209.footer_signing_key_metadata;
+ __isset = other209.__isset;
+}
+FileMetaData& FileMetaData::operator=(const FileMetaData& other210) {
+ version = other210.version;
+ schema = other210.schema;
+ num_rows = other210.num_rows;
+ row_groups = other210.row_groups;
+ key_value_metadata = other210.key_value_metadata;
+ created_by = other210.created_by;
+ column_orders = other210.column_orders;
+ encryption_algorithm = other210.encryption_algorithm;
+ footer_signing_key_metadata = other210.footer_signing_key_metadata;
+ __isset = other210.__isset;
+ return *this;
+}
+void FileMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "FileMetaData(";
+ out << "version=" << to_string(version);
+ out << ", " << "schema=" << to_string(schema);
+ out << ", " << "num_rows=" << to_string(num_rows);
+ out << ", " << "row_groups=" << to_string(row_groups);
+ out << ", " << "key_value_metadata="; (__isset.key_value_metadata ? (out << to_string(key_value_metadata)) : (out << "<null>"));
+ out << ", " << "created_by="; (__isset.created_by ? (out << to_string(created_by)) : (out << "<null>"));
+ out << ", " << "column_orders="; (__isset.column_orders ? (out << to_string(column_orders)) : (out << "<null>"));
+ out << ", " << "encryption_algorithm="; (__isset.encryption_algorithm ? (out << to_string(encryption_algorithm)) : (out << "<null>"));
+ out << ", " << "footer_signing_key_metadata="; (__isset.footer_signing_key_metadata ? (out << to_string(footer_signing_key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+
+FileCryptoMetaData::~FileCryptoMetaData() noexcept {
+}
+
+
+void FileCryptoMetaData::__set_encryption_algorithm(const EncryptionAlgorithm& val) {
+ this->encryption_algorithm = val;
+}
+
+void FileCryptoMetaData::__set_key_metadata(const std::string& val) {
+ this->key_metadata = val;
+__isset.key_metadata = true;
+}
+std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj)
+{
+ obj.printTo(out);
+ return out;
+}
+
+
+uint32_t FileCryptoMetaData::read(::apache::thrift::protocol::TProtocol* iprot) {
+
+ ::apache::thrift::protocol::TInputRecursionTracker tracker(*iprot);
+ uint32_t xfer = 0;
+ std::string fname;
+ ::apache::thrift::protocol::TType ftype;
+ int16_t fid;
+
+ xfer += iprot->readStructBegin(fname);
+
+ using ::apache::thrift::protocol::TProtocolException;
+
+ bool isset_encryption_algorithm = false;
+
+ while (true)
+ {
+ xfer += iprot->readFieldBegin(fname, ftype, fid);
+ if (ftype == ::apache::thrift::protocol::T_STOP) {
+ break;
+ }
+ switch (fid)
+ {
+ case 1:
+ if (ftype == ::apache::thrift::protocol::T_STRUCT) {
+ xfer += this->encryption_algorithm.read(iprot);
+ isset_encryption_algorithm = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ case 2:
+ if (ftype == ::apache::thrift::protocol::T_STRING) {
+ xfer += iprot->readBinary(this->key_metadata);
+ this->__isset.key_metadata = true;
+ } else {
+ xfer += iprot->skip(ftype);
+ }
+ break;
+ default:
+ xfer += iprot->skip(ftype);
+ break;
+ }
+ xfer += iprot->readFieldEnd();
+ }
+
+ xfer += iprot->readStructEnd();
+
+ if (!isset_encryption_algorithm)
+ throw TProtocolException(TProtocolException::INVALID_DATA);
+ return xfer;
+}
+
+uint32_t FileCryptoMetaData::write(::apache::thrift::protocol::TProtocol* oprot) const {
+ uint32_t xfer = 0;
+ ::apache::thrift::protocol::TOutputRecursionTracker tracker(*oprot);
+ xfer += oprot->writeStructBegin("FileCryptoMetaData");
+
+ xfer += oprot->writeFieldBegin("encryption_algorithm", ::apache::thrift::protocol::T_STRUCT, 1);
+ xfer += this->encryption_algorithm.write(oprot);
+ xfer += oprot->writeFieldEnd();
+
+ if (this->__isset.key_metadata) {
+ xfer += oprot->writeFieldBegin("key_metadata", ::apache::thrift::protocol::T_STRING, 2);
+ xfer += oprot->writeBinary(this->key_metadata);
+ xfer += oprot->writeFieldEnd();
+ }
+ xfer += oprot->writeFieldStop();
+ xfer += oprot->writeStructEnd();
+ return xfer;
+}
+
+void swap(FileCryptoMetaData &a, FileCryptoMetaData &b) {
+ using ::std::swap;
+ swap(a.encryption_algorithm, b.encryption_algorithm);
+ swap(a.key_metadata, b.key_metadata);
+ swap(a.__isset, b.__isset);
+}
+
+FileCryptoMetaData::FileCryptoMetaData(const FileCryptoMetaData& other211) {
+ encryption_algorithm = other211.encryption_algorithm;
+ key_metadata = other211.key_metadata;
+ __isset = other211.__isset;
+}
+FileCryptoMetaData& FileCryptoMetaData::operator=(const FileCryptoMetaData& other212) {
+ encryption_algorithm = other212.encryption_algorithm;
+ key_metadata = other212.key_metadata;
+ __isset = other212.__isset;
+ return *this;
+}
+void FileCryptoMetaData::printTo(std::ostream& out) const {
+ using ::apache::thrift::to_string;
+ out << "FileCryptoMetaData(";
+ out << "encryption_algorithm=" << to_string(encryption_algorithm);
+ out << ", " << "key_metadata="; (__isset.key_metadata ? (out << to_string(key_metadata)) : (out << "<null>"));
+ out << ")";
+}
+
+}} // namespace
diff --git a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
index c48383fa4d5..3d7edd40983 100644
--- a/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
+++ b/contrib/libs/apache/arrow/cpp/src/generated/parquet_types.h
@@ -1,2917 +1,2917 @@
-/**
- * Autogenerated by Thrift Compiler (0.13.0)
- *
- * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
- * @generated
- */
-#ifndef parquet_TYPES_H
-#define parquet_TYPES_H
-
-#include <iosfwd>
-
-#include <thrift/Thrift.h>
-#include <thrift/TApplicationException.h>
-#include <thrift/TBase.h>
-#include <thrift/protocol/TProtocol.h>
-#include <thrift/transport/TTransport.h>
-
-#include <functional>
-#include <memory>
-
-#include "parquet/windows_compatibility.h"
-
-namespace parquet { namespace format {
-
-struct Type {
- enum type {
- BOOLEAN = 0,
- INT32 = 1,
- INT64 = 2,
- INT96 = 3,
- FLOAT = 4,
- DOUBLE = 5,
- BYTE_ARRAY = 6,
- FIXED_LEN_BYTE_ARRAY = 7
- };
-};
-
-extern const std::map<int, const char*> _Type_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const Type::type& val);
-
-std::string to_string(const Type::type& val);
-
-struct ConvertedType {
- enum type {
- UTF8 = 0,
- MAP = 1,
- MAP_KEY_VALUE = 2,
- LIST = 3,
- ENUM = 4,
- DECIMAL = 5,
- DATE = 6,
- TIME_MILLIS = 7,
- TIME_MICROS = 8,
- TIMESTAMP_MILLIS = 9,
- TIMESTAMP_MICROS = 10,
- UINT_8 = 11,
- UINT_16 = 12,
- UINT_32 = 13,
- UINT_64 = 14,
- INT_8 = 15,
- INT_16 = 16,
- INT_32 = 17,
- INT_64 = 18,
- JSON = 19,
- BSON = 20,
- INTERVAL = 21
- };
-};
-
-extern const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val);
-
-std::string to_string(const ConvertedType::type& val);
-
-struct FieldRepetitionType {
- enum type {
- REQUIRED = 0,
- OPTIONAL = 1,
- REPEATED = 2
- };
-};
-
-extern const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val);
-
-std::string to_string(const FieldRepetitionType::type& val);
-
-struct Encoding {
- enum type {
- PLAIN = 0,
- PLAIN_DICTIONARY = 2,
- RLE = 3,
- BIT_PACKED = 4,
- DELTA_BINARY_PACKED = 5,
- DELTA_LENGTH_BYTE_ARRAY = 6,
- DELTA_BYTE_ARRAY = 7,
- RLE_DICTIONARY = 8,
- BYTE_STREAM_SPLIT = 9
- };
-};
-
-extern const std::map<int, const char*> _Encoding_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const Encoding::type& val);
-
-std::string to_string(const Encoding::type& val);
-
-struct CompressionCodec {
- enum type {
- UNCOMPRESSED = 0,
- SNAPPY = 1,
- GZIP = 2,
- LZO = 3,
- BROTLI = 4,
- LZ4 = 5,
- ZSTD = 6,
- LZ4_RAW = 7
- };
-};
-
-extern const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val);
-
-std::string to_string(const CompressionCodec::type& val);
-
-struct PageType {
- enum type {
- DATA_PAGE = 0,
- INDEX_PAGE = 1,
- DICTIONARY_PAGE = 2,
- DATA_PAGE_V2 = 3
- };
-};
-
-extern const std::map<int, const char*> _PageType_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const PageType::type& val);
-
-std::string to_string(const PageType::type& val);
-
-struct BoundaryOrder {
- enum type {
- UNORDERED = 0,
- ASCENDING = 1,
- DESCENDING = 2
- };
-};
-
-extern const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES;
-
-std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val);
-
-std::string to_string(const BoundaryOrder::type& val);
-
-class Statistics;
-
-class StringType;
-
-class UUIDType;
-
-class MapType;
-
-class ListType;
-
-class EnumType;
-
-class DateType;
-
-class NullType;
-
-class DecimalType;
-
-class MilliSeconds;
-
-class MicroSeconds;
-
-class NanoSeconds;
-
-class TimeUnit;
-
-class TimestampType;
-
-class TimeType;
-
-class IntType;
-
-class JsonType;
-
-class BsonType;
-
-class LogicalType;
-
-class SchemaElement;
-
-class DataPageHeader;
-
-class IndexPageHeader;
-
-class DictionaryPageHeader;
-
-class DataPageHeaderV2;
-
-class SplitBlockAlgorithm;
-
-class BloomFilterAlgorithm;
-
-class XxHash;
-
-class BloomFilterHash;
-
-class Uncompressed;
-
-class BloomFilterCompression;
-
-class BloomFilterHeader;
-
-class PageHeader;
-
-class KeyValue;
-
-class SortingColumn;
-
-class PageEncodingStats;
-
-class ColumnMetaData;
-
-class EncryptionWithFooterKey;
-
-class EncryptionWithColumnKey;
-
-class ColumnCryptoMetaData;
-
-class ColumnChunk;
-
-class RowGroup;
-
-class TypeDefinedOrder;
-
-class ColumnOrder;
-
-class PageLocation;
-
-class OffsetIndex;
-
-class ColumnIndex;
-
-class AesGcmV1;
-
-class AesGcmCtrV1;
-
-class EncryptionAlgorithm;
-
-class FileMetaData;
-
-class FileCryptoMetaData;
-
-typedef struct _Statistics__isset {
- _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {}
- bool max :1;
- bool min :1;
- bool null_count :1;
- bool distinct_count :1;
- bool max_value :1;
- bool min_value :1;
-} _Statistics__isset;
-
-class Statistics : public virtual ::apache::thrift::TBase {
- public:
-
- Statistics(const Statistics&);
- Statistics& operator=(const Statistics&);
- Statistics() : max(), min(), null_count(0), distinct_count(0), max_value(), min_value() {
- }
-
- virtual ~Statistics() noexcept;
- std::string max;
- std::string min;
- int64_t null_count;
- int64_t distinct_count;
- std::string max_value;
- std::string min_value;
-
- _Statistics__isset __isset;
-
- void __set_max(const std::string& val);
-
- void __set_min(const std::string& val);
-
- void __set_null_count(const int64_t val);
-
- void __set_distinct_count(const int64_t val);
-
- void __set_max_value(const std::string& val);
-
- void __set_min_value(const std::string& val);
-
- bool operator == (const Statistics & rhs) const
- {
- if (__isset.max != rhs.__isset.max)
- return false;
- else if (__isset.max && !(max == rhs.max))
- return false;
- if (__isset.min != rhs.__isset.min)
- return false;
- else if (__isset.min && !(min == rhs.min))
- return false;
- if (__isset.null_count != rhs.__isset.null_count)
- return false;
- else if (__isset.null_count && !(null_count == rhs.null_count))
- return false;
- if (__isset.distinct_count != rhs.__isset.distinct_count)
- return false;
- else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
- return false;
- if (__isset.max_value != rhs.__isset.max_value)
- return false;
- else if (__isset.max_value && !(max_value == rhs.max_value))
- return false;
- if (__isset.min_value != rhs.__isset.min_value)
- return false;
- else if (__isset.min_value && !(min_value == rhs.min_value))
- return false;
- return true;
- }
- bool operator != (const Statistics &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const Statistics & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(Statistics &a, Statistics &b);
-
-std::ostream& operator<<(std::ostream& out, const Statistics& obj);
-
-
-class StringType : public virtual ::apache::thrift::TBase {
- public:
-
- StringType(const StringType&);
- StringType& operator=(const StringType&);
- StringType() {
- }
-
- virtual ~StringType() noexcept;
-
- bool operator == (const StringType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const StringType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const StringType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(StringType &a, StringType &b);
-
-std::ostream& operator<<(std::ostream& out, const StringType& obj);
-
-
-class UUIDType : public virtual ::apache::thrift::TBase {
- public:
-
- UUIDType(const UUIDType&);
- UUIDType& operator=(const UUIDType&);
- UUIDType() {
- }
-
- virtual ~UUIDType() noexcept;
-
- bool operator == (const UUIDType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const UUIDType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const UUIDType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(UUIDType &a, UUIDType &b);
-
-std::ostream& operator<<(std::ostream& out, const UUIDType& obj);
-
-
-class MapType : public virtual ::apache::thrift::TBase {
- public:
-
- MapType(const MapType&);
- MapType& operator=(const MapType&);
- MapType() {
- }
-
- virtual ~MapType() noexcept;
-
- bool operator == (const MapType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const MapType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const MapType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(MapType &a, MapType &b);
-
-std::ostream& operator<<(std::ostream& out, const MapType& obj);
-
-
-class ListType : public virtual ::apache::thrift::TBase {
- public:
-
- ListType(const ListType&);
- ListType& operator=(const ListType&);
- ListType() {
- }
-
- virtual ~ListType() noexcept;
-
- bool operator == (const ListType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const ListType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ListType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ListType &a, ListType &b);
-
-std::ostream& operator<<(std::ostream& out, const ListType& obj);
-
-
-class EnumType : public virtual ::apache::thrift::TBase {
- public:
-
- EnumType(const EnumType&);
- EnumType& operator=(const EnumType&);
- EnumType() {
- }
-
- virtual ~EnumType() noexcept;
-
- bool operator == (const EnumType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const EnumType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EnumType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EnumType &a, EnumType &b);
-
-std::ostream& operator<<(std::ostream& out, const EnumType& obj);
-
-
-class DateType : public virtual ::apache::thrift::TBase {
- public:
-
- DateType(const DateType&);
- DateType& operator=(const DateType&);
- DateType() {
- }
-
- virtual ~DateType() noexcept;
-
- bool operator == (const DateType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const DateType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DateType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DateType &a, DateType &b);
-
-std::ostream& operator<<(std::ostream& out, const DateType& obj);
-
-
-class NullType : public virtual ::apache::thrift::TBase {
- public:
-
- NullType(const NullType&);
- NullType& operator=(const NullType&);
- NullType() {
- }
-
- virtual ~NullType() noexcept;
-
- bool operator == (const NullType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const NullType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const NullType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(NullType &a, NullType &b);
-
-std::ostream& operator<<(std::ostream& out, const NullType& obj);
-
-
-class DecimalType : public virtual ::apache::thrift::TBase {
- public:
-
- DecimalType(const DecimalType&);
- DecimalType& operator=(const DecimalType&);
- DecimalType() : scale(0), precision(0) {
- }
-
- virtual ~DecimalType() noexcept;
- int32_t scale;
- int32_t precision;
-
- void __set_scale(const int32_t val);
-
- void __set_precision(const int32_t val);
-
- bool operator == (const DecimalType & rhs) const
- {
- if (!(scale == rhs.scale))
- return false;
- if (!(precision == rhs.precision))
- return false;
- return true;
- }
- bool operator != (const DecimalType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DecimalType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DecimalType &a, DecimalType &b);
-
-std::ostream& operator<<(std::ostream& out, const DecimalType& obj);
-
-
-class MilliSeconds : public virtual ::apache::thrift::TBase {
- public:
-
- MilliSeconds(const MilliSeconds&);
- MilliSeconds& operator=(const MilliSeconds&);
- MilliSeconds() {
- }
-
- virtual ~MilliSeconds() noexcept;
-
- bool operator == (const MilliSeconds & /* rhs */) const
- {
- return true;
- }
- bool operator != (const MilliSeconds &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const MilliSeconds & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(MilliSeconds &a, MilliSeconds &b);
-
-std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj);
-
-
-class MicroSeconds : public virtual ::apache::thrift::TBase {
- public:
-
- MicroSeconds(const MicroSeconds&);
- MicroSeconds& operator=(const MicroSeconds&);
- MicroSeconds() {
- }
-
- virtual ~MicroSeconds() noexcept;
-
- bool operator == (const MicroSeconds & /* rhs */) const
- {
- return true;
- }
- bool operator != (const MicroSeconds &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const MicroSeconds & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(MicroSeconds &a, MicroSeconds &b);
-
-std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj);
-
-
-class NanoSeconds : public virtual ::apache::thrift::TBase {
- public:
-
- NanoSeconds(const NanoSeconds&);
- NanoSeconds& operator=(const NanoSeconds&);
- NanoSeconds() {
- }
-
- virtual ~NanoSeconds() noexcept;
-
- bool operator == (const NanoSeconds & /* rhs */) const
- {
- return true;
- }
- bool operator != (const NanoSeconds &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const NanoSeconds & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(NanoSeconds &a, NanoSeconds &b);
-
-std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj);
-
-typedef struct _TimeUnit__isset {
- _TimeUnit__isset() : MILLIS(false), MICROS(false), NANOS(false) {}
- bool MILLIS :1;
- bool MICROS :1;
- bool NANOS :1;
-} _TimeUnit__isset;
-
-class TimeUnit : public virtual ::apache::thrift::TBase {
- public:
-
- TimeUnit(const TimeUnit&);
- TimeUnit& operator=(const TimeUnit&);
- TimeUnit() {
- }
-
- virtual ~TimeUnit() noexcept;
- MilliSeconds MILLIS;
- MicroSeconds MICROS;
- NanoSeconds NANOS;
-
- _TimeUnit__isset __isset;
-
- void __set_MILLIS(const MilliSeconds& val);
-
- void __set_MICROS(const MicroSeconds& val);
-
- void __set_NANOS(const NanoSeconds& val);
-
- bool operator == (const TimeUnit & rhs) const
- {
- if (__isset.MILLIS != rhs.__isset.MILLIS)
- return false;
- else if (__isset.MILLIS && !(MILLIS == rhs.MILLIS))
- return false;
- if (__isset.MICROS != rhs.__isset.MICROS)
- return false;
- else if (__isset.MICROS && !(MICROS == rhs.MICROS))
- return false;
- if (__isset.NANOS != rhs.__isset.NANOS)
- return false;
- else if (__isset.NANOS && !(NANOS == rhs.NANOS))
- return false;
- return true;
- }
- bool operator != (const TimeUnit &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TimeUnit & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TimeUnit &a, TimeUnit &b);
-
-std::ostream& operator<<(std::ostream& out, const TimeUnit& obj);
-
-
-class TimestampType : public virtual ::apache::thrift::TBase {
- public:
-
- TimestampType(const TimestampType&);
- TimestampType& operator=(const TimestampType&);
- TimestampType() : isAdjustedToUTC(0) {
- }
-
- virtual ~TimestampType() noexcept;
- bool isAdjustedToUTC;
- TimeUnit unit;
-
- void __set_isAdjustedToUTC(const bool val);
-
- void __set_unit(const TimeUnit& val);
-
- bool operator == (const TimestampType & rhs) const
- {
- if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
- return false;
- if (!(unit == rhs.unit))
- return false;
- return true;
- }
- bool operator != (const TimestampType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TimestampType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TimestampType &a, TimestampType &b);
-
-std::ostream& operator<<(std::ostream& out, const TimestampType& obj);
-
-
-class TimeType : public virtual ::apache::thrift::TBase {
- public:
-
- TimeType(const TimeType&);
- TimeType& operator=(const TimeType&);
- TimeType() : isAdjustedToUTC(0) {
- }
-
- virtual ~TimeType() noexcept;
- bool isAdjustedToUTC;
- TimeUnit unit;
-
- void __set_isAdjustedToUTC(const bool val);
-
- void __set_unit(const TimeUnit& val);
-
- bool operator == (const TimeType & rhs) const
- {
- if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
- return false;
- if (!(unit == rhs.unit))
- return false;
- return true;
- }
- bool operator != (const TimeType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TimeType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TimeType &a, TimeType &b);
-
-std::ostream& operator<<(std::ostream& out, const TimeType& obj);
-
-
-class IntType : public virtual ::apache::thrift::TBase {
- public:
-
- IntType(const IntType&);
- IntType& operator=(const IntType&);
- IntType() : bitWidth(0), isSigned(0) {
- }
-
- virtual ~IntType() noexcept;
- int8_t bitWidth;
- bool isSigned;
-
- void __set_bitWidth(const int8_t val);
-
- void __set_isSigned(const bool val);
-
- bool operator == (const IntType & rhs) const
- {
- if (!(bitWidth == rhs.bitWidth))
- return false;
- if (!(isSigned == rhs.isSigned))
- return false;
- return true;
- }
- bool operator != (const IntType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const IntType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(IntType &a, IntType &b);
-
-std::ostream& operator<<(std::ostream& out, const IntType& obj);
-
-
-class JsonType : public virtual ::apache::thrift::TBase {
- public:
-
- JsonType(const JsonType&);
- JsonType& operator=(const JsonType&);
- JsonType() {
- }
-
- virtual ~JsonType() noexcept;
-
- bool operator == (const JsonType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const JsonType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const JsonType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(JsonType &a, JsonType &b);
-
-std::ostream& operator<<(std::ostream& out, const JsonType& obj);
-
-
-class BsonType : public virtual ::apache::thrift::TBase {
- public:
-
- BsonType(const BsonType&);
- BsonType& operator=(const BsonType&);
- BsonType() {
- }
-
- virtual ~BsonType() noexcept;
-
- bool operator == (const BsonType & /* rhs */) const
- {
- return true;
- }
- bool operator != (const BsonType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BsonType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BsonType &a, BsonType &b);
-
-std::ostream& operator<<(std::ostream& out, const BsonType& obj);
-
-typedef struct _LogicalType__isset {
- _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false) {}
- bool STRING :1;
- bool MAP :1;
- bool LIST :1;
- bool ENUM :1;
- bool DECIMAL :1;
- bool DATE :1;
- bool TIME :1;
- bool TIMESTAMP :1;
- bool INTEGER :1;
- bool UNKNOWN :1;
- bool JSON :1;
- bool BSON :1;
- bool UUID :1;
-} _LogicalType__isset;
-
-class LogicalType : public virtual ::apache::thrift::TBase {
- public:
-
- LogicalType(const LogicalType&);
- LogicalType& operator=(const LogicalType&);
- LogicalType() {
- }
-
- virtual ~LogicalType() noexcept;
- StringType STRING;
- MapType MAP;
- ListType LIST;
- EnumType ENUM;
- DecimalType DECIMAL;
- DateType DATE;
- TimeType TIME;
- TimestampType TIMESTAMP;
- IntType INTEGER;
- NullType UNKNOWN;
- JsonType JSON;
- BsonType BSON;
- UUIDType UUID;
-
- _LogicalType__isset __isset;
-
- void __set_STRING(const StringType& val);
-
- void __set_MAP(const MapType& val);
-
- void __set_LIST(const ListType& val);
-
- void __set_ENUM(const EnumType& val);
-
- void __set_DECIMAL(const DecimalType& val);
-
- void __set_DATE(const DateType& val);
-
- void __set_TIME(const TimeType& val);
-
- void __set_TIMESTAMP(const TimestampType& val);
-
- void __set_INTEGER(const IntType& val);
-
- void __set_UNKNOWN(const NullType& val);
-
- void __set_JSON(const JsonType& val);
-
- void __set_BSON(const BsonType& val);
-
- void __set_UUID(const UUIDType& val);
-
- bool operator == (const LogicalType & rhs) const
- {
- if (__isset.STRING != rhs.__isset.STRING)
- return false;
- else if (__isset.STRING && !(STRING == rhs.STRING))
- return false;
- if (__isset.MAP != rhs.__isset.MAP)
- return false;
- else if (__isset.MAP && !(MAP == rhs.MAP))
- return false;
- if (__isset.LIST != rhs.__isset.LIST)
- return false;
- else if (__isset.LIST && !(LIST == rhs.LIST))
- return false;
- if (__isset.ENUM != rhs.__isset.ENUM)
- return false;
- else if (__isset.ENUM && !(ENUM == rhs.ENUM))
- return false;
- if (__isset.DECIMAL != rhs.__isset.DECIMAL)
- return false;
- else if (__isset.DECIMAL && !(DECIMAL == rhs.DECIMAL))
- return false;
- if (__isset.DATE != rhs.__isset.DATE)
- return false;
- else if (__isset.DATE && !(DATE == rhs.DATE))
- return false;
- if (__isset.TIME != rhs.__isset.TIME)
- return false;
- else if (__isset.TIME && !(TIME == rhs.TIME))
- return false;
- if (__isset.TIMESTAMP != rhs.__isset.TIMESTAMP)
- return false;
- else if (__isset.TIMESTAMP && !(TIMESTAMP == rhs.TIMESTAMP))
- return false;
- if (__isset.INTEGER != rhs.__isset.INTEGER)
- return false;
- else if (__isset.INTEGER && !(INTEGER == rhs.INTEGER))
- return false;
- if (__isset.UNKNOWN != rhs.__isset.UNKNOWN)
- return false;
- else if (__isset.UNKNOWN && !(UNKNOWN == rhs.UNKNOWN))
- return false;
- if (__isset.JSON != rhs.__isset.JSON)
- return false;
- else if (__isset.JSON && !(JSON == rhs.JSON))
- return false;
- if (__isset.BSON != rhs.__isset.BSON)
- return false;
- else if (__isset.BSON && !(BSON == rhs.BSON))
- return false;
- if (__isset.UUID != rhs.__isset.UUID)
- return false;
- else if (__isset.UUID && !(UUID == rhs.UUID))
- return false;
- return true;
- }
- bool operator != (const LogicalType &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const LogicalType & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(LogicalType &a, LogicalType &b);
-
-std::ostream& operator<<(std::ostream& out, const LogicalType& obj);
-
-typedef struct _SchemaElement__isset {
- _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false) {}
- bool type :1;
- bool type_length :1;
- bool repetition_type :1;
- bool num_children :1;
- bool converted_type :1;
- bool scale :1;
- bool precision :1;
- bool field_id :1;
- bool logicalType :1;
-} _SchemaElement__isset;
-
-class SchemaElement : public virtual ::apache::thrift::TBase {
- public:
-
- SchemaElement(const SchemaElement&);
- SchemaElement& operator=(const SchemaElement&);
- SchemaElement() : type((Type::type)0), type_length(0), repetition_type((FieldRepetitionType::type)0), name(), num_children(0), converted_type((ConvertedType::type)0), scale(0), precision(0), field_id(0) {
- }
-
- virtual ~SchemaElement() noexcept;
- Type::type type;
- int32_t type_length;
- FieldRepetitionType::type repetition_type;
- std::string name;
- int32_t num_children;
- ConvertedType::type converted_type;
- int32_t scale;
- int32_t precision;
- int32_t field_id;
- LogicalType logicalType;
-
- _SchemaElement__isset __isset;
-
- void __set_type(const Type::type val);
-
- void __set_type_length(const int32_t val);
-
- void __set_repetition_type(const FieldRepetitionType::type val);
-
- void __set_name(const std::string& val);
-
- void __set_num_children(const int32_t val);
-
- void __set_converted_type(const ConvertedType::type val);
-
- void __set_scale(const int32_t val);
-
- void __set_precision(const int32_t val);
-
- void __set_field_id(const int32_t val);
-
- void __set_logicalType(const LogicalType& val);
-
- bool operator == (const SchemaElement & rhs) const
- {
- if (__isset.type != rhs.__isset.type)
- return false;
- else if (__isset.type && !(type == rhs.type))
- return false;
- if (__isset.type_length != rhs.__isset.type_length)
- return false;
- else if (__isset.type_length && !(type_length == rhs.type_length))
- return false;
- if (__isset.repetition_type != rhs.__isset.repetition_type)
- return false;
- else if (__isset.repetition_type && !(repetition_type == rhs.repetition_type))
- return false;
- if (!(name == rhs.name))
- return false;
- if (__isset.num_children != rhs.__isset.num_children)
- return false;
- else if (__isset.num_children && !(num_children == rhs.num_children))
- return false;
- if (__isset.converted_type != rhs.__isset.converted_type)
- return false;
- else if (__isset.converted_type && !(converted_type == rhs.converted_type))
- return false;
- if (__isset.scale != rhs.__isset.scale)
- return false;
- else if (__isset.scale && !(scale == rhs.scale))
- return false;
- if (__isset.precision != rhs.__isset.precision)
- return false;
- else if (__isset.precision && !(precision == rhs.precision))
- return false;
- if (__isset.field_id != rhs.__isset.field_id)
- return false;
- else if (__isset.field_id && !(field_id == rhs.field_id))
- return false;
- if (__isset.logicalType != rhs.__isset.logicalType)
- return false;
- else if (__isset.logicalType && !(logicalType == rhs.logicalType))
- return false;
- return true;
- }
- bool operator != (const SchemaElement &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const SchemaElement & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(SchemaElement &a, SchemaElement &b);
-
-std::ostream& operator<<(std::ostream& out, const SchemaElement& obj);
-
-typedef struct _DataPageHeader__isset {
- _DataPageHeader__isset() : statistics(false) {}
- bool statistics :1;
-} _DataPageHeader__isset;
-
-class DataPageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- DataPageHeader(const DataPageHeader&);
- DataPageHeader& operator=(const DataPageHeader&);
- DataPageHeader() : num_values(0), encoding((Encoding::type)0), definition_level_encoding((Encoding::type)0), repetition_level_encoding((Encoding::type)0) {
- }
-
- virtual ~DataPageHeader() noexcept;
- int32_t num_values;
- Encoding::type encoding;
- Encoding::type definition_level_encoding;
- Encoding::type repetition_level_encoding;
- Statistics statistics;
-
- _DataPageHeader__isset __isset;
-
- void __set_num_values(const int32_t val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_definition_level_encoding(const Encoding::type val);
-
- void __set_repetition_level_encoding(const Encoding::type val);
-
- void __set_statistics(const Statistics& val);
-
- bool operator == (const DataPageHeader & rhs) const
- {
- if (!(num_values == rhs.num_values))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (!(definition_level_encoding == rhs.definition_level_encoding))
- return false;
- if (!(repetition_level_encoding == rhs.repetition_level_encoding))
- return false;
- if (__isset.statistics != rhs.__isset.statistics)
- return false;
- else if (__isset.statistics && !(statistics == rhs.statistics))
- return false;
- return true;
- }
- bool operator != (const DataPageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DataPageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DataPageHeader &a, DataPageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj);
-
-
-class IndexPageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- IndexPageHeader(const IndexPageHeader&);
- IndexPageHeader& operator=(const IndexPageHeader&);
- IndexPageHeader() {
- }
-
- virtual ~IndexPageHeader() noexcept;
-
- bool operator == (const IndexPageHeader & /* rhs */) const
- {
- return true;
- }
- bool operator != (const IndexPageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const IndexPageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(IndexPageHeader &a, IndexPageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj);
-
-typedef struct _DictionaryPageHeader__isset {
- _DictionaryPageHeader__isset() : is_sorted(false) {}
- bool is_sorted :1;
-} _DictionaryPageHeader__isset;
-
-class DictionaryPageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- DictionaryPageHeader(const DictionaryPageHeader&);
- DictionaryPageHeader& operator=(const DictionaryPageHeader&);
- DictionaryPageHeader() : num_values(0), encoding((Encoding::type)0), is_sorted(0) {
- }
-
- virtual ~DictionaryPageHeader() noexcept;
- int32_t num_values;
- Encoding::type encoding;
- bool is_sorted;
-
- _DictionaryPageHeader__isset __isset;
-
- void __set_num_values(const int32_t val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_is_sorted(const bool val);
-
- bool operator == (const DictionaryPageHeader & rhs) const
- {
- if (!(num_values == rhs.num_values))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (__isset.is_sorted != rhs.__isset.is_sorted)
- return false;
- else if (__isset.is_sorted && !(is_sorted == rhs.is_sorted))
- return false;
- return true;
- }
- bool operator != (const DictionaryPageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DictionaryPageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DictionaryPageHeader &a, DictionaryPageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj);
-
-typedef struct _DataPageHeaderV2__isset {
- _DataPageHeaderV2__isset() : is_compressed(true), statistics(false) {}
- bool is_compressed :1;
- bool statistics :1;
-} _DataPageHeaderV2__isset;
-
-class DataPageHeaderV2 : public virtual ::apache::thrift::TBase {
- public:
-
- DataPageHeaderV2(const DataPageHeaderV2&);
- DataPageHeaderV2& operator=(const DataPageHeaderV2&);
- DataPageHeaderV2() : num_values(0), num_nulls(0), num_rows(0), encoding((Encoding::type)0), definition_levels_byte_length(0), repetition_levels_byte_length(0), is_compressed(true) {
- }
-
- virtual ~DataPageHeaderV2() noexcept;
- int32_t num_values;
- int32_t num_nulls;
- int32_t num_rows;
- Encoding::type encoding;
- int32_t definition_levels_byte_length;
- int32_t repetition_levels_byte_length;
- bool is_compressed;
- Statistics statistics;
-
- _DataPageHeaderV2__isset __isset;
-
- void __set_num_values(const int32_t val);
-
- void __set_num_nulls(const int32_t val);
-
- void __set_num_rows(const int32_t val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_definition_levels_byte_length(const int32_t val);
-
- void __set_repetition_levels_byte_length(const int32_t val);
-
- void __set_is_compressed(const bool val);
-
- void __set_statistics(const Statistics& val);
-
- bool operator == (const DataPageHeaderV2 & rhs) const
- {
- if (!(num_values == rhs.num_values))
- return false;
- if (!(num_nulls == rhs.num_nulls))
- return false;
- if (!(num_rows == rhs.num_rows))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (!(definition_levels_byte_length == rhs.definition_levels_byte_length))
- return false;
- if (!(repetition_levels_byte_length == rhs.repetition_levels_byte_length))
- return false;
- if (__isset.is_compressed != rhs.__isset.is_compressed)
- return false;
- else if (__isset.is_compressed && !(is_compressed == rhs.is_compressed))
- return false;
- if (__isset.statistics != rhs.__isset.statistics)
- return false;
- else if (__isset.statistics && !(statistics == rhs.statistics))
- return false;
- return true;
- }
- bool operator != (const DataPageHeaderV2 &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const DataPageHeaderV2 & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b);
-
-std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj);
-
-
-class SplitBlockAlgorithm : public virtual ::apache::thrift::TBase {
- public:
-
- SplitBlockAlgorithm(const SplitBlockAlgorithm&);
- SplitBlockAlgorithm& operator=(const SplitBlockAlgorithm&);
- SplitBlockAlgorithm() {
- }
-
- virtual ~SplitBlockAlgorithm() noexcept;
-
- bool operator == (const SplitBlockAlgorithm & /* rhs */) const
- {
- return true;
- }
- bool operator != (const SplitBlockAlgorithm &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const SplitBlockAlgorithm & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b);
-
-std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj);
-
-typedef struct _BloomFilterAlgorithm__isset {
- _BloomFilterAlgorithm__isset() : BLOCK(false) {}
- bool BLOCK :1;
-} _BloomFilterAlgorithm__isset;
-
-class BloomFilterAlgorithm : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterAlgorithm(const BloomFilterAlgorithm&);
- BloomFilterAlgorithm& operator=(const BloomFilterAlgorithm&);
- BloomFilterAlgorithm() {
- }
-
- virtual ~BloomFilterAlgorithm() noexcept;
- SplitBlockAlgorithm BLOCK;
-
- _BloomFilterAlgorithm__isset __isset;
-
- void __set_BLOCK(const SplitBlockAlgorithm& val);
-
- bool operator == (const BloomFilterAlgorithm & rhs) const
- {
- if (__isset.BLOCK != rhs.__isset.BLOCK)
- return false;
- else if (__isset.BLOCK && !(BLOCK == rhs.BLOCK))
- return false;
- return true;
- }
- bool operator != (const BloomFilterAlgorithm &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterAlgorithm & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj);
-
-
-class XxHash : public virtual ::apache::thrift::TBase {
- public:
-
- XxHash(const XxHash&);
- XxHash& operator=(const XxHash&);
- XxHash() {
- }
-
- virtual ~XxHash() noexcept;
-
- bool operator == (const XxHash & /* rhs */) const
- {
- return true;
- }
- bool operator != (const XxHash &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const XxHash & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(XxHash &a, XxHash &b);
-
-std::ostream& operator<<(std::ostream& out, const XxHash& obj);
-
-typedef struct _BloomFilterHash__isset {
- _BloomFilterHash__isset() : XXHASH(false) {}
- bool XXHASH :1;
-} _BloomFilterHash__isset;
-
-class BloomFilterHash : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterHash(const BloomFilterHash&);
- BloomFilterHash& operator=(const BloomFilterHash&);
- BloomFilterHash() {
- }
-
- virtual ~BloomFilterHash() noexcept;
- XxHash XXHASH;
-
- _BloomFilterHash__isset __isset;
-
- void __set_XXHASH(const XxHash& val);
-
- bool operator == (const BloomFilterHash & rhs) const
- {
- if (__isset.XXHASH != rhs.__isset.XXHASH)
- return false;
- else if (__isset.XXHASH && !(XXHASH == rhs.XXHASH))
- return false;
- return true;
- }
- bool operator != (const BloomFilterHash &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterHash & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterHash &a, BloomFilterHash &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj);
-
-
-class Uncompressed : public virtual ::apache::thrift::TBase {
- public:
-
- Uncompressed(const Uncompressed&);
- Uncompressed& operator=(const Uncompressed&);
- Uncompressed() {
- }
-
- virtual ~Uncompressed() noexcept;
-
- bool operator == (const Uncompressed & /* rhs */) const
- {
- return true;
- }
- bool operator != (const Uncompressed &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const Uncompressed & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(Uncompressed &a, Uncompressed &b);
-
-std::ostream& operator<<(std::ostream& out, const Uncompressed& obj);
-
-typedef struct _BloomFilterCompression__isset {
- _BloomFilterCompression__isset() : UNCOMPRESSED(false) {}
- bool UNCOMPRESSED :1;
-} _BloomFilterCompression__isset;
-
-class BloomFilterCompression : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterCompression(const BloomFilterCompression&);
- BloomFilterCompression& operator=(const BloomFilterCompression&);
- BloomFilterCompression() {
- }
-
- virtual ~BloomFilterCompression() noexcept;
- Uncompressed UNCOMPRESSED;
-
- _BloomFilterCompression__isset __isset;
-
- void __set_UNCOMPRESSED(const Uncompressed& val);
-
- bool operator == (const BloomFilterCompression & rhs) const
- {
- if (__isset.UNCOMPRESSED != rhs.__isset.UNCOMPRESSED)
- return false;
- else if (__isset.UNCOMPRESSED && !(UNCOMPRESSED == rhs.UNCOMPRESSED))
- return false;
- return true;
- }
- bool operator != (const BloomFilterCompression &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterCompression & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterCompression &a, BloomFilterCompression &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj);
-
-
-class BloomFilterHeader : public virtual ::apache::thrift::TBase {
- public:
-
- BloomFilterHeader(const BloomFilterHeader&);
- BloomFilterHeader& operator=(const BloomFilterHeader&);
- BloomFilterHeader() : numBytes(0) {
- }
-
- virtual ~BloomFilterHeader() noexcept;
- int32_t numBytes;
- BloomFilterAlgorithm algorithm;
- BloomFilterHash hash;
- BloomFilterCompression compression;
-
- void __set_numBytes(const int32_t val);
-
- void __set_algorithm(const BloomFilterAlgorithm& val);
-
- void __set_hash(const BloomFilterHash& val);
-
- void __set_compression(const BloomFilterCompression& val);
-
- bool operator == (const BloomFilterHeader & rhs) const
- {
- if (!(numBytes == rhs.numBytes))
- return false;
- if (!(algorithm == rhs.algorithm))
- return false;
- if (!(hash == rhs.hash))
- return false;
- if (!(compression == rhs.compression))
- return false;
- return true;
- }
- bool operator != (const BloomFilterHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const BloomFilterHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(BloomFilterHeader &a, BloomFilterHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj);
-
-typedef struct _PageHeader__isset {
- _PageHeader__isset() : crc(false), data_page_header(false), index_page_header(false), dictionary_page_header(false), data_page_header_v2(false) {}
- bool crc :1;
- bool data_page_header :1;
- bool index_page_header :1;
- bool dictionary_page_header :1;
- bool data_page_header_v2 :1;
-} _PageHeader__isset;
-
-class PageHeader : public virtual ::apache::thrift::TBase {
- public:
-
- PageHeader(const PageHeader&);
- PageHeader& operator=(const PageHeader&);
- PageHeader() : type((PageType::type)0), uncompressed_page_size(0), compressed_page_size(0), crc(0) {
- }
-
- virtual ~PageHeader() noexcept;
- PageType::type type;
- int32_t uncompressed_page_size;
- int32_t compressed_page_size;
- int32_t crc;
- DataPageHeader data_page_header;
- IndexPageHeader index_page_header;
- DictionaryPageHeader dictionary_page_header;
- DataPageHeaderV2 data_page_header_v2;
-
- _PageHeader__isset __isset;
-
- void __set_type(const PageType::type val);
-
- void __set_uncompressed_page_size(const int32_t val);
-
- void __set_compressed_page_size(const int32_t val);
-
- void __set_crc(const int32_t val);
-
- void __set_data_page_header(const DataPageHeader& val);
-
- void __set_index_page_header(const IndexPageHeader& val);
-
- void __set_dictionary_page_header(const DictionaryPageHeader& val);
-
- void __set_data_page_header_v2(const DataPageHeaderV2& val);
-
- bool operator == (const PageHeader & rhs) const
- {
- if (!(type == rhs.type))
- return false;
- if (!(uncompressed_page_size == rhs.uncompressed_page_size))
- return false;
- if (!(compressed_page_size == rhs.compressed_page_size))
- return false;
- if (__isset.crc != rhs.__isset.crc)
- return false;
- else if (__isset.crc && !(crc == rhs.crc))
- return false;
- if (__isset.data_page_header != rhs.__isset.data_page_header)
- return false;
- else if (__isset.data_page_header && !(data_page_header == rhs.data_page_header))
- return false;
- if (__isset.index_page_header != rhs.__isset.index_page_header)
- return false;
- else if (__isset.index_page_header && !(index_page_header == rhs.index_page_header))
- return false;
- if (__isset.dictionary_page_header != rhs.__isset.dictionary_page_header)
- return false;
- else if (__isset.dictionary_page_header && !(dictionary_page_header == rhs.dictionary_page_header))
- return false;
- if (__isset.data_page_header_v2 != rhs.__isset.data_page_header_v2)
- return false;
- else if (__isset.data_page_header_v2 && !(data_page_header_v2 == rhs.data_page_header_v2))
- return false;
- return true;
- }
- bool operator != (const PageHeader &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const PageHeader & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(PageHeader &a, PageHeader &b);
-
-std::ostream& operator<<(std::ostream& out, const PageHeader& obj);
-
-typedef struct _KeyValue__isset {
- _KeyValue__isset() : value(false) {}
- bool value :1;
-} _KeyValue__isset;
-
-class KeyValue : public virtual ::apache::thrift::TBase {
- public:
-
- KeyValue(const KeyValue&);
- KeyValue& operator=(const KeyValue&);
- KeyValue() : key(), value() {
- }
-
- virtual ~KeyValue() noexcept;
- std::string key;
- std::string value;
-
- _KeyValue__isset __isset;
-
- void __set_key(const std::string& val);
-
- void __set_value(const std::string& val);
-
- bool operator == (const KeyValue & rhs) const
- {
- if (!(key == rhs.key))
- return false;
- if (__isset.value != rhs.__isset.value)
- return false;
- else if (__isset.value && !(value == rhs.value))
- return false;
- return true;
- }
- bool operator != (const KeyValue &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const KeyValue & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(KeyValue &a, KeyValue &b);
-
-std::ostream& operator<<(std::ostream& out, const KeyValue& obj);
-
-
-class SortingColumn : public virtual ::apache::thrift::TBase {
- public:
-
- SortingColumn(const SortingColumn&);
- SortingColumn& operator=(const SortingColumn&);
- SortingColumn() : column_idx(0), descending(0), nulls_first(0) {
- }
-
- virtual ~SortingColumn() noexcept;
- int32_t column_idx;
- bool descending;
- bool nulls_first;
-
- void __set_column_idx(const int32_t val);
-
- void __set_descending(const bool val);
-
- void __set_nulls_first(const bool val);
-
- bool operator == (const SortingColumn & rhs) const
- {
- if (!(column_idx == rhs.column_idx))
- return false;
- if (!(descending == rhs.descending))
- return false;
- if (!(nulls_first == rhs.nulls_first))
- return false;
- return true;
- }
- bool operator != (const SortingColumn &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const SortingColumn & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(SortingColumn &a, SortingColumn &b);
-
-std::ostream& operator<<(std::ostream& out, const SortingColumn& obj);
-
-
-class PageEncodingStats : public virtual ::apache::thrift::TBase {
- public:
-
- PageEncodingStats(const PageEncodingStats&);
- PageEncodingStats& operator=(const PageEncodingStats&);
- PageEncodingStats() : page_type((PageType::type)0), encoding((Encoding::type)0), count(0) {
- }
-
- virtual ~PageEncodingStats() noexcept;
- PageType::type page_type;
- Encoding::type encoding;
- int32_t count;
-
- void __set_page_type(const PageType::type val);
-
- void __set_encoding(const Encoding::type val);
-
- void __set_count(const int32_t val);
-
- bool operator == (const PageEncodingStats & rhs) const
- {
- if (!(page_type == rhs.page_type))
- return false;
- if (!(encoding == rhs.encoding))
- return false;
- if (!(count == rhs.count))
- return false;
- return true;
- }
- bool operator != (const PageEncodingStats &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const PageEncodingStats & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(PageEncodingStats &a, PageEncodingStats &b);
-
-std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj);
-
-typedef struct _ColumnMetaData__isset {
- _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {}
- bool key_value_metadata :1;
- bool index_page_offset :1;
- bool dictionary_page_offset :1;
- bool statistics :1;
- bool encoding_stats :1;
- bool bloom_filter_offset :1;
-} _ColumnMetaData__isset;
-
-class ColumnMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnMetaData(const ColumnMetaData&);
- ColumnMetaData& operator=(const ColumnMetaData&);
- ColumnMetaData() : type((Type::type)0), codec((CompressionCodec::type)0), num_values(0), total_uncompressed_size(0), total_compressed_size(0), data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), bloom_filter_offset(0) {
- }
-
- virtual ~ColumnMetaData() noexcept;
- Type::type type;
- std::vector<Encoding::type> encodings;
- std::vector<std::string> path_in_schema;
- CompressionCodec::type codec;
- int64_t num_values;
- int64_t total_uncompressed_size;
- int64_t total_compressed_size;
- std::vector<KeyValue> key_value_metadata;
- int64_t data_page_offset;
- int64_t index_page_offset;
- int64_t dictionary_page_offset;
- Statistics statistics;
- std::vector<PageEncodingStats> encoding_stats;
- int64_t bloom_filter_offset;
-
- _ColumnMetaData__isset __isset;
-
- void __set_type(const Type::type val);
-
- void __set_encodings(const std::vector<Encoding::type> & val);
-
- void __set_path_in_schema(const std::vector<std::string> & val);
-
- void __set_codec(const CompressionCodec::type val);
-
- void __set_num_values(const int64_t val);
-
- void __set_total_uncompressed_size(const int64_t val);
-
- void __set_total_compressed_size(const int64_t val);
-
- void __set_key_value_metadata(const std::vector<KeyValue> & val);
-
- void __set_data_page_offset(const int64_t val);
-
- void __set_index_page_offset(const int64_t val);
-
- void __set_dictionary_page_offset(const int64_t val);
-
- void __set_statistics(const Statistics& val);
-
- void __set_encoding_stats(const std::vector<PageEncodingStats> & val);
-
- void __set_bloom_filter_offset(const int64_t val);
-
- bool operator == (const ColumnMetaData & rhs) const
- {
- if (!(type == rhs.type))
- return false;
- if (!(encodings == rhs.encodings))
- return false;
- if (!(path_in_schema == rhs.path_in_schema))
- return false;
- if (!(codec == rhs.codec))
- return false;
- if (!(num_values == rhs.num_values))
- return false;
- if (!(total_uncompressed_size == rhs.total_uncompressed_size))
- return false;
- if (!(total_compressed_size == rhs.total_compressed_size))
- return false;
- if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
- return false;
- else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
- return false;
- if (!(data_page_offset == rhs.data_page_offset))
- return false;
- if (__isset.index_page_offset != rhs.__isset.index_page_offset)
- return false;
- else if (__isset.index_page_offset && !(index_page_offset == rhs.index_page_offset))
- return false;
- if (__isset.dictionary_page_offset != rhs.__isset.dictionary_page_offset)
- return false;
- else if (__isset.dictionary_page_offset && !(dictionary_page_offset == rhs.dictionary_page_offset))
- return false;
- if (__isset.statistics != rhs.__isset.statistics)
- return false;
- else if (__isset.statistics && !(statistics == rhs.statistics))
- return false;
- if (__isset.encoding_stats != rhs.__isset.encoding_stats)
- return false;
- else if (__isset.encoding_stats && !(encoding_stats == rhs.encoding_stats))
- return false;
- if (__isset.bloom_filter_offset != rhs.__isset.bloom_filter_offset)
- return false;
- else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset))
- return false;
- return true;
- }
- bool operator != (const ColumnMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnMetaData &a, ColumnMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj);
-
-
-class EncryptionWithFooterKey : public virtual ::apache::thrift::TBase {
- public:
-
- EncryptionWithFooterKey(const EncryptionWithFooterKey&);
- EncryptionWithFooterKey& operator=(const EncryptionWithFooterKey&);
- EncryptionWithFooterKey() {
- }
-
- virtual ~EncryptionWithFooterKey() noexcept;
-
- bool operator == (const EncryptionWithFooterKey & /* rhs */) const
- {
- return true;
- }
- bool operator != (const EncryptionWithFooterKey &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EncryptionWithFooterKey & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b);
-
-std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj);
-
-typedef struct _EncryptionWithColumnKey__isset {
- _EncryptionWithColumnKey__isset() : key_metadata(false) {}
- bool key_metadata :1;
-} _EncryptionWithColumnKey__isset;
-
-class EncryptionWithColumnKey : public virtual ::apache::thrift::TBase {
- public:
-
- EncryptionWithColumnKey(const EncryptionWithColumnKey&);
- EncryptionWithColumnKey& operator=(const EncryptionWithColumnKey&);
- EncryptionWithColumnKey() : key_metadata() {
- }
-
- virtual ~EncryptionWithColumnKey() noexcept;
- std::vector<std::string> path_in_schema;
- std::string key_metadata;
-
- _EncryptionWithColumnKey__isset __isset;
-
- void __set_path_in_schema(const std::vector<std::string> & val);
-
- void __set_key_metadata(const std::string& val);
-
- bool operator == (const EncryptionWithColumnKey & rhs) const
- {
- if (!(path_in_schema == rhs.path_in_schema))
- return false;
- if (__isset.key_metadata != rhs.__isset.key_metadata)
- return false;
- else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
- return false;
- return true;
- }
- bool operator != (const EncryptionWithColumnKey &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EncryptionWithColumnKey & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b);
-
-std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj);
-
-typedef struct _ColumnCryptoMetaData__isset {
- _ColumnCryptoMetaData__isset() : ENCRYPTION_WITH_FOOTER_KEY(false), ENCRYPTION_WITH_COLUMN_KEY(false) {}
- bool ENCRYPTION_WITH_FOOTER_KEY :1;
- bool ENCRYPTION_WITH_COLUMN_KEY :1;
-} _ColumnCryptoMetaData__isset;
-
-class ColumnCryptoMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnCryptoMetaData(const ColumnCryptoMetaData&);
- ColumnCryptoMetaData& operator=(const ColumnCryptoMetaData&);
- ColumnCryptoMetaData() {
- }
-
- virtual ~ColumnCryptoMetaData() noexcept;
- EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY;
- EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY;
-
- _ColumnCryptoMetaData__isset __isset;
-
- void __set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val);
-
- void __set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val);
-
- bool operator == (const ColumnCryptoMetaData & rhs) const
- {
- if (__isset.ENCRYPTION_WITH_FOOTER_KEY != rhs.__isset.ENCRYPTION_WITH_FOOTER_KEY)
- return false;
- else if (__isset.ENCRYPTION_WITH_FOOTER_KEY && !(ENCRYPTION_WITH_FOOTER_KEY == rhs.ENCRYPTION_WITH_FOOTER_KEY))
- return false;
- if (__isset.ENCRYPTION_WITH_COLUMN_KEY != rhs.__isset.ENCRYPTION_WITH_COLUMN_KEY)
- return false;
- else if (__isset.ENCRYPTION_WITH_COLUMN_KEY && !(ENCRYPTION_WITH_COLUMN_KEY == rhs.ENCRYPTION_WITH_COLUMN_KEY))
- return false;
- return true;
- }
- bool operator != (const ColumnCryptoMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnCryptoMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj);
-
-typedef struct _ColumnChunk__isset {
- _ColumnChunk__isset() : file_path(false), meta_data(false), offset_index_offset(false), offset_index_length(false), column_index_offset(false), column_index_length(false), crypto_metadata(false), encrypted_column_metadata(false) {}
- bool file_path :1;
- bool meta_data :1;
- bool offset_index_offset :1;
- bool offset_index_length :1;
- bool column_index_offset :1;
- bool column_index_length :1;
- bool crypto_metadata :1;
- bool encrypted_column_metadata :1;
-} _ColumnChunk__isset;
-
-class ColumnChunk : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnChunk(const ColumnChunk&);
- ColumnChunk& operator=(const ColumnChunk&);
- ColumnChunk() : file_path(), file_offset(0), offset_index_offset(0), offset_index_length(0), column_index_offset(0), column_index_length(0), encrypted_column_metadata() {
- }
-
- virtual ~ColumnChunk() noexcept;
- std::string file_path;
- int64_t file_offset;
- ColumnMetaData meta_data;
- int64_t offset_index_offset;
- int32_t offset_index_length;
- int64_t column_index_offset;
- int32_t column_index_length;
- ColumnCryptoMetaData crypto_metadata;
- std::string encrypted_column_metadata;
-
- _ColumnChunk__isset __isset;
-
- void __set_file_path(const std::string& val);
-
- void __set_file_offset(const int64_t val);
-
- void __set_meta_data(const ColumnMetaData& val);
-
- void __set_offset_index_offset(const int64_t val);
-
- void __set_offset_index_length(const int32_t val);
-
- void __set_column_index_offset(const int64_t val);
-
- void __set_column_index_length(const int32_t val);
-
- void __set_crypto_metadata(const ColumnCryptoMetaData& val);
-
- void __set_encrypted_column_metadata(const std::string& val);
-
- bool operator == (const ColumnChunk & rhs) const
- {
- if (__isset.file_path != rhs.__isset.file_path)
- return false;
- else if (__isset.file_path && !(file_path == rhs.file_path))
- return false;
- if (!(file_offset == rhs.file_offset))
- return false;
- if (__isset.meta_data != rhs.__isset.meta_data)
- return false;
- else if (__isset.meta_data && !(meta_data == rhs.meta_data))
- return false;
- if (__isset.offset_index_offset != rhs.__isset.offset_index_offset)
- return false;
- else if (__isset.offset_index_offset && !(offset_index_offset == rhs.offset_index_offset))
- return false;
- if (__isset.offset_index_length != rhs.__isset.offset_index_length)
- return false;
- else if (__isset.offset_index_length && !(offset_index_length == rhs.offset_index_length))
- return false;
- if (__isset.column_index_offset != rhs.__isset.column_index_offset)
- return false;
- else if (__isset.column_index_offset && !(column_index_offset == rhs.column_index_offset))
- return false;
- if (__isset.column_index_length != rhs.__isset.column_index_length)
- return false;
- else if (__isset.column_index_length && !(column_index_length == rhs.column_index_length))
- return false;
- if (__isset.crypto_metadata != rhs.__isset.crypto_metadata)
- return false;
- else if (__isset.crypto_metadata && !(crypto_metadata == rhs.crypto_metadata))
- return false;
- if (__isset.encrypted_column_metadata != rhs.__isset.encrypted_column_metadata)
- return false;
- else if (__isset.encrypted_column_metadata && !(encrypted_column_metadata == rhs.encrypted_column_metadata))
- return false;
- return true;
- }
- bool operator != (const ColumnChunk &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnChunk & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnChunk &a, ColumnChunk &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj);
-
-typedef struct _RowGroup__isset {
- _RowGroup__isset() : sorting_columns(false), file_offset(false), total_compressed_size(false), ordinal(false) {}
- bool sorting_columns :1;
- bool file_offset :1;
- bool total_compressed_size :1;
- bool ordinal :1;
-} _RowGroup__isset;
-
-class RowGroup : public virtual ::apache::thrift::TBase {
- public:
-
- RowGroup(const RowGroup&);
- RowGroup& operator=(const RowGroup&);
- RowGroup() : total_byte_size(0), num_rows(0), file_offset(0), total_compressed_size(0), ordinal(0) {
- }
-
- virtual ~RowGroup() noexcept;
- std::vector<ColumnChunk> columns;
- int64_t total_byte_size;
- int64_t num_rows;
- std::vector<SortingColumn> sorting_columns;
- int64_t file_offset;
- int64_t total_compressed_size;
- int16_t ordinal;
-
- _RowGroup__isset __isset;
-
- void __set_columns(const std::vector<ColumnChunk> & val);
-
- void __set_total_byte_size(const int64_t val);
-
- void __set_num_rows(const int64_t val);
-
- void __set_sorting_columns(const std::vector<SortingColumn> & val);
-
- void __set_file_offset(const int64_t val);
-
- void __set_total_compressed_size(const int64_t val);
-
- void __set_ordinal(const int16_t val);
-
- bool operator == (const RowGroup & rhs) const
- {
- if (!(columns == rhs.columns))
- return false;
- if (!(total_byte_size == rhs.total_byte_size))
- return false;
- if (!(num_rows == rhs.num_rows))
- return false;
- if (__isset.sorting_columns != rhs.__isset.sorting_columns)
- return false;
- else if (__isset.sorting_columns && !(sorting_columns == rhs.sorting_columns))
- return false;
- if (__isset.file_offset != rhs.__isset.file_offset)
- return false;
- else if (__isset.file_offset && !(file_offset == rhs.file_offset))
- return false;
- if (__isset.total_compressed_size != rhs.__isset.total_compressed_size)
- return false;
- else if (__isset.total_compressed_size && !(total_compressed_size == rhs.total_compressed_size))
- return false;
- if (__isset.ordinal != rhs.__isset.ordinal)
- return false;
- else if (__isset.ordinal && !(ordinal == rhs.ordinal))
- return false;
- return true;
- }
- bool operator != (const RowGroup &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const RowGroup & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(RowGroup &a, RowGroup &b);
-
-std::ostream& operator<<(std::ostream& out, const RowGroup& obj);
-
-
-class TypeDefinedOrder : public virtual ::apache::thrift::TBase {
- public:
-
- TypeDefinedOrder(const TypeDefinedOrder&);
- TypeDefinedOrder& operator=(const TypeDefinedOrder&);
- TypeDefinedOrder() {
- }
-
- virtual ~TypeDefinedOrder() noexcept;
-
- bool operator == (const TypeDefinedOrder & /* rhs */) const
- {
- return true;
- }
- bool operator != (const TypeDefinedOrder &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const TypeDefinedOrder & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(TypeDefinedOrder &a, TypeDefinedOrder &b);
-
-std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj);
-
-typedef struct _ColumnOrder__isset {
- _ColumnOrder__isset() : TYPE_ORDER(false) {}
- bool TYPE_ORDER :1;
-} _ColumnOrder__isset;
-
-class ColumnOrder : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnOrder(const ColumnOrder&);
- ColumnOrder& operator=(const ColumnOrder&);
- ColumnOrder() {
- }
-
- virtual ~ColumnOrder() noexcept;
- TypeDefinedOrder TYPE_ORDER;
-
- _ColumnOrder__isset __isset;
-
- void __set_TYPE_ORDER(const TypeDefinedOrder& val);
-
- bool operator == (const ColumnOrder & rhs) const
- {
- if (__isset.TYPE_ORDER != rhs.__isset.TYPE_ORDER)
- return false;
- else if (__isset.TYPE_ORDER && !(TYPE_ORDER == rhs.TYPE_ORDER))
- return false;
- return true;
- }
- bool operator != (const ColumnOrder &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnOrder & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnOrder &a, ColumnOrder &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj);
-
-
-class PageLocation : public virtual ::apache::thrift::TBase {
- public:
-
- PageLocation(const PageLocation&);
- PageLocation& operator=(const PageLocation&);
- PageLocation() : offset(0), compressed_page_size(0), first_row_index(0) {
- }
-
- virtual ~PageLocation() noexcept;
- int64_t offset;
- int32_t compressed_page_size;
- int64_t first_row_index;
-
- void __set_offset(const int64_t val);
-
- void __set_compressed_page_size(const int32_t val);
-
- void __set_first_row_index(const int64_t val);
-
- bool operator == (const PageLocation & rhs) const
- {
- if (!(offset == rhs.offset))
- return false;
- if (!(compressed_page_size == rhs.compressed_page_size))
- return false;
- if (!(first_row_index == rhs.first_row_index))
- return false;
- return true;
- }
- bool operator != (const PageLocation &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const PageLocation & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(PageLocation &a, PageLocation &b);
-
-std::ostream& operator<<(std::ostream& out, const PageLocation& obj);
-
-
-class OffsetIndex : public virtual ::apache::thrift::TBase {
- public:
-
- OffsetIndex(const OffsetIndex&);
- OffsetIndex& operator=(const OffsetIndex&);
- OffsetIndex() {
- }
-
- virtual ~OffsetIndex() noexcept;
- std::vector<PageLocation> page_locations;
-
- void __set_page_locations(const std::vector<PageLocation> & val);
-
- bool operator == (const OffsetIndex & rhs) const
- {
- if (!(page_locations == rhs.page_locations))
- return false;
- return true;
- }
- bool operator != (const OffsetIndex &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const OffsetIndex & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(OffsetIndex &a, OffsetIndex &b);
-
-std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj);
-
-typedef struct _ColumnIndex__isset {
- _ColumnIndex__isset() : null_counts(false) {}
- bool null_counts :1;
-} _ColumnIndex__isset;
-
-class ColumnIndex : public virtual ::apache::thrift::TBase {
- public:
-
- ColumnIndex(const ColumnIndex&);
- ColumnIndex& operator=(const ColumnIndex&);
- ColumnIndex() : boundary_order((BoundaryOrder::type)0) {
- }
-
- virtual ~ColumnIndex() noexcept;
- std::vector<bool> null_pages;
- std::vector<std::string> min_values;
- std::vector<std::string> max_values;
- BoundaryOrder::type boundary_order;
- std::vector<int64_t> null_counts;
-
- _ColumnIndex__isset __isset;
-
- void __set_null_pages(const std::vector<bool> & val);
-
- void __set_min_values(const std::vector<std::string> & val);
-
- void __set_max_values(const std::vector<std::string> & val);
-
- void __set_boundary_order(const BoundaryOrder::type val);
-
- void __set_null_counts(const std::vector<int64_t> & val);
-
- bool operator == (const ColumnIndex & rhs) const
- {
- if (!(null_pages == rhs.null_pages))
- return false;
- if (!(min_values == rhs.min_values))
- return false;
- if (!(max_values == rhs.max_values))
- return false;
- if (!(boundary_order == rhs.boundary_order))
- return false;
- if (__isset.null_counts != rhs.__isset.null_counts)
- return false;
- else if (__isset.null_counts && !(null_counts == rhs.null_counts))
- return false;
- return true;
- }
- bool operator != (const ColumnIndex &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const ColumnIndex & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(ColumnIndex &a, ColumnIndex &b);
-
-std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj);
-
-typedef struct _AesGcmV1__isset {
- _AesGcmV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
- bool aad_prefix :1;
- bool aad_file_unique :1;
- bool supply_aad_prefix :1;
-} _AesGcmV1__isset;
-
-class AesGcmV1 : public virtual ::apache::thrift::TBase {
- public:
-
- AesGcmV1(const AesGcmV1&);
- AesGcmV1& operator=(const AesGcmV1&);
- AesGcmV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
- }
-
- virtual ~AesGcmV1() noexcept;
- std::string aad_prefix;
- std::string aad_file_unique;
- bool supply_aad_prefix;
-
- _AesGcmV1__isset __isset;
-
- void __set_aad_prefix(const std::string& val);
-
- void __set_aad_file_unique(const std::string& val);
-
- void __set_supply_aad_prefix(const bool val);
-
- bool operator == (const AesGcmV1 & rhs) const
- {
- if (__isset.aad_prefix != rhs.__isset.aad_prefix)
- return false;
- else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
- return false;
- if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
- return false;
- else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
- return false;
- if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
- return false;
- else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
- return false;
- return true;
- }
- bool operator != (const AesGcmV1 &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const AesGcmV1 & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(AesGcmV1 &a, AesGcmV1 &b);
-
-std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj);
-
-typedef struct _AesGcmCtrV1__isset {
- _AesGcmCtrV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
- bool aad_prefix :1;
- bool aad_file_unique :1;
- bool supply_aad_prefix :1;
-} _AesGcmCtrV1__isset;
-
-class AesGcmCtrV1 : public virtual ::apache::thrift::TBase {
- public:
-
- AesGcmCtrV1(const AesGcmCtrV1&);
- AesGcmCtrV1& operator=(const AesGcmCtrV1&);
- AesGcmCtrV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
- }
-
- virtual ~AesGcmCtrV1() noexcept;
- std::string aad_prefix;
- std::string aad_file_unique;
- bool supply_aad_prefix;
-
- _AesGcmCtrV1__isset __isset;
-
- void __set_aad_prefix(const std::string& val);
-
- void __set_aad_file_unique(const std::string& val);
-
- void __set_supply_aad_prefix(const bool val);
-
- bool operator == (const AesGcmCtrV1 & rhs) const
- {
- if (__isset.aad_prefix != rhs.__isset.aad_prefix)
- return false;
- else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
- return false;
- if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
- return false;
- else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
- return false;
- if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
- return false;
- else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
- return false;
- return true;
- }
- bool operator != (const AesGcmCtrV1 &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const AesGcmCtrV1 & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b);
-
-std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj);
-
-typedef struct _EncryptionAlgorithm__isset {
- _EncryptionAlgorithm__isset() : AES_GCM_V1(false), AES_GCM_CTR_V1(false) {}
- bool AES_GCM_V1 :1;
- bool AES_GCM_CTR_V1 :1;
-} _EncryptionAlgorithm__isset;
-
-class EncryptionAlgorithm : public virtual ::apache::thrift::TBase {
- public:
-
- EncryptionAlgorithm(const EncryptionAlgorithm&);
- EncryptionAlgorithm& operator=(const EncryptionAlgorithm&);
- EncryptionAlgorithm() {
- }
-
- virtual ~EncryptionAlgorithm() noexcept;
- AesGcmV1 AES_GCM_V1;
- AesGcmCtrV1 AES_GCM_CTR_V1;
-
- _EncryptionAlgorithm__isset __isset;
-
- void __set_AES_GCM_V1(const AesGcmV1& val);
-
- void __set_AES_GCM_CTR_V1(const AesGcmCtrV1& val);
-
- bool operator == (const EncryptionAlgorithm & rhs) const
- {
- if (__isset.AES_GCM_V1 != rhs.__isset.AES_GCM_V1)
- return false;
- else if (__isset.AES_GCM_V1 && !(AES_GCM_V1 == rhs.AES_GCM_V1))
- return false;
- if (__isset.AES_GCM_CTR_V1 != rhs.__isset.AES_GCM_CTR_V1)
- return false;
- else if (__isset.AES_GCM_CTR_V1 && !(AES_GCM_CTR_V1 == rhs.AES_GCM_CTR_V1))
- return false;
- return true;
- }
- bool operator != (const EncryptionAlgorithm &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const EncryptionAlgorithm & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b);
-
-std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj);
-
-typedef struct _FileMetaData__isset {
- _FileMetaData__isset() : key_value_metadata(false), created_by(false), column_orders(false), encryption_algorithm(false), footer_signing_key_metadata(false) {}
- bool key_value_metadata :1;
- bool created_by :1;
- bool column_orders :1;
- bool encryption_algorithm :1;
- bool footer_signing_key_metadata :1;
-} _FileMetaData__isset;
-
-class FileMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- FileMetaData(const FileMetaData&);
- FileMetaData& operator=(const FileMetaData&);
- FileMetaData() : version(0), num_rows(0), created_by(), footer_signing_key_metadata() {
- }
-
- virtual ~FileMetaData() noexcept;
- int32_t version;
- std::vector<SchemaElement> schema;
- int64_t num_rows;
- std::vector<RowGroup> row_groups;
- std::vector<KeyValue> key_value_metadata;
- std::string created_by;
- std::vector<ColumnOrder> column_orders;
- EncryptionAlgorithm encryption_algorithm;
- std::string footer_signing_key_metadata;
-
- _FileMetaData__isset __isset;
-
- void __set_version(const int32_t val);
-
- void __set_schema(const std::vector<SchemaElement> & val);
-
- void __set_num_rows(const int64_t val);
-
- void __set_row_groups(const std::vector<RowGroup> & val);
-
- void __set_key_value_metadata(const std::vector<KeyValue> & val);
-
- void __set_created_by(const std::string& val);
-
- void __set_column_orders(const std::vector<ColumnOrder> & val);
-
- void __set_encryption_algorithm(const EncryptionAlgorithm& val);
-
- void __set_footer_signing_key_metadata(const std::string& val);
-
- bool operator == (const FileMetaData & rhs) const
- {
- if (!(version == rhs.version))
- return false;
- if (!(schema == rhs.schema))
- return false;
- if (!(num_rows == rhs.num_rows))
- return false;
- if (!(row_groups == rhs.row_groups))
- return false;
- if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
- return false;
- else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
- return false;
- if (__isset.created_by != rhs.__isset.created_by)
- return false;
- else if (__isset.created_by && !(created_by == rhs.created_by))
- return false;
- if (__isset.column_orders != rhs.__isset.column_orders)
- return false;
- else if (__isset.column_orders && !(column_orders == rhs.column_orders))
- return false;
- if (__isset.encryption_algorithm != rhs.__isset.encryption_algorithm)
- return false;
- else if (__isset.encryption_algorithm && !(encryption_algorithm == rhs.encryption_algorithm))
- return false;
- if (__isset.footer_signing_key_metadata != rhs.__isset.footer_signing_key_metadata)
- return false;
- else if (__isset.footer_signing_key_metadata && !(footer_signing_key_metadata == rhs.footer_signing_key_metadata))
- return false;
- return true;
- }
- bool operator != (const FileMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const FileMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(FileMetaData &a, FileMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const FileMetaData& obj);
-
-typedef struct _FileCryptoMetaData__isset {
- _FileCryptoMetaData__isset() : key_metadata(false) {}
- bool key_metadata :1;
-} _FileCryptoMetaData__isset;
-
-class FileCryptoMetaData : public virtual ::apache::thrift::TBase {
- public:
-
- FileCryptoMetaData(const FileCryptoMetaData&);
- FileCryptoMetaData& operator=(const FileCryptoMetaData&);
- FileCryptoMetaData() : key_metadata() {
- }
-
- virtual ~FileCryptoMetaData() noexcept;
- EncryptionAlgorithm encryption_algorithm;
- std::string key_metadata;
-
- _FileCryptoMetaData__isset __isset;
-
- void __set_encryption_algorithm(const EncryptionAlgorithm& val);
-
- void __set_key_metadata(const std::string& val);
-
- bool operator == (const FileCryptoMetaData & rhs) const
- {
- if (!(encryption_algorithm == rhs.encryption_algorithm))
- return false;
- if (__isset.key_metadata != rhs.__isset.key_metadata)
- return false;
- else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
- return false;
- return true;
- }
- bool operator != (const FileCryptoMetaData &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator < (const FileCryptoMetaData & ) const;
-
- uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
- uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
-
- virtual void printTo(std::ostream& out) const;
-};
-
-void swap(FileCryptoMetaData &a, FileCryptoMetaData &b);
-
-std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
-
-}} // namespace
-
-#endif
+/**
+ * Autogenerated by Thrift Compiler (0.13.0)
+ *
+ * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
+ * @generated
+ */
+#ifndef parquet_TYPES_H
+#define parquet_TYPES_H
+
+#include <iosfwd>
+
+#include <thrift/Thrift.h>
+#include <thrift/TApplicationException.h>
+#include <thrift/TBase.h>
+#include <thrift/protocol/TProtocol.h>
+#include <thrift/transport/TTransport.h>
+
+#include <functional>
+#include <memory>
+
+#include "parquet/windows_compatibility.h"
+
+namespace parquet { namespace format {
+
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7
+ };
+};
+
+extern const std::map<int, const char*> _Type_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const Type::type& val);
+
+std::string to_string(const Type::type& val);
+
+struct ConvertedType {
+ enum type {
+ UTF8 = 0,
+ MAP = 1,
+ MAP_KEY_VALUE = 2,
+ LIST = 3,
+ ENUM = 4,
+ DECIMAL = 5,
+ DATE = 6,
+ TIME_MILLIS = 7,
+ TIME_MICROS = 8,
+ TIMESTAMP_MILLIS = 9,
+ TIMESTAMP_MICROS = 10,
+ UINT_8 = 11,
+ UINT_16 = 12,
+ UINT_32 = 13,
+ UINT_64 = 14,
+ INT_8 = 15,
+ INT_16 = 16,
+ INT_32 = 17,
+ INT_64 = 18,
+ JSON = 19,
+ BSON = 20,
+ INTERVAL = 21
+ };
+};
+
+extern const std::map<int, const char*> _ConvertedType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const ConvertedType::type& val);
+
+std::string to_string(const ConvertedType::type& val);
+
+struct FieldRepetitionType {
+ enum type {
+ REQUIRED = 0,
+ OPTIONAL = 1,
+ REPEATED = 2
+ };
+};
+
+extern const std::map<int, const char*> _FieldRepetitionType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const FieldRepetitionType::type& val);
+
+std::string to_string(const FieldRepetitionType::type& val);
+
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9
+ };
+};
+
+extern const std::map<int, const char*> _Encoding_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const Encoding::type& val);
+
+std::string to_string(const Encoding::type& val);
+
+struct CompressionCodec {
+ enum type {
+ UNCOMPRESSED = 0,
+ SNAPPY = 1,
+ GZIP = 2,
+ LZO = 3,
+ BROTLI = 4,
+ LZ4 = 5,
+ ZSTD = 6,
+ LZ4_RAW = 7
+ };
+};
+
+extern const std::map<int, const char*> _CompressionCodec_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const CompressionCodec::type& val);
+
+std::string to_string(const CompressionCodec::type& val);
+
+struct PageType {
+ enum type {
+ DATA_PAGE = 0,
+ INDEX_PAGE = 1,
+ DICTIONARY_PAGE = 2,
+ DATA_PAGE_V2 = 3
+ };
+};
+
+extern const std::map<int, const char*> _PageType_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const PageType::type& val);
+
+std::string to_string(const PageType::type& val);
+
+struct BoundaryOrder {
+ enum type {
+ UNORDERED = 0,
+ ASCENDING = 1,
+ DESCENDING = 2
+ };
+};
+
+extern const std::map<int, const char*> _BoundaryOrder_VALUES_TO_NAMES;
+
+std::ostream& operator<<(std::ostream& out, const BoundaryOrder::type& val);
+
+std::string to_string(const BoundaryOrder::type& val);
+
+class Statistics;
+
+class StringType;
+
+class UUIDType;
+
+class MapType;
+
+class ListType;
+
+class EnumType;
+
+class DateType;
+
+class NullType;
+
+class DecimalType;
+
+class MilliSeconds;
+
+class MicroSeconds;
+
+class NanoSeconds;
+
+class TimeUnit;
+
+class TimestampType;
+
+class TimeType;
+
+class IntType;
+
+class JsonType;
+
+class BsonType;
+
+class LogicalType;
+
+class SchemaElement;
+
+class DataPageHeader;
+
+class IndexPageHeader;
+
+class DictionaryPageHeader;
+
+class DataPageHeaderV2;
+
+class SplitBlockAlgorithm;
+
+class BloomFilterAlgorithm;
+
+class XxHash;
+
+class BloomFilterHash;
+
+class Uncompressed;
+
+class BloomFilterCompression;
+
+class BloomFilterHeader;
+
+class PageHeader;
+
+class KeyValue;
+
+class SortingColumn;
+
+class PageEncodingStats;
+
+class ColumnMetaData;
+
+class EncryptionWithFooterKey;
+
+class EncryptionWithColumnKey;
+
+class ColumnCryptoMetaData;
+
+class ColumnChunk;
+
+class RowGroup;
+
+class TypeDefinedOrder;
+
+class ColumnOrder;
+
+class PageLocation;
+
+class OffsetIndex;
+
+class ColumnIndex;
+
+class AesGcmV1;
+
+class AesGcmCtrV1;
+
+class EncryptionAlgorithm;
+
+class FileMetaData;
+
+class FileCryptoMetaData;
+
+typedef struct _Statistics__isset {
+ _Statistics__isset() : max(false), min(false), null_count(false), distinct_count(false), max_value(false), min_value(false) {}
+ bool max :1;
+ bool min :1;
+ bool null_count :1;
+ bool distinct_count :1;
+ bool max_value :1;
+ bool min_value :1;
+} _Statistics__isset;
+
+class Statistics : public virtual ::apache::thrift::TBase {
+ public:
+
+ Statistics(const Statistics&);
+ Statistics& operator=(const Statistics&);
+ Statistics() : max(), min(), null_count(0), distinct_count(0), max_value(), min_value() {
+ }
+
+ virtual ~Statistics() noexcept;
+ std::string max;
+ std::string min;
+ int64_t null_count;
+ int64_t distinct_count;
+ std::string max_value;
+ std::string min_value;
+
+ _Statistics__isset __isset;
+
+ void __set_max(const std::string& val);
+
+ void __set_min(const std::string& val);
+
+ void __set_null_count(const int64_t val);
+
+ void __set_distinct_count(const int64_t val);
+
+ void __set_max_value(const std::string& val);
+
+ void __set_min_value(const std::string& val);
+
+ bool operator == (const Statistics & rhs) const
+ {
+ if (__isset.max != rhs.__isset.max)
+ return false;
+ else if (__isset.max && !(max == rhs.max))
+ return false;
+ if (__isset.min != rhs.__isset.min)
+ return false;
+ else if (__isset.min && !(min == rhs.min))
+ return false;
+ if (__isset.null_count != rhs.__isset.null_count)
+ return false;
+ else if (__isset.null_count && !(null_count == rhs.null_count))
+ return false;
+ if (__isset.distinct_count != rhs.__isset.distinct_count)
+ return false;
+ else if (__isset.distinct_count && !(distinct_count == rhs.distinct_count))
+ return false;
+ if (__isset.max_value != rhs.__isset.max_value)
+ return false;
+ else if (__isset.max_value && !(max_value == rhs.max_value))
+ return false;
+ if (__isset.min_value != rhs.__isset.min_value)
+ return false;
+ else if (__isset.min_value && !(min_value == rhs.min_value))
+ return false;
+ return true;
+ }
+ bool operator != (const Statistics &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const Statistics & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(Statistics &a, Statistics &b);
+
+std::ostream& operator<<(std::ostream& out, const Statistics& obj);
+
+
+class StringType : public virtual ::apache::thrift::TBase {
+ public:
+
+ StringType(const StringType&);
+ StringType& operator=(const StringType&);
+ StringType() {
+ }
+
+ virtual ~StringType() noexcept;
+
+ bool operator == (const StringType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const StringType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const StringType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(StringType &a, StringType &b);
+
+std::ostream& operator<<(std::ostream& out, const StringType& obj);
+
+
+class UUIDType : public virtual ::apache::thrift::TBase {
+ public:
+
+ UUIDType(const UUIDType&);
+ UUIDType& operator=(const UUIDType&);
+ UUIDType() {
+ }
+
+ virtual ~UUIDType() noexcept;
+
+ bool operator == (const UUIDType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const UUIDType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const UUIDType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(UUIDType &a, UUIDType &b);
+
+std::ostream& operator<<(std::ostream& out, const UUIDType& obj);
+
+
+class MapType : public virtual ::apache::thrift::TBase {
+ public:
+
+ MapType(const MapType&);
+ MapType& operator=(const MapType&);
+ MapType() {
+ }
+
+ virtual ~MapType() noexcept;
+
+ bool operator == (const MapType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MapType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MapType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MapType &a, MapType &b);
+
+std::ostream& operator<<(std::ostream& out, const MapType& obj);
+
+
+class ListType : public virtual ::apache::thrift::TBase {
+ public:
+
+ ListType(const ListType&);
+ ListType& operator=(const ListType&);
+ ListType() {
+ }
+
+ virtual ~ListType() noexcept;
+
+ bool operator == (const ListType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const ListType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ListType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ListType &a, ListType &b);
+
+std::ostream& operator<<(std::ostream& out, const ListType& obj);
+
+
+class EnumType : public virtual ::apache::thrift::TBase {
+ public:
+
+ EnumType(const EnumType&);
+ EnumType& operator=(const EnumType&);
+ EnumType() {
+ }
+
+ virtual ~EnumType() noexcept;
+
+ bool operator == (const EnumType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const EnumType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EnumType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EnumType &a, EnumType &b);
+
+std::ostream& operator<<(std::ostream& out, const EnumType& obj);
+
+
+class DateType : public virtual ::apache::thrift::TBase {
+ public:
+
+ DateType(const DateType&);
+ DateType& operator=(const DateType&);
+ DateType() {
+ }
+
+ virtual ~DateType() noexcept;
+
+ bool operator == (const DateType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const DateType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DateType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DateType &a, DateType &b);
+
+std::ostream& operator<<(std::ostream& out, const DateType& obj);
+
+
+class NullType : public virtual ::apache::thrift::TBase {
+ public:
+
+ NullType(const NullType&);
+ NullType& operator=(const NullType&);
+ NullType() {
+ }
+
+ virtual ~NullType() noexcept;
+
+ bool operator == (const NullType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const NullType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const NullType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(NullType &a, NullType &b);
+
+std::ostream& operator<<(std::ostream& out, const NullType& obj);
+
+
+class DecimalType : public virtual ::apache::thrift::TBase {
+ public:
+
+ DecimalType(const DecimalType&);
+ DecimalType& operator=(const DecimalType&);
+ DecimalType() : scale(0), precision(0) {
+ }
+
+ virtual ~DecimalType() noexcept;
+ int32_t scale;
+ int32_t precision;
+
+ void __set_scale(const int32_t val);
+
+ void __set_precision(const int32_t val);
+
+ bool operator == (const DecimalType & rhs) const
+ {
+ if (!(scale == rhs.scale))
+ return false;
+ if (!(precision == rhs.precision))
+ return false;
+ return true;
+ }
+ bool operator != (const DecimalType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DecimalType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DecimalType &a, DecimalType &b);
+
+std::ostream& operator<<(std::ostream& out, const DecimalType& obj);
+
+
+class MilliSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ MilliSeconds(const MilliSeconds&);
+ MilliSeconds& operator=(const MilliSeconds&);
+ MilliSeconds() {
+ }
+
+ virtual ~MilliSeconds() noexcept;
+
+ bool operator == (const MilliSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MilliSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MilliSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MilliSeconds &a, MilliSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const MilliSeconds& obj);
+
+
+class MicroSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ MicroSeconds(const MicroSeconds&);
+ MicroSeconds& operator=(const MicroSeconds&);
+ MicroSeconds() {
+ }
+
+ virtual ~MicroSeconds() noexcept;
+
+ bool operator == (const MicroSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const MicroSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const MicroSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(MicroSeconds &a, MicroSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const MicroSeconds& obj);
+
+
+class NanoSeconds : public virtual ::apache::thrift::TBase {
+ public:
+
+ NanoSeconds(const NanoSeconds&);
+ NanoSeconds& operator=(const NanoSeconds&);
+ NanoSeconds() {
+ }
+
+ virtual ~NanoSeconds() noexcept;
+
+ bool operator == (const NanoSeconds & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const NanoSeconds &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const NanoSeconds & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(NanoSeconds &a, NanoSeconds &b);
+
+std::ostream& operator<<(std::ostream& out, const NanoSeconds& obj);
+
+typedef struct _TimeUnit__isset {
+ _TimeUnit__isset() : MILLIS(false), MICROS(false), NANOS(false) {}
+ bool MILLIS :1;
+ bool MICROS :1;
+ bool NANOS :1;
+} _TimeUnit__isset;
+
+class TimeUnit : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimeUnit(const TimeUnit&);
+ TimeUnit& operator=(const TimeUnit&);
+ TimeUnit() {
+ }
+
+ virtual ~TimeUnit() noexcept;
+ MilliSeconds MILLIS;
+ MicroSeconds MICROS;
+ NanoSeconds NANOS;
+
+ _TimeUnit__isset __isset;
+
+ void __set_MILLIS(const MilliSeconds& val);
+
+ void __set_MICROS(const MicroSeconds& val);
+
+ void __set_NANOS(const NanoSeconds& val);
+
+ bool operator == (const TimeUnit & rhs) const
+ {
+ if (__isset.MILLIS != rhs.__isset.MILLIS)
+ return false;
+ else if (__isset.MILLIS && !(MILLIS == rhs.MILLIS))
+ return false;
+ if (__isset.MICROS != rhs.__isset.MICROS)
+ return false;
+ else if (__isset.MICROS && !(MICROS == rhs.MICROS))
+ return false;
+ if (__isset.NANOS != rhs.__isset.NANOS)
+ return false;
+ else if (__isset.NANOS && !(NANOS == rhs.NANOS))
+ return false;
+ return true;
+ }
+ bool operator != (const TimeUnit &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimeUnit & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimeUnit &a, TimeUnit &b);
+
+std::ostream& operator<<(std::ostream& out, const TimeUnit& obj);
+
+
+class TimestampType : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimestampType(const TimestampType&);
+ TimestampType& operator=(const TimestampType&);
+ TimestampType() : isAdjustedToUTC(0) {
+ }
+
+ virtual ~TimestampType() noexcept;
+ bool isAdjustedToUTC;
+ TimeUnit unit;
+
+ void __set_isAdjustedToUTC(const bool val);
+
+ void __set_unit(const TimeUnit& val);
+
+ bool operator == (const TimestampType & rhs) const
+ {
+ if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
+ return false;
+ if (!(unit == rhs.unit))
+ return false;
+ return true;
+ }
+ bool operator != (const TimestampType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimestampType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimestampType &a, TimestampType &b);
+
+std::ostream& operator<<(std::ostream& out, const TimestampType& obj);
+
+
+class TimeType : public virtual ::apache::thrift::TBase {
+ public:
+
+ TimeType(const TimeType&);
+ TimeType& operator=(const TimeType&);
+ TimeType() : isAdjustedToUTC(0) {
+ }
+
+ virtual ~TimeType() noexcept;
+ bool isAdjustedToUTC;
+ TimeUnit unit;
+
+ void __set_isAdjustedToUTC(const bool val);
+
+ void __set_unit(const TimeUnit& val);
+
+ bool operator == (const TimeType & rhs) const
+ {
+ if (!(isAdjustedToUTC == rhs.isAdjustedToUTC))
+ return false;
+ if (!(unit == rhs.unit))
+ return false;
+ return true;
+ }
+ bool operator != (const TimeType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TimeType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TimeType &a, TimeType &b);
+
+std::ostream& operator<<(std::ostream& out, const TimeType& obj);
+
+
+class IntType : public virtual ::apache::thrift::TBase {
+ public:
+
+ IntType(const IntType&);
+ IntType& operator=(const IntType&);
+ IntType() : bitWidth(0), isSigned(0) {
+ }
+
+ virtual ~IntType() noexcept;
+ int8_t bitWidth;
+ bool isSigned;
+
+ void __set_bitWidth(const int8_t val);
+
+ void __set_isSigned(const bool val);
+
+ bool operator == (const IntType & rhs) const
+ {
+ if (!(bitWidth == rhs.bitWidth))
+ return false;
+ if (!(isSigned == rhs.isSigned))
+ return false;
+ return true;
+ }
+ bool operator != (const IntType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const IntType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(IntType &a, IntType &b);
+
+std::ostream& operator<<(std::ostream& out, const IntType& obj);
+
+
+class JsonType : public virtual ::apache::thrift::TBase {
+ public:
+
+ JsonType(const JsonType&);
+ JsonType& operator=(const JsonType&);
+ JsonType() {
+ }
+
+ virtual ~JsonType() noexcept;
+
+ bool operator == (const JsonType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const JsonType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const JsonType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(JsonType &a, JsonType &b);
+
+std::ostream& operator<<(std::ostream& out, const JsonType& obj);
+
+
+class BsonType : public virtual ::apache::thrift::TBase {
+ public:
+
+ BsonType(const BsonType&);
+ BsonType& operator=(const BsonType&);
+ BsonType() {
+ }
+
+ virtual ~BsonType() noexcept;
+
+ bool operator == (const BsonType & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const BsonType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BsonType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BsonType &a, BsonType &b);
+
+std::ostream& operator<<(std::ostream& out, const BsonType& obj);
+
+typedef struct _LogicalType__isset {
+ _LogicalType__isset() : STRING(false), MAP(false), LIST(false), ENUM(false), DECIMAL(false), DATE(false), TIME(false), TIMESTAMP(false), INTEGER(false), UNKNOWN(false), JSON(false), BSON(false), UUID(false) {}
+ bool STRING :1;
+ bool MAP :1;
+ bool LIST :1;
+ bool ENUM :1;
+ bool DECIMAL :1;
+ bool DATE :1;
+ bool TIME :1;
+ bool TIMESTAMP :1;
+ bool INTEGER :1;
+ bool UNKNOWN :1;
+ bool JSON :1;
+ bool BSON :1;
+ bool UUID :1;
+} _LogicalType__isset;
+
+class LogicalType : public virtual ::apache::thrift::TBase {
+ public:
+
+ LogicalType(const LogicalType&);
+ LogicalType& operator=(const LogicalType&);
+ LogicalType() {
+ }
+
+ virtual ~LogicalType() noexcept;
+ StringType STRING;
+ MapType MAP;
+ ListType LIST;
+ EnumType ENUM;
+ DecimalType DECIMAL;
+ DateType DATE;
+ TimeType TIME;
+ TimestampType TIMESTAMP;
+ IntType INTEGER;
+ NullType UNKNOWN;
+ JsonType JSON;
+ BsonType BSON;
+ UUIDType UUID;
+
+ _LogicalType__isset __isset;
+
+ void __set_STRING(const StringType& val);
+
+ void __set_MAP(const MapType& val);
+
+ void __set_LIST(const ListType& val);
+
+ void __set_ENUM(const EnumType& val);
+
+ void __set_DECIMAL(const DecimalType& val);
+
+ void __set_DATE(const DateType& val);
+
+ void __set_TIME(const TimeType& val);
+
+ void __set_TIMESTAMP(const TimestampType& val);
+
+ void __set_INTEGER(const IntType& val);
+
+ void __set_UNKNOWN(const NullType& val);
+
+ void __set_JSON(const JsonType& val);
+
+ void __set_BSON(const BsonType& val);
+
+ void __set_UUID(const UUIDType& val);
+
+ bool operator == (const LogicalType & rhs) const
+ {
+ if (__isset.STRING != rhs.__isset.STRING)
+ return false;
+ else if (__isset.STRING && !(STRING == rhs.STRING))
+ return false;
+ if (__isset.MAP != rhs.__isset.MAP)
+ return false;
+ else if (__isset.MAP && !(MAP == rhs.MAP))
+ return false;
+ if (__isset.LIST != rhs.__isset.LIST)
+ return false;
+ else if (__isset.LIST && !(LIST == rhs.LIST))
+ return false;
+ if (__isset.ENUM != rhs.__isset.ENUM)
+ return false;
+ else if (__isset.ENUM && !(ENUM == rhs.ENUM))
+ return false;
+ if (__isset.DECIMAL != rhs.__isset.DECIMAL)
+ return false;
+ else if (__isset.DECIMAL && !(DECIMAL == rhs.DECIMAL))
+ return false;
+ if (__isset.DATE != rhs.__isset.DATE)
+ return false;
+ else if (__isset.DATE && !(DATE == rhs.DATE))
+ return false;
+ if (__isset.TIME != rhs.__isset.TIME)
+ return false;
+ else if (__isset.TIME && !(TIME == rhs.TIME))
+ return false;
+ if (__isset.TIMESTAMP != rhs.__isset.TIMESTAMP)
+ return false;
+ else if (__isset.TIMESTAMP && !(TIMESTAMP == rhs.TIMESTAMP))
+ return false;
+ if (__isset.INTEGER != rhs.__isset.INTEGER)
+ return false;
+ else if (__isset.INTEGER && !(INTEGER == rhs.INTEGER))
+ return false;
+ if (__isset.UNKNOWN != rhs.__isset.UNKNOWN)
+ return false;
+ else if (__isset.UNKNOWN && !(UNKNOWN == rhs.UNKNOWN))
+ return false;
+ if (__isset.JSON != rhs.__isset.JSON)
+ return false;
+ else if (__isset.JSON && !(JSON == rhs.JSON))
+ return false;
+ if (__isset.BSON != rhs.__isset.BSON)
+ return false;
+ else if (__isset.BSON && !(BSON == rhs.BSON))
+ return false;
+ if (__isset.UUID != rhs.__isset.UUID)
+ return false;
+ else if (__isset.UUID && !(UUID == rhs.UUID))
+ return false;
+ return true;
+ }
+ bool operator != (const LogicalType &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const LogicalType & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(LogicalType &a, LogicalType &b);
+
+std::ostream& operator<<(std::ostream& out, const LogicalType& obj);
+
+typedef struct _SchemaElement__isset {
+ _SchemaElement__isset() : type(false), type_length(false), repetition_type(false), num_children(false), converted_type(false), scale(false), precision(false), field_id(false), logicalType(false) {}
+ bool type :1;
+ bool type_length :1;
+ bool repetition_type :1;
+ bool num_children :1;
+ bool converted_type :1;
+ bool scale :1;
+ bool precision :1;
+ bool field_id :1;
+ bool logicalType :1;
+} _SchemaElement__isset;
+
+class SchemaElement : public virtual ::apache::thrift::TBase {
+ public:
+
+ SchemaElement(const SchemaElement&);
+ SchemaElement& operator=(const SchemaElement&);
+ SchemaElement() : type((Type::type)0), type_length(0), repetition_type((FieldRepetitionType::type)0), name(), num_children(0), converted_type((ConvertedType::type)0), scale(0), precision(0), field_id(0) {
+ }
+
+ virtual ~SchemaElement() noexcept;
+ Type::type type;
+ int32_t type_length;
+ FieldRepetitionType::type repetition_type;
+ std::string name;
+ int32_t num_children;
+ ConvertedType::type converted_type;
+ int32_t scale;
+ int32_t precision;
+ int32_t field_id;
+ LogicalType logicalType;
+
+ _SchemaElement__isset __isset;
+
+ void __set_type(const Type::type val);
+
+ void __set_type_length(const int32_t val);
+
+ void __set_repetition_type(const FieldRepetitionType::type val);
+
+ void __set_name(const std::string& val);
+
+ void __set_num_children(const int32_t val);
+
+ void __set_converted_type(const ConvertedType::type val);
+
+ void __set_scale(const int32_t val);
+
+ void __set_precision(const int32_t val);
+
+ void __set_field_id(const int32_t val);
+
+ void __set_logicalType(const LogicalType& val);
+
+ bool operator == (const SchemaElement & rhs) const
+ {
+ if (__isset.type != rhs.__isset.type)
+ return false;
+ else if (__isset.type && !(type == rhs.type))
+ return false;
+ if (__isset.type_length != rhs.__isset.type_length)
+ return false;
+ else if (__isset.type_length && !(type_length == rhs.type_length))
+ return false;
+ if (__isset.repetition_type != rhs.__isset.repetition_type)
+ return false;
+ else if (__isset.repetition_type && !(repetition_type == rhs.repetition_type))
+ return false;
+ if (!(name == rhs.name))
+ return false;
+ if (__isset.num_children != rhs.__isset.num_children)
+ return false;
+ else if (__isset.num_children && !(num_children == rhs.num_children))
+ return false;
+ if (__isset.converted_type != rhs.__isset.converted_type)
+ return false;
+ else if (__isset.converted_type && !(converted_type == rhs.converted_type))
+ return false;
+ if (__isset.scale != rhs.__isset.scale)
+ return false;
+ else if (__isset.scale && !(scale == rhs.scale))
+ return false;
+ if (__isset.precision != rhs.__isset.precision)
+ return false;
+ else if (__isset.precision && !(precision == rhs.precision))
+ return false;
+ if (__isset.field_id != rhs.__isset.field_id)
+ return false;
+ else if (__isset.field_id && !(field_id == rhs.field_id))
+ return false;
+ if (__isset.logicalType != rhs.__isset.logicalType)
+ return false;
+ else if (__isset.logicalType && !(logicalType == rhs.logicalType))
+ return false;
+ return true;
+ }
+ bool operator != (const SchemaElement &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SchemaElement & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SchemaElement &a, SchemaElement &b);
+
+std::ostream& operator<<(std::ostream& out, const SchemaElement& obj);
+
+typedef struct _DataPageHeader__isset {
+ _DataPageHeader__isset() : statistics(false) {}
+ bool statistics :1;
+} _DataPageHeader__isset;
+
+class DataPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ DataPageHeader(const DataPageHeader&);
+ DataPageHeader& operator=(const DataPageHeader&);
+ DataPageHeader() : num_values(0), encoding((Encoding::type)0), definition_level_encoding((Encoding::type)0), repetition_level_encoding((Encoding::type)0) {
+ }
+
+ virtual ~DataPageHeader() noexcept;
+ int32_t num_values;
+ Encoding::type encoding;
+ Encoding::type definition_level_encoding;
+ Encoding::type repetition_level_encoding;
+ Statistics statistics;
+
+ _DataPageHeader__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_definition_level_encoding(const Encoding::type val);
+
+ void __set_repetition_level_encoding(const Encoding::type val);
+
+ void __set_statistics(const Statistics& val);
+
+ bool operator == (const DataPageHeader & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(definition_level_encoding == rhs.definition_level_encoding))
+ return false;
+ if (!(repetition_level_encoding == rhs.repetition_level_encoding))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ return true;
+ }
+ bool operator != (const DataPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DataPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DataPageHeader &a, DataPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const DataPageHeader& obj);
+
+
+class IndexPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ IndexPageHeader(const IndexPageHeader&);
+ IndexPageHeader& operator=(const IndexPageHeader&);
+ IndexPageHeader() {
+ }
+
+ virtual ~IndexPageHeader() noexcept;
+
+ bool operator == (const IndexPageHeader & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const IndexPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const IndexPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(IndexPageHeader &a, IndexPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const IndexPageHeader& obj);
+
+typedef struct _DictionaryPageHeader__isset {
+ _DictionaryPageHeader__isset() : is_sorted(false) {}
+ bool is_sorted :1;
+} _DictionaryPageHeader__isset;
+
+class DictionaryPageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ DictionaryPageHeader(const DictionaryPageHeader&);
+ DictionaryPageHeader& operator=(const DictionaryPageHeader&);
+ DictionaryPageHeader() : num_values(0), encoding((Encoding::type)0), is_sorted(0) {
+ }
+
+ virtual ~DictionaryPageHeader() noexcept;
+ int32_t num_values;
+ Encoding::type encoding;
+ bool is_sorted;
+
+ _DictionaryPageHeader__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_is_sorted(const bool val);
+
+ bool operator == (const DictionaryPageHeader & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (__isset.is_sorted != rhs.__isset.is_sorted)
+ return false;
+ else if (__isset.is_sorted && !(is_sorted == rhs.is_sorted))
+ return false;
+ return true;
+ }
+ bool operator != (const DictionaryPageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DictionaryPageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DictionaryPageHeader &a, DictionaryPageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const DictionaryPageHeader& obj);
+
+typedef struct _DataPageHeaderV2__isset {
+ _DataPageHeaderV2__isset() : is_compressed(true), statistics(false) {}
+ bool is_compressed :1;
+ bool statistics :1;
+} _DataPageHeaderV2__isset;
+
+class DataPageHeaderV2 : public virtual ::apache::thrift::TBase {
+ public:
+
+ DataPageHeaderV2(const DataPageHeaderV2&);
+ DataPageHeaderV2& operator=(const DataPageHeaderV2&);
+ DataPageHeaderV2() : num_values(0), num_nulls(0), num_rows(0), encoding((Encoding::type)0), definition_levels_byte_length(0), repetition_levels_byte_length(0), is_compressed(true) {
+ }
+
+ virtual ~DataPageHeaderV2() noexcept;
+ int32_t num_values;
+ int32_t num_nulls;
+ int32_t num_rows;
+ Encoding::type encoding;
+ int32_t definition_levels_byte_length;
+ int32_t repetition_levels_byte_length;
+ bool is_compressed;
+ Statistics statistics;
+
+ _DataPageHeaderV2__isset __isset;
+
+ void __set_num_values(const int32_t val);
+
+ void __set_num_nulls(const int32_t val);
+
+ void __set_num_rows(const int32_t val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_definition_levels_byte_length(const int32_t val);
+
+ void __set_repetition_levels_byte_length(const int32_t val);
+
+ void __set_is_compressed(const bool val);
+
+ void __set_statistics(const Statistics& val);
+
+ bool operator == (const DataPageHeaderV2 & rhs) const
+ {
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(num_nulls == rhs.num_nulls))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(definition_levels_byte_length == rhs.definition_levels_byte_length))
+ return false;
+ if (!(repetition_levels_byte_length == rhs.repetition_levels_byte_length))
+ return false;
+ if (__isset.is_compressed != rhs.__isset.is_compressed)
+ return false;
+ else if (__isset.is_compressed && !(is_compressed == rhs.is_compressed))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ return true;
+ }
+ bool operator != (const DataPageHeaderV2 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const DataPageHeaderV2 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(DataPageHeaderV2 &a, DataPageHeaderV2 &b);
+
+std::ostream& operator<<(std::ostream& out, const DataPageHeaderV2& obj);
+
+
+class SplitBlockAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ SplitBlockAlgorithm(const SplitBlockAlgorithm&);
+ SplitBlockAlgorithm& operator=(const SplitBlockAlgorithm&);
+ SplitBlockAlgorithm() {
+ }
+
+ virtual ~SplitBlockAlgorithm() noexcept;
+
+ bool operator == (const SplitBlockAlgorithm & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const SplitBlockAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SplitBlockAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SplitBlockAlgorithm &a, SplitBlockAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const SplitBlockAlgorithm& obj);
+
+typedef struct _BloomFilterAlgorithm__isset {
+ _BloomFilterAlgorithm__isset() : BLOCK(false) {}
+ bool BLOCK :1;
+} _BloomFilterAlgorithm__isset;
+
+class BloomFilterAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterAlgorithm(const BloomFilterAlgorithm&);
+ BloomFilterAlgorithm& operator=(const BloomFilterAlgorithm&);
+ BloomFilterAlgorithm() {
+ }
+
+ virtual ~BloomFilterAlgorithm() noexcept;
+ SplitBlockAlgorithm BLOCK;
+
+ _BloomFilterAlgorithm__isset __isset;
+
+ void __set_BLOCK(const SplitBlockAlgorithm& val);
+
+ bool operator == (const BloomFilterAlgorithm & rhs) const
+ {
+ if (__isset.BLOCK != rhs.__isset.BLOCK)
+ return false;
+ else if (__isset.BLOCK && !(BLOCK == rhs.BLOCK))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterAlgorithm &a, BloomFilterAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterAlgorithm& obj);
+
+
+class XxHash : public virtual ::apache::thrift::TBase {
+ public:
+
+ XxHash(const XxHash&);
+ XxHash& operator=(const XxHash&);
+ XxHash() {
+ }
+
+ virtual ~XxHash() noexcept;
+
+ bool operator == (const XxHash & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const XxHash &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const XxHash & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(XxHash &a, XxHash &b);
+
+std::ostream& operator<<(std::ostream& out, const XxHash& obj);
+
+typedef struct _BloomFilterHash__isset {
+ _BloomFilterHash__isset() : XXHASH(false) {}
+ bool XXHASH :1;
+} _BloomFilterHash__isset;
+
+class BloomFilterHash : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterHash(const BloomFilterHash&);
+ BloomFilterHash& operator=(const BloomFilterHash&);
+ BloomFilterHash() {
+ }
+
+ virtual ~BloomFilterHash() noexcept;
+ XxHash XXHASH;
+
+ _BloomFilterHash__isset __isset;
+
+ void __set_XXHASH(const XxHash& val);
+
+ bool operator == (const BloomFilterHash & rhs) const
+ {
+ if (__isset.XXHASH != rhs.__isset.XXHASH)
+ return false;
+ else if (__isset.XXHASH && !(XXHASH == rhs.XXHASH))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterHash &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterHash & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterHash &a, BloomFilterHash &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterHash& obj);
+
+
+class Uncompressed : public virtual ::apache::thrift::TBase {
+ public:
+
+ Uncompressed(const Uncompressed&);
+ Uncompressed& operator=(const Uncompressed&);
+ Uncompressed() {
+ }
+
+ virtual ~Uncompressed() noexcept;
+
+ bool operator == (const Uncompressed & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const Uncompressed &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const Uncompressed & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(Uncompressed &a, Uncompressed &b);
+
+std::ostream& operator<<(std::ostream& out, const Uncompressed& obj);
+
+typedef struct _BloomFilterCompression__isset {
+ _BloomFilterCompression__isset() : UNCOMPRESSED(false) {}
+ bool UNCOMPRESSED :1;
+} _BloomFilterCompression__isset;
+
+class BloomFilterCompression : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterCompression(const BloomFilterCompression&);
+ BloomFilterCompression& operator=(const BloomFilterCompression&);
+ BloomFilterCompression() {
+ }
+
+ virtual ~BloomFilterCompression() noexcept;
+ Uncompressed UNCOMPRESSED;
+
+ _BloomFilterCompression__isset __isset;
+
+ void __set_UNCOMPRESSED(const Uncompressed& val);
+
+ bool operator == (const BloomFilterCompression & rhs) const
+ {
+ if (__isset.UNCOMPRESSED != rhs.__isset.UNCOMPRESSED)
+ return false;
+ else if (__isset.UNCOMPRESSED && !(UNCOMPRESSED == rhs.UNCOMPRESSED))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterCompression &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterCompression & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterCompression &a, BloomFilterCompression &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterCompression& obj);
+
+
+class BloomFilterHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ BloomFilterHeader(const BloomFilterHeader&);
+ BloomFilterHeader& operator=(const BloomFilterHeader&);
+ BloomFilterHeader() : numBytes(0) {
+ }
+
+ virtual ~BloomFilterHeader() noexcept;
+ int32_t numBytes;
+ BloomFilterAlgorithm algorithm;
+ BloomFilterHash hash;
+ BloomFilterCompression compression;
+
+ void __set_numBytes(const int32_t val);
+
+ void __set_algorithm(const BloomFilterAlgorithm& val);
+
+ void __set_hash(const BloomFilterHash& val);
+
+ void __set_compression(const BloomFilterCompression& val);
+
+ bool operator == (const BloomFilterHeader & rhs) const
+ {
+ if (!(numBytes == rhs.numBytes))
+ return false;
+ if (!(algorithm == rhs.algorithm))
+ return false;
+ if (!(hash == rhs.hash))
+ return false;
+ if (!(compression == rhs.compression))
+ return false;
+ return true;
+ }
+ bool operator != (const BloomFilterHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const BloomFilterHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(BloomFilterHeader &a, BloomFilterHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const BloomFilterHeader& obj);
+
+typedef struct _PageHeader__isset {
+ _PageHeader__isset() : crc(false), data_page_header(false), index_page_header(false), dictionary_page_header(false), data_page_header_v2(false) {}
+ bool crc :1;
+ bool data_page_header :1;
+ bool index_page_header :1;
+ bool dictionary_page_header :1;
+ bool data_page_header_v2 :1;
+} _PageHeader__isset;
+
+class PageHeader : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageHeader(const PageHeader&);
+ PageHeader& operator=(const PageHeader&);
+ PageHeader() : type((PageType::type)0), uncompressed_page_size(0), compressed_page_size(0), crc(0) {
+ }
+
+ virtual ~PageHeader() noexcept;
+ PageType::type type;
+ int32_t uncompressed_page_size;
+ int32_t compressed_page_size;
+ int32_t crc;
+ DataPageHeader data_page_header;
+ IndexPageHeader index_page_header;
+ DictionaryPageHeader dictionary_page_header;
+ DataPageHeaderV2 data_page_header_v2;
+
+ _PageHeader__isset __isset;
+
+ void __set_type(const PageType::type val);
+
+ void __set_uncompressed_page_size(const int32_t val);
+
+ void __set_compressed_page_size(const int32_t val);
+
+ void __set_crc(const int32_t val);
+
+ void __set_data_page_header(const DataPageHeader& val);
+
+ void __set_index_page_header(const IndexPageHeader& val);
+
+ void __set_dictionary_page_header(const DictionaryPageHeader& val);
+
+ void __set_data_page_header_v2(const DataPageHeaderV2& val);
+
+ bool operator == (const PageHeader & rhs) const
+ {
+ if (!(type == rhs.type))
+ return false;
+ if (!(uncompressed_page_size == rhs.uncompressed_page_size))
+ return false;
+ if (!(compressed_page_size == rhs.compressed_page_size))
+ return false;
+ if (__isset.crc != rhs.__isset.crc)
+ return false;
+ else if (__isset.crc && !(crc == rhs.crc))
+ return false;
+ if (__isset.data_page_header != rhs.__isset.data_page_header)
+ return false;
+ else if (__isset.data_page_header && !(data_page_header == rhs.data_page_header))
+ return false;
+ if (__isset.index_page_header != rhs.__isset.index_page_header)
+ return false;
+ else if (__isset.index_page_header && !(index_page_header == rhs.index_page_header))
+ return false;
+ if (__isset.dictionary_page_header != rhs.__isset.dictionary_page_header)
+ return false;
+ else if (__isset.dictionary_page_header && !(dictionary_page_header == rhs.dictionary_page_header))
+ return false;
+ if (__isset.data_page_header_v2 != rhs.__isset.data_page_header_v2)
+ return false;
+ else if (__isset.data_page_header_v2 && !(data_page_header_v2 == rhs.data_page_header_v2))
+ return false;
+ return true;
+ }
+ bool operator != (const PageHeader &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageHeader & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageHeader &a, PageHeader &b);
+
+std::ostream& operator<<(std::ostream& out, const PageHeader& obj);
+
+typedef struct _KeyValue__isset {
+ _KeyValue__isset() : value(false) {}
+ bool value :1;
+} _KeyValue__isset;
+
+class KeyValue : public virtual ::apache::thrift::TBase {
+ public:
+
+ KeyValue(const KeyValue&);
+ KeyValue& operator=(const KeyValue&);
+ KeyValue() : key(), value() {
+ }
+
+ virtual ~KeyValue() noexcept;
+ std::string key;
+ std::string value;
+
+ _KeyValue__isset __isset;
+
+ void __set_key(const std::string& val);
+
+ void __set_value(const std::string& val);
+
+ bool operator == (const KeyValue & rhs) const
+ {
+ if (!(key == rhs.key))
+ return false;
+ if (__isset.value != rhs.__isset.value)
+ return false;
+ else if (__isset.value && !(value == rhs.value))
+ return false;
+ return true;
+ }
+ bool operator != (const KeyValue &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const KeyValue & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(KeyValue &a, KeyValue &b);
+
+std::ostream& operator<<(std::ostream& out, const KeyValue& obj);
+
+
+class SortingColumn : public virtual ::apache::thrift::TBase {
+ public:
+
+ SortingColumn(const SortingColumn&);
+ SortingColumn& operator=(const SortingColumn&);
+ SortingColumn() : column_idx(0), descending(0), nulls_first(0) {
+ }
+
+ virtual ~SortingColumn() noexcept;
+ int32_t column_idx;
+ bool descending;
+ bool nulls_first;
+
+ void __set_column_idx(const int32_t val);
+
+ void __set_descending(const bool val);
+
+ void __set_nulls_first(const bool val);
+
+ bool operator == (const SortingColumn & rhs) const
+ {
+ if (!(column_idx == rhs.column_idx))
+ return false;
+ if (!(descending == rhs.descending))
+ return false;
+ if (!(nulls_first == rhs.nulls_first))
+ return false;
+ return true;
+ }
+ bool operator != (const SortingColumn &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const SortingColumn & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(SortingColumn &a, SortingColumn &b);
+
+std::ostream& operator<<(std::ostream& out, const SortingColumn& obj);
+
+
+class PageEncodingStats : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageEncodingStats(const PageEncodingStats&);
+ PageEncodingStats& operator=(const PageEncodingStats&);
+ PageEncodingStats() : page_type((PageType::type)0), encoding((Encoding::type)0), count(0) {
+ }
+
+ virtual ~PageEncodingStats() noexcept;
+ PageType::type page_type;
+ Encoding::type encoding;
+ int32_t count;
+
+ void __set_page_type(const PageType::type val);
+
+ void __set_encoding(const Encoding::type val);
+
+ void __set_count(const int32_t val);
+
+ bool operator == (const PageEncodingStats & rhs) const
+ {
+ if (!(page_type == rhs.page_type))
+ return false;
+ if (!(encoding == rhs.encoding))
+ return false;
+ if (!(count == rhs.count))
+ return false;
+ return true;
+ }
+ bool operator != (const PageEncodingStats &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageEncodingStats & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageEncodingStats &a, PageEncodingStats &b);
+
+std::ostream& operator<<(std::ostream& out, const PageEncodingStats& obj);
+
+typedef struct _ColumnMetaData__isset {
+ _ColumnMetaData__isset() : key_value_metadata(false), index_page_offset(false), dictionary_page_offset(false), statistics(false), encoding_stats(false), bloom_filter_offset(false) {}
+ bool key_value_metadata :1;
+ bool index_page_offset :1;
+ bool dictionary_page_offset :1;
+ bool statistics :1;
+ bool encoding_stats :1;
+ bool bloom_filter_offset :1;
+} _ColumnMetaData__isset;
+
+class ColumnMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnMetaData(const ColumnMetaData&);
+ ColumnMetaData& operator=(const ColumnMetaData&);
+ ColumnMetaData() : type((Type::type)0), codec((CompressionCodec::type)0), num_values(0), total_uncompressed_size(0), total_compressed_size(0), data_page_offset(0), index_page_offset(0), dictionary_page_offset(0), bloom_filter_offset(0) {
+ }
+
+ virtual ~ColumnMetaData() noexcept;
+ Type::type type;
+ std::vector<Encoding::type> encodings;
+ std::vector<std::string> path_in_schema;
+ CompressionCodec::type codec;
+ int64_t num_values;
+ int64_t total_uncompressed_size;
+ int64_t total_compressed_size;
+ std::vector<KeyValue> key_value_metadata;
+ int64_t data_page_offset;
+ int64_t index_page_offset;
+ int64_t dictionary_page_offset;
+ Statistics statistics;
+ std::vector<PageEncodingStats> encoding_stats;
+ int64_t bloom_filter_offset;
+
+ _ColumnMetaData__isset __isset;
+
+ void __set_type(const Type::type val);
+
+ void __set_encodings(const std::vector<Encoding::type> & val);
+
+ void __set_path_in_schema(const std::vector<std::string> & val);
+
+ void __set_codec(const CompressionCodec::type val);
+
+ void __set_num_values(const int64_t val);
+
+ void __set_total_uncompressed_size(const int64_t val);
+
+ void __set_total_compressed_size(const int64_t val);
+
+ void __set_key_value_metadata(const std::vector<KeyValue> & val);
+
+ void __set_data_page_offset(const int64_t val);
+
+ void __set_index_page_offset(const int64_t val);
+
+ void __set_dictionary_page_offset(const int64_t val);
+
+ void __set_statistics(const Statistics& val);
+
+ void __set_encoding_stats(const std::vector<PageEncodingStats> & val);
+
+ void __set_bloom_filter_offset(const int64_t val);
+
+ bool operator == (const ColumnMetaData & rhs) const
+ {
+ if (!(type == rhs.type))
+ return false;
+ if (!(encodings == rhs.encodings))
+ return false;
+ if (!(path_in_schema == rhs.path_in_schema))
+ return false;
+ if (!(codec == rhs.codec))
+ return false;
+ if (!(num_values == rhs.num_values))
+ return false;
+ if (!(total_uncompressed_size == rhs.total_uncompressed_size))
+ return false;
+ if (!(total_compressed_size == rhs.total_compressed_size))
+ return false;
+ if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+ return false;
+ else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+ return false;
+ if (!(data_page_offset == rhs.data_page_offset))
+ return false;
+ if (__isset.index_page_offset != rhs.__isset.index_page_offset)
+ return false;
+ else if (__isset.index_page_offset && !(index_page_offset == rhs.index_page_offset))
+ return false;
+ if (__isset.dictionary_page_offset != rhs.__isset.dictionary_page_offset)
+ return false;
+ else if (__isset.dictionary_page_offset && !(dictionary_page_offset == rhs.dictionary_page_offset))
+ return false;
+ if (__isset.statistics != rhs.__isset.statistics)
+ return false;
+ else if (__isset.statistics && !(statistics == rhs.statistics))
+ return false;
+ if (__isset.encoding_stats != rhs.__isset.encoding_stats)
+ return false;
+ else if (__isset.encoding_stats && !(encoding_stats == rhs.encoding_stats))
+ return false;
+ if (__isset.bloom_filter_offset != rhs.__isset.bloom_filter_offset)
+ return false;
+ else if (__isset.bloom_filter_offset && !(bloom_filter_offset == rhs.bloom_filter_offset))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnMetaData &a, ColumnMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnMetaData& obj);
+
+
+class EncryptionWithFooterKey : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionWithFooterKey(const EncryptionWithFooterKey&);
+ EncryptionWithFooterKey& operator=(const EncryptionWithFooterKey&);
+ EncryptionWithFooterKey() {
+ }
+
+ virtual ~EncryptionWithFooterKey() noexcept;
+
+ bool operator == (const EncryptionWithFooterKey & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const EncryptionWithFooterKey &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionWithFooterKey & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionWithFooterKey &a, EncryptionWithFooterKey &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithFooterKey& obj);
+
+typedef struct _EncryptionWithColumnKey__isset {
+ _EncryptionWithColumnKey__isset() : key_metadata(false) {}
+ bool key_metadata :1;
+} _EncryptionWithColumnKey__isset;
+
+class EncryptionWithColumnKey : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionWithColumnKey(const EncryptionWithColumnKey&);
+ EncryptionWithColumnKey& operator=(const EncryptionWithColumnKey&);
+ EncryptionWithColumnKey() : key_metadata() {
+ }
+
+ virtual ~EncryptionWithColumnKey() noexcept;
+ std::vector<std::string> path_in_schema;
+ std::string key_metadata;
+
+ _EncryptionWithColumnKey__isset __isset;
+
+ void __set_path_in_schema(const std::vector<std::string> & val);
+
+ void __set_key_metadata(const std::string& val);
+
+ bool operator == (const EncryptionWithColumnKey & rhs) const
+ {
+ if (!(path_in_schema == rhs.path_in_schema))
+ return false;
+ if (__isset.key_metadata != rhs.__isset.key_metadata)
+ return false;
+ else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const EncryptionWithColumnKey &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionWithColumnKey & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionWithColumnKey &a, EncryptionWithColumnKey &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionWithColumnKey& obj);
+
+typedef struct _ColumnCryptoMetaData__isset {
+ _ColumnCryptoMetaData__isset() : ENCRYPTION_WITH_FOOTER_KEY(false), ENCRYPTION_WITH_COLUMN_KEY(false) {}
+ bool ENCRYPTION_WITH_FOOTER_KEY :1;
+ bool ENCRYPTION_WITH_COLUMN_KEY :1;
+} _ColumnCryptoMetaData__isset;
+
+class ColumnCryptoMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnCryptoMetaData(const ColumnCryptoMetaData&);
+ ColumnCryptoMetaData& operator=(const ColumnCryptoMetaData&);
+ ColumnCryptoMetaData() {
+ }
+
+ virtual ~ColumnCryptoMetaData() noexcept;
+ EncryptionWithFooterKey ENCRYPTION_WITH_FOOTER_KEY;
+ EncryptionWithColumnKey ENCRYPTION_WITH_COLUMN_KEY;
+
+ _ColumnCryptoMetaData__isset __isset;
+
+ void __set_ENCRYPTION_WITH_FOOTER_KEY(const EncryptionWithFooterKey& val);
+
+ void __set_ENCRYPTION_WITH_COLUMN_KEY(const EncryptionWithColumnKey& val);
+
+ bool operator == (const ColumnCryptoMetaData & rhs) const
+ {
+ if (__isset.ENCRYPTION_WITH_FOOTER_KEY != rhs.__isset.ENCRYPTION_WITH_FOOTER_KEY)
+ return false;
+ else if (__isset.ENCRYPTION_WITH_FOOTER_KEY && !(ENCRYPTION_WITH_FOOTER_KEY == rhs.ENCRYPTION_WITH_FOOTER_KEY))
+ return false;
+ if (__isset.ENCRYPTION_WITH_COLUMN_KEY != rhs.__isset.ENCRYPTION_WITH_COLUMN_KEY)
+ return false;
+ else if (__isset.ENCRYPTION_WITH_COLUMN_KEY && !(ENCRYPTION_WITH_COLUMN_KEY == rhs.ENCRYPTION_WITH_COLUMN_KEY))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnCryptoMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnCryptoMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnCryptoMetaData &a, ColumnCryptoMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnCryptoMetaData& obj);
+
+typedef struct _ColumnChunk__isset {
+ _ColumnChunk__isset() : file_path(false), meta_data(false), offset_index_offset(false), offset_index_length(false), column_index_offset(false), column_index_length(false), crypto_metadata(false), encrypted_column_metadata(false) {}
+ bool file_path :1;
+ bool meta_data :1;
+ bool offset_index_offset :1;
+ bool offset_index_length :1;
+ bool column_index_offset :1;
+ bool column_index_length :1;
+ bool crypto_metadata :1;
+ bool encrypted_column_metadata :1;
+} _ColumnChunk__isset;
+
+class ColumnChunk : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnChunk(const ColumnChunk&);
+ ColumnChunk& operator=(const ColumnChunk&);
+ ColumnChunk() : file_path(), file_offset(0), offset_index_offset(0), offset_index_length(0), column_index_offset(0), column_index_length(0), encrypted_column_metadata() {
+ }
+
+ virtual ~ColumnChunk() noexcept;
+ std::string file_path;
+ int64_t file_offset;
+ ColumnMetaData meta_data;
+ int64_t offset_index_offset;
+ int32_t offset_index_length;
+ int64_t column_index_offset;
+ int32_t column_index_length;
+ ColumnCryptoMetaData crypto_metadata;
+ std::string encrypted_column_metadata;
+
+ _ColumnChunk__isset __isset;
+
+ void __set_file_path(const std::string& val);
+
+ void __set_file_offset(const int64_t val);
+
+ void __set_meta_data(const ColumnMetaData& val);
+
+ void __set_offset_index_offset(const int64_t val);
+
+ void __set_offset_index_length(const int32_t val);
+
+ void __set_column_index_offset(const int64_t val);
+
+ void __set_column_index_length(const int32_t val);
+
+ void __set_crypto_metadata(const ColumnCryptoMetaData& val);
+
+ void __set_encrypted_column_metadata(const std::string& val);
+
+ bool operator == (const ColumnChunk & rhs) const
+ {
+ if (__isset.file_path != rhs.__isset.file_path)
+ return false;
+ else if (__isset.file_path && !(file_path == rhs.file_path))
+ return false;
+ if (!(file_offset == rhs.file_offset))
+ return false;
+ if (__isset.meta_data != rhs.__isset.meta_data)
+ return false;
+ else if (__isset.meta_data && !(meta_data == rhs.meta_data))
+ return false;
+ if (__isset.offset_index_offset != rhs.__isset.offset_index_offset)
+ return false;
+ else if (__isset.offset_index_offset && !(offset_index_offset == rhs.offset_index_offset))
+ return false;
+ if (__isset.offset_index_length != rhs.__isset.offset_index_length)
+ return false;
+ else if (__isset.offset_index_length && !(offset_index_length == rhs.offset_index_length))
+ return false;
+ if (__isset.column_index_offset != rhs.__isset.column_index_offset)
+ return false;
+ else if (__isset.column_index_offset && !(column_index_offset == rhs.column_index_offset))
+ return false;
+ if (__isset.column_index_length != rhs.__isset.column_index_length)
+ return false;
+ else if (__isset.column_index_length && !(column_index_length == rhs.column_index_length))
+ return false;
+ if (__isset.crypto_metadata != rhs.__isset.crypto_metadata)
+ return false;
+ else if (__isset.crypto_metadata && !(crypto_metadata == rhs.crypto_metadata))
+ return false;
+ if (__isset.encrypted_column_metadata != rhs.__isset.encrypted_column_metadata)
+ return false;
+ else if (__isset.encrypted_column_metadata && !(encrypted_column_metadata == rhs.encrypted_column_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnChunk &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnChunk & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnChunk &a, ColumnChunk &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnChunk& obj);
+
+typedef struct _RowGroup__isset {
+ _RowGroup__isset() : sorting_columns(false), file_offset(false), total_compressed_size(false), ordinal(false) {}
+ bool sorting_columns :1;
+ bool file_offset :1;
+ bool total_compressed_size :1;
+ bool ordinal :1;
+} _RowGroup__isset;
+
+class RowGroup : public virtual ::apache::thrift::TBase {
+ public:
+
+ RowGroup(const RowGroup&);
+ RowGroup& operator=(const RowGroup&);
+ RowGroup() : total_byte_size(0), num_rows(0), file_offset(0), total_compressed_size(0), ordinal(0) {
+ }
+
+ virtual ~RowGroup() noexcept;
+ std::vector<ColumnChunk> columns;
+ int64_t total_byte_size;
+ int64_t num_rows;
+ std::vector<SortingColumn> sorting_columns;
+ int64_t file_offset;
+ int64_t total_compressed_size;
+ int16_t ordinal;
+
+ _RowGroup__isset __isset;
+
+ void __set_columns(const std::vector<ColumnChunk> & val);
+
+ void __set_total_byte_size(const int64_t val);
+
+ void __set_num_rows(const int64_t val);
+
+ void __set_sorting_columns(const std::vector<SortingColumn> & val);
+
+ void __set_file_offset(const int64_t val);
+
+ void __set_total_compressed_size(const int64_t val);
+
+ void __set_ordinal(const int16_t val);
+
+ bool operator == (const RowGroup & rhs) const
+ {
+ if (!(columns == rhs.columns))
+ return false;
+ if (!(total_byte_size == rhs.total_byte_size))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (__isset.sorting_columns != rhs.__isset.sorting_columns)
+ return false;
+ else if (__isset.sorting_columns && !(sorting_columns == rhs.sorting_columns))
+ return false;
+ if (__isset.file_offset != rhs.__isset.file_offset)
+ return false;
+ else if (__isset.file_offset && !(file_offset == rhs.file_offset))
+ return false;
+ if (__isset.total_compressed_size != rhs.__isset.total_compressed_size)
+ return false;
+ else if (__isset.total_compressed_size && !(total_compressed_size == rhs.total_compressed_size))
+ return false;
+ if (__isset.ordinal != rhs.__isset.ordinal)
+ return false;
+ else if (__isset.ordinal && !(ordinal == rhs.ordinal))
+ return false;
+ return true;
+ }
+ bool operator != (const RowGroup &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const RowGroup & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(RowGroup &a, RowGroup &b);
+
+std::ostream& operator<<(std::ostream& out, const RowGroup& obj);
+
+
+class TypeDefinedOrder : public virtual ::apache::thrift::TBase {
+ public:
+
+ TypeDefinedOrder(const TypeDefinedOrder&);
+ TypeDefinedOrder& operator=(const TypeDefinedOrder&);
+ TypeDefinedOrder() {
+ }
+
+ virtual ~TypeDefinedOrder() noexcept;
+
+ bool operator == (const TypeDefinedOrder & /* rhs */) const
+ {
+ return true;
+ }
+ bool operator != (const TypeDefinedOrder &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const TypeDefinedOrder & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(TypeDefinedOrder &a, TypeDefinedOrder &b);
+
+std::ostream& operator<<(std::ostream& out, const TypeDefinedOrder& obj);
+
+typedef struct _ColumnOrder__isset {
+ _ColumnOrder__isset() : TYPE_ORDER(false) {}
+ bool TYPE_ORDER :1;
+} _ColumnOrder__isset;
+
+class ColumnOrder : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnOrder(const ColumnOrder&);
+ ColumnOrder& operator=(const ColumnOrder&);
+ ColumnOrder() {
+ }
+
+ virtual ~ColumnOrder() noexcept;
+ TypeDefinedOrder TYPE_ORDER;
+
+ _ColumnOrder__isset __isset;
+
+ void __set_TYPE_ORDER(const TypeDefinedOrder& val);
+
+ bool operator == (const ColumnOrder & rhs) const
+ {
+ if (__isset.TYPE_ORDER != rhs.__isset.TYPE_ORDER)
+ return false;
+ else if (__isset.TYPE_ORDER && !(TYPE_ORDER == rhs.TYPE_ORDER))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnOrder &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnOrder & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnOrder &a, ColumnOrder &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnOrder& obj);
+
+
+class PageLocation : public virtual ::apache::thrift::TBase {
+ public:
+
+ PageLocation(const PageLocation&);
+ PageLocation& operator=(const PageLocation&);
+ PageLocation() : offset(0), compressed_page_size(0), first_row_index(0) {
+ }
+
+ virtual ~PageLocation() noexcept;
+ int64_t offset;
+ int32_t compressed_page_size;
+ int64_t first_row_index;
+
+ void __set_offset(const int64_t val);
+
+ void __set_compressed_page_size(const int32_t val);
+
+ void __set_first_row_index(const int64_t val);
+
+ bool operator == (const PageLocation & rhs) const
+ {
+ if (!(offset == rhs.offset))
+ return false;
+ if (!(compressed_page_size == rhs.compressed_page_size))
+ return false;
+ if (!(first_row_index == rhs.first_row_index))
+ return false;
+ return true;
+ }
+ bool operator != (const PageLocation &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const PageLocation & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(PageLocation &a, PageLocation &b);
+
+std::ostream& operator<<(std::ostream& out, const PageLocation& obj);
+
+
+class OffsetIndex : public virtual ::apache::thrift::TBase {
+ public:
+
+ OffsetIndex(const OffsetIndex&);
+ OffsetIndex& operator=(const OffsetIndex&);
+ OffsetIndex() {
+ }
+
+ virtual ~OffsetIndex() noexcept;
+ std::vector<PageLocation> page_locations;
+
+ void __set_page_locations(const std::vector<PageLocation> & val);
+
+ bool operator == (const OffsetIndex & rhs) const
+ {
+ if (!(page_locations == rhs.page_locations))
+ return false;
+ return true;
+ }
+ bool operator != (const OffsetIndex &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const OffsetIndex & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(OffsetIndex &a, OffsetIndex &b);
+
+std::ostream& operator<<(std::ostream& out, const OffsetIndex& obj);
+
+typedef struct _ColumnIndex__isset {
+ _ColumnIndex__isset() : null_counts(false) {}
+ bool null_counts :1;
+} _ColumnIndex__isset;
+
+class ColumnIndex : public virtual ::apache::thrift::TBase {
+ public:
+
+ ColumnIndex(const ColumnIndex&);
+ ColumnIndex& operator=(const ColumnIndex&);
+ ColumnIndex() : boundary_order((BoundaryOrder::type)0) {
+ }
+
+ virtual ~ColumnIndex() noexcept;
+ std::vector<bool> null_pages;
+ std::vector<std::string> min_values;
+ std::vector<std::string> max_values;
+ BoundaryOrder::type boundary_order;
+ std::vector<int64_t> null_counts;
+
+ _ColumnIndex__isset __isset;
+
+ void __set_null_pages(const std::vector<bool> & val);
+
+ void __set_min_values(const std::vector<std::string> & val);
+
+ void __set_max_values(const std::vector<std::string> & val);
+
+ void __set_boundary_order(const BoundaryOrder::type val);
+
+ void __set_null_counts(const std::vector<int64_t> & val);
+
+ bool operator == (const ColumnIndex & rhs) const
+ {
+ if (!(null_pages == rhs.null_pages))
+ return false;
+ if (!(min_values == rhs.min_values))
+ return false;
+ if (!(max_values == rhs.max_values))
+ return false;
+ if (!(boundary_order == rhs.boundary_order))
+ return false;
+ if (__isset.null_counts != rhs.__isset.null_counts)
+ return false;
+ else if (__isset.null_counts && !(null_counts == rhs.null_counts))
+ return false;
+ return true;
+ }
+ bool operator != (const ColumnIndex &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const ColumnIndex & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(ColumnIndex &a, ColumnIndex &b);
+
+std::ostream& operator<<(std::ostream& out, const ColumnIndex& obj);
+
+typedef struct _AesGcmV1__isset {
+ _AesGcmV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
+ bool aad_prefix :1;
+ bool aad_file_unique :1;
+ bool supply_aad_prefix :1;
+} _AesGcmV1__isset;
+
+class AesGcmV1 : public virtual ::apache::thrift::TBase {
+ public:
+
+ AesGcmV1(const AesGcmV1&);
+ AesGcmV1& operator=(const AesGcmV1&);
+ AesGcmV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
+ }
+
+ virtual ~AesGcmV1() noexcept;
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+
+ _AesGcmV1__isset __isset;
+
+ void __set_aad_prefix(const std::string& val);
+
+ void __set_aad_file_unique(const std::string& val);
+
+ void __set_supply_aad_prefix(const bool val);
+
+ bool operator == (const AesGcmV1 & rhs) const
+ {
+ if (__isset.aad_prefix != rhs.__isset.aad_prefix)
+ return false;
+ else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
+ return false;
+ if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
+ return false;
+ else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
+ return false;
+ if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
+ return false;
+ else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
+ return false;
+ return true;
+ }
+ bool operator != (const AesGcmV1 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const AesGcmV1 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(AesGcmV1 &a, AesGcmV1 &b);
+
+std::ostream& operator<<(std::ostream& out, const AesGcmV1& obj);
+
+typedef struct _AesGcmCtrV1__isset {
+ _AesGcmCtrV1__isset() : aad_prefix(false), aad_file_unique(false), supply_aad_prefix(false) {}
+ bool aad_prefix :1;
+ bool aad_file_unique :1;
+ bool supply_aad_prefix :1;
+} _AesGcmCtrV1__isset;
+
+class AesGcmCtrV1 : public virtual ::apache::thrift::TBase {
+ public:
+
+ AesGcmCtrV1(const AesGcmCtrV1&);
+ AesGcmCtrV1& operator=(const AesGcmCtrV1&);
+ AesGcmCtrV1() : aad_prefix(), aad_file_unique(), supply_aad_prefix(0) {
+ }
+
+ virtual ~AesGcmCtrV1() noexcept;
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+
+ _AesGcmCtrV1__isset __isset;
+
+ void __set_aad_prefix(const std::string& val);
+
+ void __set_aad_file_unique(const std::string& val);
+
+ void __set_supply_aad_prefix(const bool val);
+
+ bool operator == (const AesGcmCtrV1 & rhs) const
+ {
+ if (__isset.aad_prefix != rhs.__isset.aad_prefix)
+ return false;
+ else if (__isset.aad_prefix && !(aad_prefix == rhs.aad_prefix))
+ return false;
+ if (__isset.aad_file_unique != rhs.__isset.aad_file_unique)
+ return false;
+ else if (__isset.aad_file_unique && !(aad_file_unique == rhs.aad_file_unique))
+ return false;
+ if (__isset.supply_aad_prefix != rhs.__isset.supply_aad_prefix)
+ return false;
+ else if (__isset.supply_aad_prefix && !(supply_aad_prefix == rhs.supply_aad_prefix))
+ return false;
+ return true;
+ }
+ bool operator != (const AesGcmCtrV1 &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const AesGcmCtrV1 & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(AesGcmCtrV1 &a, AesGcmCtrV1 &b);
+
+std::ostream& operator<<(std::ostream& out, const AesGcmCtrV1& obj);
+
+typedef struct _EncryptionAlgorithm__isset {
+ _EncryptionAlgorithm__isset() : AES_GCM_V1(false), AES_GCM_CTR_V1(false) {}
+ bool AES_GCM_V1 :1;
+ bool AES_GCM_CTR_V1 :1;
+} _EncryptionAlgorithm__isset;
+
+class EncryptionAlgorithm : public virtual ::apache::thrift::TBase {
+ public:
+
+ EncryptionAlgorithm(const EncryptionAlgorithm&);
+ EncryptionAlgorithm& operator=(const EncryptionAlgorithm&);
+ EncryptionAlgorithm() {
+ }
+
+ virtual ~EncryptionAlgorithm() noexcept;
+ AesGcmV1 AES_GCM_V1;
+ AesGcmCtrV1 AES_GCM_CTR_V1;
+
+ _EncryptionAlgorithm__isset __isset;
+
+ void __set_AES_GCM_V1(const AesGcmV1& val);
+
+ void __set_AES_GCM_CTR_V1(const AesGcmCtrV1& val);
+
+ bool operator == (const EncryptionAlgorithm & rhs) const
+ {
+ if (__isset.AES_GCM_V1 != rhs.__isset.AES_GCM_V1)
+ return false;
+ else if (__isset.AES_GCM_V1 && !(AES_GCM_V1 == rhs.AES_GCM_V1))
+ return false;
+ if (__isset.AES_GCM_CTR_V1 != rhs.__isset.AES_GCM_CTR_V1)
+ return false;
+ else if (__isset.AES_GCM_CTR_V1 && !(AES_GCM_CTR_V1 == rhs.AES_GCM_CTR_V1))
+ return false;
+ return true;
+ }
+ bool operator != (const EncryptionAlgorithm &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const EncryptionAlgorithm & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(EncryptionAlgorithm &a, EncryptionAlgorithm &b);
+
+std::ostream& operator<<(std::ostream& out, const EncryptionAlgorithm& obj);
+
+typedef struct _FileMetaData__isset {
+ _FileMetaData__isset() : key_value_metadata(false), created_by(false), column_orders(false), encryption_algorithm(false), footer_signing_key_metadata(false) {}
+ bool key_value_metadata :1;
+ bool created_by :1;
+ bool column_orders :1;
+ bool encryption_algorithm :1;
+ bool footer_signing_key_metadata :1;
+} _FileMetaData__isset;
+
+class FileMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ FileMetaData(const FileMetaData&);
+ FileMetaData& operator=(const FileMetaData&);
+ FileMetaData() : version(0), num_rows(0), created_by(), footer_signing_key_metadata() {
+ }
+
+ virtual ~FileMetaData() noexcept;
+ int32_t version;
+ std::vector<SchemaElement> schema;
+ int64_t num_rows;
+ std::vector<RowGroup> row_groups;
+ std::vector<KeyValue> key_value_metadata;
+ std::string created_by;
+ std::vector<ColumnOrder> column_orders;
+ EncryptionAlgorithm encryption_algorithm;
+ std::string footer_signing_key_metadata;
+
+ _FileMetaData__isset __isset;
+
+ void __set_version(const int32_t val);
+
+ void __set_schema(const std::vector<SchemaElement> & val);
+
+ void __set_num_rows(const int64_t val);
+
+ void __set_row_groups(const std::vector<RowGroup> & val);
+
+ void __set_key_value_metadata(const std::vector<KeyValue> & val);
+
+ void __set_created_by(const std::string& val);
+
+ void __set_column_orders(const std::vector<ColumnOrder> & val);
+
+ void __set_encryption_algorithm(const EncryptionAlgorithm& val);
+
+ void __set_footer_signing_key_metadata(const std::string& val);
+
+ bool operator == (const FileMetaData & rhs) const
+ {
+ if (!(version == rhs.version))
+ return false;
+ if (!(schema == rhs.schema))
+ return false;
+ if (!(num_rows == rhs.num_rows))
+ return false;
+ if (!(row_groups == rhs.row_groups))
+ return false;
+ if (__isset.key_value_metadata != rhs.__isset.key_value_metadata)
+ return false;
+ else if (__isset.key_value_metadata && !(key_value_metadata == rhs.key_value_metadata))
+ return false;
+ if (__isset.created_by != rhs.__isset.created_by)
+ return false;
+ else if (__isset.created_by && !(created_by == rhs.created_by))
+ return false;
+ if (__isset.column_orders != rhs.__isset.column_orders)
+ return false;
+ else if (__isset.column_orders && !(column_orders == rhs.column_orders))
+ return false;
+ if (__isset.encryption_algorithm != rhs.__isset.encryption_algorithm)
+ return false;
+ else if (__isset.encryption_algorithm && !(encryption_algorithm == rhs.encryption_algorithm))
+ return false;
+ if (__isset.footer_signing_key_metadata != rhs.__isset.footer_signing_key_metadata)
+ return false;
+ else if (__isset.footer_signing_key_metadata && !(footer_signing_key_metadata == rhs.footer_signing_key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const FileMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const FileMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(FileMetaData &a, FileMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const FileMetaData& obj);
+
+typedef struct _FileCryptoMetaData__isset {
+ _FileCryptoMetaData__isset() : key_metadata(false) {}
+ bool key_metadata :1;
+} _FileCryptoMetaData__isset;
+
+class FileCryptoMetaData : public virtual ::apache::thrift::TBase {
+ public:
+
+ FileCryptoMetaData(const FileCryptoMetaData&);
+ FileCryptoMetaData& operator=(const FileCryptoMetaData&);
+ FileCryptoMetaData() : key_metadata() {
+ }
+
+ virtual ~FileCryptoMetaData() noexcept;
+ EncryptionAlgorithm encryption_algorithm;
+ std::string key_metadata;
+
+ _FileCryptoMetaData__isset __isset;
+
+ void __set_encryption_algorithm(const EncryptionAlgorithm& val);
+
+ void __set_key_metadata(const std::string& val);
+
+ bool operator == (const FileCryptoMetaData & rhs) const
+ {
+ if (!(encryption_algorithm == rhs.encryption_algorithm))
+ return false;
+ if (__isset.key_metadata != rhs.__isset.key_metadata)
+ return false;
+ else if (__isset.key_metadata && !(key_metadata == rhs.key_metadata))
+ return false;
+ return true;
+ }
+ bool operator != (const FileCryptoMetaData &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator < (const FileCryptoMetaData & ) const;
+
+ uint32_t read(::apache::thrift::protocol::TProtocol* iprot);
+ uint32_t write(::apache::thrift::protocol::TProtocol* oprot) const;
+
+ virtual void printTo(std::ostream& out) const;
+};
+
+void swap(FileCryptoMetaData &a, FileCryptoMetaData &b);
+
+std::ostream& operator<<(std::ostream& out, const FileCryptoMetaData& obj);
+
+}} // namespace
+
+#endif
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/README b/contrib/libs/apache/arrow/cpp/src/parquet/README
index 326bd7253f4..fc16a46ca08 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/README
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/README
@@ -1,10 +1,10 @@
-The CompatibilityTest of bloom_filter-test.cc is used to test cross compatibility of
-Bloom filters between parquet-mr and parquet-cpp. It reads the Bloom filter binary
-generated by the Bloom filter class in the parquet-mr project and tests whether the
-values inserted before could be filtered or not.
-
-The Bloom filter binary is generated by three steps from Parquet-mr:
-Step 1: Construct a Bloom filter with 1024 bytes of bitset.
-Step 2: Insert hashes of "hello", "parquet", "bloom", "filter" strings to Bloom filter
-by calling hash and insert APIs.
-Step 3: Call writeTo API to write to File.
+The CompatibilityTest of bloom_filter-test.cc is used to test cross compatibility of
+Bloom filters between parquet-mr and parquet-cpp. It reads the Bloom filter binary
+generated by the Bloom filter class in the parquet-mr project and tests whether the
+values inserted before could be filtered or not.
+
+The Bloom filter binary is generated by three steps from Parquet-mr:
+Step 1: Construct a Bloom filter with 1024 bytes of bitset.
+Step 2: Insert hashes of "hello", "parquet", "bloom", "filter" strings to Bloom filter
+by calling hash and insert APIs.
+Step 3: Call writeTo API to write to File.
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
index 62cbee22a18..a51773c44d3 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.cc
@@ -1,900 +1,900 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Overview.
-//
-// The strategy used for this code for repetition/definition
-// is to dissect the top level array into a list of paths
-// from the top level array to the final primitive (possibly
-// dictionary encoded array). It then evaluates each one of
-// those paths to produce results for the callback iteratively.
-//
-// This approach was taken to reduce the aggregate memory required if we were
-// to build all def/rep levels in parallel as apart of a tree traversal. It
-// also allows for straightforward parallelization at the path level if that is
-// desired in the future.
-//
-// The main downside to this approach is it duplicates effort for nodes
-// that share common ancestors. This can be mitigated to some degree
-// by adding in optimizations that detect leaf arrays that share
-// the same common list ancestor and reuse the repetition levels
-// from the first leaf encountered (only definition levels greater
-// the list ancestor need to be re-evaluated. This is left for future
-// work.
-//
-// Algorithm.
-//
-// As mentioned above this code dissects arrays into constituent parts:
-// nullability data, and list offset data. It tries to optimize for
-// some special cases, where it is known ahead of time that a step
-// can be skipped (e.g. a nullable array happens to have all of its
-// values) or batch filled (a nullable array has all null values).
-// One further optimization that is not implemented but could be done
-// in the future is special handling for nested list arrays that
-// have some intermediate data which indicates the final array contains only
-// nulls.
-//
-// In general, the algorithm attempts to batch work at each node as much
-// as possible. For nullability nodes this means finding runs of null
-// values and batch filling those interspersed with finding runs of non-null values
-// to process in batch at the next column.
-//
-// Similarly, list runs of empty lists are all processed in one batch
-// followed by either:
-// - A single list entry for non-terminal lists (i.e. the upper part of a nested list)
-// - Runs of non-empty lists for the terminal list (i.e. the lowest part of a nested
-// list).
-//
-// This makes use of the following observations.
-// 1. Null values at any node on the path are terminal (repetition and definition
-// level can be set directly when a Null value is encountered).
-// 2. Empty lists share this eager termination property with Null values.
-// 3. In order to keep repetition/definition level populated the algorithm is lazy
-// in assigning repetition levels. The algorithm tracks whether it is currently
-// in the middle of a list by comparing the lengths of repetition/definition levels.
-// If it is currently in the middle of a list the the number of repetition levels
-// populated will be greater than definition levels (the start of a List requires
-// adding the first element). If there are equal numbers of definition and repetition
-// levels populated this indicates a list is waiting to be started and the next list
-// encountered will have its repetition level signify the beginning of the list.
-//
-// Other implementation notes.
-//
-// This code hasn't been benchmarked (or assembly analyzed) but did the following
-// as optimizations (yes premature optimization is the root of all evil).
-// - This code does not use recursion, instead it constructs its own stack and manages
-// updating elements accordingly.
-// - It tries to avoid using Status for common return states.
-// - Avoids virtual dispatch in favor of if/else statements on a set of well known
-// classes.
-
-#include "parquet/arrow/path_internal.h"
-
-#include <atomic>
-#include <cstddef>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/buffer_builder.h"
-#include "arrow/extension_type.h"
-#include "arrow/memory_pool.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_visit.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/util/variant.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/properties.h"
-
-namespace parquet {
-namespace arrow {
-
-namespace {
-
-using ::arrow::Array;
-using ::arrow::Status;
-using ::arrow::TypedBufferBuilder;
-
-constexpr static int16_t kLevelNotSet = -1;
-
-/// \brief Simple result of a iterating over a column to determine values.
-enum IterationResult {
- /// Processing is done at this node. Move back up the path
- /// to continue processing.
- kDone = -1,
- /// Move down towards the leaf for processing.
- kNext = 1,
- /// An error occurred while processing.
- kError = 2
-};
-
-#define RETURN_IF_ERROR(iteration_result) \
- do { \
- if (ARROW_PREDICT_FALSE(iteration_result == kError)) { \
- return iteration_result; \
- } \
- } while (false)
-
-int64_t LazyNullCount(const Array& array) { return array.data()->null_count.load(); }
-
-bool LazyNoNulls(const Array& array) {
- int64_t null_count = LazyNullCount(array);
- return null_count == 0 ||
- // kUnkownNullCount comparison is needed to account
- // for null arrays.
- (null_count == ::arrow::kUnknownNullCount &&
- array.null_bitmap_data() == nullptr);
-}
-
-struct PathWriteContext {
- PathWriteContext(::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::ResizableBuffer> def_levels_buffer)
- : rep_levels(pool), def_levels(std::move(def_levels_buffer), pool) {}
- IterationResult ReserveDefLevels(int64_t elements) {
- last_status = def_levels.Reserve(elements);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- IterationResult AppendDefLevel(int16_t def_level) {
- last_status = def_levels.Append(def_level);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- IterationResult AppendDefLevels(int64_t count, int16_t def_level) {
- last_status = def_levels.Append(count, def_level);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- void UnsafeAppendDefLevel(int16_t def_level) { def_levels.UnsafeAppend(def_level); }
-
- IterationResult AppendRepLevel(int16_t rep_level) {
- last_status = rep_levels.Append(rep_level);
-
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- IterationResult AppendRepLevels(int64_t count, int16_t rep_level) {
- last_status = rep_levels.Append(count, rep_level);
- if (ARROW_PREDICT_TRUE(last_status.ok())) {
- return kDone;
- }
- return kError;
- }
-
- bool EqualRepDefLevelsLengths() const {
- return rep_levels.length() == def_levels.length();
- }
-
- // Incorporates |range| into visited elements. If the |range| is contiguous
- // with the last range, extend the last range, otherwise add |range| separately
- // tot he list.
- void RecordPostListVisit(const ElementRange& range) {
- if (!visited_elements.empty() && range.start == visited_elements.back().end) {
- visited_elements.back().end = range.end;
- return;
- }
- visited_elements.push_back(range);
- }
-
- Status last_status;
- TypedBufferBuilder<int16_t> rep_levels;
- TypedBufferBuilder<int16_t> def_levels;
- std::vector<ElementRange> visited_elements;
-};
-
-IterationResult FillRepLevels(int64_t count, int16_t rep_level,
- PathWriteContext* context) {
- if (rep_level == kLevelNotSet) {
- return kDone;
- }
- int64_t fill_count = count;
- // This condition occurs (rep and dep levels equals), in one of
- // in a few cases:
- // 1. Before any list is encountered.
- // 2. After rep-level has been filled in due to null/empty
- // values above it.
- // 3. After finishing a list.
- if (!context->EqualRepDefLevelsLengths()) {
- fill_count--;
- }
- return context->AppendRepLevels(fill_count, rep_level);
-}
-
-// A node for handling an array that is discovered to have all
-// null elements. It is referred to as a TerminalNode because
-// traversal of nodes will not continue it when generating
-// rep/def levels. However, there could be many nested children
-// elements beyond it in the Array that is being processed.
-class AllNullsTerminalNode {
- public:
- explicit AllNullsTerminalNode(int16_t def_level, int16_t rep_level = kLevelNotSet)
- : def_level_(def_level), rep_level_(rep_level) {}
- void SetRepLevelIfNull(int16_t rep_level) { rep_level_ = rep_level; }
- IterationResult Run(const ElementRange& range, PathWriteContext* context) {
- int64_t size = range.Size();
- RETURN_IF_ERROR(FillRepLevels(size, rep_level_, context));
- return context->AppendDefLevels(size, def_level_);
- }
-
- private:
- int16_t def_level_;
- int16_t rep_level_;
-};
-
-// Handles the case where all remaining arrays until the leaf have no nulls
-// (and are not interrupted by lists). Unlike AllNullsTerminalNode this is
-// always the last node in a path. We don't need an analogue to the AllNullsTerminalNode
-// because if all values are present at an intermediate array no node is added for it
-// (the def-level for the next nullable node is incremented).
-struct AllPresentTerminalNode {
- IterationResult Run(const ElementRange& range, PathWriteContext* context) {
- return context->AppendDefLevels(range.end - range.start, def_level);
- // No need to worry about rep levels, because this state should
- // only be applicable for after all list/repeated values
- // have been evaluated in the path.
- }
- int16_t def_level;
-};
-
-/// Node for handling the case when the leaf-array is nullable
-/// and contains null elements.
-struct NullableTerminalNode {
- NullableTerminalNode() = default;
-
- NullableTerminalNode(const uint8_t* bitmap, int64_t element_offset,
- int16_t def_level_if_present)
- : bitmap_(bitmap),
- element_offset_(element_offset),
- def_level_if_present_(def_level_if_present),
- def_level_if_null_(def_level_if_present - 1) {}
-
- IterationResult Run(const ElementRange& range, PathWriteContext* context) {
- int64_t elements = range.Size();
- RETURN_IF_ERROR(context->ReserveDefLevels(elements));
-
- DCHECK_GT(elements, 0);
-
- auto bit_visitor = [&](bool is_set) {
- context->UnsafeAppendDefLevel(is_set ? def_level_if_present_ : def_level_if_null_);
- };
-
- if (elements > 16) { // 16 guarantees at least one unrolled loop.
- ::arrow::internal::VisitBitsUnrolled(bitmap_, range.start + element_offset_,
- elements, bit_visitor);
- } else {
- ::arrow::internal::VisitBits(bitmap_, range.start + element_offset_, elements,
- bit_visitor);
- }
- return kDone;
- }
- const uint8_t* bitmap_;
- int64_t element_offset_;
- int16_t def_level_if_present_;
- int16_t def_level_if_null_;
-};
-
-// List nodes handle populating rep_level for Arrow Lists and def-level for empty lists.
-// Nullability (both list and children) is handled by other Nodes. By
-// construction all list nodes will be intermediate nodes (they will always be followed by
-// at least one other node).
-//
-// Type parameters:
-// |RangeSelector| - A strategy for determine the the range of the child node to
-// process.
-// this varies depending on the type of list (int32_t* offsets, int64_t* offsets of
-// fixed.
-template <typename RangeSelector>
-class ListPathNode {
- public:
- ListPathNode(RangeSelector selector, int16_t rep_lev, int16_t def_level_if_empty)
- : selector_(std::move(selector)),
- prev_rep_level_(rep_lev - 1),
- rep_level_(rep_lev),
- def_level_if_empty_(def_level_if_empty) {}
-
- int16_t rep_level() const { return rep_level_; }
-
- IterationResult Run(ElementRange* range, ElementRange* child_range,
- PathWriteContext* context) {
- if (range->Empty()) {
- return kDone;
- }
-
- // Find the first non-empty list (skipping a run of empties).
- int64_t start = range->start;
- // Retrieves the range of elements that this list contains.
- // Uses the strategy pattern to distinguish between the different
- // lists that are supported in Arrow (fixed size, normal and "large").
- *child_range = selector_.GetRange(range->start);
- while (child_range->Empty() && !range->Empty()) {
- ++range->start;
- *child_range = selector_.GetRange(range->start);
- }
- // Loops post-condition:
- // * range is either empty (we are done processing at this node)
- // or start corresponds a non-empty list.
- // * If range is non-empty child_range contains
- // the bounds of non-empty list.
-
- // Handle any skipped over empty lists.
- int64_t empty_elements = range->start - start;
- if (empty_elements > 0) {
- RETURN_IF_ERROR(FillRepLevels(empty_elements, prev_rep_level_, context));
- RETURN_IF_ERROR(context->AppendDefLevels(empty_elements, def_level_if_empty_));
- }
- // Start of a new list. Note that for nested lists adding the element
- // here effectively suppresses this code until we either encounter null
- // elements or empty lists between here and the innermost list (since
- // we make the rep levels repetition and definition levels unequal).
- // Similarly when we are backtracking up the stack the repetition and
- // definition levels are again equal so if we encounter an intermediate list
- // with more elements this will detect it as a new list.
- if (context->EqualRepDefLevelsLengths() && !range->Empty()) {
- RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
- }
-
- if (range->Empty()) {
- return kDone;
- }
-
- ++range->start;
- if (is_last_) {
- // If this is the last repeated node, we can extend try
- // to extend the child range as wide as possible before
- // continuing to the next node.
- return FillForLast(range, child_range, context);
- }
- return kNext;
- }
-
- void SetLast() { is_last_ = true; }
-
- private:
- IterationResult FillForLast(ElementRange* range, ElementRange* child_range,
- PathWriteContext* context) {
- // First fill int the remainder of the list.
- RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context));
- // Once we've reached this point the following preconditions should hold:
- // 1. There are no more repeated path nodes to deal with.
- // 2. All elements in |range| represent contiguous elements in the
- // child array (Null values would have shortened the range to ensure
- // all remaining list elements are present (though they may be empty lists)).
- // 3. No element of range spans a parent list (intermediate
- // list nodes only handle one list entry at a time).
- //
- // Given these preconditions it should be safe to fill runs on non-empty
- // lists here and expand the range in the child node accordingly.
-
- while (!range->Empty()) {
- ElementRange size_check = selector_.GetRange(range->start);
- if (size_check.Empty()) {
- // The empty range will need to be handled after we pass down the accumulated
- // range because it affects def_level placement and we need to get the children
- // def_levels entered first.
- break;
- }
- // This is the start of a new list. We can be sure it only applies
- // to the previous list (and doesn't jump to the start of any list
- // further up in nesting due to the constraints mentioned at the start
- // of the function).
- RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
- RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_));
- DCHECK_EQ(size_check.start, child_range->end);
- child_range->end = size_check.end;
- ++range->start;
- }
-
- // Do book-keeping to track the elements of the arrays that are actually visited
- // beyond this point. This is necessary to identify "gaps" in values that should
- // not be processed (written out to parquet).
- context->RecordPostListVisit(*child_range);
- return kNext;
- }
-
- RangeSelector selector_;
- int16_t prev_rep_level_;
- int16_t rep_level_;
- int16_t def_level_if_empty_;
- bool is_last_ = false;
-};
-
-template <typename OffsetType>
-struct VarRangeSelector {
- ElementRange GetRange(int64_t index) const {
- return ElementRange{offsets[index], offsets[index + 1]};
- }
-
- // Either int32_t* or int64_t*.
- const OffsetType* offsets;
-};
-
-struct FixedSizedRangeSelector {
- ElementRange GetRange(int64_t index) const {
- int64_t start = index * list_size;
- return ElementRange{start, start + list_size};
- }
- int list_size;
-};
-
-// An intermediate node that handles null values.
-class NullableNode {
- public:
- NullableNode(const uint8_t* null_bitmap, int64_t entry_offset,
- int16_t def_level_if_null, int16_t rep_level_if_null = kLevelNotSet)
- : null_bitmap_(null_bitmap),
- entry_offset_(entry_offset),
- valid_bits_reader_(MakeReader(ElementRange{0, 0})),
- def_level_if_null_(def_level_if_null),
- rep_level_if_null_(rep_level_if_null),
- new_range_(true) {}
-
- void SetRepLevelIfNull(int16_t rep_level) { rep_level_if_null_ = rep_level; }
-
- ::arrow::internal::BitRunReader MakeReader(const ElementRange& range) {
- return ::arrow::internal::BitRunReader(null_bitmap_, entry_offset_ + range.start,
- range.Size());
- }
-
- IterationResult Run(ElementRange* range, ElementRange* child_range,
- PathWriteContext* context) {
- if (new_range_) {
- // Reset the reader each time we are starting fresh on a range.
- // We can't rely on continuity because nulls above can
- // cause discontinuities.
- valid_bits_reader_ = MakeReader(*range);
- }
- child_range->start = range->start;
- ::arrow::internal::BitRun run = valid_bits_reader_.NextRun();
- if (!run.set) {
- range->start += run.length;
- RETURN_IF_ERROR(FillRepLevels(run.length, rep_level_if_null_, context));
- RETURN_IF_ERROR(context->AppendDefLevels(run.length, def_level_if_null_));
- run = valid_bits_reader_.NextRun();
- }
- if (range->Empty()) {
- new_range_ = true;
- return kDone;
- }
- child_range->end = child_range->start = range->start;
- child_range->end += run.length;
-
- DCHECK(!child_range->Empty());
- range->start += child_range->Size();
- new_range_ = false;
- return kNext;
- }
-
- const uint8_t* null_bitmap_;
- int64_t entry_offset_;
- ::arrow::internal::BitRunReader valid_bits_reader_;
- int16_t def_level_if_null_;
- int16_t rep_level_if_null_;
-
- // Whether the next invocation will be a new range.
- bool new_range_ = true;
-};
-
-using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
-using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
-using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;
-
-// Contains static information derived from traversing the schema.
-struct PathInfo {
- // The vectors are expected to the same length info.
-
- // Note index order matters here.
- using Node = ::arrow::util::Variant<NullableTerminalNode, ListNode, LargeListNode,
- FixedSizeListNode, NullableNode,
- AllPresentTerminalNode, AllNullsTerminalNode>;
-
- std::vector<Node> path;
- std::shared_ptr<Array> primitive_array;
- int16_t max_def_level = 0;
- int16_t max_rep_level = 0;
- bool has_dictionary = false;
- bool leaf_is_nullable = false;
-};
-
-/// Contains logic for writing a single leaf node to parquet.
-/// This tracks the path from root to leaf.
-///
-/// |writer| will be called after all of the definition/repetition
-/// values have been calculated for root_range with the calculated
-/// values. It is intended to abstract the complexity of writing
-/// the levels and values to parquet.
-Status WritePath(ElementRange root_range, PathInfo* path_info,
- ArrowWriteContext* arrow_context,
- MultipathLevelBuilder::CallbackFunction writer) {
- std::vector<ElementRange> stack(path_info->path.size());
- MultipathLevelBuilderResult builder_result;
- builder_result.leaf_array = path_info->primitive_array;
- builder_result.leaf_is_nullable = path_info->leaf_is_nullable;
-
- if (path_info->max_def_level == 0) {
- // This case only occurs when there are no nullable or repeated
- // columns in the path from the root to leaf.
- int64_t leaf_length = builder_result.leaf_array->length();
- builder_result.def_rep_level_count = leaf_length;
- builder_result.post_list_visited_elements.push_back({0, leaf_length});
- return writer(builder_result);
- }
- stack[0] = root_range;
- RETURN_NOT_OK(
- arrow_context->def_levels_buffer->Resize(/*new_size=*/0, /*shrink_to_fit*/ false));
- PathWriteContext context(arrow_context->memory_pool, arrow_context->def_levels_buffer);
- // We should need at least this many entries so reserve the space ahead of time.
- RETURN_NOT_OK(context.def_levels.Reserve(root_range.Size()));
- if (path_info->max_rep_level > 0) {
- RETURN_NOT_OK(context.rep_levels.Reserve(root_range.Size()));
- }
-
- auto stack_base = &stack[0];
- auto stack_position = stack_base;
- // This is the main loop for calculated rep/def levels. The nodes
- // in the path implement a chain-of-responsibility like pattern
- // where each node can add some number of repetition/definition
- // levels to PathWriteContext and also delegate to the next node
- // in the path to add values. The values are added through each Run(...)
- // call and the choice to delegate to the next node (or return to the
- // previous node) is communicated by the return value of Run(...).
- // The loop terminates after the first node indicates all values in
- // |root_range| are processed.
- while (stack_position >= stack_base) {
- PathInfo::Node& node = path_info->path[stack_position - stack_base];
- struct {
- IterationResult operator()(NullableNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- IterationResult operator()(ListNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- IterationResult operator()(NullableTerminalNode* node) {
- return node->Run(*stack_position, context);
- }
- IterationResult operator()(FixedSizeListNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- IterationResult operator()(AllPresentTerminalNode* node) {
- return node->Run(*stack_position, context);
- }
- IterationResult operator()(AllNullsTerminalNode* node) {
- return node->Run(*stack_position, context);
- }
- IterationResult operator()(LargeListNode* node) {
- return node->Run(stack_position, stack_position + 1, context);
- }
- ElementRange* stack_position;
- PathWriteContext* context;
- } visitor = {stack_position, &context};
-
- IterationResult result = ::arrow::util::visit(visitor, &node);
-
- if (ARROW_PREDICT_FALSE(result == kError)) {
- DCHECK(!context.last_status.ok());
- return context.last_status;
- }
- stack_position += static_cast<int>(result);
- }
- RETURN_NOT_OK(context.last_status);
- builder_result.def_rep_level_count = context.def_levels.length();
-
- if (context.rep_levels.length() > 0) {
- // This case only occurs when there was a repeated element that needs to be
- // processed.
- builder_result.rep_levels = context.rep_levels.data();
- std::swap(builder_result.post_list_visited_elements, context.visited_elements);
- // If it is possible when processing lists that all lists where empty. In this
- // case no elements would have been added to post_list_visited_elements. By
- // added an empty element we avoid special casing in downstream consumers.
- if (builder_result.post_list_visited_elements.empty()) {
- builder_result.post_list_visited_elements.push_back({0, 0});
- }
- } else {
- builder_result.post_list_visited_elements.push_back(
- {0, builder_result.leaf_array->length()});
- builder_result.rep_levels = nullptr;
- }
-
- builder_result.def_levels = context.def_levels.data();
- return writer(builder_result);
-}
-
-struct FixupVisitor {
- int max_rep_level = -1;
- int16_t rep_level_if_null = kLevelNotSet;
-
- template <typename T>
- void HandleListNode(T* arg) {
- if (arg->rep_level() == max_rep_level) {
- arg->SetLast();
- // after the last list node we don't need to fill
- // rep levels on null.
- rep_level_if_null = kLevelNotSet;
- } else {
- rep_level_if_null = arg->rep_level();
- }
- }
- void operator()(ListNode* node) { HandleListNode(node); }
- void operator()(LargeListNode* node) { HandleListNode(node); }
- void operator()(FixedSizeListNode* node) { HandleListNode(node); }
-
- // For non-list intermediate nodes.
- template <typename T>
- void HandleIntermediateNode(T* arg) {
- if (rep_level_if_null != kLevelNotSet) {
- arg->SetRepLevelIfNull(rep_level_if_null);
- }
- }
-
- void operator()(NullableNode* arg) { HandleIntermediateNode(arg); }
-
- void operator()(AllNullsTerminalNode* arg) {
- // Even though no processing happens past this point we
- // still need to adjust it if a list occurred after an
- // all null array.
- HandleIntermediateNode(arg);
- }
-
- void operator()(NullableTerminalNode*) {}
- void operator()(AllPresentTerminalNode*) {}
-};
-
-PathInfo Fixup(PathInfo info) {
- // We only need to fixup the path if there were repeated
- // elements on it.
- if (info.max_rep_level == 0) {
- return info;
- }
- FixupVisitor visitor;
- visitor.max_rep_level = info.max_rep_level;
- if (visitor.max_rep_level > 0) {
- visitor.rep_level_if_null = 0;
- }
- for (size_t x = 0; x < info.path.size(); x++) {
- ::arrow::util::visit(visitor, &info.path[x]);
- }
- return info;
-}
-
-class PathBuilder {
- public:
- explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {}
- template <typename T>
- void AddTerminalInfo(const T& array) {
- info_.leaf_is_nullable = nullable_in_parent_;
- if (nullable_in_parent_) {
- info_.max_def_level++;
- }
- // We don't use null_count() because if the null_count isn't known
- // and the array does in fact contain nulls, we will end up
- // traversing the null bitmap twice (once here and once when calculating
- // rep/def levels).
- if (LazyNoNulls(array)) {
- info_.path.emplace_back(AllPresentTerminalNode{info_.max_def_level});
- } else if (LazyNullCount(array) == array.length()) {
- info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
- } else {
- info_.path.emplace_back(NullableTerminalNode(array.null_bitmap_data(),
- array.offset(), info_.max_def_level));
- }
- info_.primitive_array = std::make_shared<T>(array.data());
- paths_.push_back(Fixup(info_));
- }
-
- template <typename T>
- ::arrow::enable_if_t<std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit(
- const T& array) {
- AddTerminalInfo(array);
- return Status::OK();
- }
-
- template <typename T>
- ::arrow::enable_if_t<std::is_same<::arrow::ListArray, T>::value ||
- std::is_same<::arrow::LargeListArray, T>::value,
- Status>
- Visit(const T& array) {
- MaybeAddNullable(array);
- // Increment necessary due to empty lists.
- info_.max_def_level++;
- info_.max_rep_level++;
- // raw_value_offsets() accounts for any slice offset.
- ListPathNode<VarRangeSelector<typename T::offset_type>> node(
- VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
- info_.max_rep_level, info_.max_def_level - 1);
- info_.path.emplace_back(std::move(node));
- nullable_in_parent_ = array.list_type()->value_field()->nullable();
- return VisitInline(*array.values());
- }
-
- Status Visit(const ::arrow::DictionaryArray& array) {
- // Only currently handle DictionaryArray where the dictionary is a
- // primitive type
- if (array.dict_type()->value_type()->num_fields() > 0) {
- return Status::NotImplemented(
- "Writing DictionaryArray with nested dictionary "
- "type not yet supported");
- }
- if (array.dictionary()->null_count() > 0) {
- return Status::NotImplemented(
- "Writing DictionaryArray with null encoded in dictionary "
- "type not yet supported");
- }
- AddTerminalInfo(array);
- return Status::OK();
- }
-
- void MaybeAddNullable(const Array& array) {
- if (!nullable_in_parent_) {
- return;
- }
- info_.max_def_level++;
- // We don't use null_count() because if the null_count isn't known
- // and the array does in fact contain nulls, we will end up
- // traversing the null bitmap twice (once here and once when calculating
- // rep/def levels). Because this isn't terminal this might not be
- // the right decision for structs that share the same nullable
- // parents.
- if (LazyNoNulls(array)) {
- // Don't add anything because there won't be any point checking
- // null values for the array. There will always be at least
- // one more array to handle nullability.
- return;
- }
- if (LazyNullCount(array) == array.length()) {
- info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
- return;
- }
- info_.path.emplace_back(
- NullableNode(array.null_bitmap_data(), array.offset(),
- /* def_level_if_null = */ info_.max_def_level - 1));
- }
-
- Status VisitInline(const Array& array);
-
- Status Visit(const ::arrow::MapArray& array) {
- return Visit(static_cast<const ::arrow::ListArray&>(array));
- }
-
- Status Visit(const ::arrow::StructArray& array) {
- MaybeAddNullable(array);
- PathInfo info_backup = info_;
- for (int x = 0; x < array.num_fields(); x++) {
- nullable_in_parent_ = array.type()->field(x)->nullable();
- RETURN_NOT_OK(VisitInline(*array.field(x)));
- info_ = info_backup;
- }
- return Status::OK();
- }
-
- Status Visit(const ::arrow::FixedSizeListArray& array) {
- MaybeAddNullable(array);
- int32_t list_size = array.list_type()->list_size();
- // Technically we could encode fixed size lists with two level encodings
- // but since we always use 3 level encoding we increment def levels as
- // well.
- info_.max_def_level++;
- info_.max_rep_level++;
- info_.path.emplace_back(FixedSizeListNode(FixedSizedRangeSelector{list_size},
- info_.max_rep_level, info_.max_def_level));
- nullable_in_parent_ = array.list_type()->value_field()->nullable();
- if (array.offset() > 0) {
- return VisitInline(*array.values()->Slice(array.value_offset(0)));
- }
- return VisitInline(*array.values());
- }
-
- Status Visit(const ::arrow::ExtensionArray& array) {
- return VisitInline(*array.storage());
- }
-
-#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
- Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
- return Status::NotImplemented("Level generation for " #ArrowTypePrefix \
- " not supported yet"); \
- }
-
- // Union types aren't supported in Parquet.
- NOT_IMPLEMENTED_VISIT(Union)
-
-#undef NOT_IMPLEMENTED_VISIT
- std::vector<PathInfo>& paths() { return paths_; }
-
- private:
- PathInfo info_;
- std::vector<PathInfo> paths_;
- bool nullable_in_parent_;
-};
-
-Status PathBuilder::VisitInline(const Array& array) {
- return ::arrow::VisitArrayInline(array, this);
-}
-
-#undef RETURN_IF_ERROR
-} // namespace
-
-class MultipathLevelBuilderImpl : public MultipathLevelBuilder {
- public:
- MultipathLevelBuilderImpl(std::shared_ptr<::arrow::ArrayData> data,
- std::unique_ptr<PathBuilder> path_builder)
- : root_range_{0, data->length},
- data_(std::move(data)),
- path_builder_(std::move(path_builder)) {}
-
- int GetLeafCount() const override {
- return static_cast<int>(path_builder_->paths().size());
- }
-
- ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
- CallbackFunction write_leaf_callback) override {
- DCHECK_GE(leaf_index, 0);
- DCHECK_LT(leaf_index, GetLeafCount());
- return WritePath(root_range_, &path_builder_->paths()[leaf_index], context,
- std::move(write_leaf_callback));
- }
-
- private:
- ElementRange root_range_;
- // Reference holder to ensure the data stays valid.
- std::shared_ptr<::arrow::ArrayData> data_;
- std::unique_ptr<PathBuilder> path_builder_;
-};
-
-// static
-::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> MultipathLevelBuilder::Make(
- const ::arrow::Array& array, bool array_field_nullable) {
- auto constructor = ::arrow::internal::make_unique<PathBuilder>(array_field_nullable);
- RETURN_NOT_OK(VisitArrayInline(array, constructor.get()));
- return ::arrow::internal::make_unique<MultipathLevelBuilderImpl>(
- array.data(), std::move(constructor));
-}
-
-// static
-Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullable,
- ArrowWriteContext* context,
- MultipathLevelBuilder::CallbackFunction callback) {
- ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
- MultipathLevelBuilder::Make(array, array_field_nullable));
- PathBuilder constructor(array_field_nullable);
- RETURN_NOT_OK(VisitArrayInline(array, &constructor));
- for (int leaf_idx = 0; leaf_idx < builder->GetLeafCount(); leaf_idx++) {
- RETURN_NOT_OK(builder->Write(leaf_idx, context, callback));
- }
- return Status::OK();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Overview.
+//
+// The strategy used for this code for repetition/definition
+// is to dissect the top level array into a list of paths
+// from the top level array to the final primitive (possibly
+// dictionary encoded array). It then evaluates each one of
+// those paths to produce results for the callback iteratively.
+//
+// This approach was taken to reduce the aggregate memory required if we were
+// to build all def/rep levels in parallel as apart of a tree traversal. It
+// also allows for straightforward parallelization at the path level if that is
+// desired in the future.
+//
+// The main downside to this approach is it duplicates effort for nodes
+// that share common ancestors. This can be mitigated to some degree
+// by adding in optimizations that detect leaf arrays that share
+// the same common list ancestor and reuse the repetition levels
+// from the first leaf encountered (only definition levels greater
+// the list ancestor need to be re-evaluated. This is left for future
+// work.
+//
+// Algorithm.
+//
+// As mentioned above this code dissects arrays into constituent parts:
+// nullability data, and list offset data. It tries to optimize for
+// some special cases, where it is known ahead of time that a step
+// can be skipped (e.g. a nullable array happens to have all of its
+// values) or batch filled (a nullable array has all null values).
+// One further optimization that is not implemented but could be done
+// in the future is special handling for nested list arrays that
+// have some intermediate data which indicates the final array contains only
+// nulls.
+//
+// In general, the algorithm attempts to batch work at each node as much
+// as possible. For nullability nodes this means finding runs of null
+// values and batch filling those interspersed with finding runs of non-null values
+// to process in batch at the next column.
+//
+// Similarly, list runs of empty lists are all processed in one batch
+// followed by either:
+// - A single list entry for non-terminal lists (i.e. the upper part of a nested list)
+// - Runs of non-empty lists for the terminal list (i.e. the lowest part of a nested
+// list).
+//
+// This makes use of the following observations.
+// 1. Null values at any node on the path are terminal (repetition and definition
+// level can be set directly when a Null value is encountered).
+// 2. Empty lists share this eager termination property with Null values.
+// 3. In order to keep repetition/definition level populated the algorithm is lazy
+// in assigning repetition levels. The algorithm tracks whether it is currently
+// in the middle of a list by comparing the lengths of repetition/definition levels.
+// If it is currently in the middle of a list the the number of repetition levels
+// populated will be greater than definition levels (the start of a List requires
+// adding the first element). If there are equal numbers of definition and repetition
+// levels populated this indicates a list is waiting to be started and the next list
+// encountered will have its repetition level signify the beginning of the list.
+//
+// Other implementation notes.
+//
+// This code hasn't been benchmarked (or assembly analyzed) but did the following
+// as optimizations (yes premature optimization is the root of all evil).
+// - This code does not use recursion, instead it constructs its own stack and manages
+// updating elements accordingly.
+// - It tries to avoid using Status for common return states.
+// - Avoids virtual dispatch in favor of if/else statements on a set of well known
+// classes.
+
+#include "parquet/arrow/path_internal.h"
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/extension_type.h"
+#include "arrow/memory_pool.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_visit.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/variant.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+namespace arrow {
+
+namespace {
+
+using ::arrow::Array;
+using ::arrow::Status;
+using ::arrow::TypedBufferBuilder;
+
+constexpr static int16_t kLevelNotSet = -1;
+
+/// \brief Simple result of a iterating over a column to determine values.
+enum IterationResult {
+ /// Processing is done at this node. Move back up the path
+ /// to continue processing.
+ kDone = -1,
+ /// Move down towards the leaf for processing.
+ kNext = 1,
+ /// An error occurred while processing.
+ kError = 2
+};
+
+#define RETURN_IF_ERROR(iteration_result) \
+ do { \
+ if (ARROW_PREDICT_FALSE(iteration_result == kError)) { \
+ return iteration_result; \
+ } \
+ } while (false)
+
+int64_t LazyNullCount(const Array& array) { return array.data()->null_count.load(); }
+
+bool LazyNoNulls(const Array& array) {
+ int64_t null_count = LazyNullCount(array);
+ return null_count == 0 ||
+ // kUnkownNullCount comparison is needed to account
+ // for null arrays.
+ (null_count == ::arrow::kUnknownNullCount &&
+ array.null_bitmap_data() == nullptr);
+}
+
+struct PathWriteContext {
+ PathWriteContext(::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::ResizableBuffer> def_levels_buffer)
+ : rep_levels(pool), def_levels(std::move(def_levels_buffer), pool) {}
+ IterationResult ReserveDefLevels(int64_t elements) {
+ last_status = def_levels.Reserve(elements);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendDefLevel(int16_t def_level) {
+ last_status = def_levels.Append(def_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendDefLevels(int64_t count, int16_t def_level) {
+ last_status = def_levels.Append(count, def_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ void UnsafeAppendDefLevel(int16_t def_level) { def_levels.UnsafeAppend(def_level); }
+
+ IterationResult AppendRepLevel(int16_t rep_level) {
+ last_status = rep_levels.Append(rep_level);
+
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ IterationResult AppendRepLevels(int64_t count, int16_t rep_level) {
+ last_status = rep_levels.Append(count, rep_level);
+ if (ARROW_PREDICT_TRUE(last_status.ok())) {
+ return kDone;
+ }
+ return kError;
+ }
+
+ bool EqualRepDefLevelsLengths() const {
+ return rep_levels.length() == def_levels.length();
+ }
+
+ // Incorporates |range| into visited elements. If the |range| is contiguous
+ // with the last range, extend the last range, otherwise add |range| separately
+ // tot he list.
+ void RecordPostListVisit(const ElementRange& range) {
+ if (!visited_elements.empty() && range.start == visited_elements.back().end) {
+ visited_elements.back().end = range.end;
+ return;
+ }
+ visited_elements.push_back(range);
+ }
+
+ Status last_status;
+ TypedBufferBuilder<int16_t> rep_levels;
+ TypedBufferBuilder<int16_t> def_levels;
+ std::vector<ElementRange> visited_elements;
+};
+
+IterationResult FillRepLevels(int64_t count, int16_t rep_level,
+ PathWriteContext* context) {
+ if (rep_level == kLevelNotSet) {
+ return kDone;
+ }
+ int64_t fill_count = count;
+ // This condition occurs (rep and dep levels equals), in one of
+ // in a few cases:
+ // 1. Before any list is encountered.
+ // 2. After rep-level has been filled in due to null/empty
+ // values above it.
+ // 3. After finishing a list.
+ if (!context->EqualRepDefLevelsLengths()) {
+ fill_count--;
+ }
+ return context->AppendRepLevels(fill_count, rep_level);
+}
+
+// A node for handling an array that is discovered to have all
+// null elements. It is referred to as a TerminalNode because
+// traversal of nodes will not continue it when generating
+// rep/def levels. However, there could be many nested children
+// elements beyond it in the Array that is being processed.
+class AllNullsTerminalNode {
+ public:
+ explicit AllNullsTerminalNode(int16_t def_level, int16_t rep_level = kLevelNotSet)
+ : def_level_(def_level), rep_level_(rep_level) {}
+ void SetRepLevelIfNull(int16_t rep_level) { rep_level_ = rep_level; }
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ int64_t size = range.Size();
+ RETURN_IF_ERROR(FillRepLevels(size, rep_level_, context));
+ return context->AppendDefLevels(size, def_level_);
+ }
+
+ private:
+ int16_t def_level_;
+ int16_t rep_level_;
+};
+
+// Handles the case where all remaining arrays until the leaf have no nulls
+// (and are not interrupted by lists). Unlike AllNullsTerminalNode this is
+// always the last node in a path. We don't need an analogue to the AllNullsTerminalNode
+// because if all values are present at an intermediate array no node is added for it
+// (the def-level for the next nullable node is incremented).
+struct AllPresentTerminalNode {
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ return context->AppendDefLevels(range.end - range.start, def_level);
+ // No need to worry about rep levels, because this state should
+ // only be applicable for after all list/repeated values
+ // have been evaluated in the path.
+ }
+ int16_t def_level;
+};
+
+/// Node for handling the case when the leaf-array is nullable
+/// and contains null elements.
+struct NullableTerminalNode {
+ NullableTerminalNode() = default;
+
+ NullableTerminalNode(const uint8_t* bitmap, int64_t element_offset,
+ int16_t def_level_if_present)
+ : bitmap_(bitmap),
+ element_offset_(element_offset),
+ def_level_if_present_(def_level_if_present),
+ def_level_if_null_(def_level_if_present - 1) {}
+
+ IterationResult Run(const ElementRange& range, PathWriteContext* context) {
+ int64_t elements = range.Size();
+ RETURN_IF_ERROR(context->ReserveDefLevels(elements));
+
+ DCHECK_GT(elements, 0);
+
+ auto bit_visitor = [&](bool is_set) {
+ context->UnsafeAppendDefLevel(is_set ? def_level_if_present_ : def_level_if_null_);
+ };
+
+ if (elements > 16) { // 16 guarantees at least one unrolled loop.
+ ::arrow::internal::VisitBitsUnrolled(bitmap_, range.start + element_offset_,
+ elements, bit_visitor);
+ } else {
+ ::arrow::internal::VisitBits(bitmap_, range.start + element_offset_, elements,
+ bit_visitor);
+ }
+ return kDone;
+ }
+ const uint8_t* bitmap_;
+ int64_t element_offset_;
+ int16_t def_level_if_present_;
+ int16_t def_level_if_null_;
+};
+
+// List nodes handle populating rep_level for Arrow Lists and def-level for empty lists.
+// Nullability (both list and children) is handled by other Nodes. By
+// construction all list nodes will be intermediate nodes (they will always be followed by
+// at least one other node).
+//
+// Type parameters:
+// |RangeSelector| - A strategy for determine the the range of the child node to
+// process.
+// this varies depending on the type of list (int32_t* offsets, int64_t* offsets of
+// fixed.
+template <typename RangeSelector>
+class ListPathNode {
+ public:
+ ListPathNode(RangeSelector selector, int16_t rep_lev, int16_t def_level_if_empty)
+ : selector_(std::move(selector)),
+ prev_rep_level_(rep_lev - 1),
+ rep_level_(rep_lev),
+ def_level_if_empty_(def_level_if_empty) {}
+
+ int16_t rep_level() const { return rep_level_; }
+
+ IterationResult Run(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ if (range->Empty()) {
+ return kDone;
+ }
+
+ // Find the first non-empty list (skipping a run of empties).
+ int64_t start = range->start;
+ // Retrieves the range of elements that this list contains.
+ // Uses the strategy pattern to distinguish between the different
+ // lists that are supported in Arrow (fixed size, normal and "large").
+ *child_range = selector_.GetRange(range->start);
+ while (child_range->Empty() && !range->Empty()) {
+ ++range->start;
+ *child_range = selector_.GetRange(range->start);
+ }
+ // Loops post-condition:
+ // * range is either empty (we are done processing at this node)
+ // or start corresponds a non-empty list.
+ // * If range is non-empty child_range contains
+ // the bounds of non-empty list.
+
+ // Handle any skipped over empty lists.
+ int64_t empty_elements = range->start - start;
+ if (empty_elements > 0) {
+ RETURN_IF_ERROR(FillRepLevels(empty_elements, prev_rep_level_, context));
+ RETURN_IF_ERROR(context->AppendDefLevels(empty_elements, def_level_if_empty_));
+ }
+ // Start of a new list. Note that for nested lists adding the element
+ // here effectively suppresses this code until we either encounter null
+ // elements or empty lists between here and the innermost list (since
+ // we make the rep levels repetition and definition levels unequal).
+ // Similarly when we are backtracking up the stack the repetition and
+ // definition levels are again equal so if we encounter an intermediate list
+ // with more elements this will detect it as a new list.
+ if (context->EqualRepDefLevelsLengths() && !range->Empty()) {
+ RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
+ }
+
+ if (range->Empty()) {
+ return kDone;
+ }
+
+ ++range->start;
+ if (is_last_) {
+ // If this is the last repeated node, we can extend try
+ // to extend the child range as wide as possible before
+ // continuing to the next node.
+ return FillForLast(range, child_range, context);
+ }
+ return kNext;
+ }
+
+ void SetLast() { is_last_ = true; }
+
+ private:
+ IterationResult FillForLast(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ // First fill int the remainder of the list.
+ RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context));
+ // Once we've reached this point the following preconditions should hold:
+ // 1. There are no more repeated path nodes to deal with.
+ // 2. All elements in |range| represent contiguous elements in the
+ // child array (Null values would have shortened the range to ensure
+ // all remaining list elements are present (though they may be empty lists)).
+ // 3. No element of range spans a parent list (intermediate
+ // list nodes only handle one list entry at a time).
+ //
+ // Given these preconditions it should be safe to fill runs on non-empty
+ // lists here and expand the range in the child node accordingly.
+
+ while (!range->Empty()) {
+ ElementRange size_check = selector_.GetRange(range->start);
+ if (size_check.Empty()) {
+ // The empty range will need to be handled after we pass down the accumulated
+ // range because it affects def_level placement and we need to get the children
+ // def_levels entered first.
+ break;
+ }
+ // This is the start of a new list. We can be sure it only applies
+ // to the previous list (and doesn't jump to the start of any list
+ // further up in nesting due to the constraints mentioned at the start
+ // of the function).
+ RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
+ RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_));
+ DCHECK_EQ(size_check.start, child_range->end);
+ child_range->end = size_check.end;
+ ++range->start;
+ }
+
+ // Do book-keeping to track the elements of the arrays that are actually visited
+ // beyond this point. This is necessary to identify "gaps" in values that should
+ // not be processed (written out to parquet).
+ context->RecordPostListVisit(*child_range);
+ return kNext;
+ }
+
+ RangeSelector selector_;
+ int16_t prev_rep_level_;
+ int16_t rep_level_;
+ int16_t def_level_if_empty_;
+ bool is_last_ = false;
+};
+
+template <typename OffsetType>
+struct VarRangeSelector {
+ ElementRange GetRange(int64_t index) const {
+ return ElementRange{offsets[index], offsets[index + 1]};
+ }
+
+ // Either int32_t* or int64_t*.
+ const OffsetType* offsets;
+};
+
+struct FixedSizedRangeSelector {
+ ElementRange GetRange(int64_t index) const {
+ int64_t start = index * list_size;
+ return ElementRange{start, start + list_size};
+ }
+ int list_size;
+};
+
+// An intermediate node that handles null values.
+class NullableNode {
+ public:
+ NullableNode(const uint8_t* null_bitmap, int64_t entry_offset,
+ int16_t def_level_if_null, int16_t rep_level_if_null = kLevelNotSet)
+ : null_bitmap_(null_bitmap),
+ entry_offset_(entry_offset),
+ valid_bits_reader_(MakeReader(ElementRange{0, 0})),
+ def_level_if_null_(def_level_if_null),
+ rep_level_if_null_(rep_level_if_null),
+ new_range_(true) {}
+
+ void SetRepLevelIfNull(int16_t rep_level) { rep_level_if_null_ = rep_level; }
+
+ ::arrow::internal::BitRunReader MakeReader(const ElementRange& range) {
+ return ::arrow::internal::BitRunReader(null_bitmap_, entry_offset_ + range.start,
+ range.Size());
+ }
+
+ IterationResult Run(ElementRange* range, ElementRange* child_range,
+ PathWriteContext* context) {
+ if (new_range_) {
+ // Reset the reader each time we are starting fresh on a range.
+ // We can't rely on continuity because nulls above can
+ // cause discontinuities.
+ valid_bits_reader_ = MakeReader(*range);
+ }
+ child_range->start = range->start;
+ ::arrow::internal::BitRun run = valid_bits_reader_.NextRun();
+ if (!run.set) {
+ range->start += run.length;
+ RETURN_IF_ERROR(FillRepLevels(run.length, rep_level_if_null_, context));
+ RETURN_IF_ERROR(context->AppendDefLevels(run.length, def_level_if_null_));
+ run = valid_bits_reader_.NextRun();
+ }
+ if (range->Empty()) {
+ new_range_ = true;
+ return kDone;
+ }
+ child_range->end = child_range->start = range->start;
+ child_range->end += run.length;
+
+ DCHECK(!child_range->Empty());
+ range->start += child_range->Size();
+ new_range_ = false;
+ return kNext;
+ }
+
+ const uint8_t* null_bitmap_;
+ int64_t entry_offset_;
+ ::arrow::internal::BitRunReader valid_bits_reader_;
+ int16_t def_level_if_null_;
+ int16_t rep_level_if_null_;
+
+ // Whether the next invocation will be a new range.
+ bool new_range_ = true;
+};
+
+using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
+using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
+using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;
+
+// Contains static information derived from traversing the schema.
+struct PathInfo {
+ // The vectors are expected to the same length info.
+
+ // Note index order matters here.
+ using Node = ::arrow::util::Variant<NullableTerminalNode, ListNode, LargeListNode,
+ FixedSizeListNode, NullableNode,
+ AllPresentTerminalNode, AllNullsTerminalNode>;
+
+ std::vector<Node> path;
+ std::shared_ptr<Array> primitive_array;
+ int16_t max_def_level = 0;
+ int16_t max_rep_level = 0;
+ bool has_dictionary = false;
+ bool leaf_is_nullable = false;
+};
+
+/// Contains logic for writing a single leaf node to parquet.
+/// This tracks the path from root to leaf.
+///
+/// |writer| will be called after all of the definition/repetition
+/// values have been calculated for root_range with the calculated
+/// values. It is intended to abstract the complexity of writing
+/// the levels and values to parquet.
+Status WritePath(ElementRange root_range, PathInfo* path_info,
+ ArrowWriteContext* arrow_context,
+ MultipathLevelBuilder::CallbackFunction writer) {
+ std::vector<ElementRange> stack(path_info->path.size());
+ MultipathLevelBuilderResult builder_result;
+ builder_result.leaf_array = path_info->primitive_array;
+ builder_result.leaf_is_nullable = path_info->leaf_is_nullable;
+
+ if (path_info->max_def_level == 0) {
+ // This case only occurs when there are no nullable or repeated
+ // columns in the path from the root to leaf.
+ int64_t leaf_length = builder_result.leaf_array->length();
+ builder_result.def_rep_level_count = leaf_length;
+ builder_result.post_list_visited_elements.push_back({0, leaf_length});
+ return writer(builder_result);
+ }
+ stack[0] = root_range;
+ RETURN_NOT_OK(
+ arrow_context->def_levels_buffer->Resize(/*new_size=*/0, /*shrink_to_fit*/ false));
+ PathWriteContext context(arrow_context->memory_pool, arrow_context->def_levels_buffer);
+ // We should need at least this many entries so reserve the space ahead of time.
+ RETURN_NOT_OK(context.def_levels.Reserve(root_range.Size()));
+ if (path_info->max_rep_level > 0) {
+ RETURN_NOT_OK(context.rep_levels.Reserve(root_range.Size()));
+ }
+
+ auto stack_base = &stack[0];
+ auto stack_position = stack_base;
+ // This is the main loop for calculated rep/def levels. The nodes
+ // in the path implement a chain-of-responsibility like pattern
+ // where each node can add some number of repetition/definition
+ // levels to PathWriteContext and also delegate to the next node
+ // in the path to add values. The values are added through each Run(...)
+ // call and the choice to delegate to the next node (or return to the
+ // previous node) is communicated by the return value of Run(...).
+ // The loop terminates after the first node indicates all values in
+ // |root_range| are processed.
+ while (stack_position >= stack_base) {
+ PathInfo::Node& node = path_info->path[stack_position - stack_base];
+ struct {
+ IterationResult operator()(NullableNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(ListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(NullableTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(FixedSizeListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ IterationResult operator()(AllPresentTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(AllNullsTerminalNode* node) {
+ return node->Run(*stack_position, context);
+ }
+ IterationResult operator()(LargeListNode* node) {
+ return node->Run(stack_position, stack_position + 1, context);
+ }
+ ElementRange* stack_position;
+ PathWriteContext* context;
+ } visitor = {stack_position, &context};
+
+ IterationResult result = ::arrow::util::visit(visitor, &node);
+
+ if (ARROW_PREDICT_FALSE(result == kError)) {
+ DCHECK(!context.last_status.ok());
+ return context.last_status;
+ }
+ stack_position += static_cast<int>(result);
+ }
+ RETURN_NOT_OK(context.last_status);
+ builder_result.def_rep_level_count = context.def_levels.length();
+
+ if (context.rep_levels.length() > 0) {
+ // This case only occurs when there was a repeated element that needs to be
+ // processed.
+ builder_result.rep_levels = context.rep_levels.data();
+ std::swap(builder_result.post_list_visited_elements, context.visited_elements);
+ // If it is possible when processing lists that all lists where empty. In this
+ // case no elements would have been added to post_list_visited_elements. By
+ // added an empty element we avoid special casing in downstream consumers.
+ if (builder_result.post_list_visited_elements.empty()) {
+ builder_result.post_list_visited_elements.push_back({0, 0});
+ }
+ } else {
+ builder_result.post_list_visited_elements.push_back(
+ {0, builder_result.leaf_array->length()});
+ builder_result.rep_levels = nullptr;
+ }
+
+ builder_result.def_levels = context.def_levels.data();
+ return writer(builder_result);
+}
+
+struct FixupVisitor {
+ int max_rep_level = -1;
+ int16_t rep_level_if_null = kLevelNotSet;
+
+ template <typename T>
+ void HandleListNode(T* arg) {
+ if (arg->rep_level() == max_rep_level) {
+ arg->SetLast();
+ // after the last list node we don't need to fill
+ // rep levels on null.
+ rep_level_if_null = kLevelNotSet;
+ } else {
+ rep_level_if_null = arg->rep_level();
+ }
+ }
+ void operator()(ListNode* node) { HandleListNode(node); }
+ void operator()(LargeListNode* node) { HandleListNode(node); }
+ void operator()(FixedSizeListNode* node) { HandleListNode(node); }
+
+ // For non-list intermediate nodes.
+ template <typename T>
+ void HandleIntermediateNode(T* arg) {
+ if (rep_level_if_null != kLevelNotSet) {
+ arg->SetRepLevelIfNull(rep_level_if_null);
+ }
+ }
+
+ void operator()(NullableNode* arg) { HandleIntermediateNode(arg); }
+
+ void operator()(AllNullsTerminalNode* arg) {
+ // Even though no processing happens past this point we
+ // still need to adjust it if a list occurred after an
+ // all null array.
+ HandleIntermediateNode(arg);
+ }
+
+ void operator()(NullableTerminalNode*) {}
+ void operator()(AllPresentTerminalNode*) {}
+};
+
+PathInfo Fixup(PathInfo info) {
+ // We only need to fixup the path if there were repeated
+ // elements on it.
+ if (info.max_rep_level == 0) {
+ return info;
+ }
+ FixupVisitor visitor;
+ visitor.max_rep_level = info.max_rep_level;
+ if (visitor.max_rep_level > 0) {
+ visitor.rep_level_if_null = 0;
+ }
+ for (size_t x = 0; x < info.path.size(); x++) {
+ ::arrow::util::visit(visitor, &info.path[x]);
+ }
+ return info;
+}
+
+class PathBuilder {
+ public:
+ explicit PathBuilder(bool start_nullable) : nullable_in_parent_(start_nullable) {}
+ template <typename T>
+ void AddTerminalInfo(const T& array) {
+ info_.leaf_is_nullable = nullable_in_parent_;
+ if (nullable_in_parent_) {
+ info_.max_def_level++;
+ }
+ // We don't use null_count() because if the null_count isn't known
+ // and the array does in fact contain nulls, we will end up
+ // traversing the null bitmap twice (once here and once when calculating
+ // rep/def levels).
+ if (LazyNoNulls(array)) {
+ info_.path.emplace_back(AllPresentTerminalNode{info_.max_def_level});
+ } else if (LazyNullCount(array) == array.length()) {
+ info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
+ } else {
+ info_.path.emplace_back(NullableTerminalNode(array.null_bitmap_data(),
+ array.offset(), info_.max_def_level));
+ }
+ info_.primitive_array = std::make_shared<T>(array.data());
+ paths_.push_back(Fixup(info_));
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<std::is_base_of<::arrow::FlatArray, T>::value, Status> Visit(
+ const T& array) {
+ AddTerminalInfo(array);
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<std::is_same<::arrow::ListArray, T>::value ||
+ std::is_same<::arrow::LargeListArray, T>::value,
+ Status>
+ Visit(const T& array) {
+ MaybeAddNullable(array);
+ // Increment necessary due to empty lists.
+ info_.max_def_level++;
+ info_.max_rep_level++;
+ // raw_value_offsets() accounts for any slice offset.
+ ListPathNode<VarRangeSelector<typename T::offset_type>> node(
+ VarRangeSelector<typename T::offset_type>{array.raw_value_offsets()},
+ info_.max_rep_level, info_.max_def_level - 1);
+ info_.path.emplace_back(std::move(node));
+ nullable_in_parent_ = array.list_type()->value_field()->nullable();
+ return VisitInline(*array.values());
+ }
+
+ Status Visit(const ::arrow::DictionaryArray& array) {
+ // Only currently handle DictionaryArray where the dictionary is a
+ // primitive type
+ if (array.dict_type()->value_type()->num_fields() > 0) {
+ return Status::NotImplemented(
+ "Writing DictionaryArray with nested dictionary "
+ "type not yet supported");
+ }
+ if (array.dictionary()->null_count() > 0) {
+ return Status::NotImplemented(
+ "Writing DictionaryArray with null encoded in dictionary "
+ "type not yet supported");
+ }
+ AddTerminalInfo(array);
+ return Status::OK();
+ }
+
+ void MaybeAddNullable(const Array& array) {
+ if (!nullable_in_parent_) {
+ return;
+ }
+ info_.max_def_level++;
+ // We don't use null_count() because if the null_count isn't known
+ // and the array does in fact contain nulls, we will end up
+ // traversing the null bitmap twice (once here and once when calculating
+ // rep/def levels). Because this isn't terminal this might not be
+ // the right decision for structs that share the same nullable
+ // parents.
+ if (LazyNoNulls(array)) {
+ // Don't add anything because there won't be any point checking
+ // null values for the array. There will always be at least
+ // one more array to handle nullability.
+ return;
+ }
+ if (LazyNullCount(array) == array.length()) {
+ info_.path.emplace_back(AllNullsTerminalNode(info_.max_def_level - 1));
+ return;
+ }
+ info_.path.emplace_back(
+ NullableNode(array.null_bitmap_data(), array.offset(),
+ /* def_level_if_null = */ info_.max_def_level - 1));
+ }
+
+ Status VisitInline(const Array& array);
+
+ Status Visit(const ::arrow::MapArray& array) {
+ return Visit(static_cast<const ::arrow::ListArray&>(array));
+ }
+
+ Status Visit(const ::arrow::StructArray& array) {
+ MaybeAddNullable(array);
+ PathInfo info_backup = info_;
+ for (int x = 0; x < array.num_fields(); x++) {
+ nullable_in_parent_ = array.type()->field(x)->nullable();
+ RETURN_NOT_OK(VisitInline(*array.field(x)));
+ info_ = info_backup;
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const ::arrow::FixedSizeListArray& array) {
+ MaybeAddNullable(array);
+ int32_t list_size = array.list_type()->list_size();
+ // Technically we could encode fixed size lists with two level encodings
+ // but since we always use 3 level encoding we increment def levels as
+ // well.
+ info_.max_def_level++;
+ info_.max_rep_level++;
+ info_.path.emplace_back(FixedSizeListNode(FixedSizedRangeSelector{list_size},
+ info_.max_rep_level, info_.max_def_level));
+ nullable_in_parent_ = array.list_type()->value_field()->nullable();
+ if (array.offset() > 0) {
+ return VisitInline(*array.values()->Slice(array.value_offset(0)));
+ }
+ return VisitInline(*array.values());
+ }
+
+ Status Visit(const ::arrow::ExtensionArray& array) {
+ return VisitInline(*array.storage());
+ }
+
+#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
+ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
+ return Status::NotImplemented("Level generation for " #ArrowTypePrefix \
+ " not supported yet"); \
+ }
+
+ // Union types aren't supported in Parquet.
+ NOT_IMPLEMENTED_VISIT(Union)
+
+#undef NOT_IMPLEMENTED_VISIT
+ std::vector<PathInfo>& paths() { return paths_; }
+
+ private:
+ PathInfo info_;
+ std::vector<PathInfo> paths_;
+ bool nullable_in_parent_;
+};
+
+Status PathBuilder::VisitInline(const Array& array) {
+ return ::arrow::VisitArrayInline(array, this);
+}
+
+#undef RETURN_IF_ERROR
+} // namespace
+
+class MultipathLevelBuilderImpl : public MultipathLevelBuilder {
+ public:
+ MultipathLevelBuilderImpl(std::shared_ptr<::arrow::ArrayData> data,
+ std::unique_ptr<PathBuilder> path_builder)
+ : root_range_{0, data->length},
+ data_(std::move(data)),
+ path_builder_(std::move(path_builder)) {}
+
+ int GetLeafCount() const override {
+ return static_cast<int>(path_builder_->paths().size());
+ }
+
+ ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback) override {
+ DCHECK_GE(leaf_index, 0);
+ DCHECK_LT(leaf_index, GetLeafCount());
+ return WritePath(root_range_, &path_builder_->paths()[leaf_index], context,
+ std::move(write_leaf_callback));
+ }
+
+ private:
+ ElementRange root_range_;
+ // Reference holder to ensure the data stays valid.
+ std::shared_ptr<::arrow::ArrayData> data_;
+ std::unique_ptr<PathBuilder> path_builder_;
+};
+
+// static
+::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> MultipathLevelBuilder::Make(
+ const ::arrow::Array& array, bool array_field_nullable) {
+ auto constructor = ::arrow::internal::make_unique<PathBuilder>(array_field_nullable);
+ RETURN_NOT_OK(VisitArrayInline(array, constructor.get()));
+ return ::arrow::internal::make_unique<MultipathLevelBuilderImpl>(
+ array.data(), std::move(constructor));
+}
+
+// static
+Status MultipathLevelBuilder::Write(const Array& array, bool array_field_nullable,
+ ArrowWriteContext* context,
+ MultipathLevelBuilder::CallbackFunction callback) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
+ MultipathLevelBuilder::Make(array, array_field_nullable));
+ PathBuilder constructor(array_field_nullable);
+ RETURN_NOT_OK(VisitArrayInline(array, &constructor));
+ for (int leaf_idx = 0; leaf_idx < builder->GetLeafCount(); leaf_idx++) {
+ RETURN_NOT_OK(builder->Write(leaf_idx, context, callback));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
index e5af186dc4f..c5b7fdfdac3 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/path_internal.h
@@ -1,155 +1,155 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "arrow/result.h"
-#include "arrow/status.h"
-
-#include "parquet/platform.h"
-
-namespace arrow {
-
-class Array;
-
-} // namespace arrow
-
-namespace parquet {
-
-struct ArrowWriteContext;
-
-namespace arrow {
-
-// This files contain internal implementation details and should not be considered
-// part of the public API.
-
-// The MultipathLevelBuilder is intended to fully support all Arrow nested types that
-// map to parquet types (i.e. Everything but Unions).
-//
-
-/// \brief Half open range of elements in an array.
-struct ElementRange {
- /// Upper bound of range (inclusive)
- int64_t start;
- /// Upper bound of range (exclusive)
- int64_t end;
-
- bool Empty() const { return start == end; }
-
- int64_t Size() const { return end - start; }
-};
-
-/// \brief Result for a single leaf array when running the builder on the
-/// its root.
-struct MultipathLevelBuilderResult {
- /// \brief The Array containing only the values to write (after all nesting has
- /// been processed.
- ///
- /// No additional processing is done on this array (it is copied as is when
- /// visited via a DFS).
- std::shared_ptr<::arrow::Array> leaf_array;
-
- /// \brief Might be null.
- const int16_t* def_levels = nullptr;
-
- /// \brief Might be null.
- const int16_t* rep_levels = nullptr;
-
- /// \brief Number of items (int16_t) contained in def/rep_levels when present.
- int64_t def_rep_level_count = 0;
-
- /// \brief Contains element ranges of the required visiting on the
- /// descendants of the final list ancestor for any leaf node.
- ///
- /// The algorithm will attempt to consolidate visited ranges into
- /// the smallest number possible.
- ///
- /// This data is necessary to pass along because after producing
- /// def-rep levels for each leaf array it is impossible to determine
- /// which values have to be sent to parquet when a null list value
- /// in a nullable ListArray is non-empty.
- ///
- /// This allows for the parquet writing to determine which values ultimately
- /// needs to be written.
- std::vector<ElementRange> post_list_visited_elements;
-
- /// Whether the leaf array is nullable.
- bool leaf_is_nullable;
-};
-
-/// \brief Logic for being able to write out nesting (rep/def level) data that is
-/// needed for writing to parquet.
-class PARQUET_EXPORT MultipathLevelBuilder {
- public:
- /// \brief A callback function that will receive results from the call to
- /// Write(...) below. The MultipathLevelBuilderResult passed in will
- /// only remain valid for the function call (i.e. storing it and relying
- /// for its data to be consistent afterwards will result in undefined
- /// behavior.
- using CallbackFunction =
- std::function<::arrow::Status(const MultipathLevelBuilderResult&)>;
-
- /// \brief Determine rep/def level information for the array.
- ///
- /// The callback will be invoked for each leaf Array that is a
- /// descendant of array. Each leaf array is processed in a depth
- /// first traversal-order.
- ///
- /// \param[in] array The array to process.
- /// \param[in] array_field_nullable Whether the algorithm should consider
- /// the the array column as nullable (as determined by its type's parent
- /// field).
- /// \param[in, out] context for use when allocating memory, etc.
- /// \param[out] write_leaf_callback Callback to receive results.
- /// There will be one call to the write_leaf_callback for each leaf node.
- static ::arrow::Status Write(const ::arrow::Array& array, bool array_field_nullable,
- ArrowWriteContext* context,
- CallbackFunction write_leaf_callback);
-
- /// \brief Construct a new instance of the builder.
- ///
- /// \param[in] array The array to process.
- /// \param[in] array_field_nullable Whether the algorithm should consider
- /// the the array column as nullable (as determined by its type's parent
- /// field).
- static ::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> Make(
- const ::arrow::Array& array, bool array_field_nullable);
-
- virtual ~MultipathLevelBuilder() = default;
-
- /// \brief Returns the number of leaf columns that need to be written
- /// to Parquet.
- virtual int GetLeafCount() const = 0;
-
- /// \brief Calls write_leaf_callback with the MultipathLevelBuilderResult corresponding
- /// to |leaf_index|.
- ///
- /// \param[in] leaf_index The index of the leaf column to write. Must be in the range
- /// [0, GetLeafCount()].
- /// \param[in, out] context for use when allocating memory, etc.
- /// \param[out] write_leaf_callback Callback to receive the result.
- virtual ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
- CallbackFunction write_leaf_callback) = 0;
-};
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+#include "parquet/platform.h"
+
+namespace arrow {
+
+class Array;
+
+} // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+
+namespace arrow {
+
+// This files contain internal implementation details and should not be considered
+// part of the public API.
+
+// The MultipathLevelBuilder is intended to fully support all Arrow nested types that
+// map to parquet types (i.e. Everything but Unions).
+//
+
+/// \brief Half open range of elements in an array.
+struct ElementRange {
+ /// Upper bound of range (inclusive)
+ int64_t start;
+ /// Upper bound of range (exclusive)
+ int64_t end;
+
+ bool Empty() const { return start == end; }
+
+ int64_t Size() const { return end - start; }
+};
+
+/// \brief Result for a single leaf array when running the builder on the
+/// its root.
+struct MultipathLevelBuilderResult {
+ /// \brief The Array containing only the values to write (after all nesting has
+ /// been processed.
+ ///
+ /// No additional processing is done on this array (it is copied as is when
+ /// visited via a DFS).
+ std::shared_ptr<::arrow::Array> leaf_array;
+
+ /// \brief Might be null.
+ const int16_t* def_levels = nullptr;
+
+ /// \brief Might be null.
+ const int16_t* rep_levels = nullptr;
+
+ /// \brief Number of items (int16_t) contained in def/rep_levels when present.
+ int64_t def_rep_level_count = 0;
+
+ /// \brief Contains element ranges of the required visiting on the
+ /// descendants of the final list ancestor for any leaf node.
+ ///
+ /// The algorithm will attempt to consolidate visited ranges into
+ /// the smallest number possible.
+ ///
+ /// This data is necessary to pass along because after producing
+ /// def-rep levels for each leaf array it is impossible to determine
+ /// which values have to be sent to parquet when a null list value
+ /// in a nullable ListArray is non-empty.
+ ///
+ /// This allows for the parquet writing to determine which values ultimately
+ /// needs to be written.
+ std::vector<ElementRange> post_list_visited_elements;
+
+ /// Whether the leaf array is nullable.
+ bool leaf_is_nullable;
+};
+
+/// \brief Logic for being able to write out nesting (rep/def level) data that is
+/// needed for writing to parquet.
+class PARQUET_EXPORT MultipathLevelBuilder {
+ public:
+ /// \brief A callback function that will receive results from the call to
+ /// Write(...) below. The MultipathLevelBuilderResult passed in will
+ /// only remain valid for the function call (i.e. storing it and relying
+ /// for its data to be consistent afterwards will result in undefined
+ /// behavior.
+ using CallbackFunction =
+ std::function<::arrow::Status(const MultipathLevelBuilderResult&)>;
+
+ /// \brief Determine rep/def level information for the array.
+ ///
+ /// The callback will be invoked for each leaf Array that is a
+ /// descendant of array. Each leaf array is processed in a depth
+ /// first traversal-order.
+ ///
+ /// \param[in] array The array to process.
+ /// \param[in] array_field_nullable Whether the algorithm should consider
+ /// the the array column as nullable (as determined by its type's parent
+ /// field).
+ /// \param[in, out] context for use when allocating memory, etc.
+ /// \param[out] write_leaf_callback Callback to receive results.
+ /// There will be one call to the write_leaf_callback for each leaf node.
+ static ::arrow::Status Write(const ::arrow::Array& array, bool array_field_nullable,
+ ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback);
+
+ /// \brief Construct a new instance of the builder.
+ ///
+ /// \param[in] array The array to process.
+ /// \param[in] array_field_nullable Whether the algorithm should consider
+ /// the the array column as nullable (as determined by its type's parent
+ /// field).
+ static ::arrow::Result<std::unique_ptr<MultipathLevelBuilder>> Make(
+ const ::arrow::Array& array, bool array_field_nullable);
+
+ virtual ~MultipathLevelBuilder() = default;
+
+ /// \brief Returns the number of leaf columns that need to be written
+ /// to Parquet.
+ virtual int GetLeafCount() const = 0;
+
+ /// \brief Calls write_leaf_callback with the MultipathLevelBuilderResult corresponding
+ /// to |leaf_index|.
+ ///
+ /// \param[in] leaf_index The index of the leaf column to write. Must be in the range
+ /// [0, GetLeafCount()].
+ /// \param[in, out] context for use when allocating memory, etc.
+ /// \param[out] write_leaf_callback Callback to receive the result.
+ virtual ::arrow::Status Write(int leaf_index, ArrowWriteContext* context,
+ CallbackFunction write_leaf_callback) = 0;
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
index 7f284abdee0..4f5f79c964a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.cc
@@ -1,1248 +1,1248 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/reader.h"
-
-#include <algorithm>
-#include <cstring>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/extension_type.h"
-#include "arrow/io/memory.h"
-#include "arrow/record_batch.h"
-#include "arrow/table.h"
-#include "arrow/type.h"
-#include "arrow/util/async_generator.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/future.h"
-#include "arrow/util/iterator.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/util/parallel.h"
-#include "arrow/util/range.h"
-#include "parquet/arrow/reader_internal.h"
-#include "parquet/column_reader.h"
-#include "parquet/exception.h"
-#include "parquet/file_reader.h"
-#include "parquet/metadata.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-
-using arrow::Array;
-using arrow::ArrayData;
-using arrow::BooleanArray;
-using arrow::ChunkedArray;
-using arrow::DataType;
-using arrow::ExtensionType;
-using arrow::Field;
-using arrow::Future;
-using arrow::Int32Array;
-using arrow::ListArray;
-using arrow::MemoryPool;
-using arrow::RecordBatchReader;
-using arrow::ResizableBuffer;
-using arrow::Status;
-using arrow::StructArray;
-using arrow::Table;
-using arrow::TimestampArray;
-
-using arrow::internal::checked_cast;
-using arrow::internal::Iota;
-
-// Help reduce verbosity
-using ParquetReader = parquet::ParquetFileReader;
-
-using parquet::internal::RecordReader;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-namespace arrow {
-namespace {
-
-::arrow::Result<std::shared_ptr<ArrayData>> ChunksToSingle(const ChunkedArray& chunked) {
- switch (chunked.num_chunks()) {
- case 0: {
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array,
- ::arrow::MakeArrayOfNull(chunked.type(), 0));
- return array->data();
- }
- case 1:
- return chunked.chunk(0)->data();
- default:
- // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
- // this is not yet implemented
- return Status::NotImplemented(
- "Nested data conversions not implemented for chunked array outputs");
- }
-}
-
-} // namespace
-
-class ColumnReaderImpl : public ColumnReader {
- public:
- virtual Status GetDefLevels(const int16_t** data, int64_t* length) = 0;
- virtual Status GetRepLevels(const int16_t** data, int64_t* length) = 0;
- virtual const std::shared_ptr<Field> field() = 0;
-
- ::arrow::Status NextBatch(int64_t batch_size,
- std::shared_ptr<::arrow::ChunkedArray>* out) final {
- RETURN_NOT_OK(LoadBatch(batch_size));
- RETURN_NOT_OK(BuildArray(batch_size, out));
- for (int x = 0; x < (*out)->num_chunks(); x++) {
- RETURN_NOT_OK((*out)->chunk(x)->Validate());
- }
- return Status::OK();
- }
-
- virtual ::arrow::Status LoadBatch(int64_t num_records) = 0;
-
- virtual ::arrow::Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
- virtual bool IsOrHasRepeatedChild() const = 0;
-};
-
-namespace {
-
-std::shared_ptr<std::unordered_set<int>> VectorToSharedSet(
- const std::vector<int>& values) {
- std::shared_ptr<std::unordered_set<int>> result(new std::unordered_set<int>());
- result->insert(values.begin(), values.end());
- return result;
-}
-
-// Forward declaration
-Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& context,
- std::unique_ptr<ColumnReaderImpl>* out);
-
-// ----------------------------------------------------------------------
-// FileReaderImpl forward declaration
-
-class FileReaderImpl : public FileReader {
- public:
- FileReaderImpl(MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader,
- ArrowReaderProperties properties)
- : pool_(pool),
- reader_(std::move(reader)),
- reader_properties_(std::move(properties)) {}
-
- Status Init() {
- return SchemaManifest::Make(reader_->metadata()->schema(),
- reader_->metadata()->key_value_metadata(),
- reader_properties_, &manifest_);
- }
-
- FileColumnIteratorFactory SomeRowGroupsFactory(std::vector<int> row_groups) {
- return [row_groups](int i, ParquetFileReader* reader) {
- return new FileColumnIterator(i, reader, row_groups);
- };
- }
-
- FileColumnIteratorFactory AllRowGroupsFactory() {
- return SomeRowGroupsFactory(Iota(reader_->metadata()->num_row_groups()));
- }
-
- Status BoundsCheckColumn(int column) {
- if (column < 0 || column >= this->num_columns()) {
- return Status::Invalid("Column index out of bounds (got ", column,
- ", should be "
- "between 0 and ",
- this->num_columns() - 1, ")");
- }
- return Status::OK();
- }
-
- Status BoundsCheckRowGroup(int row_group) {
- // row group indices check
- if (row_group < 0 || row_group >= num_row_groups()) {
- return Status::Invalid("Some index in row_group_indices is ", row_group,
- ", which is either < 0 or >= num_row_groups(",
- num_row_groups(), ")");
- }
- return Status::OK();
- }
-
- Status BoundsCheck(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices) {
- for (int i : row_groups) {
- RETURN_NOT_OK(BoundsCheckRowGroup(i));
- }
- for (int i : column_indices) {
- RETURN_NOT_OK(BoundsCheckColumn(i));
- }
- return Status::OK();
- }
-
- std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) override;
-
- Status ReadTable(const std::vector<int>& indices,
- std::shared_ptr<Table>* out) override {
- return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out);
- }
-
- Status GetFieldReader(int i,
- const std::shared_ptr<std::unordered_set<int>>& included_leaves,
- const std::vector<int>& row_groups,
- std::unique_ptr<ColumnReaderImpl>* out) {
- auto ctx = std::make_shared<ReaderContext>();
- ctx->reader = reader_.get();
- ctx->pool = pool_;
- ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
- ctx->filter_leaves = true;
- ctx->included_leaves = included_leaves;
- return GetReader(manifest_.schema_fields[i], ctx, out);
- }
-
- Status GetFieldReaders(const std::vector<int>& column_indices,
- const std::vector<int>& row_groups,
- std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
- std::shared_ptr<::arrow::Schema>* out_schema) {
- // We only need to read schema fields which have columns indicated
- // in the indices vector
- ARROW_ASSIGN_OR_RAISE(std::vector<int> field_indices,
- manifest_.GetFieldIndices(column_indices));
-
- auto included_leaves = VectorToSharedSet(column_indices);
-
- out->resize(field_indices.size());
- ::arrow::FieldVector out_fields(field_indices.size());
- for (size_t i = 0; i < out->size(); ++i) {
- std::unique_ptr<ColumnReaderImpl> reader;
- RETURN_NOT_OK(
- GetFieldReader(field_indices[i], included_leaves, row_groups, &reader));
-
- out_fields[i] = reader->field();
- out->at(i) = std::move(reader);
- }
-
- *out_schema = ::arrow::schema(std::move(out_fields), manifest_.schema_metadata);
- return Status::OK();
- }
-
- Status GetColumn(int i, FileColumnIteratorFactory iterator_factory,
- std::unique_ptr<ColumnReader>* out);
-
- Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) override {
- return GetColumn(i, AllRowGroupsFactory(), out);
- }
-
- Status GetSchema(std::shared_ptr<::arrow::Schema>* out) override {
- return FromParquetSchema(reader_->metadata()->schema(), reader_properties_,
- reader_->metadata()->key_value_metadata(), out);
- }
-
- Status ReadSchemaField(int i, std::shared_ptr<ChunkedArray>* out) override {
- auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns()));
- std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
-
- std::unique_ptr<ColumnReaderImpl> reader;
- RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader));
-
- return ReadColumn(i, row_groups, reader.get(), out);
- }
-
- Status ReadColumn(int i, const std::vector<int>& row_groups, ColumnReader* reader,
- std::shared_ptr<ChunkedArray>* out) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- // TODO(wesm): This calculation doesn't make much sense when we have repeated
- // schema nodes
- int64_t records_to_read = 0;
- for (auto row_group : row_groups) {
- // Can throw exception
- records_to_read +=
- reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values();
- }
- return reader->NextBatch(records_to_read, out);
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- Status ReadColumn(int i, const std::vector<int>& row_groups,
- std::shared_ptr<ChunkedArray>* out) {
- std::unique_ptr<ColumnReader> flat_column_reader;
- RETURN_NOT_OK(GetColumn(i, SomeRowGroupsFactory(row_groups), &flat_column_reader));
- return ReadColumn(i, row_groups, flat_column_reader.get(), out);
- }
-
- Status ReadColumn(int i, std::shared_ptr<ChunkedArray>* out) override {
- return ReadColumn(i, Iota(reader_->metadata()->num_row_groups()), out);
- }
-
- Status ReadTable(std::shared_ptr<Table>* table) override {
- return ReadTable(Iota(reader_->metadata()->num_columns()), table);
- }
-
- Status ReadRowGroups(const std::vector<int>& row_groups,
- const std::vector<int>& indices,
- std::shared_ptr<Table>* table) override;
-
- // Helper method used by ReadRowGroups - read the given row groups/columns, skipping
- // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader
- // alive in async contexts.
- Future<std::shared_ptr<Table>> DecodeRowGroups(
- std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
- const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor);
-
- Status ReadRowGroups(const std::vector<int>& row_groups,
- std::shared_ptr<Table>* table) override {
- return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns()), table);
- }
-
- Status ReadRowGroup(int row_group_index, const std::vector<int>& column_indices,
- std::shared_ptr<Table>* out) override {
- return ReadRowGroups({row_group_index}, column_indices, out);
- }
-
- Status ReadRowGroup(int i, std::shared_ptr<Table>* table) override {
- return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
- }
-
- Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- const std::vector<int>& column_indices,
- std::unique_ptr<RecordBatchReader>* out) override;
-
- Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- std::unique_ptr<RecordBatchReader>* out) override {
- return GetRecordBatchReader(row_group_indices,
- Iota(reader_->metadata()->num_columns()), out);
- }
-
- ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
- GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
- const std::vector<int> row_group_indices,
- const std::vector<int> column_indices,
- ::arrow::internal::Executor* cpu_executor) override;
-
- int num_columns() const { return reader_->metadata()->num_columns(); }
-
- ParquetFileReader* parquet_reader() const override { return reader_.get(); }
-
- int num_row_groups() const override { return reader_->metadata()->num_row_groups(); }
-
- void set_use_threads(bool use_threads) override {
- reader_properties_.set_use_threads(use_threads);
- }
-
- void set_batch_size(int64_t batch_size) override {
- reader_properties_.set_batch_size(batch_size);
- }
-
- const ArrowReaderProperties& properties() const override { return reader_properties_; }
-
- const SchemaManifest& manifest() const override { return manifest_; }
-
- Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
- int64_t* num_rows) override {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- *num_rows = ScanFileContents(columns, column_batch_size, reader_.get());
- return Status::OK();
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- MemoryPool* pool_;
- std::unique_ptr<ParquetFileReader> reader_;
- ArrowReaderProperties reader_properties_;
-
- SchemaManifest manifest_;
-};
-
-class RowGroupRecordBatchReader : public ::arrow::RecordBatchReader {
- public:
- RowGroupRecordBatchReader(::arrow::RecordBatchIterator batches,
- std::shared_ptr<::arrow::Schema> schema)
- : batches_(std::move(batches)), schema_(std::move(schema)) {}
-
- ~RowGroupRecordBatchReader() override {}
-
- Status ReadNext(std::shared_ptr<::arrow::RecordBatch>* out) override {
- return batches_.Next().Value(out);
- }
-
- std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
-
- private:
- ::arrow::Iterator<std::shared_ptr<::arrow::RecordBatch>> batches_;
- std::shared_ptr<::arrow::Schema> schema_;
-};
-
-class ColumnChunkReaderImpl : public ColumnChunkReader {
- public:
- ColumnChunkReaderImpl(FileReaderImpl* impl, int row_group_index, int column_index)
- : impl_(impl), column_index_(column_index), row_group_index_(row_group_index) {}
-
- Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) override {
- return impl_->ReadColumn(column_index_, {row_group_index_}, out);
- }
-
- private:
- FileReaderImpl* impl_;
- int column_index_;
- int row_group_index_;
-};
-
-class RowGroupReaderImpl : public RowGroupReader {
- public:
- RowGroupReaderImpl(FileReaderImpl* impl, int row_group_index)
- : impl_(impl), row_group_index_(row_group_index) {}
-
- std::shared_ptr<ColumnChunkReader> Column(int column_index) override {
- return std::shared_ptr<ColumnChunkReader>(
- new ColumnChunkReaderImpl(impl_, row_group_index_, column_index));
- }
-
- Status ReadTable(const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) override {
- return impl_->ReadRowGroup(row_group_index_, column_indices, out);
- }
-
- Status ReadTable(std::shared_ptr<::arrow::Table>* out) override {
- return impl_->ReadRowGroup(row_group_index_, out);
- }
-
- private:
- FileReaderImpl* impl_;
- int row_group_index_;
-};
-
-// ----------------------------------------------------------------------
-// Column reader implementations
-
-// Leaf reader is for primitive arrays and primitive children of nested arrays
-class LeafReader : public ColumnReaderImpl {
- public:
- LeafReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
- std::unique_ptr<FileColumnIterator> input,
- ::parquet::internal::LevelInfo leaf_info)
- : ctx_(std::move(ctx)),
- field_(std::move(field)),
- input_(std::move(input)),
- descr_(input_->descr()) {
- record_reader_ = RecordReader::Make(
- descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
- NextRowGroup();
- }
-
- Status GetDefLevels(const int16_t** data, int64_t* length) final {
- *data = record_reader_->def_levels();
- *length = record_reader_->levels_position();
- return Status::OK();
- }
-
- Status GetRepLevels(const int16_t** data, int64_t* length) final {
- *data = record_reader_->rep_levels();
- *length = record_reader_->levels_position();
- return Status::OK();
- }
-
- bool IsOrHasRepeatedChild() const final { return false; }
-
- Status LoadBatch(int64_t records_to_read) final {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- out_ = nullptr;
- record_reader_->Reset();
- // Pre-allocation gives much better performance for flat columns
- record_reader_->Reserve(records_to_read);
- while (records_to_read > 0) {
- if (!record_reader_->HasMoreData()) {
- break;
- }
- int64_t records_read = record_reader_->ReadRecords(records_to_read);
- records_to_read -= records_read;
- if (records_read == 0) {
- NextRowGroup();
- }
- }
- RETURN_NOT_OK(TransferColumnData(record_reader_.get(), field_->type(), descr_,
- ctx_->pool, &out_));
- return Status::OK();
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- ::arrow::Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<::arrow::ChunkedArray>* out) final {
- *out = out_;
- return Status::OK();
- }
-
- const std::shared_ptr<Field> field() override { return field_; }
-
- private:
- std::shared_ptr<ChunkedArray> out_;
- void NextRowGroup() {
- std::unique_ptr<PageReader> page_reader = input_->NextChunk();
- record_reader_->SetPageReader(std::move(page_reader));
- }
-
- std::shared_ptr<ReaderContext> ctx_;
- std::shared_ptr<Field> field_;
- std::unique_ptr<FileColumnIterator> input_;
- const ColumnDescriptor* descr_;
- std::shared_ptr<RecordReader> record_reader_;
-};
-
-// Column reader for extension arrays
-class ExtensionReader : public ColumnReaderImpl {
- public:
- ExtensionReader(std::shared_ptr<Field> field,
- std::unique_ptr<ColumnReaderImpl> storage_reader)
- : field_(std::move(field)), storage_reader_(std::move(storage_reader)) {}
-
- Status GetDefLevels(const int16_t** data, int64_t* length) override {
- return storage_reader_->GetDefLevels(data, length);
- }
-
- Status GetRepLevels(const int16_t** data, int64_t* length) override {
- return storage_reader_->GetRepLevels(data, length);
- }
-
- Status LoadBatch(int64_t number_of_records) final {
- return storage_reader_->LoadBatch(number_of_records);
- }
-
- Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) override {
- std::shared_ptr<ChunkedArray> storage;
- RETURN_NOT_OK(storage_reader_->BuildArray(length_upper_bound, &storage));
- *out = ExtensionType::WrapArray(field_->type(), storage);
- return Status::OK();
- }
-
- bool IsOrHasRepeatedChild() const final {
- return storage_reader_->IsOrHasRepeatedChild();
- }
-
- const std::shared_ptr<Field> field() override { return field_; }
-
- private:
- std::shared_ptr<Field> field_;
- std::unique_ptr<ColumnReaderImpl> storage_reader_;
-};
-
-template <typename IndexType>
-class ListReader : public ColumnReaderImpl {
- public:
- ListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
- ::parquet::internal::LevelInfo level_info,
- std::unique_ptr<ColumnReaderImpl> child_reader)
- : ctx_(std::move(ctx)),
- field_(std::move(field)),
- level_info_(level_info),
- item_reader_(std::move(child_reader)) {}
-
- Status GetDefLevels(const int16_t** data, int64_t* length) override {
- return item_reader_->GetDefLevels(data, length);
- }
-
- Status GetRepLevels(const int16_t** data, int64_t* length) override {
- return item_reader_->GetRepLevels(data, length);
- }
-
- bool IsOrHasRepeatedChild() const final { return true; }
-
- Status LoadBatch(int64_t number_of_records) final {
- return item_reader_->LoadBatch(number_of_records);
- }
-
- virtual ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
- std::shared_ptr<ArrayData> data) {
- if (field_->type()->id() == ::arrow::Type::MAP) {
- // Error out if data is not map-compliant instead of aborting in MakeArray below
- RETURN_NOT_OK(::arrow::MapArray::ValidateChildData(data->child_data));
- }
- std::shared_ptr<Array> result = ::arrow::MakeArray(data);
- return std::make_shared<ChunkedArray>(result);
- }
-
- Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) override {
- const int16_t* def_levels;
- const int16_t* rep_levels;
- int64_t num_levels;
- RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
- RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
-
- std::shared_ptr<ResizableBuffer> validity_buffer;
- ::parquet::internal::ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = length_upper_bound;
- if (field_->nullable()) {
- ARROW_ASSIGN_OR_RAISE(
- validity_buffer,
- AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
- validity_io.valid_bits = validity_buffer->mutable_data();
- }
- ARROW_ASSIGN_OR_RAISE(
- std::shared_ptr<ResizableBuffer> offsets_buffer,
- AllocateResizableBuffer(
- sizeof(IndexType) * std::max(int64_t{1}, length_upper_bound + 1),
- ctx_->pool));
- // Ensure zero initialization in case we have reached a zero length list (and
- // because first entry is always zero).
- IndexType* offset_data = reinterpret_cast<IndexType*>(offsets_buffer->mutable_data());
- offset_data[0] = 0;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- ::parquet::internal::DefRepLevelsToList(def_levels, rep_levels, num_levels,
- level_info_, &validity_io, offset_data);
- END_PARQUET_CATCH_EXCEPTIONS
-
- RETURN_NOT_OK(item_reader_->BuildArray(offset_data[validity_io.values_read], out));
-
- // Resize to actual number of elements returned.
- RETURN_NOT_OK(
- offsets_buffer->Resize((validity_io.values_read + 1) * sizeof(IndexType)));
- if (validity_buffer != nullptr) {
- RETURN_NOT_OK(
- validity_buffer->Resize(BitUtil::BytesForBits(validity_io.values_read)));
- validity_buffer->ZeroPadding();
- }
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> item_chunk, ChunksToSingle(**out));
-
- std::vector<std::shared_ptr<Buffer>> buffers{
- validity_io.null_count > 0 ? validity_buffer : nullptr, offsets_buffer};
- auto data = std::make_shared<ArrayData>(
- field_->type(),
- /*length=*/validity_io.values_read, std::move(buffers),
- std::vector<std::shared_ptr<ArrayData>>{item_chunk}, validity_io.null_count);
-
- ARROW_ASSIGN_OR_RAISE(*out, AssembleArray(std::move(data)));
- return Status::OK();
- }
-
- const std::shared_ptr<Field> field() override { return field_; }
-
- private:
- std::shared_ptr<ReaderContext> ctx_;
- std::shared_ptr<Field> field_;
- ::parquet::internal::LevelInfo level_info_;
- std::unique_ptr<ColumnReaderImpl> item_reader_;
-};
-
-class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader<int32_t> {
- public:
- FixedSizeListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
- ::parquet::internal::LevelInfo level_info,
- std::unique_ptr<ColumnReaderImpl> child_reader)
- : ListReader(std::move(ctx), std::move(field), level_info,
- std::move(child_reader)) {}
- ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
- std::shared_ptr<ArrayData> data) final {
- DCHECK_EQ(data->buffers.size(), 2);
- DCHECK_EQ(field()->type()->id(), ::arrow::Type::FIXED_SIZE_LIST);
- const auto& type = checked_cast<::arrow::FixedSizeListType&>(*field()->type());
- const int32_t* offsets = reinterpret_cast<const int32_t*>(data->buffers[1]->data());
- for (int x = 1; x <= data->length; x++) {
- int32_t size = offsets[x] - offsets[x - 1];
- if (size != type.list_size()) {
- return Status::Invalid("Expected all lists to be of size=", type.list_size(),
- " but index ", x, " had size=", size);
- }
- }
- data->buffers.resize(1);
- std::shared_ptr<Array> result = ::arrow::MakeArray(data);
- return std::make_shared<ChunkedArray>(result);
- }
-};
-
-class PARQUET_NO_EXPORT StructReader : public ColumnReaderImpl {
- public:
- explicit StructReader(std::shared_ptr<ReaderContext> ctx,
- std::shared_ptr<Field> filtered_field,
- ::parquet::internal::LevelInfo level_info,
- std::vector<std::unique_ptr<ColumnReaderImpl>> children)
- : ctx_(std::move(ctx)),
- filtered_field_(std::move(filtered_field)),
- level_info_(level_info),
- children_(std::move(children)) {
- // There could be a mix of children some might be repeated some might not be.
- // If possible use one that isn't since that will be guaranteed to have the least
- // number of levels to reconstruct a nullable bitmap.
- auto result = std::find_if(children_.begin(), children_.end(),
- [](const std::unique_ptr<ColumnReaderImpl>& child) {
- return !child->IsOrHasRepeatedChild();
- });
- if (result != children_.end()) {
- def_rep_level_child_ = result->get();
- has_repeated_child_ = false;
- } else if (!children_.empty()) {
- def_rep_level_child_ = children_.front().get();
- has_repeated_child_ = true;
- }
- }
-
- bool IsOrHasRepeatedChild() const final { return has_repeated_child_; }
-
- Status LoadBatch(int64_t records_to_read) override {
- for (const std::unique_ptr<ColumnReaderImpl>& reader : children_) {
- RETURN_NOT_OK(reader->LoadBatch(records_to_read));
- }
- return Status::OK();
- }
- Status BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) override;
- Status GetDefLevels(const int16_t** data, int64_t* length) override;
- Status GetRepLevels(const int16_t** data, int64_t* length) override;
- const std::shared_ptr<Field> field() override { return filtered_field_; }
-
- private:
- const std::shared_ptr<ReaderContext> ctx_;
- const std::shared_ptr<Field> filtered_field_;
- const ::parquet::internal::LevelInfo level_info_;
- const std::vector<std::unique_ptr<ColumnReaderImpl>> children_;
- ColumnReaderImpl* def_rep_level_child_ = nullptr;
- bool has_repeated_child_;
-};
-
-Status StructReader::GetDefLevels(const int16_t** data, int64_t* length) {
- *data = nullptr;
- if (children_.size() == 0) {
- *length = 0;
- return Status::Invalid("StructReader had no children");
- }
-
- // This method should only be called when this struct or one of its parents
- // are optional/repeated or it has a repeated child.
- // Meaning all children must have rep/def levels associated
- // with them.
- RETURN_NOT_OK(def_rep_level_child_->GetDefLevels(data, length));
- return Status::OK();
-}
-
-Status StructReader::GetRepLevels(const int16_t** data, int64_t* length) {
- *data = nullptr;
- if (children_.size() == 0) {
- *length = 0;
- return Status::Invalid("StructReader had no childre");
- }
-
- // This method should only be called when this struct or one of its parents
- // are optional/repeated or it has repeated child.
- // Meaning all children must have rep/def levels associated
- // with them.
- RETURN_NOT_OK(def_rep_level_child_->GetRepLevels(data, length));
- return Status::OK();
-}
-
-Status StructReader::BuildArray(int64_t length_upper_bound,
- std::shared_ptr<ChunkedArray>* out) {
- std::vector<std::shared_ptr<ArrayData>> children_array_data;
- std::shared_ptr<ResizableBuffer> null_bitmap;
-
- ::parquet::internal::ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = length_upper_bound;
- // This simplifies accounting below.
- validity_io.values_read = length_upper_bound;
-
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- const int16_t* def_levels;
- const int16_t* rep_levels;
- int64_t num_levels;
-
- if (has_repeated_child_) {
- ARROW_ASSIGN_OR_RAISE(
- null_bitmap,
- AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
- validity_io.valid_bits = null_bitmap->mutable_data();
- RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
- RETURN_NOT_OK(GetRepLevels(&rep_levels, &num_levels));
- DefRepLevelsToBitmap(def_levels, rep_levels, num_levels, level_info_, &validity_io);
- } else if (filtered_field_->nullable()) {
- ARROW_ASSIGN_OR_RAISE(
- null_bitmap,
- AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
- validity_io.valid_bits = null_bitmap->mutable_data();
- RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
- DefLevelsToBitmap(def_levels, num_levels, level_info_, &validity_io);
- }
-
- // Ensure all values are initialized.
- if (null_bitmap) {
- RETURN_NOT_OK(null_bitmap->Resize(BitUtil::BytesForBits(validity_io.values_read)));
- null_bitmap->ZeroPadding();
- }
-
- END_PARQUET_CATCH_EXCEPTIONS
- // Gather children arrays and def levels
- for (auto& child : children_) {
- std::shared_ptr<ChunkedArray> field;
- RETURN_NOT_OK(child->BuildArray(validity_io.values_read, &field));
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array_data, ChunksToSingle(*field));
- children_array_data.push_back(std::move(array_data));
- }
-
- if (!filtered_field_->nullable() && !has_repeated_child_) {
- validity_io.values_read = children_array_data.front()->length;
- }
-
- std::vector<std::shared_ptr<Buffer>> buffers{validity_io.null_count > 0 ? null_bitmap
- : nullptr};
- auto data =
- std::make_shared<ArrayData>(filtered_field_->type(),
- /*length=*/validity_io.values_read, std::move(buffers),
- std::move(children_array_data));
- std::shared_ptr<Array> result = ::arrow::MakeArray(data);
-
- *out = std::make_shared<ChunkedArray>(result);
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// File reader implementation
-
-Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
- const std::shared_ptr<ReaderContext>& ctx,
- std::unique_ptr<ColumnReaderImpl>* out) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
-
- auto type_id = arrow_field->type()->id();
-
- if (type_id == ::arrow::Type::EXTENSION) {
- auto storage_field = arrow_field->WithType(
- checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
- RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
- out->reset(new ExtensionReader(arrow_field, std::move(*out)));
- return Status::OK();
- }
-
- if (field.children.size() == 0) {
- if (!field.is_leaf()) {
- return Status::Invalid("Parquet non-leaf node has no children");
- }
- if (!ctx->IncludesLeaf(field.column_index)) {
- *out = nullptr;
- return Status::OK();
- }
- std::unique_ptr<FileColumnIterator> input(
- ctx->iterator_factory(field.column_index, ctx->reader));
- out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
- } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
- type_id == ::arrow::Type::FIXED_SIZE_LIST ||
- type_id == ::arrow::Type::LARGE_LIST) {
- auto list_field = arrow_field;
- auto child = &field.children[0];
- std::unique_ptr<ColumnReaderImpl> child_reader;
- RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
- if (child_reader == nullptr) {
- *out = nullptr;
- return Status::OK();
- }
- if (type_id == ::arrow::Type::LIST ||
- type_id == ::arrow::Type::MAP) { // Map can be reconstructed as list of structs.
- if (type_id == ::arrow::Type::MAP &&
- child_reader->field()->type()->num_fields() != 2) {
- // This case applies if either key or value is filtered.
- list_field = list_field->WithType(::arrow::list(child_reader->field()));
- }
- out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
- std::move(child_reader)));
- } else if (type_id == ::arrow::Type::LARGE_LIST) {
- out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
- std::move(child_reader)));
-
- } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
- out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
- std::move(child_reader)));
- } else {
- return Status::UnknownError("Unknown list type: ", field.field->ToString());
- }
- } else if (type_id == ::arrow::Type::STRUCT) {
- std::vector<std::shared_ptr<Field>> child_fields;
- std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
- for (const auto& child : field.children) {
- std::unique_ptr<ColumnReaderImpl> child_reader;
- RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
- if (!child_reader) {
- // If all children were pruned, then we do not try to read this field
- continue;
- }
- child_fields.push_back(child.field);
- child_readers.emplace_back(std::move(child_reader));
- }
- if (child_fields.size() == 0) {
- *out = nullptr;
- return Status::OK();
- }
- auto filtered_field =
- ::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
- arrow_field->nullable(), arrow_field->metadata());
- out->reset(new StructReader(ctx, filtered_field, field.level_info,
- std::move(child_readers)));
- } else {
- return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
- }
- return Status::OK();
-
- END_PARQUET_CATCH_EXCEPTIONS
-}
-
-Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& ctx,
- std::unique_ptr<ColumnReaderImpl>* out) {
- return GetReader(field, field.field, ctx, out);
-}
-
-} // namespace
-
-Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- std::unique_ptr<RecordBatchReader>* out) {
- RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
-
- if (reader_properties_.pre_buffer()) {
- // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- reader_->PreBuffer(row_groups, column_indices, reader_properties_.io_context(),
- reader_properties_.cache_options());
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
- std::shared_ptr<::arrow::Schema> batch_schema;
- RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
-
- if (readers.empty()) {
- // Just generate all batches right now; they're cheap since they have no columns.
- int64_t batch_size = properties().batch_size();
- auto max_sized_batch =
- ::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});
-
- ::arrow::RecordBatchVector batches;
-
- for (int row_group : row_groups) {
- int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
-
- batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);
-
- if (int64_t trailing_rows = num_rows % batch_size) {
- batches.push_back(max_sized_batch->Slice(0, trailing_rows));
- }
- }
-
- *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
- ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
-
- return Status::OK();
- }
-
- int64_t num_rows = 0;
- for (int row_group : row_groups) {
- num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
- }
-
- using ::arrow::RecordBatchIterator;
-
- // NB: This lambda will be invoked outside the scope of this call to
- // `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
- // `this` is a non-owning pointer so we are relying on the parent FileReader outliving
- // this RecordBatchReader.
- ::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
- [readers, batch_schema, num_rows,
- this]() mutable -> ::arrow::Result<RecordBatchIterator> {
- ::arrow::ChunkedArrayVector columns(readers.size());
-
- // don't reserve more rows than necessary
- int64_t batch_size = std::min(properties().batch_size(), num_rows);
- num_rows -= batch_size;
-
- RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
- reader_properties_.use_threads(), static_cast<int>(readers.size()),
- [&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
-
- for (const auto& column : columns) {
- if (column == nullptr || column->length() == 0) {
- return ::arrow::IterationTraits<RecordBatchIterator>::End();
- }
- }
-
- auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
- auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
-
- // NB: explicitly preserve table so that table_reader doesn't outlive it
- return ::arrow::MakeFunctionIterator(
- [table, table_reader] { return table_reader->Next(); });
- });
-
- *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
- ::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));
-
- return Status::OK();
-}
-
-/// Given a file reader and a list of row groups, this is a generator of record
-/// batch generators (where each sub-generator is the contents of a single row group).
-class RowGroupGenerator {
- public:
- using RecordBatchGenerator =
- ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>;
-
- explicit RowGroupGenerator(std::shared_ptr<FileReaderImpl> arrow_reader,
- ::arrow::internal::Executor* cpu_executor,
- std::vector<int> row_groups, std::vector<int> column_indices)
- : arrow_reader_(std::move(arrow_reader)),
- cpu_executor_(cpu_executor),
- row_groups_(std::move(row_groups)),
- column_indices_(std::move(column_indices)),
- index_(0) {}
-
- ::arrow::Future<RecordBatchGenerator> operator()() {
- if (index_ >= row_groups_.size()) {
- return ::arrow::AsyncGeneratorEnd<RecordBatchGenerator>();
- }
- int row_group = row_groups_[index_++];
- std::vector<int> column_indices = column_indices_;
- auto reader = arrow_reader_;
- if (!reader->properties().pre_buffer()) {
- return SubmitRead(cpu_executor_, reader, row_group, column_indices);
- }
- auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices);
- if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready);
- return ready.Then([=]() -> ::arrow::Future<RecordBatchGenerator> {
- return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices);
- });
- }
-
- private:
- // Synchronous fallback for when pre-buffer isn't enabled.
- //
- // Making the Parquet reader truly asynchronous requires heavy refactoring, so the
- // generator piggybacks on ReadRangeCache. The lazy ReadRangeCache can be used for
- // async I/O without forcing readahead.
- static ::arrow::Future<RecordBatchGenerator> SubmitRead(
- ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
- const int row_group, const std::vector<int>& column_indices) {
- if (!cpu_executor) {
- return ReadOneRowGroup(cpu_executor, self, row_group, column_indices);
- }
- // If we have an executor, then force transfer (even if I/O was complete)
- return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self,
- row_group, column_indices));
- }
-
- static ::arrow::Future<RecordBatchGenerator> ReadOneRowGroup(
- ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
- const int row_group, const std::vector<int>& column_indices) {
- // Skips bound checks/pre-buffering, since we've done that already
- return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor)
- .Then([](const std::shared_ptr<Table>& table)
- -> ::arrow::Result<RecordBatchGenerator> {
- ::arrow::TableBatchReader table_reader(*table);
- ::arrow::RecordBatchVector batches;
- RETURN_NOT_OK(table_reader.ReadAll(&batches));
- return ::arrow::MakeVectorGenerator(std::move(batches));
- });
- }
-
- std::shared_ptr<FileReaderImpl> arrow_reader_;
- ::arrow::internal::Executor* cpu_executor_;
- std::vector<int> row_groups_;
- std::vector<int> column_indices_;
- size_t index_;
-};
-
-::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
-FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
- const std::vector<int> row_group_indices,
- const std::vector<int> column_indices,
- ::arrow::internal::Executor* cpu_executor) {
- RETURN_NOT_OK(BoundsCheck(row_group_indices, column_indices));
- if (reader_properties_.pre_buffer()) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- reader_->PreBuffer(row_group_indices, column_indices, reader_properties_.io_context(),
- reader_properties_.cache_options());
- END_PARQUET_CATCH_EXCEPTIONS
- }
- ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> row_group_generator =
- RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
- cpu_executor, row_group_indices, column_indices);
- return ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
-}
-
-Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_factory,
- std::unique_ptr<ColumnReader>* out) {
- RETURN_NOT_OK(BoundsCheckColumn(i));
- auto ctx = std::make_shared<ReaderContext>();
- ctx->reader = reader_.get();
- ctx->pool = pool_;
- ctx->iterator_factory = iterator_factory;
- ctx->filter_leaves = false;
- std::unique_ptr<ColumnReaderImpl> result;
- RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
- out->reset(result.release());
- return Status::OK();
-}
-
-Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- std::shared_ptr<Table>* out) {
- RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
-
- // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
- if (reader_properties_.pre_buffer()) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- parquet_reader()->PreBuffer(row_groups, column_indices,
- reader_properties_.io_context(),
- reader_properties_.cache_options());
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
- /*cpu_executor=*/nullptr);
- ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
- return Status::OK();
-}
-
-Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
- std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
- const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor) {
- // `self` is used solely to keep `this` alive in an async context - but we use this
- // in a sync context too so use `this` over `self`
- std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
- std::shared_ptr<::arrow::Schema> result_schema;
- RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
- // OptionalParallelForAsync requires an executor
- if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
-
- auto read_column = [row_groups, self, this](size_t i,
- std::shared_ptr<ColumnReaderImpl> reader)
- -> ::arrow::Result<std::shared_ptr<::arrow::ChunkedArray>> {
- std::shared_ptr<::arrow::ChunkedArray> column;
- RETURN_NOT_OK(ReadColumn(static_cast<int>(i), row_groups, reader.get(), &column));
- return column;
- };
- auto make_table = [result_schema, row_groups, self,
- this](const ::arrow::ChunkedArrayVector& columns)
- -> ::arrow::Result<std::shared_ptr<Table>> {
- int64_t num_rows = 0;
- if (!columns.empty()) {
- num_rows = columns[0]->length();
- } else {
- for (int i : row_groups) {
- num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
- }
- }
- auto table = Table::Make(std::move(result_schema), columns, num_rows);
- RETURN_NOT_OK(table->Validate());
- return table;
- };
- return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(),
- std::move(readers), read_column,
- cpu_executor)
- .Then(std::move(make_table));
-}
-
-std::shared_ptr<RowGroupReader> FileReaderImpl::RowGroup(int row_group_index) {
- return std::make_shared<RowGroupReaderImpl>(this, row_group_index);
-}
-
-// ----------------------------------------------------------------------
-// Public factory functions
-
-Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- std::unique_ptr<RecordBatchReader> tmp;
- ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, &tmp));
- out->reset(tmp.release());
- return Status::OK();
-}
-
-Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
- const std::vector<int>& column_indices,
- std::shared_ptr<RecordBatchReader>* out) {
- std::unique_ptr<RecordBatchReader> tmp;
- ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, &tmp));
- out->reset(tmp.release());
- return Status::OK();
-}
-
-Status FileReader::Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- const ArrowReaderProperties& properties,
- std::unique_ptr<FileReader>* out) {
- out->reset(new FileReaderImpl(pool, std::move(reader), properties));
- return static_cast<FileReaderImpl*>(out->get())->Init();
-}
-
-Status FileReader::Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- std::unique_ptr<FileReader>* out) {
- return Make(pool, std::move(reader), default_arrow_reader_properties(), out);
-}
-
-FileReaderBuilder::FileReaderBuilder()
- : pool_(::arrow::default_memory_pool()),
- properties_(default_arrow_reader_properties()) {}
-
-Status FileReaderBuilder::Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
- const ReaderProperties& properties,
- std::shared_ptr<FileMetaData> metadata) {
- PARQUET_CATCH_NOT_OK(raw_reader_ = ParquetReader::Open(std::move(file), properties,
- std::move(metadata)));
- return Status::OK();
-}
-
-FileReaderBuilder* FileReaderBuilder::memory_pool(::arrow::MemoryPool* pool) {
- pool_ = pool;
- return this;
-}
-
-FileReaderBuilder* FileReaderBuilder::properties(
- const ArrowReaderProperties& arg_properties) {
- properties_ = arg_properties;
- return this;
-}
-
-Status FileReaderBuilder::Build(std::unique_ptr<FileReader>* out) {
- return FileReader::Make(pool_, std::move(raw_reader_), properties_, out);
-}
-
-Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
- std::unique_ptr<FileReader>* reader) {
- FileReaderBuilder builder;
- RETURN_NOT_OK(builder.Open(std::move(file)));
- return builder.memory_pool(pool)->Build(reader);
-}
-
-namespace internal {
-
-Status FuzzReader(std::unique_ptr<FileReader> reader) {
- auto st = Status::OK();
- for (int i = 0; i < reader->num_row_groups(); ++i) {
- std::shared_ptr<Table> table;
- auto row_group_status = reader->ReadRowGroup(i, &table);
- if (row_group_status.ok()) {
- row_group_status &= table->ValidateFull();
- }
- st &= row_group_status;
- }
- return st;
-}
-
-Status FuzzReader(const uint8_t* data, int64_t size) {
- auto buffer = std::make_shared<::arrow::Buffer>(data, size);
- auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
- FileReaderBuilder builder;
- RETURN_NOT_OK(builder.Open(std::move(file)));
-
- std::unique_ptr<FileReader> reader;
- RETURN_NOT_OK(builder.Build(&reader));
- return FuzzReader(std::move(reader));
-}
-
-} // namespace internal
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/reader.h"
+
+#include <algorithm>
+#include <cstring>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/util/parallel.h"
+#include "arrow/util/range.h"
+#include "parquet/arrow/reader_internal.h"
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+using arrow::Array;
+using arrow::ArrayData;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::ExtensionType;
+using arrow::Field;
+using arrow::Future;
+using arrow::Int32Array;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::RecordBatchReader;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::StructArray;
+using arrow::Table;
+using arrow::TimestampArray;
+
+using arrow::internal::checked_cast;
+using arrow::internal::Iota;
+
+// Help reduce verbosity
+using ParquetReader = parquet::ParquetFileReader;
+
+using parquet::internal::RecordReader;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace arrow {
+namespace {
+
+::arrow::Result<std::shared_ptr<ArrayData>> ChunksToSingle(const ChunkedArray& chunked) {
+ switch (chunked.num_chunks()) {
+ case 0: {
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array,
+ ::arrow::MakeArrayOfNull(chunked.type(), 0));
+ return array->data();
+ }
+ case 1:
+ return chunked.chunk(0)->data();
+ default:
+ // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
+ // this is not yet implemented
+ return Status::NotImplemented(
+ "Nested data conversions not implemented for chunked array outputs");
+ }
+}
+
+} // namespace
+
+class ColumnReaderImpl : public ColumnReader {
+ public:
+ virtual Status GetDefLevels(const int16_t** data, int64_t* length) = 0;
+ virtual Status GetRepLevels(const int16_t** data, int64_t* length) = 0;
+ virtual const std::shared_ptr<Field> field() = 0;
+
+ ::arrow::Status NextBatch(int64_t batch_size,
+ std::shared_ptr<::arrow::ChunkedArray>* out) final {
+ RETURN_NOT_OK(LoadBatch(batch_size));
+ RETURN_NOT_OK(BuildArray(batch_size, out));
+ for (int x = 0; x < (*out)->num_chunks(); x++) {
+ RETURN_NOT_OK((*out)->chunk(x)->Validate());
+ }
+ return Status::OK();
+ }
+
+ virtual ::arrow::Status LoadBatch(int64_t num_records) = 0;
+
+ virtual ::arrow::Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+ virtual bool IsOrHasRepeatedChild() const = 0;
+};
+
+namespace {
+
+std::shared_ptr<std::unordered_set<int>> VectorToSharedSet(
+ const std::vector<int>& values) {
+ std::shared_ptr<std::unordered_set<int>> result(new std::unordered_set<int>());
+ result->insert(values.begin(), values.end());
+ return result;
+}
+
+// Forward declaration
+Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& context,
+ std::unique_ptr<ColumnReaderImpl>* out);
+
+// ----------------------------------------------------------------------
+// FileReaderImpl forward declaration
+
+class FileReaderImpl : public FileReader {
+ public:
+ FileReaderImpl(MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader,
+ ArrowReaderProperties properties)
+ : pool_(pool),
+ reader_(std::move(reader)),
+ reader_properties_(std::move(properties)) {}
+
+ Status Init() {
+ return SchemaManifest::Make(reader_->metadata()->schema(),
+ reader_->metadata()->key_value_metadata(),
+ reader_properties_, &manifest_);
+ }
+
+ FileColumnIteratorFactory SomeRowGroupsFactory(std::vector<int> row_groups) {
+ return [row_groups](int i, ParquetFileReader* reader) {
+ return new FileColumnIterator(i, reader, row_groups);
+ };
+ }
+
+ FileColumnIteratorFactory AllRowGroupsFactory() {
+ return SomeRowGroupsFactory(Iota(reader_->metadata()->num_row_groups()));
+ }
+
+ Status BoundsCheckColumn(int column) {
+ if (column < 0 || column >= this->num_columns()) {
+ return Status::Invalid("Column index out of bounds (got ", column,
+ ", should be "
+ "between 0 and ",
+ this->num_columns() - 1, ")");
+ }
+ return Status::OK();
+ }
+
+ Status BoundsCheckRowGroup(int row_group) {
+ // row group indices check
+ if (row_group < 0 || row_group >= num_row_groups()) {
+ return Status::Invalid("Some index in row_group_indices is ", row_group,
+ ", which is either < 0 or >= num_row_groups(",
+ num_row_groups(), ")");
+ }
+ return Status::OK();
+ }
+
+ Status BoundsCheck(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) {
+ for (int i : row_groups) {
+ RETURN_NOT_OK(BoundsCheckRowGroup(i));
+ }
+ for (int i : column_indices) {
+ RETURN_NOT_OK(BoundsCheckColumn(i));
+ }
+ return Status::OK();
+ }
+
+ std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) override;
+
+ Status ReadTable(const std::vector<int>& indices,
+ std::shared_ptr<Table>* out) override {
+ return ReadRowGroups(Iota(reader_->metadata()->num_row_groups()), indices, out);
+ }
+
+ Status GetFieldReader(int i,
+ const std::shared_ptr<std::unordered_set<int>>& included_leaves,
+ const std::vector<int>& row_groups,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ auto ctx = std::make_shared<ReaderContext>();
+ ctx->reader = reader_.get();
+ ctx->pool = pool_;
+ ctx->iterator_factory = SomeRowGroupsFactory(row_groups);
+ ctx->filter_leaves = true;
+ ctx->included_leaves = included_leaves;
+ return GetReader(manifest_.schema_fields[i], ctx, out);
+ }
+
+ Status GetFieldReaders(const std::vector<int>& column_indices,
+ const std::vector<int>& row_groups,
+ std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
+ std::shared_ptr<::arrow::Schema>* out_schema) {
+ // We only need to read schema fields which have columns indicated
+ // in the indices vector
+ ARROW_ASSIGN_OR_RAISE(std::vector<int> field_indices,
+ manifest_.GetFieldIndices(column_indices));
+
+ auto included_leaves = VectorToSharedSet(column_indices);
+
+ out->resize(field_indices.size());
+ ::arrow::FieldVector out_fields(field_indices.size());
+ for (size_t i = 0; i < out->size(); ++i) {
+ std::unique_ptr<ColumnReaderImpl> reader;
+ RETURN_NOT_OK(
+ GetFieldReader(field_indices[i], included_leaves, row_groups, &reader));
+
+ out_fields[i] = reader->field();
+ out->at(i) = std::move(reader);
+ }
+
+ *out_schema = ::arrow::schema(std::move(out_fields), manifest_.schema_metadata);
+ return Status::OK();
+ }
+
+ Status GetColumn(int i, FileColumnIteratorFactory iterator_factory,
+ std::unique_ptr<ColumnReader>* out);
+
+ Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) override {
+ return GetColumn(i, AllRowGroupsFactory(), out);
+ }
+
+ Status GetSchema(std::shared_ptr<::arrow::Schema>* out) override {
+ return FromParquetSchema(reader_->metadata()->schema(), reader_properties_,
+ reader_->metadata()->key_value_metadata(), out);
+ }
+
+ Status ReadSchemaField(int i, std::shared_ptr<ChunkedArray>* out) override {
+ auto included_leaves = VectorToSharedSet(Iota(reader_->metadata()->num_columns()));
+ std::vector<int> row_groups = Iota(reader_->metadata()->num_row_groups());
+
+ std::unique_ptr<ColumnReaderImpl> reader;
+ RETURN_NOT_OK(GetFieldReader(i, included_leaves, row_groups, &reader));
+
+ return ReadColumn(i, row_groups, reader.get(), out);
+ }
+
+ Status ReadColumn(int i, const std::vector<int>& row_groups, ColumnReader* reader,
+ std::shared_ptr<ChunkedArray>* out) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ // TODO(wesm): This calculation doesn't make much sense when we have repeated
+ // schema nodes
+ int64_t records_to_read = 0;
+ for (auto row_group : row_groups) {
+ // Can throw exception
+ records_to_read +=
+ reader_->metadata()->RowGroup(row_group)->ColumnChunk(i)->num_values();
+ }
+ return reader->NextBatch(records_to_read, out);
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ Status ReadColumn(int i, const std::vector<int>& row_groups,
+ std::shared_ptr<ChunkedArray>* out) {
+ std::unique_ptr<ColumnReader> flat_column_reader;
+ RETURN_NOT_OK(GetColumn(i, SomeRowGroupsFactory(row_groups), &flat_column_reader));
+ return ReadColumn(i, row_groups, flat_column_reader.get(), out);
+ }
+
+ Status ReadColumn(int i, std::shared_ptr<ChunkedArray>* out) override {
+ return ReadColumn(i, Iota(reader_->metadata()->num_row_groups()), out);
+ }
+
+ Status ReadTable(std::shared_ptr<Table>* table) override {
+ return ReadTable(Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& indices,
+ std::shared_ptr<Table>* table) override;
+
+ // Helper method used by ReadRowGroups - read the given row groups/columns, skipping
+ // bounds checks and pre-buffering. Takes a shared_ptr to self to keep the reader
+ // alive in async contexts.
+ Future<std::shared_ptr<Table>> DecodeRowGroups(
+ std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor);
+
+ Status ReadRowGroups(const std::vector<int>& row_groups,
+ std::shared_ptr<Table>* table) override {
+ return ReadRowGroups(row_groups, Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status ReadRowGroup(int row_group_index, const std::vector<int>& column_indices,
+ std::shared_ptr<Table>* out) override {
+ return ReadRowGroups({row_group_index}, column_indices, out);
+ }
+
+ Status ReadRowGroup(int i, std::shared_ptr<Table>* table) override {
+ return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
+ }
+
+ Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out) override;
+
+ Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::unique_ptr<RecordBatchReader>* out) override {
+ return GetRecordBatchReader(row_group_indices,
+ Iota(reader_->metadata()->num_columns()), out);
+ }
+
+ ::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
+ GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor) override;
+
+ int num_columns() const { return reader_->metadata()->num_columns(); }
+
+ ParquetFileReader* parquet_reader() const override { return reader_.get(); }
+
+ int num_row_groups() const override { return reader_->metadata()->num_row_groups(); }
+
+ void set_use_threads(bool use_threads) override {
+ reader_properties_.set_use_threads(use_threads);
+ }
+
+ void set_batch_size(int64_t batch_size) override {
+ reader_properties_.set_batch_size(batch_size);
+ }
+
+ const ArrowReaderProperties& properties() const override { return reader_properties_; }
+
+ const SchemaManifest& manifest() const override { return manifest_; }
+
+ Status ScanContents(std::vector<int> columns, const int32_t column_batch_size,
+ int64_t* num_rows) override {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ *num_rows = ScanFileContents(columns, column_batch_size, reader_.get());
+ return Status::OK();
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ MemoryPool* pool_;
+ std::unique_ptr<ParquetFileReader> reader_;
+ ArrowReaderProperties reader_properties_;
+
+ SchemaManifest manifest_;
+};
+
+class RowGroupRecordBatchReader : public ::arrow::RecordBatchReader {
+ public:
+ RowGroupRecordBatchReader(::arrow::RecordBatchIterator batches,
+ std::shared_ptr<::arrow::Schema> schema)
+ : batches_(std::move(batches)), schema_(std::move(schema)) {}
+
+ ~RowGroupRecordBatchReader() override {}
+
+ Status ReadNext(std::shared_ptr<::arrow::RecordBatch>* out) override {
+ return batches_.Next().Value(out);
+ }
+
+ std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
+
+ private:
+ ::arrow::Iterator<std::shared_ptr<::arrow::RecordBatch>> batches_;
+ std::shared_ptr<::arrow::Schema> schema_;
+};
+
+class ColumnChunkReaderImpl : public ColumnChunkReader {
+ public:
+ ColumnChunkReaderImpl(FileReaderImpl* impl, int row_group_index, int column_index)
+ : impl_(impl), column_index_(column_index), row_group_index_(row_group_index) {}
+
+ Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) override {
+ return impl_->ReadColumn(column_index_, {row_group_index_}, out);
+ }
+
+ private:
+ FileReaderImpl* impl_;
+ int column_index_;
+ int row_group_index_;
+};
+
+class RowGroupReaderImpl : public RowGroupReader {
+ public:
+ RowGroupReaderImpl(FileReaderImpl* impl, int row_group_index)
+ : impl_(impl), row_group_index_(row_group_index) {}
+
+ std::shared_ptr<ColumnChunkReader> Column(int column_index) override {
+ return std::shared_ptr<ColumnChunkReader>(
+ new ColumnChunkReaderImpl(impl_, row_group_index_, column_index));
+ }
+
+ Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) override {
+ return impl_->ReadRowGroup(row_group_index_, column_indices, out);
+ }
+
+ Status ReadTable(std::shared_ptr<::arrow::Table>* out) override {
+ return impl_->ReadRowGroup(row_group_index_, out);
+ }
+
+ private:
+ FileReaderImpl* impl_;
+ int row_group_index_;
+};
+
+// ----------------------------------------------------------------------
+// Column reader implementations
+
+// Leaf reader is for primitive arrays and primitive children of nested arrays
+class LeafReader : public ColumnReaderImpl {
+ public:
+ LeafReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ std::unique_ptr<FileColumnIterator> input,
+ ::parquet::internal::LevelInfo leaf_info)
+ : ctx_(std::move(ctx)),
+ field_(std::move(field)),
+ input_(std::move(input)),
+ descr_(input_->descr()) {
+ record_reader_ = RecordReader::Make(
+ descr_, leaf_info, ctx_->pool, field_->type()->id() == ::arrow::Type::DICTIONARY);
+ NextRowGroup();
+ }
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) final {
+ *data = record_reader_->def_levels();
+ *length = record_reader_->levels_position();
+ return Status::OK();
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) final {
+ *data = record_reader_->rep_levels();
+ *length = record_reader_->levels_position();
+ return Status::OK();
+ }
+
+ bool IsOrHasRepeatedChild() const final { return false; }
+
+ Status LoadBatch(int64_t records_to_read) final {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ out_ = nullptr;
+ record_reader_->Reset();
+ // Pre-allocation gives much better performance for flat columns
+ record_reader_->Reserve(records_to_read);
+ while (records_to_read > 0) {
+ if (!record_reader_->HasMoreData()) {
+ break;
+ }
+ int64_t records_read = record_reader_->ReadRecords(records_to_read);
+ records_to_read -= records_read;
+ if (records_read == 0) {
+ NextRowGroup();
+ }
+ }
+ RETURN_NOT_OK(TransferColumnData(record_reader_.get(), field_->type(), descr_,
+ ctx_->pool, &out_));
+ return Status::OK();
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ ::arrow::Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<::arrow::ChunkedArray>* out) final {
+ *out = out_;
+ return Status::OK();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<ChunkedArray> out_;
+ void NextRowGroup() {
+ std::unique_ptr<PageReader> page_reader = input_->NextChunk();
+ record_reader_->SetPageReader(std::move(page_reader));
+ }
+
+ std::shared_ptr<ReaderContext> ctx_;
+ std::shared_ptr<Field> field_;
+ std::unique_ptr<FileColumnIterator> input_;
+ const ColumnDescriptor* descr_;
+ std::shared_ptr<RecordReader> record_reader_;
+};
+
+// Column reader for extension arrays
+class ExtensionReader : public ColumnReaderImpl {
+ public:
+ ExtensionReader(std::shared_ptr<Field> field,
+ std::unique_ptr<ColumnReaderImpl> storage_reader)
+ : field_(std::move(field)), storage_reader_(std::move(storage_reader)) {}
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) override {
+ return storage_reader_->GetDefLevels(data, length);
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) override {
+ return storage_reader_->GetRepLevels(data, length);
+ }
+
+ Status LoadBatch(int64_t number_of_records) final {
+ return storage_reader_->LoadBatch(number_of_records);
+ }
+
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override {
+ std::shared_ptr<ChunkedArray> storage;
+ RETURN_NOT_OK(storage_reader_->BuildArray(length_upper_bound, &storage));
+ *out = ExtensionType::WrapArray(field_->type(), storage);
+ return Status::OK();
+ }
+
+ bool IsOrHasRepeatedChild() const final {
+ return storage_reader_->IsOrHasRepeatedChild();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<Field> field_;
+ std::unique_ptr<ColumnReaderImpl> storage_reader_;
+};
+
+template <typename IndexType>
+class ListReader : public ColumnReaderImpl {
+ public:
+ ListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ ::parquet::internal::LevelInfo level_info,
+ std::unique_ptr<ColumnReaderImpl> child_reader)
+ : ctx_(std::move(ctx)),
+ field_(std::move(field)),
+ level_info_(level_info),
+ item_reader_(std::move(child_reader)) {}
+
+ Status GetDefLevels(const int16_t** data, int64_t* length) override {
+ return item_reader_->GetDefLevels(data, length);
+ }
+
+ Status GetRepLevels(const int16_t** data, int64_t* length) override {
+ return item_reader_->GetRepLevels(data, length);
+ }
+
+ bool IsOrHasRepeatedChild() const final { return true; }
+
+ Status LoadBatch(int64_t number_of_records) final {
+ return item_reader_->LoadBatch(number_of_records);
+ }
+
+ virtual ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+ std::shared_ptr<ArrayData> data) {
+ if (field_->type()->id() == ::arrow::Type::MAP) {
+ // Error out if data is not map-compliant instead of aborting in MakeArray below
+ RETURN_NOT_OK(::arrow::MapArray::ValidateChildData(data->child_data));
+ }
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+ return std::make_shared<ChunkedArray>(result);
+ }
+
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override {
+ const int16_t* def_levels;
+ const int16_t* rep_levels;
+ int64_t num_levels;
+ RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
+ RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
+
+ std::shared_ptr<ResizableBuffer> validity_buffer;
+ ::parquet::internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = length_upper_bound;
+ if (field_->nullable()) {
+ ARROW_ASSIGN_OR_RAISE(
+ validity_buffer,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = validity_buffer->mutable_data();
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ std::shared_ptr<ResizableBuffer> offsets_buffer,
+ AllocateResizableBuffer(
+ sizeof(IndexType) * std::max(int64_t{1}, length_upper_bound + 1),
+ ctx_->pool));
+ // Ensure zero initialization in case we have reached a zero length list (and
+ // because first entry is always zero).
+ IndexType* offset_data = reinterpret_cast<IndexType*>(offsets_buffer->mutable_data());
+ offset_data[0] = 0;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ ::parquet::internal::DefRepLevelsToList(def_levels, rep_levels, num_levels,
+ level_info_, &validity_io, offset_data);
+ END_PARQUET_CATCH_EXCEPTIONS
+
+ RETURN_NOT_OK(item_reader_->BuildArray(offset_data[validity_io.values_read], out));
+
+ // Resize to actual number of elements returned.
+ RETURN_NOT_OK(
+ offsets_buffer->Resize((validity_io.values_read + 1) * sizeof(IndexType)));
+ if (validity_buffer != nullptr) {
+ RETURN_NOT_OK(
+ validity_buffer->Resize(BitUtil::BytesForBits(validity_io.values_read)));
+ validity_buffer->ZeroPadding();
+ }
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> item_chunk, ChunksToSingle(**out));
+
+ std::vector<std::shared_ptr<Buffer>> buffers{
+ validity_io.null_count > 0 ? validity_buffer : nullptr, offsets_buffer};
+ auto data = std::make_shared<ArrayData>(
+ field_->type(),
+ /*length=*/validity_io.values_read, std::move(buffers),
+ std::vector<std::shared_ptr<ArrayData>>{item_chunk}, validity_io.null_count);
+
+ ARROW_ASSIGN_OR_RAISE(*out, AssembleArray(std::move(data)));
+ return Status::OK();
+ }
+
+ const std::shared_ptr<Field> field() override { return field_; }
+
+ private:
+ std::shared_ptr<ReaderContext> ctx_;
+ std::shared_ptr<Field> field_;
+ ::parquet::internal::LevelInfo level_info_;
+ std::unique_ptr<ColumnReaderImpl> item_reader_;
+};
+
+class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader<int32_t> {
+ public:
+ FixedSizeListReader(std::shared_ptr<ReaderContext> ctx, std::shared_ptr<Field> field,
+ ::parquet::internal::LevelInfo level_info,
+ std::unique_ptr<ColumnReaderImpl> child_reader)
+ : ListReader(std::move(ctx), std::move(field), level_info,
+ std::move(child_reader)) {}
+ ::arrow::Result<std::shared_ptr<ChunkedArray>> AssembleArray(
+ std::shared_ptr<ArrayData> data) final {
+ DCHECK_EQ(data->buffers.size(), 2);
+ DCHECK_EQ(field()->type()->id(), ::arrow::Type::FIXED_SIZE_LIST);
+ const auto& type = checked_cast<::arrow::FixedSizeListType&>(*field()->type());
+ const int32_t* offsets = reinterpret_cast<const int32_t*>(data->buffers[1]->data());
+ for (int x = 1; x <= data->length; x++) {
+ int32_t size = offsets[x] - offsets[x - 1];
+ if (size != type.list_size()) {
+ return Status::Invalid("Expected all lists to be of size=", type.list_size(),
+ " but index ", x, " had size=", size);
+ }
+ }
+ data->buffers.resize(1);
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+ return std::make_shared<ChunkedArray>(result);
+ }
+};
+
+class PARQUET_NO_EXPORT StructReader : public ColumnReaderImpl {
+ public:
+ explicit StructReader(std::shared_ptr<ReaderContext> ctx,
+ std::shared_ptr<Field> filtered_field,
+ ::parquet::internal::LevelInfo level_info,
+ std::vector<std::unique_ptr<ColumnReaderImpl>> children)
+ : ctx_(std::move(ctx)),
+ filtered_field_(std::move(filtered_field)),
+ level_info_(level_info),
+ children_(std::move(children)) {
+ // There could be a mix of children some might be repeated some might not be.
+ // If possible use one that isn't since that will be guaranteed to have the least
+ // number of levels to reconstruct a nullable bitmap.
+ auto result = std::find_if(children_.begin(), children_.end(),
+ [](const std::unique_ptr<ColumnReaderImpl>& child) {
+ return !child->IsOrHasRepeatedChild();
+ });
+ if (result != children_.end()) {
+ def_rep_level_child_ = result->get();
+ has_repeated_child_ = false;
+ } else if (!children_.empty()) {
+ def_rep_level_child_ = children_.front().get();
+ has_repeated_child_ = true;
+ }
+ }
+
+ bool IsOrHasRepeatedChild() const final { return has_repeated_child_; }
+
+ Status LoadBatch(int64_t records_to_read) override {
+ for (const std::unique_ptr<ColumnReaderImpl>& reader : children_) {
+ RETURN_NOT_OK(reader->LoadBatch(records_to_read));
+ }
+ return Status::OK();
+ }
+ Status BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) override;
+ Status GetDefLevels(const int16_t** data, int64_t* length) override;
+ Status GetRepLevels(const int16_t** data, int64_t* length) override;
+ const std::shared_ptr<Field> field() override { return filtered_field_; }
+
+ private:
+ const std::shared_ptr<ReaderContext> ctx_;
+ const std::shared_ptr<Field> filtered_field_;
+ const ::parquet::internal::LevelInfo level_info_;
+ const std::vector<std::unique_ptr<ColumnReaderImpl>> children_;
+ ColumnReaderImpl* def_rep_level_child_ = nullptr;
+ bool has_repeated_child_;
+};
+
+Status StructReader::GetDefLevels(const int16_t** data, int64_t* length) {
+ *data = nullptr;
+ if (children_.size() == 0) {
+ *length = 0;
+ return Status::Invalid("StructReader had no children");
+ }
+
+ // This method should only be called when this struct or one of its parents
+ // are optional/repeated or it has a repeated child.
+ // Meaning all children must have rep/def levels associated
+ // with them.
+ RETURN_NOT_OK(def_rep_level_child_->GetDefLevels(data, length));
+ return Status::OK();
+}
+
+Status StructReader::GetRepLevels(const int16_t** data, int64_t* length) {
+ *data = nullptr;
+ if (children_.size() == 0) {
+ *length = 0;
+ return Status::Invalid("StructReader had no childre");
+ }
+
+ // This method should only be called when this struct or one of its parents
+ // are optional/repeated or it has repeated child.
+ // Meaning all children must have rep/def levels associated
+ // with them.
+ RETURN_NOT_OK(def_rep_level_child_->GetRepLevels(data, length));
+ return Status::OK();
+}
+
+Status StructReader::BuildArray(int64_t length_upper_bound,
+ std::shared_ptr<ChunkedArray>* out) {
+ std::vector<std::shared_ptr<ArrayData>> children_array_data;
+ std::shared_ptr<ResizableBuffer> null_bitmap;
+
+ ::parquet::internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = length_upper_bound;
+ // This simplifies accounting below.
+ validity_io.values_read = length_upper_bound;
+
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ const int16_t* def_levels;
+ const int16_t* rep_levels;
+ int64_t num_levels;
+
+ if (has_repeated_child_) {
+ ARROW_ASSIGN_OR_RAISE(
+ null_bitmap,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = null_bitmap->mutable_data();
+ RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
+ RETURN_NOT_OK(GetRepLevels(&rep_levels, &num_levels));
+ DefRepLevelsToBitmap(def_levels, rep_levels, num_levels, level_info_, &validity_io);
+ } else if (filtered_field_->nullable()) {
+ ARROW_ASSIGN_OR_RAISE(
+ null_bitmap,
+ AllocateResizableBuffer(BitUtil::BytesForBits(length_upper_bound), ctx_->pool));
+ validity_io.valid_bits = null_bitmap->mutable_data();
+ RETURN_NOT_OK(GetDefLevels(&def_levels, &num_levels));
+ DefLevelsToBitmap(def_levels, num_levels, level_info_, &validity_io);
+ }
+
+ // Ensure all values are initialized.
+ if (null_bitmap) {
+ RETURN_NOT_OK(null_bitmap->Resize(BitUtil::BytesForBits(validity_io.values_read)));
+ null_bitmap->ZeroPadding();
+ }
+
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Gather children arrays and def levels
+ for (auto& child : children_) {
+ std::shared_ptr<ChunkedArray> field;
+ RETURN_NOT_OK(child->BuildArray(validity_io.values_read, &field));
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> array_data, ChunksToSingle(*field));
+ children_array_data.push_back(std::move(array_data));
+ }
+
+ if (!filtered_field_->nullable() && !has_repeated_child_) {
+ validity_io.values_read = children_array_data.front()->length;
+ }
+
+ std::vector<std::shared_ptr<Buffer>> buffers{validity_io.null_count > 0 ? null_bitmap
+ : nullptr};
+ auto data =
+ std::make_shared<ArrayData>(filtered_field_->type(),
+ /*length=*/validity_io.values_read, std::move(buffers),
+ std::move(children_array_data));
+ std::shared_ptr<Array> result = ::arrow::MakeArray(data);
+
+ *out = std::make_shared<ChunkedArray>(result);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// File reader implementation
+
+Status GetReader(const SchemaField& field, const std::shared_ptr<Field>& arrow_field,
+ const std::shared_ptr<ReaderContext>& ctx,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+
+ auto type_id = arrow_field->type()->id();
+
+ if (type_id == ::arrow::Type::EXTENSION) {
+ auto storage_field = arrow_field->WithType(
+ checked_cast<const ExtensionType&>(*arrow_field->type()).storage_type());
+ RETURN_NOT_OK(GetReader(field, storage_field, ctx, out));
+ out->reset(new ExtensionReader(arrow_field, std::move(*out)));
+ return Status::OK();
+ }
+
+ if (field.children.size() == 0) {
+ if (!field.is_leaf()) {
+ return Status::Invalid("Parquet non-leaf node has no children");
+ }
+ if (!ctx->IncludesLeaf(field.column_index)) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ std::unique_ptr<FileColumnIterator> input(
+ ctx->iterator_factory(field.column_index, ctx->reader));
+ out->reset(new LeafReader(ctx, arrow_field, std::move(input), field.level_info));
+ } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP ||
+ type_id == ::arrow::Type::FIXED_SIZE_LIST ||
+ type_id == ::arrow::Type::LARGE_LIST) {
+ auto list_field = arrow_field;
+ auto child = &field.children[0];
+ std::unique_ptr<ColumnReaderImpl> child_reader;
+ RETURN_NOT_OK(GetReader(*child, ctx, &child_reader));
+ if (child_reader == nullptr) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ if (type_id == ::arrow::Type::LIST ||
+ type_id == ::arrow::Type::MAP) { // Map can be reconstructed as list of structs.
+ if (type_id == ::arrow::Type::MAP &&
+ child_reader->field()->type()->num_fields() != 2) {
+ // This case applies if either key or value is filtered.
+ list_field = list_field->WithType(::arrow::list(child_reader->field()));
+ }
+ out->reset(new ListReader<int32_t>(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+ } else if (type_id == ::arrow::Type::LARGE_LIST) {
+ out->reset(new ListReader<int64_t>(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+
+ } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) {
+ out->reset(new FixedSizeListReader(ctx, list_field, field.level_info,
+ std::move(child_reader)));
+ } else {
+ return Status::UnknownError("Unknown list type: ", field.field->ToString());
+ }
+ } else if (type_id == ::arrow::Type::STRUCT) {
+ std::vector<std::shared_ptr<Field>> child_fields;
+ std::vector<std::unique_ptr<ColumnReaderImpl>> child_readers;
+ for (const auto& child : field.children) {
+ std::unique_ptr<ColumnReaderImpl> child_reader;
+ RETURN_NOT_OK(GetReader(child, ctx, &child_reader));
+ if (!child_reader) {
+ // If all children were pruned, then we do not try to read this field
+ continue;
+ }
+ child_fields.push_back(child.field);
+ child_readers.emplace_back(std::move(child_reader));
+ }
+ if (child_fields.size() == 0) {
+ *out = nullptr;
+ return Status::OK();
+ }
+ auto filtered_field =
+ ::arrow::field(arrow_field->name(), ::arrow::struct_(child_fields),
+ arrow_field->nullable(), arrow_field->metadata());
+ out->reset(new StructReader(ctx, filtered_field, field.level_info,
+ std::move(child_readers)));
+ } else {
+ return Status::Invalid("Unsupported nested type: ", arrow_field->ToString());
+ }
+ return Status::OK();
+
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+Status GetReader(const SchemaField& field, const std::shared_ptr<ReaderContext>& ctx,
+ std::unique_ptr<ColumnReaderImpl>* out) {
+ return GetReader(field, field.field, ctx, out);
+}
+
+} // namespace
+
+Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out) {
+ RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
+
+ if (reader_properties_.pre_buffer()) {
+ // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ reader_->PreBuffer(row_groups, column_indices, reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
+ std::shared_ptr<::arrow::Schema> batch_schema;
+ RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &batch_schema));
+
+ if (readers.empty()) {
+ // Just generate all batches right now; they're cheap since they have no columns.
+ int64_t batch_size = properties().batch_size();
+ auto max_sized_batch =
+ ::arrow::RecordBatch::Make(batch_schema, batch_size, ::arrow::ArrayVector{});
+
+ ::arrow::RecordBatchVector batches;
+
+ for (int row_group : row_groups) {
+ int64_t num_rows = parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
+
+ batches.insert(batches.end(), num_rows / batch_size, max_sized_batch);
+
+ if (int64_t trailing_rows = num_rows % batch_size) {
+ batches.push_back(max_sized_batch->Slice(0, trailing_rows));
+ }
+ }
+
+ *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
+ ::arrow::MakeVectorIterator(std::move(batches)), std::move(batch_schema));
+
+ return Status::OK();
+ }
+
+ int64_t num_rows = 0;
+ for (int row_group : row_groups) {
+ num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
+ }
+
+ using ::arrow::RecordBatchIterator;
+
+ // NB: This lambda will be invoked outside the scope of this call to
+ // `GetRecordBatchReader()`, so it must capture `readers` and `batch_schema` by value.
+ // `this` is a non-owning pointer so we are relying on the parent FileReader outliving
+ // this RecordBatchReader.
+ ::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
+ [readers, batch_schema, num_rows,
+ this]() mutable -> ::arrow::Result<RecordBatchIterator> {
+ ::arrow::ChunkedArrayVector columns(readers.size());
+
+ // don't reserve more rows than necessary
+ int64_t batch_size = std::min(properties().batch_size(), num_rows);
+ num_rows -= batch_size;
+
+ RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
+ reader_properties_.use_threads(), static_cast<int>(readers.size()),
+ [&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
+
+ for (const auto& column : columns) {
+ if (column == nullptr || column->length() == 0) {
+ return ::arrow::IterationTraits<RecordBatchIterator>::End();
+ }
+ }
+
+ auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
+ auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
+
+ // NB: explicitly preserve table so that table_reader doesn't outlive it
+ return ::arrow::MakeFunctionIterator(
+ [table, table_reader] { return table_reader->Next(); });
+ });
+
+ *out = ::arrow::internal::make_unique<RowGroupRecordBatchReader>(
+ ::arrow::MakeFlattenIterator(std::move(batches)), std::move(batch_schema));
+
+ return Status::OK();
+}
+
+/// Given a file reader and a list of row groups, this is a generator of record
+/// batch generators (where each sub-generator is the contents of a single row group).
+class RowGroupGenerator {
+ public:
+ using RecordBatchGenerator =
+ ::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>;
+
+ explicit RowGroupGenerator(std::shared_ptr<FileReaderImpl> arrow_reader,
+ ::arrow::internal::Executor* cpu_executor,
+ std::vector<int> row_groups, std::vector<int> column_indices)
+ : arrow_reader_(std::move(arrow_reader)),
+ cpu_executor_(cpu_executor),
+ row_groups_(std::move(row_groups)),
+ column_indices_(std::move(column_indices)),
+ index_(0) {}
+
+ ::arrow::Future<RecordBatchGenerator> operator()() {
+ if (index_ >= row_groups_.size()) {
+ return ::arrow::AsyncGeneratorEnd<RecordBatchGenerator>();
+ }
+ int row_group = row_groups_[index_++];
+ std::vector<int> column_indices = column_indices_;
+ auto reader = arrow_reader_;
+ if (!reader->properties().pre_buffer()) {
+ return SubmitRead(cpu_executor_, reader, row_group, column_indices);
+ }
+ auto ready = reader->parquet_reader()->WhenBuffered({row_group}, column_indices);
+ if (cpu_executor_) ready = cpu_executor_->TransferAlways(ready);
+ return ready.Then([=]() -> ::arrow::Future<RecordBatchGenerator> {
+ return ReadOneRowGroup(cpu_executor_, reader, row_group, column_indices);
+ });
+ }
+
+ private:
+ // Synchronous fallback for when pre-buffer isn't enabled.
+ //
+ // Making the Parquet reader truly asynchronous requires heavy refactoring, so the
+ // generator piggybacks on ReadRangeCache. The lazy ReadRangeCache can be used for
+ // async I/O without forcing readahead.
+ static ::arrow::Future<RecordBatchGenerator> SubmitRead(
+ ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+ const int row_group, const std::vector<int>& column_indices) {
+ if (!cpu_executor) {
+ return ReadOneRowGroup(cpu_executor, self, row_group, column_indices);
+ }
+ // If we have an executor, then force transfer (even if I/O was complete)
+ return ::arrow::DeferNotOk(cpu_executor->Submit(ReadOneRowGroup, cpu_executor, self,
+ row_group, column_indices));
+ }
+
+ static ::arrow::Future<RecordBatchGenerator> ReadOneRowGroup(
+ ::arrow::internal::Executor* cpu_executor, std::shared_ptr<FileReaderImpl> self,
+ const int row_group, const std::vector<int>& column_indices) {
+ // Skips bound checks/pre-buffering, since we've done that already
+ return self->DecodeRowGroups(self, {row_group}, column_indices, cpu_executor)
+ .Then([](const std::shared_ptr<Table>& table)
+ -> ::arrow::Result<RecordBatchGenerator> {
+ ::arrow::TableBatchReader table_reader(*table);
+ ::arrow::RecordBatchVector batches;
+ RETURN_NOT_OK(table_reader.ReadAll(&batches));
+ return ::arrow::MakeVectorGenerator(std::move(batches));
+ });
+ }
+
+ std::shared_ptr<FileReaderImpl> arrow_reader_;
+ ::arrow::internal::Executor* cpu_executor_;
+ std::vector<int> row_groups_;
+ std::vector<int> column_indices_;
+ size_t index_;
+};
+
+::arrow::Result<::arrow::AsyncGenerator<std::shared_ptr<::arrow::RecordBatch>>>
+FileReaderImpl::GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor) {
+ RETURN_NOT_OK(BoundsCheck(row_group_indices, column_indices));
+ if (reader_properties_.pre_buffer()) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ reader_->PreBuffer(row_group_indices, column_indices, reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+ ::arrow::AsyncGenerator<RowGroupGenerator::RecordBatchGenerator> row_group_generator =
+ RowGroupGenerator(::arrow::internal::checked_pointer_cast<FileReaderImpl>(reader),
+ cpu_executor, row_group_indices, column_indices);
+ return ::arrow::MakeConcatenatedGenerator(std::move(row_group_generator));
+}
+
+Status FileReaderImpl::GetColumn(int i, FileColumnIteratorFactory iterator_factory,
+ std::unique_ptr<ColumnReader>* out) {
+ RETURN_NOT_OK(BoundsCheckColumn(i));
+ auto ctx = std::make_shared<ReaderContext>();
+ ctx->reader = reader_.get();
+ ctx->pool = pool_;
+ ctx->iterator_factory = iterator_factory;
+ ctx->filter_leaves = false;
+ std::unique_ptr<ColumnReaderImpl> result;
+ RETURN_NOT_OK(GetReader(manifest_.schema_fields[i], ctx, &result));
+ out->reset(result.release());
+ return Status::OK();
+}
+
+Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<Table>* out) {
+ RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
+
+ // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if enabled
+ if (reader_properties_.pre_buffer()) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ parquet_reader()->PreBuffer(row_groups, column_indices,
+ reader_properties_.io_context(),
+ reader_properties_.cache_options());
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
+ /*cpu_executor=*/nullptr);
+ ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
+ return Status::OK();
+}
+
+Future<std::shared_ptr<Table>> FileReaderImpl::DecodeRowGroups(
+ std::shared_ptr<FileReaderImpl> self, const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices, ::arrow::internal::Executor* cpu_executor) {
+ // `self` is used solely to keep `this` alive in an async context - but we use this
+ // in a sync context too so use `this` over `self`
+ std::vector<std::shared_ptr<ColumnReaderImpl>> readers;
+ std::shared_ptr<::arrow::Schema> result_schema;
+ RETURN_NOT_OK(GetFieldReaders(column_indices, row_groups, &readers, &result_schema));
+ // OptionalParallelForAsync requires an executor
+ if (!cpu_executor) cpu_executor = ::arrow::internal::GetCpuThreadPool();
+
+ auto read_column = [row_groups, self, this](size_t i,
+ std::shared_ptr<ColumnReaderImpl> reader)
+ -> ::arrow::Result<std::shared_ptr<::arrow::ChunkedArray>> {
+ std::shared_ptr<::arrow::ChunkedArray> column;
+ RETURN_NOT_OK(ReadColumn(static_cast<int>(i), row_groups, reader.get(), &column));
+ return column;
+ };
+ auto make_table = [result_schema, row_groups, self,
+ this](const ::arrow::ChunkedArrayVector& columns)
+ -> ::arrow::Result<std::shared_ptr<Table>> {
+ int64_t num_rows = 0;
+ if (!columns.empty()) {
+ num_rows = columns[0]->length();
+ } else {
+ for (int i : row_groups) {
+ num_rows += parquet_reader()->metadata()->RowGroup(i)->num_rows();
+ }
+ }
+ auto table = Table::Make(std::move(result_schema), columns, num_rows);
+ RETURN_NOT_OK(table->Validate());
+ return table;
+ };
+ return ::arrow::internal::OptionalParallelForAsync(reader_properties_.use_threads(),
+ std::move(readers), read_column,
+ cpu_executor)
+ .Then(std::move(make_table));
+}
+
+std::shared_ptr<RowGroupReader> FileReaderImpl::RowGroup(int row_group_index) {
+ return std::make_shared<RowGroupReaderImpl>(this, row_group_index);
+}
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ std::unique_ptr<RecordBatchReader> tmp;
+ ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, &tmp));
+ out->reset(tmp.release());
+ return Status::OK();
+}
+
+Status FileReader::GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<RecordBatchReader>* out) {
+ std::unique_ptr<RecordBatchReader> tmp;
+ ARROW_RETURN_NOT_OK(GetRecordBatchReader(row_group_indices, column_indices, &tmp));
+ out->reset(tmp.release());
+ return Status::OK();
+}
+
+Status FileReader::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ const ArrowReaderProperties& properties,
+ std::unique_ptr<FileReader>* out) {
+ out->reset(new FileReaderImpl(pool, std::move(reader), properties));
+ return static_cast<FileReaderImpl*>(out->get())->Init();
+}
+
+Status FileReader::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ std::unique_ptr<FileReader>* out) {
+ return Make(pool, std::move(reader), default_arrow_reader_properties(), out);
+}
+
+FileReaderBuilder::FileReaderBuilder()
+ : pool_(::arrow::default_memory_pool()),
+ properties_(default_arrow_reader_properties()) {}
+
+Status FileReaderBuilder::Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+ const ReaderProperties& properties,
+ std::shared_ptr<FileMetaData> metadata) {
+ PARQUET_CATCH_NOT_OK(raw_reader_ = ParquetReader::Open(std::move(file), properties,
+ std::move(metadata)));
+ return Status::OK();
+}
+
+FileReaderBuilder* FileReaderBuilder::memory_pool(::arrow::MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+}
+
+FileReaderBuilder* FileReaderBuilder::properties(
+ const ArrowReaderProperties& arg_properties) {
+ properties_ = arg_properties;
+ return this;
+}
+
+Status FileReaderBuilder::Build(std::unique_ptr<FileReader>* out) {
+ return FileReader::Make(pool_, std::move(raw_reader_), properties_, out);
+}
+
+Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile> file, MemoryPool* pool,
+ std::unique_ptr<FileReader>* reader) {
+ FileReaderBuilder builder;
+ RETURN_NOT_OK(builder.Open(std::move(file)));
+ return builder.memory_pool(pool)->Build(reader);
+}
+
+namespace internal {
+
+Status FuzzReader(std::unique_ptr<FileReader> reader) {
+ auto st = Status::OK();
+ for (int i = 0; i < reader->num_row_groups(); ++i) {
+ std::shared_ptr<Table> table;
+ auto row_group_status = reader->ReadRowGroup(i, &table);
+ if (row_group_status.ok()) {
+ row_group_status &= table->ValidateFull();
+ }
+ st &= row_group_status;
+ }
+ return st;
+}
+
+Status FuzzReader(const uint8_t* data, int64_t size) {
+ auto buffer = std::make_shared<::arrow::Buffer>(data, size);
+ auto file = std::make_shared<::arrow::io::BufferReader>(buffer);
+ FileReaderBuilder builder;
+ RETURN_NOT_OK(builder.Open(std::move(file)));
+
+ std::unique_ptr<FileReader> reader;
+ RETURN_NOT_OK(builder.Build(&reader));
+ return FuzzReader(std::move(reader));
+}
+
+} // namespace internal
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
index e8a2dd889da..2d6a5ef2c3e 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader.h
@@ -1,343 +1,343 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-// N.B. we don't include async_generator.h as it's relatively heavy
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "parquet/file_reader.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-
-namespace arrow {
-
-class ChunkedArray;
-class KeyValueMetadata;
-class RecordBatchReader;
-struct Scalar;
-class Schema;
-class Table;
-class RecordBatch;
-
-} // namespace arrow
-
-namespace parquet {
-
-class FileMetaData;
-class SchemaDescriptor;
-
-namespace arrow {
-
-class ColumnChunkReader;
-class ColumnReader;
-struct SchemaManifest;
-class RowGroupReader;
-
-/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
-///
-/// This interfaces caters for different use cases and thus provides different
-/// interfaces. In its most simplistic form, we cater for a user that wants to
-/// read the whole Parquet at once with the `FileReader::ReadTable` method.
-///
-/// More advanced users that also want to implement parallelism on top of each
-/// single Parquet files should do this on the RowGroup level. For this, they can
-/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
-/// RowGroup as a table.
-///
-/// In the most advanced situation, where a consumer wants to independently read
-/// RowGroups in parallel and consume each column individually, they can call
-/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
-/// instance.
-///
-/// The parquet format supports an optional integer field_id which can be assigned
-/// to a field. Arrow will convert these field IDs to a metadata key named
-/// PARQUET:field_id on the appropriate field.
-// TODO(wesm): nested data does not always make sense with this user
-// interface unless you are only reading a single leaf node from a branch of
-// a table. For example:
-//
-// repeated group data {
-// optional group record {
-// optional int32 val1;
-// optional byte_array val2;
-// optional bool val3;
-// }
-// optional int32 val4;
-// }
-//
-// In the Parquet file, there are 3 leaf nodes:
-//
-// * data.record.val1
-// * data.record.val2
-// * data.record.val3
-// * data.val4
-//
-// When materializing this data in an Arrow array, we would have:
-//
-// data: list<struct<
-// record: struct<
-// val1: int32,
-// val2: string (= list<uint8>),
-// val3: bool,
-// >,
-// val4: int32
-// >>
-//
-// However, in the Parquet format, each leaf node has its own repetition and
-// definition levels describing the structure of the intermediate nodes in
-// this array structure. Thus, we will need to scan the leaf data for a group
-// of leaf nodes part of the same type tree to create a single result Arrow
-// nested array structure.
-//
-// This is additionally complicated "chunky" repeated fields or very large byte
-// arrays
-class PARQUET_EXPORT FileReader {
- public:
- /// Factory function to create a FileReader from a ParquetFileReader and properties
- static ::arrow::Status Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- const ArrowReaderProperties& properties,
- std::unique_ptr<FileReader>* out);
-
- /// Factory function to create a FileReader from a ParquetFileReader
- static ::arrow::Status Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileReader> reader,
- std::unique_ptr<FileReader>* out);
-
- // Since the distribution of columns amongst a Parquet file's row groups may
- // be uneven (the number of values in each column chunk can be different), we
- // provide a column-oriented read interface. The ColumnReader hides the
- // details of paging through the file's row groups and yielding
- // fully-materialized arrow::Array instances
- //
- // Returns error status if the column of interest is not flat.
- virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
-
- /// \brief Return arrow schema for all the columns.
- virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
-
- /// \brief Read column as a whole into a chunked array.
- ///
- /// The indicated column index is relative to the schema
- virtual ::arrow::Status ReadColumn(int i,
- std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-
- // NOTE: Experimental API
- // Reads a specific top level schema field into an Array
- // The index i refers the index of the top level schema field, which may
- // be nested or flat - e.g.
- //
- // 0 foo.bar
- // foo.bar.baz
- // foo.qux
- // 1 foo2
- // 2 foo3
- //
- // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
- virtual ::arrow::Status ReadSchemaField(
- int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-
- /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
- ///
- /// Note that the ordering in row_group_indices matters. FileReaders must outlive
- /// their RecordBatchReaders.
- ///
- /// \returns error Status if row_group_indices contains an invalid index
- virtual ::arrow::Status GetRecordBatchReader(
- const std::vector<int>& row_group_indices,
- std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
-
- ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- std::shared_ptr<::arrow::RecordBatchReader>* out);
-
- /// \brief Return a RecordBatchReader of row groups selected from
- /// row_group_indices, whose columns are selected by column_indices.
- ///
- /// Note that the ordering in row_group_indices and column_indices
- /// matter. FileReaders must outlive their RecordBatchReaders.
- ///
- /// \returns error Status if either row_group_indices or column_indices
- /// contains an invalid index
- virtual ::arrow::Status GetRecordBatchReader(
- const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
- std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
-
- /// \brief Return a generator of record batches.
- ///
- /// The FileReader must outlive the generator, so this requires that you pass in a
- /// shared_ptr.
- ///
- /// \returns error Result if either row_group_indices or column_indices contains an
- /// invalid index
- virtual ::arrow::Result<
- std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
- GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
- const std::vector<int> row_group_indices,
- const std::vector<int> column_indices,
- ::arrow::internal::Executor* cpu_executor = NULLPTR) = 0;
-
- ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
- const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::RecordBatchReader>* out);
-
- /// Read all columns into a Table
- virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
-
- /// \brief Read the given columns into a Table
- ///
- /// The indicated column indices are relative to the schema
- virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
- std::shared_ptr<::arrow::Table>* out) = 0;
-
- /// \brief Scan file contents with one thread, return number of rows
- virtual ::arrow::Status ScanContents(std::vector<int> columns,
- const int32_t column_batch_size,
- int64_t* num_rows) = 0;
-
- /// \brief Return a reader for the RowGroup, this object must not outlive the
- /// FileReader.
- virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
-
- /// \brief The number of row groups in the file
- virtual int num_row_groups() const = 0;
-
- virtual ParquetFileReader* parquet_reader() const = 0;
-
- /// Set whether to use multiple threads during reads of multiple columns.
- /// By default only one thread is used.
- virtual void set_use_threads(bool use_threads) = 0;
-
- /// Set number of records to read per batch for the RecordBatchReader.
- virtual void set_batch_size(int64_t batch_size) = 0;
-
- virtual const ArrowReaderProperties& properties() const = 0;
-
- virtual const SchemaManifest& manifest() const = 0;
-
- virtual ~FileReader() = default;
-};
-
-class RowGroupReader {
- public:
- virtual ~RowGroupReader() = default;
- virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
- virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
- std::shared_ptr<::arrow::Table>* out) = 0;
- virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
-
- private:
- struct Iterator;
-};
-
-class ColumnChunkReader {
- public:
- virtual ~ColumnChunkReader() = default;
- virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-};
-
-// At this point, the column reader is a stream iterator. It only knows how to
-// read the next batch of values for a particular column from the file until it
-// runs out.
-//
-// We also do not expose any internal Parquet details, such as row groups. This
-// might change in the future.
-class PARQUET_EXPORT ColumnReader {
- public:
- virtual ~ColumnReader() = default;
-
- // Scan the next array of the indicated size. The actual size of the
- // returned array may be less than the passed size depending how much data is
- // available in the file.
- //
- // When all the data in the file has been exhausted, the result is set to
- // nullptr.
- //
- // Returns Status::OK on a successful read, including if you have exhausted
- // the data available in the file.
- virtual ::arrow::Status NextBatch(int64_t batch_size,
- std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
-};
-
-/// \brief Experimental helper class for bindings (like Python) that struggle
-/// either with std::move or C++ exceptions
-class PARQUET_EXPORT FileReaderBuilder {
- public:
- FileReaderBuilder();
-
- /// Create FileReaderBuilder from Arrow file and optional properties / metadata
- ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
- const ReaderProperties& properties = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- ParquetFileReader* raw_reader() { return raw_reader_.get(); }
-
- /// Set Arrow MemoryPool for memory allocation
- FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
- /// Set Arrow reader properties
- FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
- /// Build FileReader instance
- ::arrow::Status Build(std::unique_ptr<FileReader>* out);
-
- private:
- ::arrow::MemoryPool* pool_;
- ArrowReaderProperties properties_;
- std::unique_ptr<ParquetFileReader> raw_reader_;
-};
-
-/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
-///
-/// @{
-
-/// \brief Build FileReader from Arrow file and MemoryPool
-///
-/// Advanced settings are supported through the FileReaderBuilder class.
-PARQUET_EXPORT
-::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
- ::arrow::MemoryPool* allocator,
- std::unique_ptr<FileReader>* reader);
-
-/// @}
-
-PARQUET_EXPORT
-::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max);
-
-namespace internal {
-
-PARQUET_EXPORT
-::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
-
-} // namespace internal
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+// N.B. we don't include async_generator.h as it's relatively heavy
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "parquet/file_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class ChunkedArray;
+class KeyValueMetadata;
+class RecordBatchReader;
+struct Scalar;
+class Schema;
+class Table;
+class RecordBatch;
+
+} // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class SchemaDescriptor;
+
+namespace arrow {
+
+class ColumnChunkReader;
+class ColumnReader;
+struct SchemaManifest;
+class RowGroupReader;
+
+/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
+///
+/// This interfaces caters for different use cases and thus provides different
+/// interfaces. In its most simplistic form, we cater for a user that wants to
+/// read the whole Parquet at once with the `FileReader::ReadTable` method.
+///
+/// More advanced users that also want to implement parallelism on top of each
+/// single Parquet files should do this on the RowGroup level. For this, they can
+/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
+/// RowGroup as a table.
+///
+/// In the most advanced situation, where a consumer wants to independently read
+/// RowGroups in parallel and consume each column individually, they can call
+/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
+/// instance.
+///
+/// The parquet format supports an optional integer field_id which can be assigned
+/// to a field. Arrow will convert these field IDs to a metadata key named
+/// PARQUET:field_id on the appropriate field.
+// TODO(wesm): nested data does not always make sense with this user
+// interface unless you are only reading a single leaf node from a branch of
+// a table. For example:
+//
+// repeated group data {
+// optional group record {
+// optional int32 val1;
+// optional byte_array val2;
+// optional bool val3;
+// }
+// optional int32 val4;
+// }
+//
+// In the Parquet file, there are 3 leaf nodes:
+//
+// * data.record.val1
+// * data.record.val2
+// * data.record.val3
+// * data.val4
+//
+// When materializing this data in an Arrow array, we would have:
+//
+// data: list<struct<
+// record: struct<
+// val1: int32,
+// val2: string (= list<uint8>),
+// val3: bool,
+// >,
+// val4: int32
+// >>
+//
+// However, in the Parquet format, each leaf node has its own repetition and
+// definition levels describing the structure of the intermediate nodes in
+// this array structure. Thus, we will need to scan the leaf data for a group
+// of leaf nodes part of the same type tree to create a single result Arrow
+// nested array structure.
+//
+// This is additionally complicated "chunky" repeated fields or very large byte
+// arrays
+class PARQUET_EXPORT FileReader {
+ public:
+ /// Factory function to create a FileReader from a ParquetFileReader and properties
+ static ::arrow::Status Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ const ArrowReaderProperties& properties,
+ std::unique_ptr<FileReader>* out);
+
+ /// Factory function to create a FileReader from a ParquetFileReader
+ static ::arrow::Status Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileReader> reader,
+ std::unique_ptr<FileReader>* out);
+
+ // Since the distribution of columns amongst a Parquet file's row groups may
+ // be uneven (the number of values in each column chunk can be different), we
+ // provide a column-oriented read interface. The ColumnReader hides the
+ // details of paging through the file's row groups and yielding
+ // fully-materialized arrow::Array instances
+ //
+ // Returns error status if the column of interest is not flat.
+ virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
+
+ /// \brief Return arrow schema for all the columns.
+ virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
+
+ /// \brief Read column as a whole into a chunked array.
+ ///
+ /// The indicated column index is relative to the schema
+ virtual ::arrow::Status ReadColumn(int i,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+ // NOTE: Experimental API
+ // Reads a specific top level schema field into an Array
+ // The index i refers the index of the top level schema field, which may
+ // be nested or flat - e.g.
+ //
+ // 0 foo.bar
+ // foo.bar.baz
+ // foo.qux
+ // 1 foo2
+ // 2 foo3
+ //
+ // i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
+ virtual ::arrow::Status ReadSchemaField(
+ int i, std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+ /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
+ ///
+ /// Note that the ordering in row_group_indices matters. FileReaders must outlive
+ /// their RecordBatchReaders.
+ ///
+ /// \returns error Status if row_group_indices contains an invalid index
+ virtual ::arrow::Status GetRecordBatchReader(
+ const std::vector<int>& row_group_indices,
+ std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
+ ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+ /// \brief Return a RecordBatchReader of row groups selected from
+ /// row_group_indices, whose columns are selected by column_indices.
+ ///
+ /// Note that the ordering in row_group_indices and column_indices
+ /// matter. FileReaders must outlive their RecordBatchReaders.
+ ///
+ /// \returns error Status if either row_group_indices or column_indices
+ /// contains an invalid index
+ virtual ::arrow::Status GetRecordBatchReader(
+ const std::vector<int>& row_group_indices, const std::vector<int>& column_indices,
+ std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;
+
+ /// \brief Return a generator of record batches.
+ ///
+ /// The FileReader must outlive the generator, so this requires that you pass in a
+ /// shared_ptr.
+ ///
+ /// \returns error Result if either row_group_indices or column_indices contains an
+ /// invalid index
+ virtual ::arrow::Result<
+ std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
+ GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+ const std::vector<int> row_group_indices,
+ const std::vector<int> column_indices,
+ ::arrow::internal::Executor* cpu_executor = NULLPTR) = 0;
+
+ ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+ /// Read all columns into a Table
+ virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ /// \brief Read the given columns into a Table
+ ///
+ /// The indicated column indices are relative to the schema
+ virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+
+ /// \brief Scan file contents with one thread, return number of rows
+ virtual ::arrow::Status ScanContents(std::vector<int> columns,
+ const int32_t column_batch_size,
+ int64_t* num_rows) = 0;
+
+ /// \brief Return a reader for the RowGroup, this object must not outlive the
+ /// FileReader.
+ virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
+
+ /// \brief The number of row groups in the file
+ virtual int num_row_groups() const = 0;
+
+ virtual ParquetFileReader* parquet_reader() const = 0;
+
+ /// Set whether to use multiple threads during reads of multiple columns.
+ /// By default only one thread is used.
+ virtual void set_use_threads(bool use_threads) = 0;
+
+ /// Set number of records to read per batch for the RecordBatchReader.
+ virtual void set_batch_size(int64_t batch_size) = 0;
+
+ virtual const ArrowReaderProperties& properties() const = 0;
+
+ virtual const SchemaManifest& manifest() const = 0;
+
+ virtual ~FileReader() = default;
+};
+
+class RowGroupReader {
+ public:
+ virtual ~RowGroupReader() = default;
+ virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
+ virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+ std::shared_ptr<::arrow::Table>* out) = 0;
+ virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ private:
+ struct Iterator;
+};
+
+class ColumnChunkReader {
+ public:
+ virtual ~ColumnChunkReader() = default;
+ virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+// At this point, the column reader is a stream iterator. It only knows how to
+// read the next batch of values for a particular column from the file until it
+// runs out.
+//
+// We also do not expose any internal Parquet details, such as row groups. This
+// might change in the future.
+class PARQUET_EXPORT ColumnReader {
+ public:
+ virtual ~ColumnReader() = default;
+
+ // Scan the next array of the indicated size. The actual size of the
+ // returned array may be less than the passed size depending how much data is
+ // available in the file.
+ //
+ // When all the data in the file has been exhausted, the result is set to
+ // nullptr.
+ //
+ // Returns Status::OK on a successful read, including if you have exhausted
+ // the data available in the file.
+ virtual ::arrow::Status NextBatch(int64_t batch_size,
+ std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+/// \brief Experimental helper class for bindings (like Python) that struggle
+/// either with std::move or C++ exceptions
+class PARQUET_EXPORT FileReaderBuilder {
+ public:
+ FileReaderBuilder();
+
+ /// Create FileReaderBuilder from Arrow file and optional properties / metadata
+ ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+ const ReaderProperties& properties = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ ParquetFileReader* raw_reader() { return raw_reader_.get(); }
+
+ /// Set Arrow MemoryPool for memory allocation
+ FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
+ /// Set Arrow reader properties
+ FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
+ /// Build FileReader instance
+ ::arrow::Status Build(std::unique_ptr<FileReader>* out);
+
+ private:
+ ::arrow::MemoryPool* pool_;
+ ArrowReaderProperties properties_;
+ std::unique_ptr<ParquetFileReader> raw_reader_;
+};
+
+/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
+///
+/// @{
+
+/// \brief Build FileReader from Arrow file and MemoryPool
+///
+/// Advanced settings are supported through the FileReaderBuilder class.
+PARQUET_EXPORT
+::arrow::Status OpenFile(std::shared_ptr<::arrow::io::RandomAccessFile>,
+ ::arrow::MemoryPool* allocator,
+ std::unique_ptr<FileReader>* reader);
+
+/// @}
+
+PARQUET_EXPORT
+::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max);
+
+namespace internal {
+
+PARQUET_EXPORT
+::arrow::Status FuzzReader(const uint8_t* data, int64_t size);
+
+} // namespace internal
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
index 3fbbfa8da26..f13687079d4 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.cc
@@ -1,791 +1,791 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/reader_internal.h"
-
-#include <algorithm>
-#include <climits>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/compute/api.h"
-#include "arrow/datum.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/reader.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/scalar.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/base64.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/endian.h"
-#include "arrow/util/int_util_internal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/string_view.h"
-#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/arrow/reader.h"
-#include "parquet/arrow/schema.h"
-#include "parquet/arrow/schema_internal.h"
-#include "parquet/column_reader.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
-#include "parquet/windows_compatibility.h"
-
-using arrow::Array;
-using arrow::BooleanArray;
-using arrow::ChunkedArray;
-using arrow::DataType;
-using arrow::Datum;
-using arrow::Decimal128;
-using arrow::Decimal128Array;
-using arrow::Decimal128Type;
-using arrow::Decimal256;
-using arrow::Decimal256Array;
-using arrow::Decimal256Type;
-using arrow::Field;
-using arrow::Int32Array;
-using arrow::ListArray;
-using arrow::MemoryPool;
-using arrow::ResizableBuffer;
-using arrow::Status;
-using arrow::StructArray;
-using arrow::Table;
-using arrow::TimestampArray;
-
-using ::arrow::BitUtil::FromBigEndian;
-using ::arrow::internal::checked_cast;
-using ::arrow::internal::checked_pointer_cast;
-using ::arrow::internal::SafeLeftShift;
-using ::arrow::util::SafeLoadAs;
-
-using parquet::internal::BinaryRecordReader;
-using parquet::internal::DictionaryRecordReader;
-using parquet::internal::RecordReader;
-using parquet::schema::GroupNode;
-using parquet::schema::Node;
-using parquet::schema::PrimitiveNode;
-using ParquetType = parquet::Type;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-namespace arrow {
-namespace {
-
-template <typename ArrowType>
-using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
-
-template <typename CType, typename StatisticsType>
-Status MakeMinMaxScalar(const StatisticsType& statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- *min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
- *max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
- return Status::OK();
-}
-
-template <typename CType, typename StatisticsType>
-Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
- std::shared_ptr<DataType> type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
- ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
- return Status::OK();
-}
-
-template <typename StatisticsType>
-Status MakeMinMaxIntegralScalar(const StatisticsType& statistics,
- const ::arrow::DataType& arrow_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- const auto column_desc = statistics.descr();
- const auto& logical_type = column_desc->logical_type();
- const auto& integer = checked_pointer_cast<const IntLogicalType>(logical_type);
- const bool is_signed = integer->is_signed();
-
- switch (integer->bit_width()) {
- case 8:
- return is_signed ? MakeMinMaxScalar<int8_t>(statistics, min, max)
- : MakeMinMaxScalar<uint8_t>(statistics, min, max);
- case 16:
- return is_signed ? MakeMinMaxScalar<int16_t>(statistics, min, max)
- : MakeMinMaxScalar<uint16_t>(statistics, min, max);
- case 32:
- return is_signed ? MakeMinMaxScalar<int32_t>(statistics, min, max)
- : MakeMinMaxScalar<uint32_t>(statistics, min, max);
- case 64:
- return is_signed ? MakeMinMaxScalar<int64_t>(statistics, min, max)
- : MakeMinMaxScalar<uint64_t>(statistics, min, max);
- }
-
- return Status::OK();
-}
-
-static Status FromInt32Statistics(const Int32Statistics& statistics,
- const LogicalType& logical_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- ARROW_ASSIGN_OR_RAISE(auto type, FromInt32(logical_type));
-
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeMinMaxIntegralScalar(statistics, *type, min, max);
- break;
- case LogicalType::Type::DATE:
- case LogicalType::Type::TIME:
- case LogicalType::Type::NONE:
- return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
- break;
- default:
- break;
- }
-
- return Status::NotImplemented("Cannot extract statistics for type ");
-}
-
-static Status FromInt64Statistics(const Int64Statistics& statistics,
- const LogicalType& logical_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- ARROW_ASSIGN_OR_RAISE(auto type, FromInt64(logical_type));
-
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeMinMaxIntegralScalar(statistics, *type, min, max);
- break;
- case LogicalType::Type::TIME:
- case LogicalType::Type::TIMESTAMP:
- case LogicalType::Type::NONE:
- return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
- break;
- default:
- break;
- }
-
- return Status::NotImplemented("Cannot extract statistics for type ");
-}
-
-template <typename DecimalType>
-Result<std::shared_ptr<::arrow::Scalar>> FromBigEndianString(
- const std::string& data, std::shared_ptr<DataType> arrow_type) {
- ARROW_ASSIGN_OR_RAISE(
- DecimalType decimal,
- DecimalType::FromBigEndian(reinterpret_cast<const uint8_t*>(data.data()),
- static_cast<int32_t>(data.size())));
- return ::arrow::MakeScalar(std::move(arrow_type), decimal);
-}
-
-// Extracts Min and Max scalar from bytes like types (i.e. types where
-// decimal is encoded as little endian.
-Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics,
- const LogicalType& logical_type,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- const DecimalLogicalType& decimal_type =
- checked_cast<const DecimalLogicalType&>(logical_type);
-
- Result<std::shared_ptr<DataType>> maybe_type =
- Decimal128Type::Make(decimal_type.precision(), decimal_type.scale());
- std::shared_ptr<DataType> arrow_type;
- if (maybe_type.ok()) {
- arrow_type = maybe_type.ValueOrDie();
- ARROW_ASSIGN_OR_RAISE(
- *min, FromBigEndianString<Decimal128>(statistics.EncodeMin(), arrow_type));
- ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal128>(statistics.EncodeMax(),
- std::move(arrow_type)));
- return Status::OK();
- }
- // Fallback to see if Decimal256 can represent the type.
- ARROW_ASSIGN_OR_RAISE(
- arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale()));
- ARROW_ASSIGN_OR_RAISE(
- *min, FromBigEndianString<Decimal256>(statistics.EncodeMin(), arrow_type));
- ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal256>(statistics.EncodeMax(),
- std::move(arrow_type)));
-
- return Status::OK();
-}
-
-Status ByteArrayStatisticsAsScalars(const Statistics& statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- auto logical_type = statistics.descr()->logical_type();
- if (logical_type->type() == LogicalType::Type::DECIMAL) {
- return ExtractDecimalMinMaxFromBytesType(statistics, *logical_type, min, max);
- }
- std::shared_ptr<::arrow::DataType> type;
- if (statistics.descr()->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
- type = ::arrow::fixed_size_binary(statistics.descr()->type_length());
- } else {
- type = logical_type->type() == LogicalType::Type::STRING ? ::arrow::utf8()
- : ::arrow::binary();
- }
- ARROW_ASSIGN_OR_RAISE(
- *min, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMin())));
- ARROW_ASSIGN_OR_RAISE(
- *max, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMax())));
-
- return Status::OK();
-}
-
-} // namespace
-
-Status StatisticsAsScalars(const Statistics& statistics,
- std::shared_ptr<::arrow::Scalar>* min,
- std::shared_ptr<::arrow::Scalar>* max) {
- if (!statistics.HasMinMax()) {
- return Status::Invalid("Statistics has no min max.");
- }
-
- auto column_desc = statistics.descr();
- if (column_desc == nullptr) {
- return Status::Invalid("Statistics carries no descriptor, can't infer arrow type.");
- }
-
- auto physical_type = column_desc->physical_type();
- auto logical_type = column_desc->logical_type();
- switch (physical_type) {
- case Type::BOOLEAN:
- return MakeMinMaxScalar<bool, BoolStatistics>(
- checked_cast<const BoolStatistics&>(statistics), min, max);
- case Type::FLOAT:
- return MakeMinMaxScalar<float, FloatStatistics>(
- checked_cast<const FloatStatistics&>(statistics), min, max);
- case Type::DOUBLE:
- return MakeMinMaxScalar<double, DoubleStatistics>(
- checked_cast<const DoubleStatistics&>(statistics), min, max);
- case Type::INT32:
- return FromInt32Statistics(checked_cast<const Int32Statistics&>(statistics),
- *logical_type, min, max);
- case Type::INT64:
- return FromInt64Statistics(checked_cast<const Int64Statistics&>(statistics),
- *logical_type, min, max);
- case Type::BYTE_ARRAY:
- case Type::FIXED_LEN_BYTE_ARRAY:
- return ByteArrayStatisticsAsScalars(statistics, min, max);
- default:
- return Status::NotImplemented("Extract statistics unsupported for physical_type ",
- physical_type, " unsupported.");
- }
-
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Primitive types
-
-namespace {
-
-template <typename ArrowType, typename ParquetType>
-Status TransferInt(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- using ArrowCType = typename ArrowType::c_type;
- using ParquetCType = typename ParquetType::c_type;
- int64_t length = reader->values_written();
- ARROW_ASSIGN_OR_RAISE(auto data,
- ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
-
- auto values = reinterpret_cast<const ParquetCType*>(reader->values());
- auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
- std::copy(values, values + length, out_ptr);
- *out = std::make_shared<ArrayType<ArrowType>>(
- type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
- return Status::OK();
-}
-
-std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
- const std::shared_ptr<DataType>& type) {
- std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(),
- reader->ReleaseValues()};
- auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(),
- buffers, reader->null_count());
- return ::arrow::MakeArray(data);
-}
-
-Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) {
- int64_t length = reader->values_written();
-
- const int64_t buffer_size = BitUtil::BytesForBits(length);
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool));
-
- // Transfer boolean values to packed bitmap
- auto values = reinterpret_cast<const bool*>(reader->values());
- uint8_t* data_ptr = data->mutable_data();
- memset(data_ptr, 0, buffer_size);
-
- for (int64_t i = 0; i < length; i++) {
- if (values[i]) {
- ::arrow::BitUtil::SetBit(data_ptr, i);
- }
- }
-
- *out = std::make_shared<BooleanArray>(length, std::move(data), reader->ReleaseIsValid(),
- reader->null_count());
- return Status::OK();
-}
-
-Status TransferInt96(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out,
- const ::arrow::TimeUnit::type int96_arrow_time_unit) {
- int64_t length = reader->values_written();
- auto values = reinterpret_cast<const Int96*>(reader->values());
- ARROW_ASSIGN_OR_RAISE(auto data,
- ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
- auto data_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
- for (int64_t i = 0; i < length; i++) {
- if (values[i].value[2] == 0) {
- // Happens for null entries: avoid triggering UBSAN as that Int96 timestamp
- // isn't representable as a 64-bit Unix timestamp.
- *data_ptr++ = 0;
- } else {
- switch (int96_arrow_time_unit) {
- case ::arrow::TimeUnit::NANO:
- *data_ptr++ = Int96GetNanoSeconds(values[i]);
- break;
- case ::arrow::TimeUnit::MICRO:
- *data_ptr++ = Int96GetMicroSeconds(values[i]);
- break;
- case ::arrow::TimeUnit::MILLI:
- *data_ptr++ = Int96GetMilliSeconds(values[i]);
- break;
- case ::arrow::TimeUnit::SECOND:
- *data_ptr++ = Int96GetSeconds(values[i]);
- break;
- }
- }
- }
- *out = std::make_shared<TimestampArray>(type, length, std::move(data),
- reader->ReleaseIsValid(), reader->null_count());
- return Status::OK();
-}
-
-Status TransferDate64(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- int64_t length = reader->values_written();
- auto values = reinterpret_cast<const int32_t*>(reader->values());
-
- ARROW_ASSIGN_OR_RAISE(auto data,
- ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
- auto out_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
-
- for (int64_t i = 0; i < length; i++) {
- *out_ptr++ = static_cast<int64_t>(values[i]) * kMillisecondsPerDay;
- }
-
- *out = std::make_shared<::arrow::Date64Array>(
- type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Binary, direct to dictionary-encoded
-
-Status TransferDictionary(RecordReader* reader,
- const std::shared_ptr<DataType>& logical_value_type,
- std::shared_ptr<ChunkedArray>* out) {
- auto dict_reader = dynamic_cast<DictionaryRecordReader*>(reader);
- DCHECK(dict_reader);
- *out = dict_reader->GetResult();
- if (!logical_value_type->Equals(*(*out)->type())) {
- ARROW_ASSIGN_OR_RAISE(*out, (*out)->View(logical_value_type));
- }
- return Status::OK();
-}
-
-Status TransferBinary(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& logical_value_type,
- std::shared_ptr<ChunkedArray>* out) {
- if (reader->read_dictionary()) {
- return TransferDictionary(
- reader, ::arrow::dictionary(::arrow::int32(), logical_value_type), out);
- }
- ::arrow::compute::ExecContext ctx(pool);
- ::arrow::compute::CastOptions cast_options;
- cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data
-
- auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
- DCHECK(binary_reader);
- auto chunks = binary_reader->GetBuilderChunks();
- for (auto& chunk : chunks) {
- if (!chunk->type()->Equals(*logical_value_type)) {
- // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
- // will be lost because they are first created as int32 and then cast to int64.
- ARROW_ASSIGN_OR_RAISE(
- chunk, ::arrow::compute::Cast(*chunk, logical_value_type, cast_options, &ctx));
- }
- }
- *out = std::make_shared<ChunkedArray>(chunks, logical_value_type);
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 || Decimal256
-
-template <typename DecimalType>
-Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width,
- uint8_t* out_buf) {
- ARROW_ASSIGN_OR_RAISE(DecimalType t, DecimalType::FromBigEndian(value, byte_width));
- t.ToBytes(out_buf);
- return ::arrow::Status::OK();
-}
-
-template <typename DecimalArrayType>
-struct DecimalTypeTrait;
-
-template <>
-struct DecimalTypeTrait<::arrow::Decimal128Array> {
- using value = ::arrow::Decimal128;
-};
-
-template <>
-struct DecimalTypeTrait<::arrow::Decimal256Array> {
- using value = ::arrow::Decimal256;
-};
-
-template <typename DecimalArrayType, typename ParquetType>
-struct DecimalConverter {
- static inline Status ConvertToDecimal(const Array& array,
- const std::shared_ptr<DataType>&,
- MemoryPool* pool, std::shared_ptr<Array>*) {
- return Status::NotImplemented("not implemented");
- }
-};
-
-template <typename DecimalArrayType>
-struct DecimalConverter<DecimalArrayType, FLBAType> {
- static inline Status ConvertToDecimal(const Array& array,
- const std::shared_ptr<DataType>& type,
- MemoryPool* pool, std::shared_ptr<Array>* out) {
- const auto& fixed_size_binary_array =
- checked_cast<const ::arrow::FixedSizeBinaryArray&>(array);
-
- // The byte width of each decimal value
- const int32_t type_length =
- checked_cast<const ::arrow::DecimalType&>(*type).byte_width();
-
- // number of elements in the entire array
- const int64_t length = fixed_size_binary_array.length();
-
- // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time
- // this will be different from the decimal array width because we write the minimum
- // number of bytes necessary to represent a given precision
- const int32_t byte_width =
- checked_cast<const ::arrow::FixedSizeBinaryType&>(*fixed_size_binary_array.type())
- .byte_width();
- // allocate memory for the decimal array
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
-
- // raw bytes that we can write to
- uint8_t* out_ptr = data->mutable_data();
-
- // convert each FixedSizeBinary value to valid decimal bytes
- const int64_t null_count = fixed_size_binary_array.null_count();
-
- using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
- if (null_count > 0) {
- for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
- if (!fixed_size_binary_array.IsNull(i)) {
- RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
- fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
- } else {
- std::memset(out_ptr, 0, type_length);
- }
- }
- } else {
- for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
- RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
- fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
- }
- }
-
- *out = std::make_shared<DecimalArrayType>(
- type, length, std::move(data), fixed_size_binary_array.null_bitmap(), null_count);
-
- return Status::OK();
- }
-};
-
-template <typename DecimalArrayType>
-struct DecimalConverter<DecimalArrayType, ByteArrayType> {
- static inline Status ConvertToDecimal(const Array& array,
- const std::shared_ptr<DataType>& type,
- MemoryPool* pool, std::shared_ptr<Array>* out) {
- const auto& binary_array = checked_cast<const ::arrow::BinaryArray&>(array);
- const int64_t length = binary_array.length();
-
- const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
- const int64_t type_length = decimal_type.byte_width();
-
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
-
- // raw bytes that we can write to
- uint8_t* out_ptr = data->mutable_data();
-
- const int64_t null_count = binary_array.null_count();
-
- // convert each BinaryArray value to valid decimal bytes
- for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
- int32_t record_len = 0;
- const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
-
- if (record_len < 0 || record_len > type_length) {
- return Status::Invalid("Invalid BYTE_ARRAY length for ", type->ToString());
- }
-
- auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
- out_ptr_view[0] = 0;
- out_ptr_view[1] = 0;
-
- // only convert rows that are not null if there are nulls, or
- // all rows, if there are not
- if ((null_count > 0 && !binary_array.IsNull(i)) || null_count <= 0) {
- using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
- RETURN_NOT_OK(
- RawBytesToDecimalBytes<DecimalType>(record_loc, record_len, out_ptr));
- }
- }
- *out = std::make_shared<DecimalArrayType>(type, length, std::move(data),
- binary_array.null_bitmap(), null_count);
- return Status::OK();
- }
-};
-
-/// \brief Convert an Int32 or Int64 array into a Decimal128Array
-/// The parquet spec allows systems to write decimals in int32, int64 if the values are
-/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
-/// This function implements the conversion from int32 and int64 arrays to decimal arrays.
-template <
- typename ParquetIntegerType,
- typename = ::arrow::enable_if_t<std::is_same<ParquetIntegerType, Int32Type>::value ||
- std::is_same<ParquetIntegerType, Int64Type>::value>>
-static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- // Decimal128 and Decimal256 are only Arrow constructs. Parquet does not
- // specifically distinguish between decimal byte widths.
- // Decimal256 isn't relevant here because the Arrow-Parquet C++ bindings never
- // write Decimal values as integers and if the decimal value can fit in an
- // integer it is wasteful to use Decimal256. Put another way, the only
- // way an integer column could be construed as Decimal256 is if an arrow
- // schema was stored as metadata in the file indicating the column was
- // Decimal256. The current Arrow-Parquet C++ bindings will never do this.
- DCHECK(type->id() == ::arrow::Type::DECIMAL128);
-
- const int64_t length = reader->values_written();
-
- using ElementType = typename ParquetIntegerType::c_type;
- static_assert(std::is_same<ElementType, int32_t>::value ||
- std::is_same<ElementType, int64_t>::value,
- "ElementType must be int32_t or int64_t");
-
- const auto values = reinterpret_cast<const ElementType*>(reader->values());
-
- const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
- const int64_t type_length = decimal_type.byte_width();
-
- ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
- uint8_t* out_ptr = data->mutable_data();
-
- using ::arrow::BitUtil::FromLittleEndian;
-
- for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
- // sign/zero extend int32_t values, otherwise a no-op
- const auto value = static_cast<int64_t>(values[i]);
-
- ::arrow::Decimal128 decimal(value);
- decimal.ToBytes(out_ptr);
- }
-
- if (reader->nullable_values()) {
- std::shared_ptr<ResizableBuffer> is_valid = reader->ReleaseIsValid();
- *out = std::make_shared<Decimal128Array>(type, length, std::move(data), is_valid,
- reader->null_count());
- } else {
- *out = std::make_shared<Decimal128Array>(type, length, std::move(data));
- }
- return Status::OK();
-}
-
-/// \brief Convert an arrow::BinaryArray to an arrow::Decimal{128,256}Array
-/// We do this by:
-/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
-/// 2. Allocating a buffer for the arrow::Decimal{128,256}Array
-/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
-/// representing the high and low bits of each decimal value.
-template <typename DecimalArrayType, typename ParquetType>
-Status TransferDecimal(RecordReader* reader, MemoryPool* pool,
- const std::shared_ptr<DataType>& type, Datum* out) {
- auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
- DCHECK(binary_reader);
- ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
- for (size_t i = 0; i < chunks.size(); ++i) {
- std::shared_ptr<Array> chunk_as_decimal;
- auto fn = &DecimalConverter<DecimalArrayType, ParquetType>::ConvertToDecimal;
- RETURN_NOT_OK(fn(*chunks[i], type, pool, &chunk_as_decimal));
- // Replace the chunk, which will hopefully also free memory as we go
- chunks[i] = chunk_as_decimal;
- }
- *out = std::make_shared<ChunkedArray>(chunks, type);
- return Status::OK();
-}
-
-} // namespace
-
-#define TRANSFER_INT32(ENUM, ArrowType) \
- case ::arrow::Type::ENUM: { \
- Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_type, &result); \
- RETURN_NOT_OK(s); \
- } break;
-
-#define TRANSFER_INT64(ENUM, ArrowType) \
- case ::arrow::Type::ENUM: { \
- Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_type, &result); \
- RETURN_NOT_OK(s); \
- } break;
-
-Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_type,
- const ColumnDescriptor* descr, MemoryPool* pool,
- std::shared_ptr<ChunkedArray>* out) {
- Datum result;
- std::shared_ptr<ChunkedArray> chunked_result;
- switch (value_type->id()) {
- case ::arrow::Type::DICTIONARY: {
- RETURN_NOT_OK(TransferDictionary(reader, value_type, &chunked_result));
- result = chunked_result;
- } break;
- case ::arrow::Type::NA: {
- result = std::make_shared<::arrow::NullArray>(reader->values_written());
- break;
- }
- case ::arrow::Type::INT32:
- case ::arrow::Type::INT64:
- case ::arrow::Type::FLOAT:
- case ::arrow::Type::DOUBLE:
- result = TransferZeroCopy(reader, value_type);
- break;
- case ::arrow::Type::BOOL:
- RETURN_NOT_OK(TransferBool(reader, pool, &result));
- break;
- TRANSFER_INT32(UINT8, ::arrow::UInt8Type);
- TRANSFER_INT32(INT8, ::arrow::Int8Type);
- TRANSFER_INT32(UINT16, ::arrow::UInt16Type);
- TRANSFER_INT32(INT16, ::arrow::Int16Type);
- TRANSFER_INT32(UINT32, ::arrow::UInt32Type);
- TRANSFER_INT64(UINT64, ::arrow::UInt64Type);
- TRANSFER_INT32(DATE32, ::arrow::Date32Type);
- TRANSFER_INT32(TIME32, ::arrow::Time32Type);
- TRANSFER_INT64(TIME64, ::arrow::Time64Type);
- case ::arrow::Type::DATE64:
- RETURN_NOT_OK(TransferDate64(reader, pool, value_type, &result));
- break;
- case ::arrow::Type::FIXED_SIZE_BINARY:
- case ::arrow::Type::BINARY:
- case ::arrow::Type::STRING:
- case ::arrow::Type::LARGE_BINARY:
- case ::arrow::Type::LARGE_STRING: {
- RETURN_NOT_OK(TransferBinary(reader, pool, value_type, &chunked_result));
- result = chunked_result;
- } break;
- case ::arrow::Type::DECIMAL128: {
- switch (descr->physical_type()) {
- case ::parquet::Type::INT32: {
- auto fn = DecimalIntegerTransfer<Int32Type>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::INT64: {
- auto fn = &DecimalIntegerTransfer<Int64Type>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal128Array, ByteArrayType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal128Array, FLBAType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- default:
- return Status::Invalid(
- "Physical type for decimal128 must be int32, int64, byte array, or fixed "
- "length binary");
- }
- } break;
- case ::arrow::Type::DECIMAL256:
- switch (descr->physical_type()) {
- case ::parquet::Type::BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal256Array, ByteArrayType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
- auto fn = &TransferDecimal<Decimal256Array, FLBAType>;
- RETURN_NOT_OK(fn(reader, pool, value_type, &result));
- } break;
- default:
- return Status::Invalid(
- "Physical type for decimal256 must be fixed length binary");
- }
- break;
-
- case ::arrow::Type::TIMESTAMP: {
- const ::arrow::TimestampType& timestamp_type =
- checked_cast<::arrow::TimestampType&>(*value_type);
- if (descr->physical_type() == ::parquet::Type::INT96) {
- RETURN_NOT_OK(
- TransferInt96(reader, pool, value_type, &result, timestamp_type.unit()));
- } else {
- switch (timestamp_type.unit()) {
- case ::arrow::TimeUnit::MILLI:
- case ::arrow::TimeUnit::MICRO:
- case ::arrow::TimeUnit::NANO:
- result = TransferZeroCopy(reader, value_type);
- break;
- default:
- return Status::NotImplemented("TimeUnit not supported");
- }
- }
- } break;
- default:
- return Status::NotImplemented("No support for reading columns of type ",
- value_type->ToString());
- }
-
- if (result.kind() == Datum::ARRAY) {
- *out = std::make_shared<ChunkedArray>(result.make_array());
- } else if (result.kind() == Datum::CHUNKED_ARRAY) {
- *out = result.chunked_array();
- } else {
- DCHECK(false) << "Should be impossible, result was " << result.ToString();
- }
-
- return Status::OK();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/reader_internal.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/compute/api.h"
+#include "arrow/datum.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/arrow/reader.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/arrow/schema_internal.h"
+#include "parquet/column_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
+#include "parquet/windows_compatibility.h"
+
+using arrow::Array;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::Datum;
+using arrow::Decimal128;
+using arrow::Decimal128Array;
+using arrow::Decimal128Type;
+using arrow::Decimal256;
+using arrow::Decimal256Array;
+using arrow::Decimal256Type;
+using arrow::Field;
+using arrow::Int32Array;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::StructArray;
+using arrow::Table;
+using arrow::TimestampArray;
+
+using ::arrow::BitUtil::FromBigEndian;
+using ::arrow::internal::checked_cast;
+using ::arrow::internal::checked_pointer_cast;
+using ::arrow::internal::SafeLeftShift;
+using ::arrow::util::SafeLoadAs;
+
+using parquet::internal::BinaryRecordReader;
+using parquet::internal::DictionaryRecordReader;
+using parquet::internal::RecordReader;
+using parquet::schema::GroupNode;
+using parquet::schema::Node;
+using parquet::schema::PrimitiveNode;
+using ParquetType = parquet::Type;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace arrow {
+namespace {
+
+template <typename ArrowType>
+using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+
+template <typename CType, typename StatisticsType>
+Status MakeMinMaxScalar(const StatisticsType& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ *min = ::arrow::MakeScalar(static_cast<CType>(statistics.min()));
+ *max = ::arrow::MakeScalar(static_cast<CType>(statistics.max()));
+ return Status::OK();
+}
+
+template <typename CType, typename StatisticsType>
+Status MakeMinMaxTypedScalar(const StatisticsType& statistics,
+ std::shared_ptr<DataType> type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(*min, ::arrow::MakeScalar(type, statistics.min()));
+ ARROW_ASSIGN_OR_RAISE(*max, ::arrow::MakeScalar(type, statistics.max()));
+ return Status::OK();
+}
+
+template <typename StatisticsType>
+Status MakeMinMaxIntegralScalar(const StatisticsType& statistics,
+ const ::arrow::DataType& arrow_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ const auto column_desc = statistics.descr();
+ const auto& logical_type = column_desc->logical_type();
+ const auto& integer = checked_pointer_cast<const IntLogicalType>(logical_type);
+ const bool is_signed = integer->is_signed();
+
+ switch (integer->bit_width()) {
+ case 8:
+ return is_signed ? MakeMinMaxScalar<int8_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint8_t>(statistics, min, max);
+ case 16:
+ return is_signed ? MakeMinMaxScalar<int16_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint16_t>(statistics, min, max);
+ case 32:
+ return is_signed ? MakeMinMaxScalar<int32_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint32_t>(statistics, min, max);
+ case 64:
+ return is_signed ? MakeMinMaxScalar<int64_t>(statistics, min, max)
+ : MakeMinMaxScalar<uint64_t>(statistics, min, max);
+ }
+
+ return Status::OK();
+}
+
+static Status FromInt32Statistics(const Int32Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(auto type, FromInt32(logical_type));
+
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeMinMaxIntegralScalar(statistics, *type, min, max);
+ break;
+ case LogicalType::Type::DATE:
+ case LogicalType::Type::TIME:
+ case LogicalType::Type::NONE:
+ return MakeMinMaxTypedScalar<int32_t>(statistics, type, min, max);
+ break;
+ default:
+ break;
+ }
+
+ return Status::NotImplemented("Cannot extract statistics for type ");
+}
+
+static Status FromInt64Statistics(const Int64Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ ARROW_ASSIGN_OR_RAISE(auto type, FromInt64(logical_type));
+
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeMinMaxIntegralScalar(statistics, *type, min, max);
+ break;
+ case LogicalType::Type::TIME:
+ case LogicalType::Type::TIMESTAMP:
+ case LogicalType::Type::NONE:
+ return MakeMinMaxTypedScalar<int64_t>(statistics, type, min, max);
+ break;
+ default:
+ break;
+ }
+
+ return Status::NotImplemented("Cannot extract statistics for type ");
+}
+
+template <typename DecimalType>
+Result<std::shared_ptr<::arrow::Scalar>> FromBigEndianString(
+ const std::string& data, std::shared_ptr<DataType> arrow_type) {
+ ARROW_ASSIGN_OR_RAISE(
+ DecimalType decimal,
+ DecimalType::FromBigEndian(reinterpret_cast<const uint8_t*>(data.data()),
+ static_cast<int32_t>(data.size())));
+ return ::arrow::MakeScalar(std::move(arrow_type), decimal);
+}
+
+// Extracts Min and Max scalar from bytes like types (i.e. types where
+// decimal is encoded as little endian.
+Status ExtractDecimalMinMaxFromBytesType(const Statistics& statistics,
+ const LogicalType& logical_type,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ const DecimalLogicalType& decimal_type =
+ checked_cast<const DecimalLogicalType&>(logical_type);
+
+ Result<std::shared_ptr<DataType>> maybe_type =
+ Decimal128Type::Make(decimal_type.precision(), decimal_type.scale());
+ std::shared_ptr<DataType> arrow_type;
+ if (maybe_type.ok()) {
+ arrow_type = maybe_type.ValueOrDie();
+ ARROW_ASSIGN_OR_RAISE(
+ *min, FromBigEndianString<Decimal128>(statistics.EncodeMin(), arrow_type));
+ ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal128>(statistics.EncodeMax(),
+ std::move(arrow_type)));
+ return Status::OK();
+ }
+ // Fallback to see if Decimal256 can represent the type.
+ ARROW_ASSIGN_OR_RAISE(
+ arrow_type, Decimal256Type::Make(decimal_type.precision(), decimal_type.scale()));
+ ARROW_ASSIGN_OR_RAISE(
+ *min, FromBigEndianString<Decimal256>(statistics.EncodeMin(), arrow_type));
+ ARROW_ASSIGN_OR_RAISE(*max, FromBigEndianString<Decimal256>(statistics.EncodeMax(),
+ std::move(arrow_type)));
+
+ return Status::OK();
+}
+
+Status ByteArrayStatisticsAsScalars(const Statistics& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ auto logical_type = statistics.descr()->logical_type();
+ if (logical_type->type() == LogicalType::Type::DECIMAL) {
+ return ExtractDecimalMinMaxFromBytesType(statistics, *logical_type, min, max);
+ }
+ std::shared_ptr<::arrow::DataType> type;
+ if (statistics.descr()->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
+ type = ::arrow::fixed_size_binary(statistics.descr()->type_length());
+ } else {
+ type = logical_type->type() == LogicalType::Type::STRING ? ::arrow::utf8()
+ : ::arrow::binary();
+ }
+ ARROW_ASSIGN_OR_RAISE(
+ *min, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMin())));
+ ARROW_ASSIGN_OR_RAISE(
+ *max, ::arrow::MakeScalar(type, Buffer::FromString(statistics.EncodeMax())));
+
+ return Status::OK();
+}
+
+} // namespace
+
+Status StatisticsAsScalars(const Statistics& statistics,
+ std::shared_ptr<::arrow::Scalar>* min,
+ std::shared_ptr<::arrow::Scalar>* max) {
+ if (!statistics.HasMinMax()) {
+ return Status::Invalid("Statistics has no min max.");
+ }
+
+ auto column_desc = statistics.descr();
+ if (column_desc == nullptr) {
+ return Status::Invalid("Statistics carries no descriptor, can't infer arrow type.");
+ }
+
+ auto physical_type = column_desc->physical_type();
+ auto logical_type = column_desc->logical_type();
+ switch (physical_type) {
+ case Type::BOOLEAN:
+ return MakeMinMaxScalar<bool, BoolStatistics>(
+ checked_cast<const BoolStatistics&>(statistics), min, max);
+ case Type::FLOAT:
+ return MakeMinMaxScalar<float, FloatStatistics>(
+ checked_cast<const FloatStatistics&>(statistics), min, max);
+ case Type::DOUBLE:
+ return MakeMinMaxScalar<double, DoubleStatistics>(
+ checked_cast<const DoubleStatistics&>(statistics), min, max);
+ case Type::INT32:
+ return FromInt32Statistics(checked_cast<const Int32Statistics&>(statistics),
+ *logical_type, min, max);
+ case Type::INT64:
+ return FromInt64Statistics(checked_cast<const Int64Statistics&>(statistics),
+ *logical_type, min, max);
+ case Type::BYTE_ARRAY:
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return ByteArrayStatisticsAsScalars(statistics, min, max);
+ default:
+ return Status::NotImplemented("Extract statistics unsupported for physical_type ",
+ physical_type, " unsupported.");
+ }
+
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Primitive types
+
+namespace {
+
+template <typename ArrowType, typename ParquetType>
+Status TransferInt(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ using ArrowCType = typename ArrowType::c_type;
+ using ParquetCType = typename ParquetType::c_type;
+ int64_t length = reader->values_written();
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool));
+
+ auto values = reinterpret_cast<const ParquetCType*>(reader->values());
+ auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data());
+ std::copy(values, values + length, out_ptr);
+ *out = std::make_shared<ArrayType<ArrowType>>(
+ type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+std::shared_ptr<Array> TransferZeroCopy(RecordReader* reader,
+ const std::shared_ptr<DataType>& type) {
+ std::vector<std::shared_ptr<Buffer>> buffers = {reader->ReleaseIsValid(),
+ reader->ReleaseValues()};
+ auto data = std::make_shared<::arrow::ArrayData>(type, reader->values_written(),
+ buffers, reader->null_count());
+ return ::arrow::MakeArray(data);
+}
+
+Status TransferBool(RecordReader* reader, MemoryPool* pool, Datum* out) {
+ int64_t length = reader->values_written();
+
+ const int64_t buffer_size = BitUtil::BytesForBits(length);
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool));
+
+ // Transfer boolean values to packed bitmap
+ auto values = reinterpret_cast<const bool*>(reader->values());
+ uint8_t* data_ptr = data->mutable_data();
+ memset(data_ptr, 0, buffer_size);
+
+ for (int64_t i = 0; i < length; i++) {
+ if (values[i]) {
+ ::arrow::BitUtil::SetBit(data_ptr, i);
+ }
+ }
+
+ *out = std::make_shared<BooleanArray>(length, std::move(data), reader->ReleaseIsValid(),
+ reader->null_count());
+ return Status::OK();
+}
+
+Status TransferInt96(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ int64_t length = reader->values_written();
+ auto values = reinterpret_cast<const Int96*>(reader->values());
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
+ auto data_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
+ for (int64_t i = 0; i < length; i++) {
+ if (values[i].value[2] == 0) {
+ // Happens for null entries: avoid triggering UBSAN as that Int96 timestamp
+ // isn't representable as a 64-bit Unix timestamp.
+ *data_ptr++ = 0;
+ } else {
+ switch (int96_arrow_time_unit) {
+ case ::arrow::TimeUnit::NANO:
+ *data_ptr++ = Int96GetNanoSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::MICRO:
+ *data_ptr++ = Int96GetMicroSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::MILLI:
+ *data_ptr++ = Int96GetMilliSeconds(values[i]);
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ *data_ptr++ = Int96GetSeconds(values[i]);
+ break;
+ }
+ }
+ }
+ *out = std::make_shared<TimestampArray>(type, length, std::move(data),
+ reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+Status TransferDate64(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ int64_t length = reader->values_written();
+ auto values = reinterpret_cast<const int32_t*>(reader->values());
+
+ ARROW_ASSIGN_OR_RAISE(auto data,
+ ::arrow::AllocateBuffer(length * sizeof(int64_t), pool));
+ auto out_ptr = reinterpret_cast<int64_t*>(data->mutable_data());
+
+ for (int64_t i = 0; i < length; i++) {
+ *out_ptr++ = static_cast<int64_t>(values[i]) * kMillisecondsPerDay;
+ }
+
+ *out = std::make_shared<::arrow::Date64Array>(
+ type, length, std::move(data), reader->ReleaseIsValid(), reader->null_count());
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Binary, direct to dictionary-encoded
+
+Status TransferDictionary(RecordReader* reader,
+ const std::shared_ptr<DataType>& logical_value_type,
+ std::shared_ptr<ChunkedArray>* out) {
+ auto dict_reader = dynamic_cast<DictionaryRecordReader*>(reader);
+ DCHECK(dict_reader);
+ *out = dict_reader->GetResult();
+ if (!logical_value_type->Equals(*(*out)->type())) {
+ ARROW_ASSIGN_OR_RAISE(*out, (*out)->View(logical_value_type));
+ }
+ return Status::OK();
+}
+
+Status TransferBinary(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& logical_value_type,
+ std::shared_ptr<ChunkedArray>* out) {
+ if (reader->read_dictionary()) {
+ return TransferDictionary(
+ reader, ::arrow::dictionary(::arrow::int32(), logical_value_type), out);
+ }
+ ::arrow::compute::ExecContext ctx(pool);
+ ::arrow::compute::CastOptions cast_options;
+ cast_options.allow_invalid_utf8 = true; // avoid spending time validating UTF8 data
+
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ auto chunks = binary_reader->GetBuilderChunks();
+ for (auto& chunk : chunks) {
+ if (!chunk->type()->Equals(*logical_value_type)) {
+ // XXX: if a LargeBinary chunk is larger than 2GB, the MSBs of offsets
+ // will be lost because they are first created as int32 and then cast to int64.
+ ARROW_ASSIGN_OR_RAISE(
+ chunk, ::arrow::compute::Cast(*chunk, logical_value_type, cast_options, &ctx));
+ }
+ }
+ *out = std::make_shared<ChunkedArray>(chunks, logical_value_type);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// INT32 / INT64 / BYTE_ARRAY / FIXED_LEN_BYTE_ARRAY -> Decimal128 || Decimal256
+
+template <typename DecimalType>
+Status RawBytesToDecimalBytes(const uint8_t* value, int32_t byte_width,
+ uint8_t* out_buf) {
+ ARROW_ASSIGN_OR_RAISE(DecimalType t, DecimalType::FromBigEndian(value, byte_width));
+ t.ToBytes(out_buf);
+ return ::arrow::Status::OK();
+}
+
+template <typename DecimalArrayType>
+struct DecimalTypeTrait;
+
+template <>
+struct DecimalTypeTrait<::arrow::Decimal128Array> {
+ using value = ::arrow::Decimal128;
+};
+
+template <>
+struct DecimalTypeTrait<::arrow::Decimal256Array> {
+ using value = ::arrow::Decimal256;
+};
+
+template <typename DecimalArrayType, typename ParquetType>
+struct DecimalConverter {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>&,
+ MemoryPool* pool, std::shared_ptr<Array>*) {
+ return Status::NotImplemented("not implemented");
+ }
+};
+
+template <typename DecimalArrayType>
+struct DecimalConverter<DecimalArrayType, FLBAType> {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ const auto& fixed_size_binary_array =
+ checked_cast<const ::arrow::FixedSizeBinaryArray&>(array);
+
+ // The byte width of each decimal value
+ const int32_t type_length =
+ checked_cast<const ::arrow::DecimalType&>(*type).byte_width();
+
+ // number of elements in the entire array
+ const int64_t length = fixed_size_binary_array.length();
+
+ // Get the byte width of the values in the FixedSizeBinaryArray. Most of the time
+ // this will be different from the decimal array width because we write the minimum
+ // number of bytes necessary to represent a given precision
+ const int32_t byte_width =
+ checked_cast<const ::arrow::FixedSizeBinaryType&>(*fixed_size_binary_array.type())
+ .byte_width();
+ // allocate memory for the decimal array
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ // convert each FixedSizeBinary value to valid decimal bytes
+ const int64_t null_count = fixed_size_binary_array.null_count();
+
+ using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
+ if (null_count > 0) {
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ if (!fixed_size_binary_array.IsNull(i)) {
+ RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
+ fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
+ } else {
+ std::memset(out_ptr, 0, type_length);
+ }
+ }
+ } else {
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ RETURN_NOT_OK(RawBytesToDecimalBytes<DecimalType>(
+ fixed_size_binary_array.GetValue(i), byte_width, out_ptr));
+ }
+ }
+
+ *out = std::make_shared<DecimalArrayType>(
+ type, length, std::move(data), fixed_size_binary_array.null_bitmap(), null_count);
+
+ return Status::OK();
+ }
+};
+
+template <typename DecimalArrayType>
+struct DecimalConverter<DecimalArrayType, ByteArrayType> {
+ static inline Status ConvertToDecimal(const Array& array,
+ const std::shared_ptr<DataType>& type,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ const auto& binary_array = checked_cast<const ::arrow::BinaryArray&>(array);
+ const int64_t length = binary_array.length();
+
+ const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+
+ // raw bytes that we can write to
+ uint8_t* out_ptr = data->mutable_data();
+
+ const int64_t null_count = binary_array.null_count();
+
+ // convert each BinaryArray value to valid decimal bytes
+ for (int64_t i = 0; i < length; i++, out_ptr += type_length) {
+ int32_t record_len = 0;
+ const uint8_t* record_loc = binary_array.GetValue(i, &record_len);
+
+ if (record_len < 0 || record_len > type_length) {
+ return Status::Invalid("Invalid BYTE_ARRAY length for ", type->ToString());
+ }
+
+ auto out_ptr_view = reinterpret_cast<uint64_t*>(out_ptr);
+ out_ptr_view[0] = 0;
+ out_ptr_view[1] = 0;
+
+ // only convert rows that are not null if there are nulls, or
+ // all rows, if there are not
+ if ((null_count > 0 && !binary_array.IsNull(i)) || null_count <= 0) {
+ using DecimalType = typename DecimalTypeTrait<DecimalArrayType>::value;
+ RETURN_NOT_OK(
+ RawBytesToDecimalBytes<DecimalType>(record_loc, record_len, out_ptr));
+ }
+ }
+ *out = std::make_shared<DecimalArrayType>(type, length, std::move(data),
+ binary_array.null_bitmap(), null_count);
+ return Status::OK();
+ }
+};
+
+/// \brief Convert an Int32 or Int64 array into a Decimal128Array
+/// The parquet spec allows systems to write decimals in int32, int64 if the values are
+/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
+/// This function implements the conversion from int32 and int64 arrays to decimal arrays.
+template <
+ typename ParquetIntegerType,
+ typename = ::arrow::enable_if_t<std::is_same<ParquetIntegerType, Int32Type>::value ||
+ std::is_same<ParquetIntegerType, Int64Type>::value>>
+static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ // Decimal128 and Decimal256 are only Arrow constructs. Parquet does not
+ // specifically distinguish between decimal byte widths.
+ // Decimal256 isn't relevant here because the Arrow-Parquet C++ bindings never
+ // write Decimal values as integers and if the decimal value can fit in an
+ // integer it is wasteful to use Decimal256. Put another way, the only
+ // way an integer column could be construed as Decimal256 is if an arrow
+ // schema was stored as metadata in the file indicating the column was
+ // Decimal256. The current Arrow-Parquet C++ bindings will never do this.
+ DCHECK(type->id() == ::arrow::Type::DECIMAL128);
+
+ const int64_t length = reader->values_written();
+
+ using ElementType = typename ParquetIntegerType::c_type;
+ static_assert(std::is_same<ElementType, int32_t>::value ||
+ std::is_same<ElementType, int64_t>::value,
+ "ElementType must be int32_t or int64_t");
+
+ const auto values = reinterpret_cast<const ElementType*>(reader->values());
+
+ const auto& decimal_type = checked_cast<const ::arrow::DecimalType&>(*type);
+ const int64_t type_length = decimal_type.byte_width();
+
+ ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(length * type_length, pool));
+ uint8_t* out_ptr = data->mutable_data();
+
+ using ::arrow::BitUtil::FromLittleEndian;
+
+ for (int64_t i = 0; i < length; ++i, out_ptr += type_length) {
+ // sign/zero extend int32_t values, otherwise a no-op
+ const auto value = static_cast<int64_t>(values[i]);
+
+ ::arrow::Decimal128 decimal(value);
+ decimal.ToBytes(out_ptr);
+ }
+
+ if (reader->nullable_values()) {
+ std::shared_ptr<ResizableBuffer> is_valid = reader->ReleaseIsValid();
+ *out = std::make_shared<Decimal128Array>(type, length, std::move(data), is_valid,
+ reader->null_count());
+ } else {
+ *out = std::make_shared<Decimal128Array>(type, length, std::move(data));
+ }
+ return Status::OK();
+}
+
+/// \brief Convert an arrow::BinaryArray to an arrow::Decimal{128,256}Array
+/// We do this by:
+/// 1. Creating an arrow::BinaryArray from the RecordReader's builder
+/// 2. Allocating a buffer for the arrow::Decimal{128,256}Array
+/// 3. Converting the big-endian bytes in each BinaryArray entry to two integers
+/// representing the high and low bits of each decimal value.
+template <typename DecimalArrayType, typename ParquetType>
+Status TransferDecimal(RecordReader* reader, MemoryPool* pool,
+ const std::shared_ptr<DataType>& type, Datum* out) {
+ auto binary_reader = dynamic_cast<BinaryRecordReader*>(reader);
+ DCHECK(binary_reader);
+ ::arrow::ArrayVector chunks = binary_reader->GetBuilderChunks();
+ for (size_t i = 0; i < chunks.size(); ++i) {
+ std::shared_ptr<Array> chunk_as_decimal;
+ auto fn = &DecimalConverter<DecimalArrayType, ParquetType>::ConvertToDecimal;
+ RETURN_NOT_OK(fn(*chunks[i], type, pool, &chunk_as_decimal));
+ // Replace the chunk, which will hopefully also free memory as we go
+ chunks[i] = chunk_as_decimal;
+ }
+ *out = std::make_shared<ChunkedArray>(chunks, type);
+ return Status::OK();
+}
+
+} // namespace
+
+#define TRANSFER_INT32(ENUM, ArrowType) \
+ case ::arrow::Type::ENUM: { \
+ Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_type, &result); \
+ RETURN_NOT_OK(s); \
+ } break;
+
+#define TRANSFER_INT64(ENUM, ArrowType) \
+ case ::arrow::Type::ENUM: { \
+ Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_type, &result); \
+ RETURN_NOT_OK(s); \
+ } break;
+
+Status TransferColumnData(RecordReader* reader, std::shared_ptr<DataType> value_type,
+ const ColumnDescriptor* descr, MemoryPool* pool,
+ std::shared_ptr<ChunkedArray>* out) {
+ Datum result;
+ std::shared_ptr<ChunkedArray> chunked_result;
+ switch (value_type->id()) {
+ case ::arrow::Type::DICTIONARY: {
+ RETURN_NOT_OK(TransferDictionary(reader, value_type, &chunked_result));
+ result = chunked_result;
+ } break;
+ case ::arrow::Type::NA: {
+ result = std::make_shared<::arrow::NullArray>(reader->values_written());
+ break;
+ }
+ case ::arrow::Type::INT32:
+ case ::arrow::Type::INT64:
+ case ::arrow::Type::FLOAT:
+ case ::arrow::Type::DOUBLE:
+ result = TransferZeroCopy(reader, value_type);
+ break;
+ case ::arrow::Type::BOOL:
+ RETURN_NOT_OK(TransferBool(reader, pool, &result));
+ break;
+ TRANSFER_INT32(UINT8, ::arrow::UInt8Type);
+ TRANSFER_INT32(INT8, ::arrow::Int8Type);
+ TRANSFER_INT32(UINT16, ::arrow::UInt16Type);
+ TRANSFER_INT32(INT16, ::arrow::Int16Type);
+ TRANSFER_INT32(UINT32, ::arrow::UInt32Type);
+ TRANSFER_INT64(UINT64, ::arrow::UInt64Type);
+ TRANSFER_INT32(DATE32, ::arrow::Date32Type);
+ TRANSFER_INT32(TIME32, ::arrow::Time32Type);
+ TRANSFER_INT64(TIME64, ::arrow::Time64Type);
+ case ::arrow::Type::DATE64:
+ RETURN_NOT_OK(TransferDate64(reader, pool, value_type, &result));
+ break;
+ case ::arrow::Type::FIXED_SIZE_BINARY:
+ case ::arrow::Type::BINARY:
+ case ::arrow::Type::STRING:
+ case ::arrow::Type::LARGE_BINARY:
+ case ::arrow::Type::LARGE_STRING: {
+ RETURN_NOT_OK(TransferBinary(reader, pool, value_type, &chunked_result));
+ result = chunked_result;
+ } break;
+ case ::arrow::Type::DECIMAL128: {
+ switch (descr->physical_type()) {
+ case ::parquet::Type::INT32: {
+ auto fn = DecimalIntegerTransfer<Int32Type>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::INT64: {
+ auto fn = &DecimalIntegerTransfer<Int64Type>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal128Array, ByteArrayType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal128Array, FLBAType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ default:
+ return Status::Invalid(
+ "Physical type for decimal128 must be int32, int64, byte array, or fixed "
+ "length binary");
+ }
+ } break;
+ case ::arrow::Type::DECIMAL256:
+ switch (descr->physical_type()) {
+ case ::parquet::Type::BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal256Array, ByteArrayType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ auto fn = &TransferDecimal<Decimal256Array, FLBAType>;
+ RETURN_NOT_OK(fn(reader, pool, value_type, &result));
+ } break;
+ default:
+ return Status::Invalid(
+ "Physical type for decimal256 must be fixed length binary");
+ }
+ break;
+
+ case ::arrow::Type::TIMESTAMP: {
+ const ::arrow::TimestampType& timestamp_type =
+ checked_cast<::arrow::TimestampType&>(*value_type);
+ if (descr->physical_type() == ::parquet::Type::INT96) {
+ RETURN_NOT_OK(
+ TransferInt96(reader, pool, value_type, &result, timestamp_type.unit()));
+ } else {
+ switch (timestamp_type.unit()) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ case ::arrow::TimeUnit::NANO:
+ result = TransferZeroCopy(reader, value_type);
+ break;
+ default:
+ return Status::NotImplemented("TimeUnit not supported");
+ }
+ }
+ } break;
+ default:
+ return Status::NotImplemented("No support for reading columns of type ",
+ value_type->ToString());
+ }
+
+ if (result.kind() == Datum::ARRAY) {
+ *out = std::make_shared<ChunkedArray>(result.make_array());
+ } else if (result.kind() == Datum::CHUNKED_ARRAY) {
+ *out = result.chunked_array();
+ } else {
+ DCHECK(false) << "Should be impossible, result was " << result.ToString();
+ }
+
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
index cd54e499aa5..ad0b781576f 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/reader_internal.h
@@ -1,122 +1,122 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <deque>
-#include <functional>
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "parquet/arrow/schema.h"
-#include "parquet/column_reader.h"
-#include "parquet/file_reader.h"
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-class DataType;
-class Field;
-class KeyValueMetadata;
-class Schema;
-
-} // namespace arrow
-
-using arrow::Status;
-
-namespace parquet {
-
-class ArrowReaderProperties;
-
-namespace arrow {
-
-class ColumnReaderImpl;
-
-// ----------------------------------------------------------------------
-// Iteration utilities
-
-// Abstraction to decouple row group iteration details from the ColumnReader,
-// so we can read only a single row group if we want
-class FileColumnIterator {
- public:
- explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
- std::vector<int> row_groups)
- : column_index_(column_index),
- reader_(reader),
- schema_(reader->metadata()->schema()),
- row_groups_(row_groups.begin(), row_groups.end()) {}
-
- virtual ~FileColumnIterator() {}
-
- std::unique_ptr<::parquet::PageReader> NextChunk() {
- if (row_groups_.empty()) {
- return nullptr;
- }
-
- auto row_group_reader = reader_->RowGroup(row_groups_.front());
- row_groups_.pop_front();
- return row_group_reader->GetColumnPageReader(column_index_);
- }
-
- const SchemaDescriptor* schema() const { return schema_; }
-
- const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
-
- std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
-
- int column_index() const { return column_index_; }
-
- protected:
- int column_index_;
- ParquetFileReader* reader_;
- const SchemaDescriptor* schema_;
- std::deque<int> row_groups_;
-};
-
-using FileColumnIteratorFactory =
- std::function<FileColumnIterator*(int, ParquetFileReader*)>;
-
-Status TransferColumnData(::parquet::internal::RecordReader* reader,
- std::shared_ptr<::arrow::DataType> value_type,
- const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::ChunkedArray>* out);
-
-struct ReaderContext {
- ParquetFileReader* reader;
- ::arrow::MemoryPool* pool;
- FileColumnIteratorFactory iterator_factory;
- bool filter_leaves;
- std::shared_ptr<std::unordered_set<int>> included_leaves;
-
- bool IncludesLeaf(int leaf_index) const {
- if (this->filter_leaves) {
- return this->included_leaves->find(leaf_index) != this->included_leaves->end();
- }
- return true;
- }
-};
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "parquet/arrow/schema.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class Field;
+class KeyValueMetadata;
+class Schema;
+
+} // namespace arrow
+
+using arrow::Status;
+
+namespace parquet {
+
+class ArrowReaderProperties;
+
+namespace arrow {
+
+class ColumnReaderImpl;
+
+// ----------------------------------------------------------------------
+// Iteration utilities
+
+// Abstraction to decouple row group iteration details from the ColumnReader,
+// so we can read only a single row group if we want
+class FileColumnIterator {
+ public:
+ explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
+ std::vector<int> row_groups)
+ : column_index_(column_index),
+ reader_(reader),
+ schema_(reader->metadata()->schema()),
+ row_groups_(row_groups.begin(), row_groups.end()) {}
+
+ virtual ~FileColumnIterator() {}
+
+ std::unique_ptr<::parquet::PageReader> NextChunk() {
+ if (row_groups_.empty()) {
+ return nullptr;
+ }
+
+ auto row_group_reader = reader_->RowGroup(row_groups_.front());
+ row_groups_.pop_front();
+ return row_group_reader->GetColumnPageReader(column_index_);
+ }
+
+ const SchemaDescriptor* schema() const { return schema_; }
+
+ const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
+
+ std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
+
+ int column_index() const { return column_index_; }
+
+ protected:
+ int column_index_;
+ ParquetFileReader* reader_;
+ const SchemaDescriptor* schema_;
+ std::deque<int> row_groups_;
+};
+
+using FileColumnIteratorFactory =
+ std::function<FileColumnIterator*(int, ParquetFileReader*)>;
+
+Status TransferColumnData(::parquet::internal::RecordReader* reader,
+ std::shared_ptr<::arrow::DataType> value_type,
+ const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::ChunkedArray>* out);
+
+struct ReaderContext {
+ ParquetFileReader* reader;
+ ::arrow::MemoryPool* pool;
+ FileColumnIteratorFactory iterator_factory;
+ bool filter_leaves;
+ std::shared_ptr<std::unordered_set<int>> included_leaves;
+
+ bool IncludesLeaf(int leaf_index) const {
+ if (this->filter_leaves) {
+ return this->included_leaves->find(leaf_index) != this->included_leaves->end();
+ }
+ return true;
+ }
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
index 454b0e2289a..eb7fd628dfc 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.cc
@@ -1,1087 +1,1087 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/schema.h"
-
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "arrow/extension_type.h"
-#include "arrow/io/memory.h"
-#include "arrow/ipc/api.h"
-#include "arrow/result_internal.h"
-#include "arrow/type.h"
-#include "arrow/util/base64.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/value_parsing.h"
-
-#include "parquet/arrow/schema_internal.h"
-#include "parquet/exception.h"
-#include "parquet/properties.h"
-#include "parquet/types.h"
-
-using arrow::DecimalType;
-using arrow::Field;
-using arrow::FieldVector;
-using arrow::KeyValueMetadata;
-using arrow::Status;
-using arrow::internal::checked_cast;
-
-using ArrowType = arrow::DataType;
-using ArrowTypeId = arrow::Type;
-
-using parquet::Repetition;
-using parquet::schema::GroupNode;
-using parquet::schema::Node;
-using parquet::schema::NodePtr;
-using parquet::schema::PrimitiveNode;
-
-using ParquetType = parquet::Type;
-using parquet::ConvertedType;
-using parquet::LogicalType;
-
-using parquet::internal::LevelInfo;
-
-namespace parquet {
-
-namespace arrow {
-
-// ----------------------------------------------------------------------
-// Parquet to Arrow schema conversion
-
-namespace {
-
-Repetition::type RepetitionFromNullable(bool is_nullable) {
- return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
-}
-
-Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out);
-
-Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type,
- const std::string& name, bool nullable,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- NodePtr element;
- std::string value_name =
- arrow_properties.compliant_nested_types() ? "element" : type->value_field()->name();
- RETURN_NOT_OK(FieldToNode(value_name, type->value_field(), properties, arrow_properties,
- &element));
-
- NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element});
- *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {list},
- LogicalType::List());
- return Status::OK();
-}
-
-Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::string& name,
- bool nullable, const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- // TODO: Should we offer a non-compliant mode that forwards the type names?
- NodePtr key_node;
- RETURN_NOT_OK(
- FieldToNode("key", type->key_field(), properties, arrow_properties, &key_node));
-
- NodePtr value_node;
- RETURN_NOT_OK(FieldToNode("value", type->item_field(), properties, arrow_properties,
- &value_node));
-
- NodePtr key_value =
- GroupNode::Make("key_value", Repetition::REPEATED, {key_node, value_node});
- *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {key_value},
- LogicalType::Map());
- return Status::OK();
-}
-
-Status StructToNode(const std::shared_ptr<::arrow::StructType>& type,
- const std::string& name, bool nullable,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- std::vector<NodePtr> children(type->num_fields());
- if (type->num_fields() != 0) {
- for (int i = 0; i < type->num_fields(); i++) {
- RETURN_NOT_OK(FieldToNode(type->field(i)->name(), type->field(i), properties,
- arrow_properties, &children[i]));
- }
- } else {
- // XXX (ARROW-10928) We could add a dummy primitive node but that would
- // require special handling when writing and reading, to avoid column index
- // mismatches.
- return Status::NotImplemented("Cannot write struct type '", name,
- "' with no child field to Parquet. "
- "Consider adding a dummy child field.");
- }
-
- *out = GroupNode::Make(name, RepetitionFromNullable(nullable), std::move(children));
- return Status::OK();
-}
-
-static std::shared_ptr<const LogicalType> TimestampLogicalTypeFromArrowTimestamp(
- const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) {
- const bool utc = !(timestamp_type.timezone().empty());
- // ARROW-5878(wesm): for forward compatibility reasons, and because
- // there's no other way to signal to old readers that values are
- // timestamps, we force the ConvertedType field to be set to the
- // corresponding TIMESTAMP_* value. This does cause some ambiguity
- // as Parquet readers have not been consistent about the
- // interpretation of TIMESTAMP_* values as being UTC-normalized.
- switch (time_unit) {
- case ::arrow::TimeUnit::MILLI:
- return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MILLIS,
- /*is_from_converted_type=*/false,
- /*force_set_converted_type=*/true);
- case ::arrow::TimeUnit::MICRO:
- return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MICROS,
- /*is_from_converted_type=*/false,
- /*force_set_converted_type=*/true);
- case ::arrow::TimeUnit::NANO:
- return LogicalType::Timestamp(utc, LogicalType::TimeUnit::NANOS);
- case ::arrow::TimeUnit::SECOND:
- // No equivalent parquet logical type.
- break;
- }
- return LogicalType::None();
-}
-
-static Status GetTimestampMetadata(const ::arrow::TimestampType& type,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- ParquetType::type* physical_type,
- std::shared_ptr<const LogicalType>* logical_type) {
- const bool coerce = arrow_properties.coerce_timestamps_enabled();
- const auto target_unit =
- coerce ? arrow_properties.coerce_timestamps_unit() : type.unit();
-
- // The user is explicitly asking for Impala int96 encoding, there is no
- // logical type.
- if (arrow_properties.support_deprecated_int96_timestamps()) {
- *physical_type = ParquetType::INT96;
- return Status::OK();
- }
-
- *physical_type = ParquetType::INT64;
- *logical_type = TimestampLogicalTypeFromArrowTimestamp(type, target_unit);
-
- // The user is explicitly asking for timestamp data to be converted to the
- // specified units (target_unit).
- if (coerce) {
- if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
- switch (target_unit) {
- case ::arrow::TimeUnit::MILLI:
- case ::arrow::TimeUnit::MICRO:
- break;
- case ::arrow::TimeUnit::NANO:
- case ::arrow::TimeUnit::SECOND:
- return Status::NotImplemented(
- "For Parquet version 1.0 files, can only coerce Arrow timestamps to "
- "milliseconds or microseconds");
- }
- } else {
- switch (target_unit) {
- case ::arrow::TimeUnit::MILLI:
- case ::arrow::TimeUnit::MICRO:
- case ::arrow::TimeUnit::NANO:
- break;
- case ::arrow::TimeUnit::SECOND:
- return Status::NotImplemented(
- "For Parquet files, can only coerce Arrow timestamps to milliseconds, "
- "microseconds, or nanoseconds");
- }
- }
- return Status::OK();
- }
-
- // The user implicitly wants timestamp data to retain its original time units,
- // however the ConvertedType field used to indicate logical types for Parquet
- // version 1.0 fields does not allow for nanosecond time units and so nanoseconds
- // must be coerced to microseconds.
- if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0 &&
- type.unit() == ::arrow::TimeUnit::NANO) {
- *logical_type =
- TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MICRO);
- return Status::OK();
- }
-
- // The user implicitly wants timestamp data to retain its original time units,
- // however the Arrow seconds time unit can not be represented (annotated) in
- // any version of Parquet and so must be coerced to milliseconds.
- if (type.unit() == ::arrow::TimeUnit::SECOND) {
- *logical_type =
- TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI);
- return Status::OK();
- }
-
- return Status::OK();
-}
-
-static constexpr char FIELD_ID_KEY[] = "PARQUET:field_id";
-
-std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) {
- if (field_id >= 0) {
- return ::arrow::key_value_metadata({FIELD_ID_KEY}, {std::to_string(field_id)});
- } else {
- return nullptr;
- }
-}
-
-int FieldIdFromMetadata(
- const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
- if (!metadata) {
- return -1;
- }
- int key = metadata->FindKey(FIELD_ID_KEY);
- if (key < 0) {
- return -1;
- }
- std::string field_id_str = metadata->value(key);
- int field_id;
- if (::arrow::internal::ParseValue<::arrow::Int32Type>(
- field_id_str.c_str(), field_id_str.length(), &field_id)) {
- if (field_id < 0) {
- // Thrift should convert any negative value to null but normalize to -1 here in case
- // we later check this in logic.
- return -1;
- }
- return field_id;
- } else {
- return -1;
- }
-}
-
-Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
- ParquetType::type type;
- Repetition::type repetition = RepetitionFromNullable(field->nullable());
-
- int length = -1;
- int precision = -1;
- int scale = -1;
-
- switch (field->type()->id()) {
- case ArrowTypeId::NA: {
- type = ParquetType::INT32;
- logical_type = LogicalType::Null();
- if (repetition != Repetition::OPTIONAL) {
- return Status::Invalid("NullType Arrow field must be nullable");
- }
- } break;
- case ArrowTypeId::BOOL:
- type = ParquetType::BOOLEAN;
- break;
- case ArrowTypeId::UINT8:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(8, false);
- break;
- case ArrowTypeId::INT8:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(8, true);
- break;
- case ArrowTypeId::UINT16:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(16, false);
- break;
- case ArrowTypeId::INT16:
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(16, true);
- break;
- case ArrowTypeId::UINT32:
- if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
- type = ParquetType::INT64;
- } else {
- type = ParquetType::INT32;
- logical_type = LogicalType::Int(32, false);
- }
- break;
- case ArrowTypeId::INT32:
- type = ParquetType::INT32;
- break;
- case ArrowTypeId::UINT64:
- type = ParquetType::INT64;
- logical_type = LogicalType::Int(64, false);
- break;
- case ArrowTypeId::INT64:
- type = ParquetType::INT64;
- break;
- case ArrowTypeId::FLOAT:
- type = ParquetType::FLOAT;
- break;
- case ArrowTypeId::DOUBLE:
- type = ParquetType::DOUBLE;
- break;
- case ArrowTypeId::LARGE_STRING:
- case ArrowTypeId::STRING:
- type = ParquetType::BYTE_ARRAY;
- logical_type = LogicalType::String();
- break;
- case ArrowTypeId::LARGE_BINARY:
- case ArrowTypeId::BINARY:
- type = ParquetType::BYTE_ARRAY;
- break;
- case ArrowTypeId::FIXED_SIZE_BINARY: {
- type = ParquetType::FIXED_LEN_BYTE_ARRAY;
- const auto& fixed_size_binary_type =
- static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
- length = fixed_size_binary_type.byte_width();
- } break;
- case ArrowTypeId::DECIMAL128:
- case ArrowTypeId::DECIMAL256: {
- type = ParquetType::FIXED_LEN_BYTE_ARRAY;
- const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
- precision = decimal_type.precision();
- scale = decimal_type.scale();
- length = DecimalType::DecimalSize(precision);
- PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
- } break;
- case ArrowTypeId::DATE32:
- type = ParquetType::INT32;
- logical_type = LogicalType::Date();
- break;
- case ArrowTypeId::DATE64:
- type = ParquetType::INT32;
- logical_type = LogicalType::Date();
- break;
- case ArrowTypeId::TIMESTAMP:
- RETURN_NOT_OK(
- GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
- properties, arrow_properties, &type, &logical_type));
- break;
- case ArrowTypeId::TIME32:
- type = ParquetType::INT32;
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
- break;
- case ArrowTypeId::TIME64: {
- type = ParquetType::INT64;
- auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
- if (time_type->unit() == ::arrow::TimeUnit::NANO) {
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
- } else {
- logical_type =
- LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
- }
- } break;
- case ArrowTypeId::STRUCT: {
- auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
- return StructToNode(struct_type, name, field->nullable(), properties,
- arrow_properties, out);
- }
- case ArrowTypeId::FIXED_SIZE_LIST:
- case ArrowTypeId::LARGE_LIST:
- case ArrowTypeId::LIST: {
- auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
- return ListToNode(list_type, name, field->nullable(), properties, arrow_properties,
- out);
- }
- case ArrowTypeId::DICTIONARY: {
- // Parquet has no Dictionary type, dictionary-encoded is handled on
- // the encoding, not the schema level.
- const ::arrow::DictionaryType& dict_type =
- static_cast<const ::arrow::DictionaryType&>(*field->type());
- std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
- name, dict_type.value_type(), field->nullable(), field->metadata());
- return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
- }
- case ArrowTypeId::EXTENSION: {
- auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
- std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
- name, ext_type->storage_type(), field->nullable(), field->metadata());
- return FieldToNode(name, storage_field, properties, arrow_properties, out);
- }
- case ArrowTypeId::MAP: {
- auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
- return MapToNode(map_type, name, field->nullable(), properties, arrow_properties,
- out);
- }
-
- default: {
- // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
- return Status::NotImplemented(
- "Unhandled type for Arrow to Parquet schema conversion: ",
- field->type()->ToString());
- }
- }
-
- int field_id = FieldIdFromMetadata(field->metadata());
- PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
- length, field_id));
-
- return Status::OK();
-}
-
-struct SchemaTreeContext {
- SchemaManifest* manifest;
- ArrowReaderProperties properties;
- const SchemaDescriptor* schema;
-
- void LinkParent(const SchemaField* child, const SchemaField* parent) {
- manifest->child_to_parent[child] = parent;
- }
-
- void RecordLeaf(const SchemaField* leaf) {
- manifest->column_index_to_field[leaf->column_index] = leaf;
- }
-};
-
-bool IsDictionaryReadSupported(const ArrowType& type) {
- // Only supported currently for BYTE_ARRAY types
- return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
-}
-
-// ----------------------------------------------------------------------
-// Schema logic
-
-::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
- int column_index, const schema::PrimitiveNode& primitive_node,
- SchemaTreeContext* ctx) {
- ASSIGN_OR_RAISE(
- std::shared_ptr<ArrowType> storage_type,
- GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
- if (ctx->properties.read_dictionary(column_index) &&
- IsDictionaryReadSupported(*storage_type)) {
- return ::arrow::dictionary(::arrow::int32(), storage_type);
- }
- return storage_type;
-}
-
-Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out);
-
-Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out);
-
-Status PopulateLeaf(int column_index, const std::shared_ptr<Field>& field,
- LevelInfo current_levels, SchemaTreeContext* ctx,
- const SchemaField* parent, SchemaField* out) {
- out->field = field;
- out->column_index = column_index;
- out->level_info = current_levels;
- ctx->RecordLeaf(out);
- ctx->LinkParent(out, parent);
- return Status::OK();
-}
-
-// Special case mentioned in the format spec:
-// If the name is array or ends in _tuple, this should be a list of struct
-// even for single child elements.
-bool HasStructListName(const GroupNode& node) {
- ::arrow::util::string_view name{node.name()};
- return name == "array" || name.ends_with("_tuple");
-}
-
-Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- std::vector<std::shared_ptr<Field>> arrow_fields;
- out->children.resize(node.field_count());
- // All level increments for the node are expected to happen by callers.
- // This is required because repeated elements need to have there own
- // SchemaField.
-
- for (int i = 0; i < node.field_count(); i++) {
- RETURN_NOT_OK(
- NodeToSchemaField(*node.field(i), current_levels, ctx, out, &out->children[i]));
- arrow_fields.push_back(out->children[i].field);
- }
- auto struct_type = ::arrow::struct_(arrow_fields);
- out->field = ::arrow::field(node.name(), struct_type, node.is_optional(),
- FieldIdMetadata(node.field_id()));
- out->level_info = current_levels;
- return Status::OK();
-}
-
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out);
-
-Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (group.field_count() != 1) {
- return Status::Invalid("MAP-annotated groups must have a single child.");
- }
- if (group.is_repeated()) {
- return Status::Invalid("MAP-annotated groups must not be repeated.");
- }
-
- const Node& key_value_node = *group.field(0);
-
- if (!key_value_node.is_repeated()) {
- return Status::Invalid(
- "Non-repeated key value in a MAP-annotated group are not supported.");
- }
-
- if (!key_value_node.is_group()) {
- return Status::Invalid("Key-value node must be a group.");
- }
-
- const GroupNode& key_value = checked_cast<const GroupNode&>(key_value_node);
- if (key_value.field_count() != 1 && key_value.field_count() != 2) {
- return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
- key_value.field_count());
- }
- const Node& key_node = *key_value.field(0);
- if (!key_node.is_required()) {
- return Status::Invalid("Map keys must be annotated as required.");
- }
- // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
- // make the values column nullable, or process the map as a list. We choose the latter
- // as it is simpler.
- if (key_value.field_count() == 1) {
- return ListToSchemaField(group, current_levels, ctx, parent, out);
- }
-
- current_levels.Increment(group);
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
-
- out->children.resize(1);
- SchemaField* key_value_field = &out->children[0];
-
- key_value_field->children.resize(2);
- SchemaField* key_field = &key_value_field->children[0];
- SchemaField* value_field = &key_value_field->children[1];
-
- ctx->LinkParent(out, parent);
- ctx->LinkParent(key_value_field, out);
- ctx->LinkParent(key_field, key_value_field);
- ctx->LinkParent(value_field, key_value_field);
-
- // required/optional group name=whatever {
- // repeated group name=key_values{
- // required TYPE key;
- // required/optional TYPE value;
- // }
- // }
- //
-
- RETURN_NOT_OK(NodeToSchemaField(*key_value.field(0), current_levels, ctx,
- key_value_field, key_field));
- RETURN_NOT_OK(NodeToSchemaField(*key_value.field(1), current_levels, ctx,
- key_value_field, value_field));
-
- key_value_field->field = ::arrow::field(
- group.name(), ::arrow::struct_({key_field->field, value_field->field}),
- /*nullable=*/false, FieldIdMetadata(key_value.field_id()));
- key_value_field->level_info = current_levels;
-
- out->field = ::arrow::field(group.name(),
- ::arrow::map(key_field->field->type(), value_field->field),
- group.is_optional(), FieldIdMetadata(group.field_id()));
- out->level_info = current_levels;
- // At this point current levels contains the def level for this list,
- // we need to reset to the prior parent.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
-}
-
-Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (group.field_count() != 1) {
- return Status::Invalid("LIST-annotated groups must have a single child.");
- }
- if (group.is_repeated()) {
- return Status::Invalid("LIST-annotated groups must not be repeated.");
- }
- current_levels.Increment(group);
-
- out->children.resize(group.field_count());
- SchemaField* child_field = &out->children[0];
-
- ctx->LinkParent(out, parent);
- ctx->LinkParent(child_field, out);
-
- const Node& list_node = *group.field(0);
-
- if (!list_node.is_repeated()) {
- return Status::Invalid(
- "Non-repeated nodes in a LIST-annotated group are not supported.");
- }
-
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
- if (list_node.is_group()) {
- // Resolve 3-level encoding
- //
- // required/optional group name=whatever {
- // repeated group name=list {
- // required/optional TYPE item;
- // }
- // }
- //
- // yields list<item: TYPE ?nullable> ?nullable
- //
- // We distinguish the special case that we have
- //
- // required/optional group name=whatever {
- // repeated group name=array or $SOMETHING_tuple {
- // required/optional TYPE item;
- // }
- // }
- //
- // In this latter case, the inner type of the list should be a struct
- // rather than a primitive value
- //
- // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
- const auto& list_group = static_cast<const GroupNode&>(list_node);
- // Special case mentioned in the format spec:
- // If the name is array or ends in _tuple, this should be a list of struct
- // even for single child elements.
- if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
- // List of primitive type
- RETURN_NOT_OK(
- NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field));
- } else {
- RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
- }
- } else {
- // Two-level list encoding
- //
- // required/optional group LIST {
- // repeated TYPE;
- // }
- const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
- int column_index = ctx->schema->GetColumnIndex(primitive_node);
- ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
- GetTypeForNode(column_index, primitive_node, ctx));
- auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
- FieldIdMetadata(list_node.field_id()));
- RETURN_NOT_OK(
- PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
- }
- out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
- group.is_optional(), FieldIdMetadata(group.field_id()));
- out->level_info = current_levels;
- // At this point current levels contains the def level for this list,
- // we need to reset to the prior parent.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
-}
-
-Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- if (node.logical_type()->is_list()) {
- return ListToSchemaField(node, current_levels, ctx, parent, out);
- } else if (node.logical_type()->is_map()) {
- return MapToSchemaField(node, current_levels, ctx, parent, out);
- }
- std::shared_ptr<ArrowType> type;
- if (node.is_repeated()) {
- // Simple repeated struct
- //
- // repeated group $NAME {
- // r/o TYPE[0] f0
- // r/o TYPE[1] f1
- // }
- out->children.resize(1);
-
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
- RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out, &out->children[0]));
- out->field = ::arrow::field(node.name(), ::arrow::list(out->children[0].field),
- /*nullable=*/false, FieldIdMetadata(node.field_id()));
-
- ctx->LinkParent(&out->children[0], out);
- out->level_info = current_levels;
- // At this point current_levels contains this list as the def level, we need to
- // use the previous ancenstor of thi slist.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
- } else {
- current_levels.Increment(node);
- return GroupToStruct(node, current_levels, ctx, parent, out);
- }
-}
-
-Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
- SchemaTreeContext* ctx, const SchemaField* parent,
- SchemaField* out) {
- // Workhorse function for converting a Parquet schema node to an Arrow
- // type. Handles different conventions for nested data.
-
- ctx->LinkParent(out, parent);
-
- // Now, walk the schema and create a ColumnDescriptor for each leaf node
- if (node.is_group()) {
- // A nested field, but we don't know what kind yet
- return GroupToSchemaField(static_cast<const GroupNode&>(node), current_levels, ctx,
- parent, out);
- } else {
- // Either a normal flat primitive type, or a list type encoded with 1-level
- // list encoding. Note that the 3-level encoding is the form recommended by
- // the parquet specification, but technically we can have either
- //
- // required/optional $TYPE $FIELD_NAME
- //
- // or
- //
- // repeated $TYPE $FIELD_NAME
- const auto& primitive_node = static_cast<const PrimitiveNode&>(node);
- int column_index = ctx->schema->GetColumnIndex(primitive_node);
- ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
- GetTypeForNode(column_index, primitive_node, ctx));
- if (node.is_repeated()) {
- // One-level list encoding, e.g.
- // a: repeated int32;
- int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
- out->children.resize(1);
- auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false);
- RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, ctx, out,
- &out->children[0]));
-
- out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
- /*nullable=*/false, FieldIdMetadata(node.field_id()));
- out->level_info = current_levels;
- // At this point current_levels has consider this list the ancestor so restore
- // the actual ancenstor.
- out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
- return Status::OK();
- } else {
- current_levels.Increment(node);
- // A normal (required/optional) primitive node
- return PopulateLeaf(column_index,
- ::arrow::field(node.name(), type, node.is_optional(),
- FieldIdMetadata(node.field_id())),
- current_levels, ctx, parent, out);
- }
- }
-}
-
-// Get the original Arrow schema, as serialized in the Parquet metadata
-Status GetOriginSchema(const std::shared_ptr<const KeyValueMetadata>& metadata,
- std::shared_ptr<const KeyValueMetadata>* clean_metadata,
- std::shared_ptr<::arrow::Schema>* out) {
- if (metadata == nullptr) {
- *out = nullptr;
- *clean_metadata = nullptr;
- return Status::OK();
- }
-
- static const std::string kArrowSchemaKey = "ARROW:schema";
- int schema_index = metadata->FindKey(kArrowSchemaKey);
- if (schema_index == -1) {
- *out = nullptr;
- *clean_metadata = metadata;
- return Status::OK();
- }
-
- // The original Arrow schema was serialized using the store_schema option.
- // We deserialize it here and use it to inform read options such as
- // dictionary-encoded fields.
- auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index));
- auto schema_buf = std::make_shared<Buffer>(decoded);
-
- ::arrow::ipc::DictionaryMemo dict_memo;
- ::arrow::io::BufferReader input(schema_buf);
-
- ARROW_ASSIGN_OR_RAISE(*out, ::arrow::ipc::ReadSchema(&input, &dict_memo));
-
- if (metadata->size() > 1) {
- // Copy the metadata without the schema key
- auto new_metadata = ::arrow::key_value_metadata({}, {});
- new_metadata->reserve(metadata->size() - 1);
- for (int64_t i = 0; i < metadata->size(); ++i) {
- if (i == schema_index) continue;
- new_metadata->Append(metadata->key(i), metadata->value(i));
- }
- *clean_metadata = new_metadata;
- } else {
- // No other keys, let metadata be null
- *clean_metadata = nullptr;
- }
- return Status::OK();
-}
-
-// Restore original Arrow field information that was serialized as Parquet metadata
-// but that is not necessarily present in the field reconstitued from Parquet data
-// (for example, Parquet timestamp types doesn't carry timezone information).
-
-Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred);
-
-std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
- const ArrowType& origin_type, const ArrowType& inferred_type) {
- switch (inferred_type.id()) {
- case ::arrow::Type::STRUCT:
- if (origin_type.id() == ::arrow::Type::STRUCT) {
- return ::arrow::struct_;
- }
- break;
- case ::arrow::Type::LIST:
- if (origin_type.id() == ::arrow::Type::LIST) {
- return [](FieldVector fields) {
- DCHECK_EQ(fields.size(), 1);
- return ::arrow::list(std::move(fields[0]));
- };
- }
- if (origin_type.id() == ::arrow::Type::LARGE_LIST) {
- return [](FieldVector fields) {
- DCHECK_EQ(fields.size(), 1);
- return ::arrow::large_list(std::move(fields[0]));
- };
- }
- if (origin_type.id() == ::arrow::Type::FIXED_SIZE_LIST) {
- const auto list_size =
- checked_cast<const ::arrow::FixedSizeListType&>(origin_type).list_size();
- return [list_size](FieldVector fields) {
- DCHECK_EQ(fields.size(), 1);
- return ::arrow::fixed_size_list(std::move(fields[0]), list_size);
- };
- }
- break;
- default:
- break;
- }
- return {};
-}
-
-Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
- SchemaField* inferred) {
- bool modified = false;
-
- auto origin_type = origin_field.type();
- auto inferred_type = inferred->field->type();
-
- const int num_children = inferred_type->num_fields();
-
- if (num_children > 0 && origin_type->num_fields() == num_children) {
- DCHECK_EQ(static_cast<int>(inferred->children.size()), num_children);
- const auto factory = GetNestedFactory(*origin_type, *inferred_type);
- if (factory) {
- // The type may be modified (e.g. LargeList) while the children stay the same
- modified |= origin_type->id() != inferred_type->id();
-
- // Apply original metadata recursively to children
- for (int i = 0; i < inferred_type->num_fields(); ++i) {
- ARROW_ASSIGN_OR_RAISE(
- const bool child_modified,
- ApplyOriginalMetadata(*origin_type->field(i), &inferred->children[i]));
- modified |= child_modified;
- }
- if (modified) {
- // Recreate this field using the modified child fields
- ::arrow::FieldVector modified_children(inferred_type->num_fields());
- for (int i = 0; i < inferred_type->num_fields(); ++i) {
- modified_children[i] = inferred->children[i].field;
- }
- inferred->field =
- inferred->field->WithType(factory(std::move(modified_children)));
- }
- }
- }
-
- if (origin_type->id() == ::arrow::Type::TIMESTAMP &&
- inferred_type->id() == ::arrow::Type::TIMESTAMP) {
- // Restore time zone, if any
- const auto& ts_type = checked_cast<const ::arrow::TimestampType&>(*inferred_type);
- const auto& ts_origin_type =
- checked_cast<const ::arrow::TimestampType&>(*origin_type);
-
- // If the data is tz-aware, then set the original time zone, since Parquet
- // has no native storage for timezones
- if (ts_type.timezone() == "UTC" && ts_origin_type.timezone() != "") {
- if (ts_type.unit() == ts_origin_type.unit()) {
- inferred->field = inferred->field->WithType(origin_type);
- } else {
- auto ts_type_new = ::arrow::timestamp(ts_type.unit(), ts_origin_type.timezone());
- inferred->field = inferred->field->WithType(ts_type_new);
- }
- }
- modified = true;
- }
-
- if (origin_type->id() == ::arrow::Type::DICTIONARY &&
- inferred_type->id() != ::arrow::Type::DICTIONARY &&
- IsDictionaryReadSupported(*inferred_type)) {
- // Direct dictionary reads are only suppored for a couple primitive types,
- // so no need to recurse on value types.
- const auto& dict_origin_type =
- checked_cast<const ::arrow::DictionaryType&>(*origin_type);
- inferred->field = inferred->field->WithType(
- ::arrow::dictionary(::arrow::int32(), inferred_type, dict_origin_type.ordered()));
- modified = true;
- }
-
- if ((origin_type->id() == ::arrow::Type::LARGE_BINARY &&
- inferred_type->id() == ::arrow::Type::BINARY) ||
- (origin_type->id() == ::arrow::Type::LARGE_STRING &&
- inferred_type->id() == ::arrow::Type::STRING)) {
- // Read back binary-like arrays with the intended offset width.
- inferred->field = inferred->field->WithType(origin_type);
- modified = true;
- }
-
- if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
- inferred_type->id() == ::arrow::Type::DECIMAL128) {
- inferred->field = inferred->field->WithType(origin_type);
- modified = true;
- }
-
- // Restore field metadata
- std::shared_ptr<const KeyValueMetadata> field_metadata = origin_field.metadata();
- if (field_metadata != nullptr) {
- if (inferred->field->metadata()) {
- // Prefer the metadata keys (like field_id) from the current metadata
- field_metadata = field_metadata->Merge(*inferred->field->metadata());
- }
- inferred->field = inferred->field->WithMetadata(field_metadata);
- modified = true;
- }
-
- return modified;
-}
-
-Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred) {
- bool modified = false;
-
- auto origin_type = origin_field.type();
- auto inferred_type = inferred->field->type();
-
- if (origin_type->id() == ::arrow::Type::EXTENSION) {
- const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
- auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
-
- // Apply metadata recursively to storage type
- RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
-
- // Restore extension type, if the storage type is the same as inferred
- // from the Parquet type
- if (ex_type.storage_type()->Equals(*inferred->field->type())) {
- inferred->field = inferred->field->WithType(origin_type);
- }
- modified = true;
- } else {
- ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred));
- }
-
- return modified;
-}
-
-} // namespace
-
-Status FieldToNode(const std::shared_ptr<Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties, NodePtr* out) {
- return FieldToNode(field->name(), field, properties, arrow_properties, out);
-}
-
-Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- std::shared_ptr<SchemaDescriptor>* out) {
- std::vector<NodePtr> nodes(arrow_schema->num_fields());
- for (int i = 0; i < arrow_schema->num_fields(); i++) {
- RETURN_NOT_OK(
- FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i]));
- }
-
- NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes);
- *out = std::make_shared<::parquet::SchemaDescriptor>();
- PARQUET_CATCH_NOT_OK((*out)->Init(schema));
-
- return Status::OK();
-}
-
-Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- std::shared_ptr<SchemaDescriptor>* out) {
- return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(),
- out);
-}
-
-Status FromParquetSchema(
- const SchemaDescriptor* schema, const ArrowReaderProperties& properties,
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata,
- std::shared_ptr<::arrow::Schema>* out) {
- SchemaManifest manifest;
- RETURN_NOT_OK(SchemaManifest::Make(schema, key_value_metadata, properties, &manifest));
- std::vector<std::shared_ptr<Field>> fields(manifest.schema_fields.size());
-
- for (int i = 0; i < static_cast<int>(fields.size()); i++) {
- const auto& schema_field = manifest.schema_fields[i];
- fields[i] = schema_field.field;
- }
- if (manifest.origin_schema) {
- // ARROW-8980: If the ARROW:schema was in the input metadata, then
- // manifest.origin_schema will have it scrubbed out
- *out = ::arrow::schema(fields, manifest.origin_schema->metadata());
- } else {
- *out = ::arrow::schema(fields, key_value_metadata);
- }
- return Status::OK();
-}
-
-Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- const ArrowReaderProperties& properties,
- std::shared_ptr<::arrow::Schema>* out) {
- return FromParquetSchema(parquet_schema, properties, nullptr, out);
-}
-
-Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- std::shared_ptr<::arrow::Schema>* out) {
- ArrowReaderProperties properties;
- return FromParquetSchema(parquet_schema, properties, nullptr, out);
-}
-
-Status SchemaManifest::Make(const SchemaDescriptor* schema,
- const std::shared_ptr<const KeyValueMetadata>& metadata,
- const ArrowReaderProperties& properties,
- SchemaManifest* manifest) {
- SchemaTreeContext ctx;
- ctx.manifest = manifest;
- ctx.properties = properties;
- ctx.schema = schema;
- const GroupNode& schema_node = *schema->group_node();
- manifest->descr = schema;
- manifest->schema_fields.resize(schema_node.field_count());
-
- // Try to deserialize original Arrow schema
- RETURN_NOT_OK(
- GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema));
- // Ignore original schema if it's not compatible with the Parquet schema
- if (manifest->origin_schema != nullptr &&
- manifest->origin_schema->num_fields() != schema_node.field_count()) {
- manifest->origin_schema = nullptr;
- }
-
- for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
- SchemaField* out_field = &manifest->schema_fields[i];
- RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), LevelInfo(), &ctx,
- /*parent=*/nullptr, out_field));
-
- // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin
- // schema (if any) through all functions in the schema reconstruction, but
- // I'm being lazy and just setting dictionary fields at the top level for
- // now
- if (manifest->origin_schema == nullptr) {
- continue;
- }
-
- auto origin_field = manifest->origin_schema->field(i);
- RETURN_NOT_OK(ApplyOriginalMetadata(*origin_field, out_field));
- }
- return Status::OK();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/schema.h"
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/ipc/api.h"
+#include "arrow/result_internal.h"
+#include "arrow/type.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/value_parsing.h"
+
+#include "parquet/arrow/schema_internal.h"
+#include "parquet/exception.h"
+#include "parquet/properties.h"
+#include "parquet/types.h"
+
+using arrow::DecimalType;
+using arrow::Field;
+using arrow::FieldVector;
+using arrow::KeyValueMetadata;
+using arrow::Status;
+using arrow::internal::checked_cast;
+
+using ArrowType = arrow::DataType;
+using ArrowTypeId = arrow::Type;
+
+using parquet::Repetition;
+using parquet::schema::GroupNode;
+using parquet::schema::Node;
+using parquet::schema::NodePtr;
+using parquet::schema::PrimitiveNode;
+
+using ParquetType = parquet::Type;
+using parquet::ConvertedType;
+using parquet::LogicalType;
+
+using parquet::internal::LevelInfo;
+
+namespace parquet {
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Parquet to Arrow schema conversion
+
+namespace {
+
+Repetition::type RepetitionFromNullable(bool is_nullable) {
+ return is_nullable ? Repetition::OPTIONAL : Repetition::REQUIRED;
+}
+
+Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out);
+
+Status ListToNode(const std::shared_ptr<::arrow::BaseListType>& type,
+ const std::string& name, bool nullable,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ NodePtr element;
+ std::string value_name =
+ arrow_properties.compliant_nested_types() ? "element" : type->value_field()->name();
+ RETURN_NOT_OK(FieldToNode(value_name, type->value_field(), properties, arrow_properties,
+ &element));
+
+ NodePtr list = GroupNode::Make("list", Repetition::REPEATED, {element});
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {list},
+ LogicalType::List());
+ return Status::OK();
+}
+
+Status MapToNode(const std::shared_ptr<::arrow::MapType>& type, const std::string& name,
+ bool nullable, const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ // TODO: Should we offer a non-compliant mode that forwards the type names?
+ NodePtr key_node;
+ RETURN_NOT_OK(
+ FieldToNode("key", type->key_field(), properties, arrow_properties, &key_node));
+
+ NodePtr value_node;
+ RETURN_NOT_OK(FieldToNode("value", type->item_field(), properties, arrow_properties,
+ &value_node));
+
+ NodePtr key_value =
+ GroupNode::Make("key_value", Repetition::REPEATED, {key_node, value_node});
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), {key_value},
+ LogicalType::Map());
+ return Status::OK();
+}
+
+Status StructToNode(const std::shared_ptr<::arrow::StructType>& type,
+ const std::string& name, bool nullable,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ std::vector<NodePtr> children(type->num_fields());
+ if (type->num_fields() != 0) {
+ for (int i = 0; i < type->num_fields(); i++) {
+ RETURN_NOT_OK(FieldToNode(type->field(i)->name(), type->field(i), properties,
+ arrow_properties, &children[i]));
+ }
+ } else {
+ // XXX (ARROW-10928) We could add a dummy primitive node but that would
+ // require special handling when writing and reading, to avoid column index
+ // mismatches.
+ return Status::NotImplemented("Cannot write struct type '", name,
+ "' with no child field to Parquet. "
+ "Consider adding a dummy child field.");
+ }
+
+ *out = GroupNode::Make(name, RepetitionFromNullable(nullable), std::move(children));
+ return Status::OK();
+}
+
+static std::shared_ptr<const LogicalType> TimestampLogicalTypeFromArrowTimestamp(
+ const ::arrow::TimestampType& timestamp_type, ::arrow::TimeUnit::type time_unit) {
+ const bool utc = !(timestamp_type.timezone().empty());
+ // ARROW-5878(wesm): for forward compatibility reasons, and because
+ // there's no other way to signal to old readers that values are
+ // timestamps, we force the ConvertedType field to be set to the
+ // corresponding TIMESTAMP_* value. This does cause some ambiguity
+ // as Parquet readers have not been consistent about the
+ // interpretation of TIMESTAMP_* values as being UTC-normalized.
+ switch (time_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MILLIS,
+ /*is_from_converted_type=*/false,
+ /*force_set_converted_type=*/true);
+ case ::arrow::TimeUnit::MICRO:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::MICROS,
+ /*is_from_converted_type=*/false,
+ /*force_set_converted_type=*/true);
+ case ::arrow::TimeUnit::NANO:
+ return LogicalType::Timestamp(utc, LogicalType::TimeUnit::NANOS);
+ case ::arrow::TimeUnit::SECOND:
+ // No equivalent parquet logical type.
+ break;
+ }
+ return LogicalType::None();
+}
+
+static Status GetTimestampMetadata(const ::arrow::TimestampType& type,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ ParquetType::type* physical_type,
+ std::shared_ptr<const LogicalType>* logical_type) {
+ const bool coerce = arrow_properties.coerce_timestamps_enabled();
+ const auto target_unit =
+ coerce ? arrow_properties.coerce_timestamps_unit() : type.unit();
+
+ // The user is explicitly asking for Impala int96 encoding, there is no
+ // logical type.
+ if (arrow_properties.support_deprecated_int96_timestamps()) {
+ *physical_type = ParquetType::INT96;
+ return Status::OK();
+ }
+
+ *physical_type = ParquetType::INT64;
+ *logical_type = TimestampLogicalTypeFromArrowTimestamp(type, target_unit);
+
+ // The user is explicitly asking for timestamp data to be converted to the
+ // specified units (target_unit).
+ if (coerce) {
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
+ switch (target_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ break;
+ case ::arrow::TimeUnit::NANO:
+ case ::arrow::TimeUnit::SECOND:
+ return Status::NotImplemented(
+ "For Parquet version 1.0 files, can only coerce Arrow timestamps to "
+ "milliseconds or microseconds");
+ }
+ } else {
+ switch (target_unit) {
+ case ::arrow::TimeUnit::MILLI:
+ case ::arrow::TimeUnit::MICRO:
+ case ::arrow::TimeUnit::NANO:
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ return Status::NotImplemented(
+ "For Parquet files, can only coerce Arrow timestamps to milliseconds, "
+ "microseconds, or nanoseconds");
+ }
+ }
+ return Status::OK();
+ }
+
+ // The user implicitly wants timestamp data to retain its original time units,
+ // however the ConvertedType field used to indicate logical types for Parquet
+ // version 1.0 fields does not allow for nanosecond time units and so nanoseconds
+ // must be coerced to microseconds.
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0 &&
+ type.unit() == ::arrow::TimeUnit::NANO) {
+ *logical_type =
+ TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MICRO);
+ return Status::OK();
+ }
+
+ // The user implicitly wants timestamp data to retain its original time units,
+ // however the Arrow seconds time unit can not be represented (annotated) in
+ // any version of Parquet and so must be coerced to milliseconds.
+ if (type.unit() == ::arrow::TimeUnit::SECOND) {
+ *logical_type =
+ TimestampLogicalTypeFromArrowTimestamp(type, ::arrow::TimeUnit::MILLI);
+ return Status::OK();
+ }
+
+ return Status::OK();
+}
+
+static constexpr char FIELD_ID_KEY[] = "PARQUET:field_id";
+
+std::shared_ptr<::arrow::KeyValueMetadata> FieldIdMetadata(int field_id) {
+ if (field_id >= 0) {
+ return ::arrow::key_value_metadata({FIELD_ID_KEY}, {std::to_string(field_id)});
+ } else {
+ return nullptr;
+ }
+}
+
+int FieldIdFromMetadata(
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ if (!metadata) {
+ return -1;
+ }
+ int key = metadata->FindKey(FIELD_ID_KEY);
+ if (key < 0) {
+ return -1;
+ }
+ std::string field_id_str = metadata->value(key);
+ int field_id;
+ if (::arrow::internal::ParseValue<::arrow::Int32Type>(
+ field_id_str.c_str(), field_id_str.length(), &field_id)) {
+ if (field_id < 0) {
+ // Thrift should convert any negative value to null but normalize to -1 here in case
+ // we later check this in logic.
+ return -1;
+ }
+ return field_id;
+ } else {
+ return -1;
+ }
+}
+
+Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ std::shared_ptr<const LogicalType> logical_type = LogicalType::None();
+ ParquetType::type type;
+ Repetition::type repetition = RepetitionFromNullable(field->nullable());
+
+ int length = -1;
+ int precision = -1;
+ int scale = -1;
+
+ switch (field->type()->id()) {
+ case ArrowTypeId::NA: {
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Null();
+ if (repetition != Repetition::OPTIONAL) {
+ return Status::Invalid("NullType Arrow field must be nullable");
+ }
+ } break;
+ case ArrowTypeId::BOOL:
+ type = ParquetType::BOOLEAN;
+ break;
+ case ArrowTypeId::UINT8:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(8, false);
+ break;
+ case ArrowTypeId::INT8:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(8, true);
+ break;
+ case ArrowTypeId::UINT16:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(16, false);
+ break;
+ case ArrowTypeId::INT16:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(16, true);
+ break;
+ case ArrowTypeId::UINT32:
+ if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) {
+ type = ParquetType::INT64;
+ } else {
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Int(32, false);
+ }
+ break;
+ case ArrowTypeId::INT32:
+ type = ParquetType::INT32;
+ break;
+ case ArrowTypeId::UINT64:
+ type = ParquetType::INT64;
+ logical_type = LogicalType::Int(64, false);
+ break;
+ case ArrowTypeId::INT64:
+ type = ParquetType::INT64;
+ break;
+ case ArrowTypeId::FLOAT:
+ type = ParquetType::FLOAT;
+ break;
+ case ArrowTypeId::DOUBLE:
+ type = ParquetType::DOUBLE;
+ break;
+ case ArrowTypeId::LARGE_STRING:
+ case ArrowTypeId::STRING:
+ type = ParquetType::BYTE_ARRAY;
+ logical_type = LogicalType::String();
+ break;
+ case ArrowTypeId::LARGE_BINARY:
+ case ArrowTypeId::BINARY:
+ type = ParquetType::BYTE_ARRAY;
+ break;
+ case ArrowTypeId::FIXED_SIZE_BINARY: {
+ type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+ const auto& fixed_size_binary_type =
+ static_cast<const ::arrow::FixedSizeBinaryType&>(*field->type());
+ length = fixed_size_binary_type.byte_width();
+ } break;
+ case ArrowTypeId::DECIMAL128:
+ case ArrowTypeId::DECIMAL256: {
+ type = ParquetType::FIXED_LEN_BYTE_ARRAY;
+ const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
+ precision = decimal_type.precision();
+ scale = decimal_type.scale();
+ length = DecimalType::DecimalSize(precision);
+ PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
+ } break;
+ case ArrowTypeId::DATE32:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Date();
+ break;
+ case ArrowTypeId::DATE64:
+ type = ParquetType::INT32;
+ logical_type = LogicalType::Date();
+ break;
+ case ArrowTypeId::TIMESTAMP:
+ RETURN_NOT_OK(
+ GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()),
+ properties, arrow_properties, &type, &logical_type));
+ break;
+ case ArrowTypeId::TIME32:
+ type = ParquetType::INT32;
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MILLIS);
+ break;
+ case ArrowTypeId::TIME64: {
+ type = ParquetType::INT64;
+ auto time_type = static_cast<::arrow::Time64Type*>(field->type().get());
+ if (time_type->unit() == ::arrow::TimeUnit::NANO) {
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::NANOS);
+ } else {
+ logical_type =
+ LogicalType::Time(/*is_adjusted_to_utc=*/true, LogicalType::TimeUnit::MICROS);
+ }
+ } break;
+ case ArrowTypeId::STRUCT: {
+ auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type());
+ return StructToNode(struct_type, name, field->nullable(), properties,
+ arrow_properties, out);
+ }
+ case ArrowTypeId::FIXED_SIZE_LIST:
+ case ArrowTypeId::LARGE_LIST:
+ case ArrowTypeId::LIST: {
+ auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type());
+ return ListToNode(list_type, name, field->nullable(), properties, arrow_properties,
+ out);
+ }
+ case ArrowTypeId::DICTIONARY: {
+ // Parquet has no Dictionary type, dictionary-encoded is handled on
+ // the encoding, not the schema level.
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*field->type());
+ std::shared_ptr<::arrow::Field> unpacked_field = ::arrow::field(
+ name, dict_type.value_type(), field->nullable(), field->metadata());
+ return FieldToNode(name, unpacked_field, properties, arrow_properties, out);
+ }
+ case ArrowTypeId::EXTENSION: {
+ auto ext_type = std::static_pointer_cast<::arrow::ExtensionType>(field->type());
+ std::shared_ptr<::arrow::Field> storage_field = ::arrow::field(
+ name, ext_type->storage_type(), field->nullable(), field->metadata());
+ return FieldToNode(name, storage_field, properties, arrow_properties, out);
+ }
+ case ArrowTypeId::MAP: {
+ auto map_type = std::static_pointer_cast<::arrow::MapType>(field->type());
+ return MapToNode(map_type, name, field->nullable(), properties, arrow_properties,
+ out);
+ }
+
+ default: {
+ // TODO: DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL_TEXT, VARCHAR
+ return Status::NotImplemented(
+ "Unhandled type for Arrow to Parquet schema conversion: ",
+ field->type()->ToString());
+ }
+ }
+
+ int field_id = FieldIdFromMetadata(field->metadata());
+ PARQUET_CATCH_NOT_OK(*out = PrimitiveNode::Make(name, repetition, logical_type, type,
+ length, field_id));
+
+ return Status::OK();
+}
+
+struct SchemaTreeContext {
+ SchemaManifest* manifest;
+ ArrowReaderProperties properties;
+ const SchemaDescriptor* schema;
+
+ void LinkParent(const SchemaField* child, const SchemaField* parent) {
+ manifest->child_to_parent[child] = parent;
+ }
+
+ void RecordLeaf(const SchemaField* leaf) {
+ manifest->column_index_to_field[leaf->column_index] = leaf;
+ }
+};
+
+bool IsDictionaryReadSupported(const ArrowType& type) {
+ // Only supported currently for BYTE_ARRAY types
+ return type.id() == ::arrow::Type::BINARY || type.id() == ::arrow::Type::STRING;
+}
+
+// ----------------------------------------------------------------------
+// Schema logic
+
+::arrow::Result<std::shared_ptr<ArrowType>> GetTypeForNode(
+ int column_index, const schema::PrimitiveNode& primitive_node,
+ SchemaTreeContext* ctx) {
+ ASSIGN_OR_RAISE(
+ std::shared_ptr<ArrowType> storage_type,
+ GetArrowType(primitive_node, ctx->properties.coerce_int96_timestamp_unit()));
+ if (ctx->properties.read_dictionary(column_index) &&
+ IsDictionaryReadSupported(*storage_type)) {
+ return ::arrow::dictionary(::arrow::int32(), storage_type);
+ }
+ return storage_type;
+}
+
+Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status PopulateLeaf(int column_index, const std::shared_ptr<Field>& field,
+ LevelInfo current_levels, SchemaTreeContext* ctx,
+ const SchemaField* parent, SchemaField* out) {
+ out->field = field;
+ out->column_index = column_index;
+ out->level_info = current_levels;
+ ctx->RecordLeaf(out);
+ ctx->LinkParent(out, parent);
+ return Status::OK();
+}
+
+// Special case mentioned in the format spec:
+// If the name is array or ends in _tuple, this should be a list of struct
+// even for single child elements.
+bool HasStructListName(const GroupNode& node) {
+ ::arrow::util::string_view name{node.name()};
+ return name == "array" || name.ends_with("_tuple");
+}
+
+Status GroupToStruct(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ std::vector<std::shared_ptr<Field>> arrow_fields;
+ out->children.resize(node.field_count());
+ // All level increments for the node are expected to happen by callers.
+ // This is required because repeated elements need to have there own
+ // SchemaField.
+
+ for (int i = 0; i < node.field_count(); i++) {
+ RETURN_NOT_OK(
+ NodeToSchemaField(*node.field(i), current_levels, ctx, out, &out->children[i]));
+ arrow_fields.push_back(out->children[i].field);
+ }
+ auto struct_type = ::arrow::struct_(arrow_fields);
+ out->field = ::arrow::field(node.name(), struct_type, node.is_optional(),
+ FieldIdMetadata(node.field_id()));
+ out->level_info = current_levels;
+ return Status::OK();
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out);
+
+Status MapToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("MAP-annotated groups must have a single child.");
+ }
+ if (group.is_repeated()) {
+ return Status::Invalid("MAP-annotated groups must not be repeated.");
+ }
+
+ const Node& key_value_node = *group.field(0);
+
+ if (!key_value_node.is_repeated()) {
+ return Status::Invalid(
+ "Non-repeated key value in a MAP-annotated group are not supported.");
+ }
+
+ if (!key_value_node.is_group()) {
+ return Status::Invalid("Key-value node must be a group.");
+ }
+
+ const GroupNode& key_value = checked_cast<const GroupNode&>(key_value_node);
+ if (key_value.field_count() != 1 && key_value.field_count() != 2) {
+ return Status::Invalid("Key-value map node must have 1 or 2 child elements. Found: ",
+ key_value.field_count());
+ }
+ const Node& key_node = *key_value.field(0);
+ if (!key_node.is_required()) {
+ return Status::Invalid("Map keys must be annotated as required.");
+ }
+ // Arrow doesn't support 1 column maps (i.e. Sets). The options are to either
+ // make the values column nullable, or process the map as a list. We choose the latter
+ // as it is simpler.
+ if (key_value.field_count() == 1) {
+ return ListToSchemaField(group, current_levels, ctx, parent, out);
+ }
+
+ current_levels.Increment(group);
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+
+ out->children.resize(1);
+ SchemaField* key_value_field = &out->children[0];
+
+ key_value_field->children.resize(2);
+ SchemaField* key_field = &key_value_field->children[0];
+ SchemaField* value_field = &key_value_field->children[1];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(key_value_field, out);
+ ctx->LinkParent(key_field, key_value_field);
+ ctx->LinkParent(value_field, key_value_field);
+
+ // required/optional group name=whatever {
+ // repeated group name=key_values{
+ // required TYPE key;
+ // required/optional TYPE value;
+ // }
+ // }
+ //
+
+ RETURN_NOT_OK(NodeToSchemaField(*key_value.field(0), current_levels, ctx,
+ key_value_field, key_field));
+ RETURN_NOT_OK(NodeToSchemaField(*key_value.field(1), current_levels, ctx,
+ key_value_field, value_field));
+
+ key_value_field->field = ::arrow::field(
+ group.name(), ::arrow::struct_({key_field->field, value_field->field}),
+ /*nullable=*/false, FieldIdMetadata(key_value.field_id()));
+ key_value_field->level_info = current_levels;
+
+ out->field = ::arrow::field(group.name(),
+ ::arrow::map(key_field->field->type(), value_field->field),
+ group.is_optional(), FieldIdMetadata(group.field_id()));
+ out->level_info = current_levels;
+ // At this point current levels contains the def level for this list,
+ // we need to reset to the prior parent.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+}
+
+Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (group.field_count() != 1) {
+ return Status::Invalid("LIST-annotated groups must have a single child.");
+ }
+ if (group.is_repeated()) {
+ return Status::Invalid("LIST-annotated groups must not be repeated.");
+ }
+ current_levels.Increment(group);
+
+ out->children.resize(group.field_count());
+ SchemaField* child_field = &out->children[0];
+
+ ctx->LinkParent(out, parent);
+ ctx->LinkParent(child_field, out);
+
+ const Node& list_node = *group.field(0);
+
+ if (!list_node.is_repeated()) {
+ return Status::Invalid(
+ "Non-repeated nodes in a LIST-annotated group are not supported.");
+ }
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ if (list_node.is_group()) {
+ // Resolve 3-level encoding
+ //
+ // required/optional group name=whatever {
+ // repeated group name=list {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // yields list<item: TYPE ?nullable> ?nullable
+ //
+ // We distinguish the special case that we have
+ //
+ // required/optional group name=whatever {
+ // repeated group name=array or $SOMETHING_tuple {
+ // required/optional TYPE item;
+ // }
+ // }
+ //
+ // In this latter case, the inner type of the list should be a struct
+ // rather than a primitive value
+ //
+ // yields list<item: struct<item: TYPE ?nullable> not null> ?nullable
+ const auto& list_group = static_cast<const GroupNode&>(list_node);
+ // Special case mentioned in the format spec:
+ // If the name is array or ends in _tuple, this should be a list of struct
+ // even for single child elements.
+ if (list_group.field_count() == 1 && !HasStructListName(list_group)) {
+ // List of primitive type
+ RETURN_NOT_OK(
+ NodeToSchemaField(*list_group.field(0), current_levels, ctx, out, child_field));
+ } else {
+ RETURN_NOT_OK(GroupToStruct(list_group, current_levels, ctx, out, child_field));
+ }
+ } else {
+ // Two-level list encoding
+ //
+ // required/optional group LIST {
+ // repeated TYPE;
+ // }
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(list_node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ auto item_field = ::arrow::field(list_node.name(), type, /*nullable=*/false,
+ FieldIdMetadata(list_node.field_id()));
+ RETURN_NOT_OK(
+ PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
+ }
+ out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
+ group.is_optional(), FieldIdMetadata(group.field_id()));
+ out->level_info = current_levels;
+ // At this point current levels contains the def level for this list,
+ // we need to reset to the prior parent.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+}
+
+Status GroupToSchemaField(const GroupNode& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ if (node.logical_type()->is_list()) {
+ return ListToSchemaField(node, current_levels, ctx, parent, out);
+ } else if (node.logical_type()->is_map()) {
+ return MapToSchemaField(node, current_levels, ctx, parent, out);
+ }
+ std::shared_ptr<ArrowType> type;
+ if (node.is_repeated()) {
+ // Simple repeated struct
+ //
+ // repeated group $NAME {
+ // r/o TYPE[0] f0
+ // r/o TYPE[1] f1
+ // }
+ out->children.resize(1);
+
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ RETURN_NOT_OK(GroupToStruct(node, current_levels, ctx, out, &out->children[0]));
+ out->field = ::arrow::field(node.name(), ::arrow::list(out->children[0].field),
+ /*nullable=*/false, FieldIdMetadata(node.field_id()));
+
+ ctx->LinkParent(&out->children[0], out);
+ out->level_info = current_levels;
+ // At this point current_levels contains this list as the def level, we need to
+ // use the previous ancenstor of thi slist.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+ } else {
+ current_levels.Increment(node);
+ return GroupToStruct(node, current_levels, ctx, parent, out);
+ }
+}
+
+Status NodeToSchemaField(const Node& node, LevelInfo current_levels,
+ SchemaTreeContext* ctx, const SchemaField* parent,
+ SchemaField* out) {
+ // Workhorse function for converting a Parquet schema node to an Arrow
+ // type. Handles different conventions for nested data.
+
+ ctx->LinkParent(out, parent);
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node.is_group()) {
+ // A nested field, but we don't know what kind yet
+ return GroupToSchemaField(static_cast<const GroupNode&>(node), current_levels, ctx,
+ parent, out);
+ } else {
+ // Either a normal flat primitive type, or a list type encoded with 1-level
+ // list encoding. Note that the 3-level encoding is the form recommended by
+ // the parquet specification, but technically we can have either
+ //
+ // required/optional $TYPE $FIELD_NAME
+ //
+ // or
+ //
+ // repeated $TYPE $FIELD_NAME
+ const auto& primitive_node = static_cast<const PrimitiveNode&>(node);
+ int column_index = ctx->schema->GetColumnIndex(primitive_node);
+ ASSIGN_OR_RAISE(std::shared_ptr<ArrowType> type,
+ GetTypeForNode(column_index, primitive_node, ctx));
+ if (node.is_repeated()) {
+ // One-level list encoding, e.g.
+ // a: repeated int32;
+ int16_t repeated_ancestor_def_level = current_levels.IncrementRepeated();
+ out->children.resize(1);
+ auto child_field = ::arrow::field(node.name(), type, /*nullable=*/false);
+ RETURN_NOT_OK(PopulateLeaf(column_index, child_field, current_levels, ctx, out,
+ &out->children[0]));
+
+ out->field = ::arrow::field(node.name(), ::arrow::list(child_field),
+ /*nullable=*/false, FieldIdMetadata(node.field_id()));
+ out->level_info = current_levels;
+ // At this point current_levels has consider this list the ancestor so restore
+ // the actual ancenstor.
+ out->level_info.repeated_ancestor_def_level = repeated_ancestor_def_level;
+ return Status::OK();
+ } else {
+ current_levels.Increment(node);
+ // A normal (required/optional) primitive node
+ return PopulateLeaf(column_index,
+ ::arrow::field(node.name(), type, node.is_optional(),
+ FieldIdMetadata(node.field_id())),
+ current_levels, ctx, parent, out);
+ }
+ }
+}
+
+// Get the original Arrow schema, as serialized in the Parquet metadata
+Status GetOriginSchema(const std::shared_ptr<const KeyValueMetadata>& metadata,
+ std::shared_ptr<const KeyValueMetadata>* clean_metadata,
+ std::shared_ptr<::arrow::Schema>* out) {
+ if (metadata == nullptr) {
+ *out = nullptr;
+ *clean_metadata = nullptr;
+ return Status::OK();
+ }
+
+ static const std::string kArrowSchemaKey = "ARROW:schema";
+ int schema_index = metadata->FindKey(kArrowSchemaKey);
+ if (schema_index == -1) {
+ *out = nullptr;
+ *clean_metadata = metadata;
+ return Status::OK();
+ }
+
+ // The original Arrow schema was serialized using the store_schema option.
+ // We deserialize it here and use it to inform read options such as
+ // dictionary-encoded fields.
+ auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index));
+ auto schema_buf = std::make_shared<Buffer>(decoded);
+
+ ::arrow::ipc::DictionaryMemo dict_memo;
+ ::arrow::io::BufferReader input(schema_buf);
+
+ ARROW_ASSIGN_OR_RAISE(*out, ::arrow::ipc::ReadSchema(&input, &dict_memo));
+
+ if (metadata->size() > 1) {
+ // Copy the metadata without the schema key
+ auto new_metadata = ::arrow::key_value_metadata({}, {});
+ new_metadata->reserve(metadata->size() - 1);
+ for (int64_t i = 0; i < metadata->size(); ++i) {
+ if (i == schema_index) continue;
+ new_metadata->Append(metadata->key(i), metadata->value(i));
+ }
+ *clean_metadata = new_metadata;
+ } else {
+ // No other keys, let metadata be null
+ *clean_metadata = nullptr;
+ }
+ return Status::OK();
+}
+
+// Restore original Arrow field information that was serialized as Parquet metadata
+// but that is not necessarily present in the field reconstitued from Parquet data
+// (for example, Parquet timestamp types doesn't carry timezone information).
+
+Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred);
+
+std::function<std::shared_ptr<::arrow::DataType>(FieldVector)> GetNestedFactory(
+ const ArrowType& origin_type, const ArrowType& inferred_type) {
+ switch (inferred_type.id()) {
+ case ::arrow::Type::STRUCT:
+ if (origin_type.id() == ::arrow::Type::STRUCT) {
+ return ::arrow::struct_;
+ }
+ break;
+ case ::arrow::Type::LIST:
+ if (origin_type.id() == ::arrow::Type::LIST) {
+ return [](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::list(std::move(fields[0]));
+ };
+ }
+ if (origin_type.id() == ::arrow::Type::LARGE_LIST) {
+ return [](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::large_list(std::move(fields[0]));
+ };
+ }
+ if (origin_type.id() == ::arrow::Type::FIXED_SIZE_LIST) {
+ const auto list_size =
+ checked_cast<const ::arrow::FixedSizeListType&>(origin_type).list_size();
+ return [list_size](FieldVector fields) {
+ DCHECK_EQ(fields.size(), 1);
+ return ::arrow::fixed_size_list(std::move(fields[0]), list_size);
+ };
+ }
+ break;
+ default:
+ break;
+ }
+ return {};
+}
+
+Result<bool> ApplyOriginalStorageMetadata(const Field& origin_field,
+ SchemaField* inferred) {
+ bool modified = false;
+
+ auto origin_type = origin_field.type();
+ auto inferred_type = inferred->field->type();
+
+ const int num_children = inferred_type->num_fields();
+
+ if (num_children > 0 && origin_type->num_fields() == num_children) {
+ DCHECK_EQ(static_cast<int>(inferred->children.size()), num_children);
+ const auto factory = GetNestedFactory(*origin_type, *inferred_type);
+ if (factory) {
+ // The type may be modified (e.g. LargeList) while the children stay the same
+ modified |= origin_type->id() != inferred_type->id();
+
+ // Apply original metadata recursively to children
+ for (int i = 0; i < inferred_type->num_fields(); ++i) {
+ ARROW_ASSIGN_OR_RAISE(
+ const bool child_modified,
+ ApplyOriginalMetadata(*origin_type->field(i), &inferred->children[i]));
+ modified |= child_modified;
+ }
+ if (modified) {
+ // Recreate this field using the modified child fields
+ ::arrow::FieldVector modified_children(inferred_type->num_fields());
+ for (int i = 0; i < inferred_type->num_fields(); ++i) {
+ modified_children[i] = inferred->children[i].field;
+ }
+ inferred->field =
+ inferred->field->WithType(factory(std::move(modified_children)));
+ }
+ }
+ }
+
+ if (origin_type->id() == ::arrow::Type::TIMESTAMP &&
+ inferred_type->id() == ::arrow::Type::TIMESTAMP) {
+ // Restore time zone, if any
+ const auto& ts_type = checked_cast<const ::arrow::TimestampType&>(*inferred_type);
+ const auto& ts_origin_type =
+ checked_cast<const ::arrow::TimestampType&>(*origin_type);
+
+ // If the data is tz-aware, then set the original time zone, since Parquet
+ // has no native storage for timezones
+ if (ts_type.timezone() == "UTC" && ts_origin_type.timezone() != "") {
+ if (ts_type.unit() == ts_origin_type.unit()) {
+ inferred->field = inferred->field->WithType(origin_type);
+ } else {
+ auto ts_type_new = ::arrow::timestamp(ts_type.unit(), ts_origin_type.timezone());
+ inferred->field = inferred->field->WithType(ts_type_new);
+ }
+ }
+ modified = true;
+ }
+
+ if (origin_type->id() == ::arrow::Type::DICTIONARY &&
+ inferred_type->id() != ::arrow::Type::DICTIONARY &&
+ IsDictionaryReadSupported(*inferred_type)) {
+ // Direct dictionary reads are only suppored for a couple primitive types,
+ // so no need to recurse on value types.
+ const auto& dict_origin_type =
+ checked_cast<const ::arrow::DictionaryType&>(*origin_type);
+ inferred->field = inferred->field->WithType(
+ ::arrow::dictionary(::arrow::int32(), inferred_type, dict_origin_type.ordered()));
+ modified = true;
+ }
+
+ if ((origin_type->id() == ::arrow::Type::LARGE_BINARY &&
+ inferred_type->id() == ::arrow::Type::BINARY) ||
+ (origin_type->id() == ::arrow::Type::LARGE_STRING &&
+ inferred_type->id() == ::arrow::Type::STRING)) {
+ // Read back binary-like arrays with the intended offset width.
+ inferred->field = inferred->field->WithType(origin_type);
+ modified = true;
+ }
+
+ if (origin_type->id() == ::arrow::Type::DECIMAL256 &&
+ inferred_type->id() == ::arrow::Type::DECIMAL128) {
+ inferred->field = inferred->field->WithType(origin_type);
+ modified = true;
+ }
+
+ // Restore field metadata
+ std::shared_ptr<const KeyValueMetadata> field_metadata = origin_field.metadata();
+ if (field_metadata != nullptr) {
+ if (inferred->field->metadata()) {
+ // Prefer the metadata keys (like field_id) from the current metadata
+ field_metadata = field_metadata->Merge(*inferred->field->metadata());
+ }
+ inferred->field = inferred->field->WithMetadata(field_metadata);
+ modified = true;
+ }
+
+ return modified;
+}
+
+Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred) {
+ bool modified = false;
+
+ auto origin_type = origin_field.type();
+ auto inferred_type = inferred->field->type();
+
+ if (origin_type->id() == ::arrow::Type::EXTENSION) {
+ const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type);
+ auto origin_storage_field = origin_field.WithType(ex_type.storage_type());
+
+ // Apply metadata recursively to storage type
+ RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred));
+
+ // Restore extension type, if the storage type is the same as inferred
+ // from the Parquet type
+ if (ex_type.storage_type()->Equals(*inferred->field->type())) {
+ inferred->field = inferred->field->WithType(origin_type);
+ }
+ modified = true;
+ } else {
+ ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred));
+ }
+
+ return modified;
+}
+
+} // namespace
+
+Status FieldToNode(const std::shared_ptr<Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties, NodePtr* out) {
+ return FieldToNode(field->name(), field, properties, arrow_properties, out);
+}
+
+Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ std::shared_ptr<SchemaDescriptor>* out) {
+ std::vector<NodePtr> nodes(arrow_schema->num_fields());
+ for (int i = 0; i < arrow_schema->num_fields(); i++) {
+ RETURN_NOT_OK(
+ FieldToNode(arrow_schema->field(i), properties, arrow_properties, &nodes[i]));
+ }
+
+ NodePtr schema = GroupNode::Make("schema", Repetition::REQUIRED, nodes);
+ *out = std::make_shared<::parquet::SchemaDescriptor>();
+ PARQUET_CATCH_NOT_OK((*out)->Init(schema));
+
+ return Status::OK();
+}
+
+Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ std::shared_ptr<SchemaDescriptor>* out) {
+ return ToParquetSchema(arrow_schema, properties, *default_arrow_writer_properties(),
+ out);
+}
+
+Status FromParquetSchema(
+ const SchemaDescriptor* schema, const ArrowReaderProperties& properties,
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata,
+ std::shared_ptr<::arrow::Schema>* out) {
+ SchemaManifest manifest;
+ RETURN_NOT_OK(SchemaManifest::Make(schema, key_value_metadata, properties, &manifest));
+ std::vector<std::shared_ptr<Field>> fields(manifest.schema_fields.size());
+
+ for (int i = 0; i < static_cast<int>(fields.size()); i++) {
+ const auto& schema_field = manifest.schema_fields[i];
+ fields[i] = schema_field.field;
+ }
+ if (manifest.origin_schema) {
+ // ARROW-8980: If the ARROW:schema was in the input metadata, then
+ // manifest.origin_schema will have it scrubbed out
+ *out = ::arrow::schema(fields, manifest.origin_schema->metadata());
+ } else {
+ *out = ::arrow::schema(fields, key_value_metadata);
+ }
+ return Status::OK();
+}
+
+Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ const ArrowReaderProperties& properties,
+ std::shared_ptr<::arrow::Schema>* out) {
+ return FromParquetSchema(parquet_schema, properties, nullptr, out);
+}
+
+Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ std::shared_ptr<::arrow::Schema>* out) {
+ ArrowReaderProperties properties;
+ return FromParquetSchema(parquet_schema, properties, nullptr, out);
+}
+
+Status SchemaManifest::Make(const SchemaDescriptor* schema,
+ const std::shared_ptr<const KeyValueMetadata>& metadata,
+ const ArrowReaderProperties& properties,
+ SchemaManifest* manifest) {
+ SchemaTreeContext ctx;
+ ctx.manifest = manifest;
+ ctx.properties = properties;
+ ctx.schema = schema;
+ const GroupNode& schema_node = *schema->group_node();
+ manifest->descr = schema;
+ manifest->schema_fields.resize(schema_node.field_count());
+
+ // Try to deserialize original Arrow schema
+ RETURN_NOT_OK(
+ GetOriginSchema(metadata, &manifest->schema_metadata, &manifest->origin_schema));
+ // Ignore original schema if it's not compatible with the Parquet schema
+ if (manifest->origin_schema != nullptr &&
+ manifest->origin_schema->num_fields() != schema_node.field_count()) {
+ manifest->origin_schema = nullptr;
+ }
+
+ for (int i = 0; i < static_cast<int>(schema_node.field_count()); ++i) {
+ SchemaField* out_field = &manifest->schema_fields[i];
+ RETURN_NOT_OK(NodeToSchemaField(*schema_node.field(i), LevelInfo(), &ctx,
+ /*parent=*/nullptr, out_field));
+
+ // TODO(wesm): as follow up to ARROW-3246, we should really pass the origin
+ // schema (if any) through all functions in the schema reconstruction, but
+ // I'm being lazy and just setting dictionary fields at the top level for
+ // now
+ if (manifest->origin_schema == nullptr) {
+ continue;
+ }
+
+ auto origin_field = manifest->origin_schema->field(i);
+ RETURN_NOT_OK(ApplyOriginalMetadata(*origin_field, out_field));
+ }
+ return Status::OK();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
index a5c3a58176d..dd60fde4342 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema.h
@@ -1,184 +1,184 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cassert>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "arrow/result.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_fwd.h"
-
-#include "parquet/level_conversion.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-
-class ArrowReaderProperties;
-class ArrowWriterProperties;
-class WriterProperties;
-
-namespace arrow {
-
-/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
-/// schema into a Parquet schema.
-///
-/// @{
-
-PARQUET_EXPORT
-::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- schema::NodePtr* out);
-
-PARQUET_EXPORT
-::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- const ArrowWriterProperties& arrow_properties,
- std::shared_ptr<SchemaDescriptor>* out);
-
-PARQUET_EXPORT
-::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
- const WriterProperties& properties,
- std::shared_ptr<SchemaDescriptor>* out);
-
-/// @}
-
-/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
-/// schema into an Arrow schema.
-///
-/// @{
-
-PARQUET_EXPORT
-::arrow::Status FromParquetSchema(
- const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
- const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
- std::shared_ptr<::arrow::Schema>* out);
-
-PARQUET_EXPORT
-::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- const ArrowReaderProperties& properties,
- std::shared_ptr<::arrow::Schema>* out);
-
-PARQUET_EXPORT
-::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
- std::shared_ptr<::arrow::Schema>* out);
-
-/// @}
-
-/// \brief Bridge between an arrow::Field and parquet column indices.
-struct PARQUET_EXPORT SchemaField {
- std::shared_ptr<::arrow::Field> field;
- std::vector<SchemaField> children;
-
- // Only set for leaf nodes
- int column_index = -1;
-
- parquet::internal::LevelInfo level_info;
-
- bool is_leaf() const { return column_index != -1; }
-};
-
-/// \brief Bridge between a parquet Schema and an arrow Schema.
-///
-/// Expose parquet columns as a tree structure. Useful traverse and link
-/// between arrow's Schema and parquet's Schema.
-struct PARQUET_EXPORT SchemaManifest {
- static ::arrow::Status Make(
- const SchemaDescriptor* schema,
- const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
- const ArrowReaderProperties& properties, SchemaManifest* manifest);
-
- const SchemaDescriptor* descr;
- std::shared_ptr<::arrow::Schema> origin_schema;
- std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
- std::vector<SchemaField> schema_fields;
-
- std::unordered_map<int, const SchemaField*> column_index_to_field;
- std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
-
- ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
- auto it = column_index_to_field.find(column_index);
- if (it == column_index_to_field.end()) {
- return ::arrow::Status::KeyError("Column index ", column_index,
- " not found in schema manifest, may be malformed");
- }
- *out = it->second;
- return ::arrow::Status::OK();
- }
-
- const SchemaField* GetParent(const SchemaField* field) const {
- // Returns nullptr also if not found
- auto it = child_to_parent.find(field);
- if (it == child_to_parent.end()) {
- return NULLPTR;
- }
- return it->second;
- }
-
- /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
- /// correspond to the column root (first node below the parquet schema's root group) of
- /// each leaf referenced in column_indices.
- ///
- /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
- /// the roots are `a` and `i` (return=[0,2]).
- ///
- /// root
- /// -- a <------
- /// -- -- b | |
- /// -- -- -- c |
- /// -- -- -- d |
- /// -- -- -- -- e
- /// -- f
- /// -- -- g
- /// -- -- -- h
- /// -- i <---
- /// -- -- j |
- /// -- -- -- k
- ::arrow::Result<std::vector<int>> GetFieldIndices(
- const std::vector<int>& column_indices) const {
- const schema::GroupNode* group = descr->group_node();
- std::unordered_set<int> already_added;
-
- std::vector<int> out;
- for (int column_idx : column_indices) {
- if (column_idx < 0 || column_idx >= descr->num_columns()) {
- return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
- }
-
- auto field_node = descr->GetColumnRoot(column_idx);
- auto field_idx = group->FieldIndex(*field_node);
- if (field_idx == -1) {
- return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
- }
-
- if (already_added.insert(field_idx).second) {
- out.push_back(field_idx);
- }
- }
- return out;
- }
-};
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ArrowReaderProperties;
+class ArrowWriterProperties;
+class WriterProperties;
+
+namespace arrow {
+
+/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
+/// schema into a Parquet schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ schema::NodePtr* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ const ArrowWriterProperties& arrow_properties,
+ std::shared_ptr<SchemaDescriptor>* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+ const WriterProperties& properties,
+ std::shared_ptr<SchemaDescriptor>* out);
+
+/// @}
+
+/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
+/// schema into an Arrow schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(
+ const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
+ std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ const ArrowReaderProperties& properties,
+ std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+ std::shared_ptr<::arrow::Schema>* out);
+
+/// @}
+
+/// \brief Bridge between an arrow::Field and parquet column indices.
+struct PARQUET_EXPORT SchemaField {
+ std::shared_ptr<::arrow::Field> field;
+ std::vector<SchemaField> children;
+
+ // Only set for leaf nodes
+ int column_index = -1;
+
+ parquet::internal::LevelInfo level_info;
+
+ bool is_leaf() const { return column_index != -1; }
+};
+
+/// \brief Bridge between a parquet Schema and an arrow Schema.
+///
+/// Expose parquet columns as a tree structure. Useful traverse and link
+/// between arrow's Schema and parquet's Schema.
+struct PARQUET_EXPORT SchemaManifest {
+ static ::arrow::Status Make(
+ const SchemaDescriptor* schema,
+ const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
+ const ArrowReaderProperties& properties, SchemaManifest* manifest);
+
+ const SchemaDescriptor* descr;
+ std::shared_ptr<::arrow::Schema> origin_schema;
+ std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
+ std::vector<SchemaField> schema_fields;
+
+ std::unordered_map<int, const SchemaField*> column_index_to_field;
+ std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
+
+ ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
+ auto it = column_index_to_field.find(column_index);
+ if (it == column_index_to_field.end()) {
+ return ::arrow::Status::KeyError("Column index ", column_index,
+ " not found in schema manifest, may be malformed");
+ }
+ *out = it->second;
+ return ::arrow::Status::OK();
+ }
+
+ const SchemaField* GetParent(const SchemaField* field) const {
+ // Returns nullptr also if not found
+ auto it = child_to_parent.find(field);
+ if (it == child_to_parent.end()) {
+ return NULLPTR;
+ }
+ return it->second;
+ }
+
+ /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
+ /// correspond to the column root (first node below the parquet schema's root group) of
+ /// each leaf referenced in column_indices.
+ ///
+ /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
+ /// the roots are `a` and `i` (return=[0,2]).
+ ///
+ /// root
+ /// -- a <------
+ /// -- -- b | |
+ /// -- -- -- c |
+ /// -- -- -- d |
+ /// -- -- -- -- e
+ /// -- f
+ /// -- -- g
+ /// -- -- -- h
+ /// -- i <---
+ /// -- -- j |
+ /// -- -- -- k
+ ::arrow::Result<std::vector<int>> GetFieldIndices(
+ const std::vector<int>& column_indices) const {
+ const schema::GroupNode* group = descr->group_node();
+ std::unordered_set<int> already_added;
+
+ std::vector<int> out;
+ for (int column_idx : column_indices) {
+ if (column_idx < 0 || column_idx >= descr->num_columns()) {
+ return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+ }
+
+ auto field_node = descr->GetColumnRoot(column_idx);
+ auto field_idx = group->FieldIndex(*field_node);
+ if (field_idx == -1) {
+ return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+ }
+
+ if (already_added.insert(field_idx).second) {
+ out.push_back(field_idx);
+ }
+ }
+ return out;
+ }
+};
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
index 13acbb3d555..064bf4f55cc 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.cc
@@ -1,222 +1,222 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/schema_internal.h"
-
-#include "arrow/type.h"
-
-using ArrowType = ::arrow::DataType;
-using ArrowTypeId = ::arrow::Type;
-using ParquetType = parquet::Type;
-
-namespace parquet {
-
-namespace arrow {
-
-using ::arrow::Result;
-using ::arrow::Status;
-using ::arrow::internal::checked_cast;
-
-Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
- const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
- if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
- return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
- }
- return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
- const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
- switch (integer.bit_width()) {
- case 8:
- return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
- case 16:
- return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
- case 32:
- return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Int32");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
- const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
- switch (integer.bit_width()) {
- case 64:
- return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Int64");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
- const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
- switch (time.time_unit()) {
- case LogicalType::TimeUnit::MILLIS:
- return ::arrow::time32(::arrow::TimeUnit::MILLI);
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Time32");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
- const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
- switch (time.time_unit()) {
- case LogicalType::TimeUnit::MICROS:
- return ::arrow::time64(::arrow::TimeUnit::MICRO);
- case LogicalType::TimeUnit::NANOS:
- return ::arrow::time64(::arrow::TimeUnit::NANO);
- default:
- return Status::TypeError(logical_type.ToString(),
- " can not annotate physical type Time64");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
- const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
- const bool utc_normalized =
- timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
- static const char* utc_timezone = "UTC";
- switch (timestamp.time_unit()) {
- case LogicalType::TimeUnit::MILLIS:
- return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
- : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
- case LogicalType::TimeUnit::MICROS:
- return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
- : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
- case LogicalType::TimeUnit::NANOS:
- return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
- : ::arrow::timestamp(::arrow::TimeUnit::NANO));
- default:
- return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
- logical_type.ToString());
- }
-}
-
-Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
- switch (logical_type.type()) {
- case LogicalType::Type::STRING:
- return ::arrow::utf8();
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::NONE:
- case LogicalType::Type::ENUM:
- case LogicalType::Type::JSON:
- case LogicalType::Type::BSON:
- return ::arrow::binary();
- default:
- return Status::NotImplemented("Unhandled logical logical_type ",
- logical_type.ToString(), " for binary array");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
- int32_t physical_length) {
- switch (logical_type.type()) {
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::NONE:
- case LogicalType::Type::INTERVAL:
- case LogicalType::Type::UUID:
- return ::arrow::fixed_size_binary(physical_length);
- default:
- return Status::NotImplemented("Unhandled logical logical_type ",
- logical_type.ToString(),
- " for fixed-length binary array");
- }
-}
-
-::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeArrowInt(logical_type);
- case LogicalType::Type::DATE:
- return ::arrow::date32();
- case LogicalType::Type::TIME:
- return MakeArrowTime32(logical_type);
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::NONE:
- return ::arrow::int32();
- default:
- return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
- " for INT32");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
- switch (logical_type.type()) {
- case LogicalType::Type::INT:
- return MakeArrowInt64(logical_type);
- case LogicalType::Type::DECIMAL:
- return MakeArrowDecimal(logical_type);
- case LogicalType::Type::TIMESTAMP:
- return MakeArrowTimestamp(logical_type);
- case LogicalType::Type::TIME:
- return MakeArrowTime64(logical_type);
- case LogicalType::Type::NONE:
- return ::arrow::int64();
- default:
- return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
- " for INT64");
- }
-}
-
-Result<std::shared_ptr<ArrowType>> GetArrowType(
- Type::type physical_type, const LogicalType& logical_type, int type_length,
- const ::arrow::TimeUnit::type int96_arrow_time_unit) {
- if (logical_type.is_invalid() || logical_type.is_null()) {
- return ::arrow::null();
- }
-
- switch (physical_type) {
- case ParquetType::BOOLEAN:
- return ::arrow::boolean();
- case ParquetType::INT32:
- return FromInt32(logical_type);
- case ParquetType::INT64:
- return FromInt64(logical_type);
- case ParquetType::INT96:
- return ::arrow::timestamp(int96_arrow_time_unit);
- case ParquetType::FLOAT:
- return ::arrow::float32();
- case ParquetType::DOUBLE:
- return ::arrow::float64();
- case ParquetType::BYTE_ARRAY:
- return FromByteArray(logical_type);
- case ParquetType::FIXED_LEN_BYTE_ARRAY:
- return FromFLBA(logical_type, type_length);
- default: {
- // PARQUET-1565: This can occur if the file is corrupt
- return Status::IOError("Invalid physical column type: ",
- TypeToString(physical_type));
- }
- }
-}
-
-Result<std::shared_ptr<ArrowType>> GetArrowType(
- const schema::PrimitiveNode& primitive,
- const ::arrow::TimeUnit::type int96_arrow_time_unit) {
- return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
- primitive.type_length(), int96_arrow_time_unit);
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/schema_internal.h"
+
+#include "arrow/type.h"
+
+using ArrowType = ::arrow::DataType;
+using ArrowTypeId = ::arrow::Type;
+using ParquetType = parquet::Type;
+
+namespace parquet {
+
+namespace arrow {
+
+using ::arrow::Result;
+using ::arrow::Status;
+using ::arrow::internal::checked_cast;
+
+Result<std::shared_ptr<ArrowType>> MakeArrowDecimal(const LogicalType& logical_type) {
+ const auto& decimal = checked_cast<const DecimalLogicalType&>(logical_type);
+ if (decimal.precision() <= ::arrow::Decimal128Type::kMaxPrecision) {
+ return ::arrow::Decimal128Type::Make(decimal.precision(), decimal.scale());
+ }
+ return ::arrow::Decimal256Type::Make(decimal.precision(), decimal.scale());
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowInt(const LogicalType& logical_type) {
+ const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
+ switch (integer.bit_width()) {
+ case 8:
+ return integer.is_signed() ? ::arrow::int8() : ::arrow::uint8();
+ case 16:
+ return integer.is_signed() ? ::arrow::int16() : ::arrow::uint16();
+ case 32:
+ return integer.is_signed() ? ::arrow::int32() : ::arrow::uint32();
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Int32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowInt64(const LogicalType& logical_type) {
+ const auto& integer = checked_cast<const IntLogicalType&>(logical_type);
+ switch (integer.bit_width()) {
+ case 64:
+ return integer.is_signed() ? ::arrow::int64() : ::arrow::uint64();
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Int64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTime32(const LogicalType& logical_type) {
+ const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
+ switch (time.time_unit()) {
+ case LogicalType::TimeUnit::MILLIS:
+ return ::arrow::time32(::arrow::TimeUnit::MILLI);
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Time32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTime64(const LogicalType& logical_type) {
+ const auto& time = checked_cast<const TimeLogicalType&>(logical_type);
+ switch (time.time_unit()) {
+ case LogicalType::TimeUnit::MICROS:
+ return ::arrow::time64(::arrow::TimeUnit::MICRO);
+ case LogicalType::TimeUnit::NANOS:
+ return ::arrow::time64(::arrow::TimeUnit::NANO);
+ default:
+ return Status::TypeError(logical_type.ToString(),
+ " can not annotate physical type Time64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical_type) {
+ const auto& timestamp = checked_cast<const TimestampLogicalType&>(logical_type);
+ const bool utc_normalized =
+ timestamp.is_from_converted_type() ? false : timestamp.is_adjusted_to_utc();
+ static const char* utc_timezone = "UTC";
+ switch (timestamp.time_unit()) {
+ case LogicalType::TimeUnit::MILLIS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MILLI, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::MILLI));
+ case LogicalType::TimeUnit::MICROS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::MICRO, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::MICRO));
+ case LogicalType::TimeUnit::NANOS:
+ return (utc_normalized ? ::arrow::timestamp(::arrow::TimeUnit::NANO, utc_timezone)
+ : ::arrow::timestamp(::arrow::TimeUnit::NANO));
+ default:
+ return Status::TypeError("Unrecognized time unit in timestamp logical_type: ",
+ logical_type.ToString());
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromByteArray(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::STRING:
+ return ::arrow::utf8();
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ case LogicalType::Type::ENUM:
+ case LogicalType::Type::JSON:
+ case LogicalType::Type::BSON:
+ return ::arrow::binary();
+ default:
+ return Status::NotImplemented("Unhandled logical logical_type ",
+ logical_type.ToString(), " for binary array");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
+ int32_t physical_length) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ case LogicalType::Type::INTERVAL:
+ case LogicalType::Type::UUID:
+ return ::arrow::fixed_size_binary(physical_length);
+ default:
+ return Status::NotImplemented("Unhandled logical logical_type ",
+ logical_type.ToString(),
+ " for fixed-length binary array");
+ }
+}
+
+::arrow::Result<std::shared_ptr<ArrowType>> FromInt32(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeArrowInt(logical_type);
+ case LogicalType::Type::DATE:
+ return ::arrow::date32();
+ case LogicalType::Type::TIME:
+ return MakeArrowTime32(logical_type);
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::NONE:
+ return ::arrow::int32();
+ default:
+ return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
+ " for INT32");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> FromInt64(const LogicalType& logical_type) {
+ switch (logical_type.type()) {
+ case LogicalType::Type::INT:
+ return MakeArrowInt64(logical_type);
+ case LogicalType::Type::DECIMAL:
+ return MakeArrowDecimal(logical_type);
+ case LogicalType::Type::TIMESTAMP:
+ return MakeArrowTimestamp(logical_type);
+ case LogicalType::Type::TIME:
+ return MakeArrowTime64(logical_type);
+ case LogicalType::Type::NONE:
+ return ::arrow::int64();
+ default:
+ return Status::NotImplemented("Unhandled logical type ", logical_type.ToString(),
+ " for INT64");
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+ Type::type physical_type, const LogicalType& logical_type, int type_length,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ if (logical_type.is_invalid() || logical_type.is_null()) {
+ return ::arrow::null();
+ }
+
+ switch (physical_type) {
+ case ParquetType::BOOLEAN:
+ return ::arrow::boolean();
+ case ParquetType::INT32:
+ return FromInt32(logical_type);
+ case ParquetType::INT64:
+ return FromInt64(logical_type);
+ case ParquetType::INT96:
+ return ::arrow::timestamp(int96_arrow_time_unit);
+ case ParquetType::FLOAT:
+ return ::arrow::float32();
+ case ParquetType::DOUBLE:
+ return ::arrow::float64();
+ case ParquetType::BYTE_ARRAY:
+ return FromByteArray(logical_type);
+ case ParquetType::FIXED_LEN_BYTE_ARRAY:
+ return FromFLBA(logical_type, type_length);
+ default: {
+ // PARQUET-1565: This can occur if the file is corrupt
+ return Status::IOError("Invalid physical column type: ",
+ TypeToString(physical_type));
+ }
+ }
+}
+
+Result<std::shared_ptr<ArrowType>> GetArrowType(
+ const schema::PrimitiveNode& primitive,
+ const ::arrow::TimeUnit::type int96_arrow_time_unit) {
+ return GetArrowType(primitive.physical_type(), *primitive.logical_type(),
+ primitive.type_length(), int96_arrow_time_unit);
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
index c48fd7c938a..fb837c3ee6c 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/schema_internal.h
@@ -1,51 +1,51 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/result.h"
-#include "parquet/schema.h"
-
-namespace arrow {
-class DataType;
-}
-
-namespace parquet {
-namespace arrow {
-
-using ::arrow::Result;
-
-Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
-Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
- int32_t physical_length);
-Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
-Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
-
-Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
- const LogicalType& logical_type,
- int type_length);
-
-Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
- Type::type physical_type, const LogicalType& logical_type, int type_length,
- ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
-
-Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
- const schema::PrimitiveNode& primitive,
- ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+#include "parquet/schema.h"
+
+namespace arrow {
+class DataType;
+}
+
+namespace parquet {
+namespace arrow {
+
+using ::arrow::Result;
+
+Result<std::shared_ptr<::arrow::DataType>> FromByteArray(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromFLBA(const LogicalType& logical_type,
+ int32_t physical_length);
+Result<std::shared_ptr<::arrow::DataType>> FromInt32(const LogicalType& logical_type);
+Result<std::shared_ptr<::arrow::DataType>> FromInt64(const LogicalType& logical_type);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(Type::type physical_type,
+ const LogicalType& logical_type,
+ int type_length);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
+ Type::type physical_type, const LogicalType& logical_type, int type_length,
+ ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
+Result<std::shared_ptr<::arrow::DataType>> GetArrowType(
+ const schema::PrimitiveNode& primitive,
+ ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO);
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
index 797069eb327..2fbebf27fce 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.cc
@@ -1,482 +1,482 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/arrow/writer.h"
-
-#include <algorithm>
-#include <deque>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/extension_type.h"
-#include "arrow/ipc/writer.h"
-#include "arrow/table.h"
-#include "arrow/type.h"
-#include "arrow/util/base64.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/make_unique.h"
-#include "arrow/visitor_inline.h"
-
-#include "parquet/arrow/path_internal.h"
-#include "parquet/arrow/reader_internal.h"
-#include "parquet/arrow/schema.h"
-#include "parquet/column_writer.h"
-#include "parquet/exception.h"
-#include "parquet/file_writer.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-using arrow::Array;
-using arrow::BinaryArray;
-using arrow::BooleanArray;
-using arrow::ChunkedArray;
-using arrow::DataType;
-using arrow::DictionaryArray;
-using arrow::ExtensionArray;
-using arrow::ExtensionType;
-using arrow::Field;
-using arrow::FixedSizeBinaryArray;
-using arrow::ListArray;
-using arrow::MemoryPool;
-using arrow::NumericArray;
-using arrow::PrimitiveArray;
-using arrow::ResizableBuffer;
-using arrow::Status;
-using arrow::Table;
-using arrow::TimeUnit;
-
-using arrow::internal::checked_cast;
-
-using parquet::ParquetFileWriter;
-using parquet::ParquetVersion;
-using parquet::schema::GroupNode;
-
-namespace parquet {
-namespace arrow {
-
-namespace {
-
-int CalculateLeafCount(const DataType* type) {
- if (type->id() == ::arrow::Type::EXTENSION) {
- type = checked_cast<const ExtensionType&>(*type).storage_type().get();
- }
- // Note num_fields() can be 0 for an empty struct type
- if (!::arrow::is_nested(type->id())) {
- // Primitive type.
- return 1;
- }
-
- int num_leaves = 0;
- for (const auto& field : type->fields()) {
- num_leaves += CalculateLeafCount(field->type().get());
- }
- return num_leaves;
-}
-
-// Determines if the |schema_field|'s root ancestor is nullable.
-bool HasNullableRoot(const SchemaManifest& schema_manifest,
- const SchemaField* schema_field) {
- DCHECK(schema_field != nullptr);
- const SchemaField* current_field = schema_field;
- bool nullable = schema_field->field->nullable();
- while (current_field != nullptr) {
- nullable = current_field->field->nullable();
- current_field = schema_manifest.GetParent(current_field);
- }
- return nullable;
-}
-
-// Manages writing nested parquet columns with support for all nested types
-// supported by parquet.
-class ArrowColumnWriterV2 {
- public:
- // Constructs a new object (use Make() method below to construct from
- // A ChunkedArray).
- // level_builders should contain one MultipathLevelBuilder per chunk of the
- // Arrow-column to write.
- ArrowColumnWriterV2(std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders,
- int leaf_count, RowGroupWriter* row_group_writer)
- : level_builders_(std::move(level_builders)),
- leaf_count_(leaf_count),
- row_group_writer_(row_group_writer) {}
-
- // Writes out all leaf parquet columns to the RowGroupWriter that this
- // object was constructed with. Each leaf column is written fully before
- // the next column is written (i.e. no buffering is assumed).
- //
- // Columns are written in DFS order.
- Status Write(ArrowWriteContext* ctx) {
- for (int leaf_idx = 0; leaf_idx < leaf_count_; leaf_idx++) {
- ColumnWriter* column_writer;
- PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
- for (auto& level_builder : level_builders_) {
- RETURN_NOT_OK(level_builder->Write(
- leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
- size_t visited_component_size = result.post_list_visited_elements.size();
- DCHECK_GT(visited_component_size, 0);
- if (visited_component_size != 1) {
- return Status::NotImplemented(
- "Lists with non-zero length null components are not supported");
- }
- const ElementRange& range = result.post_list_visited_elements[0];
- std::shared_ptr<Array> values_array =
- result.leaf_array->Slice(range.start, range.Size());
-
- return column_writer->WriteArrow(result.def_levels, result.rep_levels,
- result.def_rep_level_count, *values_array,
- ctx, result.leaf_is_nullable);
- }));
- }
-
- PARQUET_CATCH_NOT_OK(column_writer->Close());
- }
- return Status::OK();
- }
-
- // Make a new object by converting each chunk in |data| to a MultipathLevelBuilder.
- //
- // It is necessary to create a new builder per array because the MultipathlevelBuilder
- // extracts the data necessary for writing each leaf column at construction time.
- // (it optimizes based on null count) and with slicing via |offset| ephemeral
- // chunks are created which need to be tracked across each leaf column-write.
- // This decision could potentially be revisited if we wanted to use "buffered"
- // RowGroupWriters (we could construct each builder on demand in that case).
- static ::arrow::Result<std::unique_ptr<ArrowColumnWriterV2>> Make(
- const ChunkedArray& data, int64_t offset, const int64_t size,
- const SchemaManifest& schema_manifest, RowGroupWriter* row_group_writer) {
- int64_t absolute_position = 0;
- int chunk_index = 0;
- int64_t chunk_offset = 0;
- if (data.length() == 0) {
- return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
- std::vector<std::unique_ptr<MultipathLevelBuilder>>{},
- CalculateLeafCount(data.type().get()), row_group_writer);
- }
- while (chunk_index < data.num_chunks() && absolute_position < offset) {
- const int64_t chunk_length = data.chunk(chunk_index)->length();
- if (absolute_position + chunk_length > offset) {
- // Relative offset into the chunk to reach the desired start offset for
- // writing
- chunk_offset = offset - absolute_position;
- break;
- } else {
- ++chunk_index;
- absolute_position += chunk_length;
- }
- }
-
- if (absolute_position >= data.length()) {
- return Status::Invalid("Cannot write data at offset past end of chunked array");
- }
-
- int64_t values_written = 0;
- std::vector<std::unique_ptr<MultipathLevelBuilder>> builders;
- const int leaf_count = CalculateLeafCount(data.type().get());
- bool is_nullable = false;
- // The row_group_writer hasn't been advanced yet so add 1 to the current
- // which is the one this instance will start writing for.
- int column_index = row_group_writer->current_column() + 1;
- for (int leaf_offset = 0; leaf_offset < leaf_count; ++leaf_offset) {
- const SchemaField* schema_field = nullptr;
- RETURN_NOT_OK(
- schema_manifest.GetColumnField(column_index + leaf_offset, &schema_field));
- bool nullable_root = HasNullableRoot(schema_manifest, schema_field);
- if (leaf_offset == 0) {
- is_nullable = nullable_root;
- }
-
-// Don't validate common ancestry for all leafs if not in debug.
-#ifndef NDEBUG
- break;
-#else
- if (is_nullable != nullable_root) {
- return Status::UnknownError(
- "Unexpected mismatched nullability between column index",
- column_index + leaf_offset, " and ", column_index);
- }
-#endif
- }
- while (values_written < size) {
- const Array& chunk = *data.chunk(chunk_index);
- const int64_t available_values = chunk.length() - chunk_offset;
- const int64_t chunk_write_size = std::min(size - values_written, available_values);
-
- // The chunk offset here will be 0 except for possibly the first chunk
- // because of the advancing logic above
- std::shared_ptr<Array> array_to_write = chunk.Slice(chunk_offset, chunk_write_size);
-
- if (array_to_write->length() > 0) {
- ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
- MultipathLevelBuilder::Make(*array_to_write, is_nullable));
- if (leaf_count != builder->GetLeafCount()) {
- return Status::UnknownError("data type leaf_count != builder_leaf_count",
- leaf_count, " ", builder->GetLeafCount());
- }
- builders.emplace_back(std::move(builder));
- }
-
- if (chunk_write_size == available_values) {
- chunk_offset = 0;
- ++chunk_index;
- }
- values_written += chunk_write_size;
- }
- return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
- std::move(builders), leaf_count, row_group_writer);
- }
-
- private:
- // One builder per column-chunk.
- std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders_;
- int leaf_count_;
- RowGroupWriter* row_group_writer_;
-};
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// FileWriter implementation
-
-class FileWriterImpl : public FileWriter {
- public:
- FileWriterImpl(std::shared_ptr<::arrow::Schema> schema, MemoryPool* pool,
- std::unique_ptr<ParquetFileWriter> writer,
- std::shared_ptr<ArrowWriterProperties> arrow_properties)
- : schema_(std::move(schema)),
- writer_(std::move(writer)),
- row_group_writer_(nullptr),
- column_write_context_(pool, arrow_properties.get()),
- arrow_properties_(std::move(arrow_properties)),
- closed_(false) {}
-
- Status Init() {
- return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr,
- default_arrow_reader_properties(), &schema_manifest_);
- }
-
- Status NewRowGroup(int64_t chunk_size) override {
- if (row_group_writer_ != nullptr) {
- PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
- }
- PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup());
- return Status::OK();
- }
-
- Status Close() override {
- if (!closed_) {
- // Make idempotent
- closed_ = true;
- if (row_group_writer_ != nullptr) {
- PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
- }
- PARQUET_CATCH_NOT_OK(writer_->Close());
- }
- return Status::OK();
- }
-
- Status WriteColumnChunk(const Array& data) override {
- // A bit awkward here since cannot instantiate ChunkedArray from const Array&
- auto chunk = ::arrow::MakeArray(data.data());
- auto chunked_array = std::make_shared<::arrow::ChunkedArray>(chunk);
- return WriteColumnChunk(chunked_array, 0, data.length());
- }
-
- Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
- int64_t size) override {
- if (arrow_properties_->engine_version() == ArrowWriterProperties::V2 ||
- arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
- ARROW_ASSIGN_OR_RAISE(
- std::unique_ptr<ArrowColumnWriterV2> writer,
- ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_,
- row_group_writer_));
- return writer->Write(&column_write_context_);
- }
- return Status::NotImplemented("Unknown engine version.");
- }
-
- Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data) override {
- return WriteColumnChunk(data, 0, data->length());
- }
-
- std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
-
- Status WriteTable(const Table& table, int64_t chunk_size) override {
- RETURN_NOT_OK(table.Validate());
-
- if (chunk_size <= 0 && table.num_rows() > 0) {
- return Status::Invalid("chunk size per row_group must be greater than 0");
- } else if (!table.schema()->Equals(*schema_, false)) {
- return Status::Invalid("table schema does not match this writer's. table:'",
- table.schema()->ToString(), "' this:'", schema_->ToString(),
- "'");
- } else if (chunk_size > this->properties().max_row_group_length()) {
- chunk_size = this->properties().max_row_group_length();
- }
-
- auto WriteRowGroup = [&](int64_t offset, int64_t size) {
- RETURN_NOT_OK(NewRowGroup(size));
- for (int i = 0; i < table.num_columns(); i++) {
- RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size));
- }
- return Status::OK();
- };
-
- if (table.num_rows() == 0) {
- // Append a row group with 0 rows
- RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close()));
- return Status::OK();
- }
-
- for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
- int64_t offset = chunk * chunk_size;
- RETURN_NOT_OK_ELSE(
- WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)),
- PARQUET_IGNORE_NOT_OK(Close()));
- }
- return Status::OK();
- }
-
- const WriterProperties& properties() const { return *writer_->properties(); }
-
- ::arrow::MemoryPool* memory_pool() const override {
- return column_write_context_.memory_pool;
- }
-
- const std::shared_ptr<FileMetaData> metadata() const override {
- return writer_->metadata();
- }
-
- private:
- friend class FileWriter;
-
- std::shared_ptr<::arrow::Schema> schema_;
-
- SchemaManifest schema_manifest_;
-
- std::unique_ptr<ParquetFileWriter> writer_;
- RowGroupWriter* row_group_writer_;
- ArrowWriteContext column_write_context_;
- std::shared_ptr<ArrowWriterProperties> arrow_properties_;
- bool closed_;
-};
-
-FileWriter::~FileWriter() {}
-
-Status FileWriter::Make(::arrow::MemoryPool* pool,
- std::unique_ptr<ParquetFileWriter> writer,
- std::shared_ptr<::arrow::Schema> schema,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* out) {
- std::unique_ptr<FileWriterImpl> impl(new FileWriterImpl(
- std::move(schema), pool, std::move(writer), std::move(arrow_properties)));
- RETURN_NOT_OK(impl->Init());
- *out = std::move(impl);
- return Status::OK();
-}
-
-Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::unique_ptr<FileWriter>* writer) {
- return Open(std::move(schema), pool, std::move(sink), std::move(properties),
- default_arrow_writer_properties(), writer);
-}
-
-Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
- const ArrowWriterProperties& properties,
- std::shared_ptr<const KeyValueMetadata>* out) {
- if (!properties.store_schema()) {
- *out = nullptr;
- return Status::OK();
- }
-
- static const std::string kArrowSchemaKey = "ARROW:schema";
- std::shared_ptr<KeyValueMetadata> result;
- if (schema.metadata()) {
- result = schema.metadata()->Copy();
- } else {
- result = ::arrow::key_value_metadata({}, {});
- }
-
- ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> serialized,
- ::arrow::ipc::SerializeSchema(schema, pool));
-
- // The serialized schema is not UTF-8, which is required for Thrift
- std::string schema_as_string = serialized->ToString();
- std::string schema_base64 = ::arrow::util::base64_encode(
- reinterpret_cast<const unsigned char*>(schema_as_string.data()),
- static_cast<unsigned int>(schema_as_string.size()));
- result->Append(kArrowSchemaKey, schema_base64);
- *out = result;
- return Status::OK();
-}
-
-Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* writer) {
- std::shared_ptr<SchemaDescriptor> parquet_schema;
- RETURN_NOT_OK(
- ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema));
-
- auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
-
- std::shared_ptr<const KeyValueMetadata> metadata;
- RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata));
-
- std::unique_ptr<ParquetFileWriter> base_writer;
- PARQUET_CATCH_NOT_OK(base_writer = ParquetFileWriter::Open(std::move(sink), schema_node,
- std::move(properties),
- std::move(metadata)));
-
- auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
- return Make(pool, std::move(base_writer), std::move(schema_ptr),
- std::move(arrow_properties), writer);
-}
-
-Status WriteFileMetaData(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink) {
- PARQUET_CATCH_NOT_OK(::parquet::WriteFileMetaData(file_metadata, sink));
- return Status::OK();
-}
-
-Status WriteMetaDataFile(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink) {
- PARQUET_CATCH_NOT_OK(::parquet::WriteMetaDataFile(file_metadata, sink));
- return Status::OK();
-}
-
-Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<ArrowWriterProperties> arrow_properties) {
- std::unique_ptr<FileWriter> writer;
- RETURN_NOT_OK(FileWriter::Open(*table.schema(), pool, std::move(sink),
- std::move(properties), std::move(arrow_properties),
- &writer));
- RETURN_NOT_OK(writer->WriteTable(table, chunk_size));
- return writer->Close();
-}
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/arrow/writer.h"
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/extension_type.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/base64.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/make_unique.h"
+#include "arrow/visitor_inline.h"
+
+#include "parquet/arrow/path_internal.h"
+#include "parquet/arrow/reader_internal.h"
+#include "parquet/arrow/schema.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+using arrow::Array;
+using arrow::BinaryArray;
+using arrow::BooleanArray;
+using arrow::ChunkedArray;
+using arrow::DataType;
+using arrow::DictionaryArray;
+using arrow::ExtensionArray;
+using arrow::ExtensionType;
+using arrow::Field;
+using arrow::FixedSizeBinaryArray;
+using arrow::ListArray;
+using arrow::MemoryPool;
+using arrow::NumericArray;
+using arrow::PrimitiveArray;
+using arrow::ResizableBuffer;
+using arrow::Status;
+using arrow::Table;
+using arrow::TimeUnit;
+
+using arrow::internal::checked_cast;
+
+using parquet::ParquetFileWriter;
+using parquet::ParquetVersion;
+using parquet::schema::GroupNode;
+
+namespace parquet {
+namespace arrow {
+
+namespace {
+
+int CalculateLeafCount(const DataType* type) {
+ if (type->id() == ::arrow::Type::EXTENSION) {
+ type = checked_cast<const ExtensionType&>(*type).storage_type().get();
+ }
+ // Note num_fields() can be 0 for an empty struct type
+ if (!::arrow::is_nested(type->id())) {
+ // Primitive type.
+ return 1;
+ }
+
+ int num_leaves = 0;
+ for (const auto& field : type->fields()) {
+ num_leaves += CalculateLeafCount(field->type().get());
+ }
+ return num_leaves;
+}
+
+// Determines if the |schema_field|'s root ancestor is nullable.
+bool HasNullableRoot(const SchemaManifest& schema_manifest,
+ const SchemaField* schema_field) {
+ DCHECK(schema_field != nullptr);
+ const SchemaField* current_field = schema_field;
+ bool nullable = schema_field->field->nullable();
+ while (current_field != nullptr) {
+ nullable = current_field->field->nullable();
+ current_field = schema_manifest.GetParent(current_field);
+ }
+ return nullable;
+}
+
+// Manages writing nested parquet columns with support for all nested types
+// supported by parquet.
+class ArrowColumnWriterV2 {
+ public:
+ // Constructs a new object (use Make() method below to construct from
+ // A ChunkedArray).
+ // level_builders should contain one MultipathLevelBuilder per chunk of the
+ // Arrow-column to write.
+ ArrowColumnWriterV2(std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders,
+ int leaf_count, RowGroupWriter* row_group_writer)
+ : level_builders_(std::move(level_builders)),
+ leaf_count_(leaf_count),
+ row_group_writer_(row_group_writer) {}
+
+ // Writes out all leaf parquet columns to the RowGroupWriter that this
+ // object was constructed with. Each leaf column is written fully before
+ // the next column is written (i.e. no buffering is assumed).
+ //
+ // Columns are written in DFS order.
+ Status Write(ArrowWriteContext* ctx) {
+ for (int leaf_idx = 0; leaf_idx < leaf_count_; leaf_idx++) {
+ ColumnWriter* column_writer;
+ PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
+ for (auto& level_builder : level_builders_) {
+ RETURN_NOT_OK(level_builder->Write(
+ leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) {
+ size_t visited_component_size = result.post_list_visited_elements.size();
+ DCHECK_GT(visited_component_size, 0);
+ if (visited_component_size != 1) {
+ return Status::NotImplemented(
+ "Lists with non-zero length null components are not supported");
+ }
+ const ElementRange& range = result.post_list_visited_elements[0];
+ std::shared_ptr<Array> values_array =
+ result.leaf_array->Slice(range.start, range.Size());
+
+ return column_writer->WriteArrow(result.def_levels, result.rep_levels,
+ result.def_rep_level_count, *values_array,
+ ctx, result.leaf_is_nullable);
+ }));
+ }
+
+ PARQUET_CATCH_NOT_OK(column_writer->Close());
+ }
+ return Status::OK();
+ }
+
+ // Make a new object by converting each chunk in |data| to a MultipathLevelBuilder.
+ //
+ // It is necessary to create a new builder per array because the MultipathlevelBuilder
+ // extracts the data necessary for writing each leaf column at construction time.
+ // (it optimizes based on null count) and with slicing via |offset| ephemeral
+ // chunks are created which need to be tracked across each leaf column-write.
+ // This decision could potentially be revisited if we wanted to use "buffered"
+ // RowGroupWriters (we could construct each builder on demand in that case).
+ static ::arrow::Result<std::unique_ptr<ArrowColumnWriterV2>> Make(
+ const ChunkedArray& data, int64_t offset, const int64_t size,
+ const SchemaManifest& schema_manifest, RowGroupWriter* row_group_writer) {
+ int64_t absolute_position = 0;
+ int chunk_index = 0;
+ int64_t chunk_offset = 0;
+ if (data.length() == 0) {
+ return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
+ std::vector<std::unique_ptr<MultipathLevelBuilder>>{},
+ CalculateLeafCount(data.type().get()), row_group_writer);
+ }
+ while (chunk_index < data.num_chunks() && absolute_position < offset) {
+ const int64_t chunk_length = data.chunk(chunk_index)->length();
+ if (absolute_position + chunk_length > offset) {
+ // Relative offset into the chunk to reach the desired start offset for
+ // writing
+ chunk_offset = offset - absolute_position;
+ break;
+ } else {
+ ++chunk_index;
+ absolute_position += chunk_length;
+ }
+ }
+
+ if (absolute_position >= data.length()) {
+ return Status::Invalid("Cannot write data at offset past end of chunked array");
+ }
+
+ int64_t values_written = 0;
+ std::vector<std::unique_ptr<MultipathLevelBuilder>> builders;
+ const int leaf_count = CalculateLeafCount(data.type().get());
+ bool is_nullable = false;
+ // The row_group_writer hasn't been advanced yet so add 1 to the current
+ // which is the one this instance will start writing for.
+ int column_index = row_group_writer->current_column() + 1;
+ for (int leaf_offset = 0; leaf_offset < leaf_count; ++leaf_offset) {
+ const SchemaField* schema_field = nullptr;
+ RETURN_NOT_OK(
+ schema_manifest.GetColumnField(column_index + leaf_offset, &schema_field));
+ bool nullable_root = HasNullableRoot(schema_manifest, schema_field);
+ if (leaf_offset == 0) {
+ is_nullable = nullable_root;
+ }
+
+// Don't validate common ancestry for all leafs if not in debug.
+#ifndef NDEBUG
+ break;
+#else
+ if (is_nullable != nullable_root) {
+ return Status::UnknownError(
+ "Unexpected mismatched nullability between column index",
+ column_index + leaf_offset, " and ", column_index);
+ }
+#endif
+ }
+ while (values_written < size) {
+ const Array& chunk = *data.chunk(chunk_index);
+ const int64_t available_values = chunk.length() - chunk_offset;
+ const int64_t chunk_write_size = std::min(size - values_written, available_values);
+
+ // The chunk offset here will be 0 except for possibly the first chunk
+ // because of the advancing logic above
+ std::shared_ptr<Array> array_to_write = chunk.Slice(chunk_offset, chunk_write_size);
+
+ if (array_to_write->length() > 0) {
+ ARROW_ASSIGN_OR_RAISE(std::unique_ptr<MultipathLevelBuilder> builder,
+ MultipathLevelBuilder::Make(*array_to_write, is_nullable));
+ if (leaf_count != builder->GetLeafCount()) {
+ return Status::UnknownError("data type leaf_count != builder_leaf_count",
+ leaf_count, " ", builder->GetLeafCount());
+ }
+ builders.emplace_back(std::move(builder));
+ }
+
+ if (chunk_write_size == available_values) {
+ chunk_offset = 0;
+ ++chunk_index;
+ }
+ values_written += chunk_write_size;
+ }
+ return ::arrow::internal::make_unique<ArrowColumnWriterV2>(
+ std::move(builders), leaf_count, row_group_writer);
+ }
+
+ private:
+ // One builder per column-chunk.
+ std::vector<std::unique_ptr<MultipathLevelBuilder>> level_builders_;
+ int leaf_count_;
+ RowGroupWriter* row_group_writer_;
+};
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// FileWriter implementation
+
+class FileWriterImpl : public FileWriter {
+ public:
+ FileWriterImpl(std::shared_ptr<::arrow::Schema> schema, MemoryPool* pool,
+ std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties)
+ : schema_(std::move(schema)),
+ writer_(std::move(writer)),
+ row_group_writer_(nullptr),
+ column_write_context_(pool, arrow_properties.get()),
+ arrow_properties_(std::move(arrow_properties)),
+ closed_(false) {}
+
+ Status Init() {
+ return SchemaManifest::Make(writer_->schema(), /*schema_metadata=*/nullptr,
+ default_arrow_reader_properties(), &schema_manifest_);
+ }
+
+ Status NewRowGroup(int64_t chunk_size) override {
+ if (row_group_writer_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
+ }
+ PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup());
+ return Status::OK();
+ }
+
+ Status Close() override {
+ if (!closed_) {
+ // Make idempotent
+ closed_ = true;
+ if (row_group_writer_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(row_group_writer_->Close());
+ }
+ PARQUET_CATCH_NOT_OK(writer_->Close());
+ }
+ return Status::OK();
+ }
+
+ Status WriteColumnChunk(const Array& data) override {
+ // A bit awkward here since cannot instantiate ChunkedArray from const Array&
+ auto chunk = ::arrow::MakeArray(data.data());
+ auto chunked_array = std::make_shared<::arrow::ChunkedArray>(chunk);
+ return WriteColumnChunk(chunked_array, 0, data.length());
+ }
+
+ Status WriteColumnChunk(const std::shared_ptr<ChunkedArray>& data, int64_t offset,
+ int64_t size) override {
+ if (arrow_properties_->engine_version() == ArrowWriterProperties::V2 ||
+ arrow_properties_->engine_version() == ArrowWriterProperties::V1) {
+ ARROW_ASSIGN_OR_RAISE(
+ std::unique_ptr<ArrowColumnWriterV2> writer,
+ ArrowColumnWriterV2::Make(*data, offset, size, schema_manifest_,
+ row_group_writer_));
+ return writer->Write(&column_write_context_);
+ }
+ return Status::NotImplemented("Unknown engine version.");
+ }
+
+ Status WriteColumnChunk(const std::shared_ptr<::arrow::ChunkedArray>& data) override {
+ return WriteColumnChunk(data, 0, data->length());
+ }
+
+ std::shared_ptr<::arrow::Schema> schema() const override { return schema_; }
+
+ Status WriteTable(const Table& table, int64_t chunk_size) override {
+ RETURN_NOT_OK(table.Validate());
+
+ if (chunk_size <= 0 && table.num_rows() > 0) {
+ return Status::Invalid("chunk size per row_group must be greater than 0");
+ } else if (!table.schema()->Equals(*schema_, false)) {
+ return Status::Invalid("table schema does not match this writer's. table:'",
+ table.schema()->ToString(), "' this:'", schema_->ToString(),
+ "'");
+ } else if (chunk_size > this->properties().max_row_group_length()) {
+ chunk_size = this->properties().max_row_group_length();
+ }
+
+ auto WriteRowGroup = [&](int64_t offset, int64_t size) {
+ RETURN_NOT_OK(NewRowGroup(size));
+ for (int i = 0; i < table.num_columns(); i++) {
+ RETURN_NOT_OK(WriteColumnChunk(table.column(i), offset, size));
+ }
+ return Status::OK();
+ };
+
+ if (table.num_rows() == 0) {
+ // Append a row group with 0 rows
+ RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close()));
+ return Status::OK();
+ }
+
+ for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
+ int64_t offset = chunk * chunk_size;
+ RETURN_NOT_OK_ELSE(
+ WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)),
+ PARQUET_IGNORE_NOT_OK(Close()));
+ }
+ return Status::OK();
+ }
+
+ const WriterProperties& properties() const { return *writer_->properties(); }
+
+ ::arrow::MemoryPool* memory_pool() const override {
+ return column_write_context_.memory_pool;
+ }
+
+ const std::shared_ptr<FileMetaData> metadata() const override {
+ return writer_->metadata();
+ }
+
+ private:
+ friend class FileWriter;
+
+ std::shared_ptr<::arrow::Schema> schema_;
+
+ SchemaManifest schema_manifest_;
+
+ std::unique_ptr<ParquetFileWriter> writer_;
+ RowGroupWriter* row_group_writer_;
+ ArrowWriteContext column_write_context_;
+ std::shared_ptr<ArrowWriterProperties> arrow_properties_;
+ bool closed_;
+};
+
+FileWriter::~FileWriter() {}
+
+Status FileWriter::Make(::arrow::MemoryPool* pool,
+ std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<::arrow::Schema> schema,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* out) {
+ std::unique_ptr<FileWriterImpl> impl(new FileWriterImpl(
+ std::move(schema), pool, std::move(writer), std::move(arrow_properties)));
+ RETURN_NOT_OK(impl->Init());
+ *out = std::move(impl);
+ return Status::OK();
+}
+
+Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::unique_ptr<FileWriter>* writer) {
+ return Open(std::move(schema), pool, std::move(sink), std::move(properties),
+ default_arrow_writer_properties(), writer);
+}
+
+Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ const ArrowWriterProperties& properties,
+ std::shared_ptr<const KeyValueMetadata>* out) {
+ if (!properties.store_schema()) {
+ *out = nullptr;
+ return Status::OK();
+ }
+
+ static const std::string kArrowSchemaKey = "ARROW:schema";
+ std::shared_ptr<KeyValueMetadata> result;
+ if (schema.metadata()) {
+ result = schema.metadata()->Copy();
+ } else {
+ result = ::arrow::key_value_metadata({}, {});
+ }
+
+ ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> serialized,
+ ::arrow::ipc::SerializeSchema(schema, pool));
+
+ // The serialized schema is not UTF-8, which is required for Thrift
+ std::string schema_as_string = serialized->ToString();
+ std::string schema_base64 = ::arrow::util::base64_encode(
+ reinterpret_cast<const unsigned char*>(schema_as_string.data()),
+ static_cast<unsigned int>(schema_as_string.size()));
+ result->Append(kArrowSchemaKey, schema_base64);
+ *out = result;
+ return Status::OK();
+}
+
+Status FileWriter::Open(const ::arrow::Schema& schema, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* writer) {
+ std::shared_ptr<SchemaDescriptor> parquet_schema;
+ RETURN_NOT_OK(
+ ToParquetSchema(&schema, *properties, *arrow_properties, &parquet_schema));
+
+ auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
+
+ std::shared_ptr<const KeyValueMetadata> metadata;
+ RETURN_NOT_OK(GetSchemaMetadata(schema, pool, *arrow_properties, &metadata));
+
+ std::unique_ptr<ParquetFileWriter> base_writer;
+ PARQUET_CATCH_NOT_OK(base_writer = ParquetFileWriter::Open(std::move(sink), schema_node,
+ std::move(properties),
+ std::move(metadata)));
+
+ auto schema_ptr = std::make_shared<::arrow::Schema>(schema);
+ return Make(pool, std::move(base_writer), std::move(schema_ptr),
+ std::move(arrow_properties), writer);
+}
+
+Status WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteFileMetaData(file_metadata, sink));
+ return Status::OK();
+}
+
+Status WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink) {
+ PARQUET_CATCH_NOT_OK(::parquet::WriteMetaDataFile(file_metadata, sink));
+ return Status::OK();
+}
+
+Status WriteTable(const ::arrow::Table& table, ::arrow::MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties) {
+ std::unique_ptr<FileWriter> writer;
+ RETURN_NOT_OK(FileWriter::Open(*table.schema(), pool, std::move(sink),
+ std::move(properties), std::move(arrow_properties),
+ &writer));
+ RETURN_NOT_OK(writer->WriteTable(table, chunk_size));
+ return writer->Close();
+}
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
index 43c5ede1ab5..f31f3d03def 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/arrow/writer.h
@@ -1,109 +1,109 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-class Schema;
-class Table;
-
-} // namespace arrow
-
-namespace parquet {
-
-class FileMetaData;
-class ParquetFileWriter;
-
-namespace arrow {
-
-/// \brief Iterative FileWriter class
-///
-/// Start a new RowGroup or Chunk with NewRowGroup.
-/// Write column-by-column the whole column chunk.
-///
-/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
-/// value is a nonnegative integer, then it will be used as the field_id in the parquet
-/// file.
-class PARQUET_EXPORT FileWriter {
- public:
- static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
- std::shared_ptr<::arrow::Schema> schema,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* out);
-
- static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::unique_ptr<FileWriter>* writer);
-
- static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<ArrowWriterProperties> arrow_properties,
- std::unique_ptr<FileWriter>* writer);
-
- virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
-
- /// \brief Write a Table to Parquet.
- virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
-
- virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
- virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
-
- /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
- virtual ::arrow::Status WriteColumnChunk(
- const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
- int64_t size) = 0;
-
- virtual ::arrow::Status WriteColumnChunk(
- const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
- virtual ::arrow::Status Close() = 0;
- virtual ~FileWriter();
-
- virtual MemoryPool* memory_pool() const = 0;
- virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
-};
-
-/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
-PARQUET_EXPORT
-::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
-PARQUET_EXPORT
-::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-/// \brief Write a Table to Parquet.
-::arrow::Status PARQUET_EXPORT
-WriteTable(const ::arrow::Table& table, MemoryPool* pool,
- std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
- std::shared_ptr<WriterProperties> properties = default_writer_properties(),
- std::shared_ptr<ArrowWriterProperties> arrow_properties =
- default_arrow_writer_properties());
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class Schema;
+class Table;
+
+} // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class ParquetFileWriter;
+
+namespace arrow {
+
+/// \brief Iterative FileWriter class
+///
+/// Start a new RowGroup or Chunk with NewRowGroup.
+/// Write column-by-column the whole column chunk.
+///
+/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
+/// value is a nonnegative integer, then it will be used as the field_id in the parquet
+/// file.
+class PARQUET_EXPORT FileWriter {
+ public:
+ static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
+ std::shared_ptr<::arrow::Schema> schema,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* out);
+
+ static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::unique_ptr<FileWriter>* writer);
+
+ static ::arrow::Status Open(const ::arrow::Schema& schema, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<ArrowWriterProperties> arrow_properties,
+ std::unique_ptr<FileWriter>* writer);
+
+ virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
+
+ /// \brief Write a Table to Parquet.
+ virtual ::arrow::Status WriteTable(const ::arrow::Table& table, int64_t chunk_size) = 0;
+
+ virtual ::arrow::Status NewRowGroup(int64_t chunk_size) = 0;
+ virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
+
+ /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
+ virtual ::arrow::Status WriteColumnChunk(
+ const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
+ int64_t size) = 0;
+
+ virtual ::arrow::Status WriteColumnChunk(
+ const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
+ virtual ::arrow::Status Close() = 0;
+ virtual ~FileWriter();
+
+ virtual MemoryPool* memory_pool() const = 0;
+ virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
+};
+
+/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+/// \brief Write a Table to Parquet.
+::arrow::Status PARQUET_EXPORT
+WriteTable(const ::arrow::Table& table, MemoryPool* pool,
+ std::shared_ptr<::arrow::io::OutputStream> sink, int64_t chunk_size,
+ std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+ std::shared_ptr<ArrowWriterProperties> arrow_properties =
+ default_arrow_writer_properties());
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
index e56449060ef..f6f6d327d06 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.cc
@@ -1,162 +1,162 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cstdint>
-#include <cstring>
-
-#include "arrow/result.h"
-#include "arrow/util/logging.h"
-#include "parquet/bloom_filter.h"
-#include "parquet/exception.h"
-#include "parquet/murmur3.h"
-
-namespace parquet {
-constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
-
-BlockSplitBloomFilter::BlockSplitBloomFilter()
- : pool_(::arrow::default_memory_pool()),
- hash_strategy_(HashStrategy::MURMUR3_X64_128),
- algorithm_(Algorithm::BLOCK) {}
-
-void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
- if (num_bytes < kMinimumBloomFilterBytes) {
- num_bytes = kMinimumBloomFilterBytes;
- }
-
- // Get next power of 2 if it is not power of 2.
- if ((num_bytes & (num_bytes - 1)) != 0) {
- num_bytes = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bytes));
- }
-
- if (num_bytes > kMaximumBloomFilterBytes) {
- num_bytes = kMaximumBloomFilterBytes;
- }
-
- num_bytes_ = num_bytes;
- PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
- memset(data_->mutable_data(), 0, num_bytes_);
-
- this->hasher_.reset(new MurmurHash3());
-}
-
-void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
- DCHECK(bitset != nullptr);
-
- if (num_bytes < kMinimumBloomFilterBytes || num_bytes > kMaximumBloomFilterBytes ||
- (num_bytes & (num_bytes - 1)) != 0) {
- throw ParquetException("Given length of bitset is illegal");
- }
-
- num_bytes_ = num_bytes;
- PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
- memcpy(data_->mutable_data(), bitset, num_bytes_);
-
- this->hasher_.reset(new MurmurHash3());
-}
-
-BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(ArrowInputStream* input) {
- uint32_t len, hash, algorithm;
- int64_t bytes_available;
-
- PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &len));
- if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
- throw ParquetException("Failed to deserialize from input stream");
- }
-
- PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &hash));
- if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
- throw ParquetException("Failed to deserialize from input stream");
- }
- if (static_cast<HashStrategy>(hash) != HashStrategy::MURMUR3_X64_128) {
- throw ParquetException("Unsupported hash strategy");
- }
-
- PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &algorithm));
- if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
- throw ParquetException("Failed to deserialize from input stream");
- }
- if (static_cast<Algorithm>(algorithm) != BloomFilter::Algorithm::BLOCK) {
- throw ParquetException("Unsupported Bloom filter algorithm");
- }
-
- BlockSplitBloomFilter bloom_filter;
-
- PARQUET_ASSIGN_OR_THROW(auto buffer, input->Read(len));
- bloom_filter.Init(buffer->data(), len);
- return bloom_filter;
-}
-
-void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const {
- DCHECK(sink != nullptr);
-
- PARQUET_THROW_NOT_OK(
- sink->Write(reinterpret_cast<const uint8_t*>(&num_bytes_), sizeof(num_bytes_)));
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<const uint8_t*>(&hash_strategy_),
- sizeof(hash_strategy_)));
- PARQUET_THROW_NOT_OK(
- sink->Write(reinterpret_cast<const uint8_t*>(&algorithm_), sizeof(algorithm_)));
- PARQUET_THROW_NOT_OK(sink->Write(data_->mutable_data(), num_bytes_));
-}
-
-void BlockSplitBloomFilter::SetMask(uint32_t key, BlockMask& block_mask) const {
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- block_mask.item[i] = key * SALT[i];
- }
-
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- block_mask.item[i] = block_mask.item[i] >> 27;
- }
-
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- block_mask.item[i] = UINT32_C(0x1) << block_mask.item[i];
- }
-}
-
-bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
- const uint32_t bucket_index =
- static_cast<uint32_t>((hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1));
- uint32_t key = static_cast<uint32_t>(hash);
- uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
-
- // Calculate mask for bucket.
- BlockMask block_mask;
- SetMask(key, block_mask);
-
- for (int i = 0; i < kBitsSetPerBlock; ++i) {
- if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & block_mask.item[i])) {
- return false;
- }
- }
- return true;
-}
-
-void BlockSplitBloomFilter::InsertHash(uint64_t hash) {
- const uint32_t bucket_index =
- static_cast<uint32_t>(hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1);
- uint32_t key = static_cast<uint32_t>(hash);
- uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
-
- // Calculate mask for bucket.
- BlockMask block_mask;
- SetMask(key, block_mask);
-
- for (int i = 0; i < kBitsSetPerBlock; i++) {
- bitset32[bucket_index * kBitsSetPerBlock + i] |= block_mask.item[i];
- }
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/result.h"
+#include "arrow/util/logging.h"
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/murmur3.h"
+
+namespace parquet {
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+ : pool_(::arrow::default_memory_pool()),
+ hash_strategy_(HashStrategy::MURMUR3_X64_128),
+ algorithm_(Algorithm::BLOCK) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+ if (num_bytes < kMinimumBloomFilterBytes) {
+ num_bytes = kMinimumBloomFilterBytes;
+ }
+
+ // Get next power of 2 if it is not power of 2.
+ if ((num_bytes & (num_bytes - 1)) != 0) {
+ num_bytes = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bytes));
+ }
+
+ if (num_bytes > kMaximumBloomFilterBytes) {
+ num_bytes = kMaximumBloomFilterBytes;
+ }
+
+ num_bytes_ = num_bytes;
+ PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
+ memset(data_->mutable_data(), 0, num_bytes_);
+
+ this->hasher_.reset(new MurmurHash3());
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+ DCHECK(bitset != nullptr);
+
+ if (num_bytes < kMinimumBloomFilterBytes || num_bytes > kMaximumBloomFilterBytes ||
+ (num_bytes & (num_bytes - 1)) != 0) {
+ throw ParquetException("Given length of bitset is illegal");
+ }
+
+ num_bytes_ = num_bytes;
+ PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
+ memcpy(data_->mutable_data(), bitset, num_bytes_);
+
+ this->hasher_.reset(new MurmurHash3());
+}
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(ArrowInputStream* input) {
+ uint32_t len, hash, algorithm;
+ int64_t bytes_available;
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &len));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &hash));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+ if (static_cast<HashStrategy>(hash) != HashStrategy::MURMUR3_X64_128) {
+ throw ParquetException("Unsupported hash strategy");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &algorithm));
+ if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
+ throw ParquetException("Failed to deserialize from input stream");
+ }
+ if (static_cast<Algorithm>(algorithm) != BloomFilter::Algorithm::BLOCK) {
+ throw ParquetException("Unsupported Bloom filter algorithm");
+ }
+
+ BlockSplitBloomFilter bloom_filter;
+
+ PARQUET_ASSIGN_OR_THROW(auto buffer, input->Read(len));
+ bloom_filter.Init(buffer->data(), len);
+ return bloom_filter;
+}
+
+void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const {
+ DCHECK(sink != nullptr);
+
+ PARQUET_THROW_NOT_OK(
+ sink->Write(reinterpret_cast<const uint8_t*>(&num_bytes_), sizeof(num_bytes_)));
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<const uint8_t*>(&hash_strategy_),
+ sizeof(hash_strategy_)));
+ PARQUET_THROW_NOT_OK(
+ sink->Write(reinterpret_cast<const uint8_t*>(&algorithm_), sizeof(algorithm_)));
+ PARQUET_THROW_NOT_OK(sink->Write(data_->mutable_data(), num_bytes_));
+}
+
+void BlockSplitBloomFilter::SetMask(uint32_t key, BlockMask& block_mask) const {
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = key * SALT[i];
+ }
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = block_mask.item[i] >> 27;
+ }
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ block_mask.item[i] = UINT32_C(0x1) << block_mask.item[i];
+ }
+}
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+ const uint32_t bucket_index =
+ static_cast<uint32_t>((hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1));
+ uint32_t key = static_cast<uint32_t>(hash);
+ uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
+
+ // Calculate mask for bucket.
+ BlockMask block_mask;
+ SetMask(key, block_mask);
+
+ for (int i = 0; i < kBitsSetPerBlock; ++i) {
+ if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & block_mask.item[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void BlockSplitBloomFilter::InsertHash(uint64_t hash) {
+ const uint32_t bucket_index =
+ static_cast<uint32_t>(hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1);
+ uint32_t key = static_cast<uint32_t>(hash);
+ uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
+
+ // Calculate mask for bucket.
+ BlockMask block_mask;
+ SetMask(key, block_mask);
+
+ for (int i = 0; i < kBitsSetPerBlock; i++) {
+ bitset32[bucket_index * kBitsSetPerBlock + i] |= block_mask.item[i];
+ }
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
index 218a1162674..39f9561ae5b 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/bloom_filter.h
@@ -1,247 +1,247 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cmath>
-#include <cstdint>
-#include <memory>
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/logging.h"
-#include "parquet/hasher.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-// A Bloom filter is a compact structure to indicate whether an item is not in a set or
-// probably in a set. The Bloom filter usually consists of a bit set that represents a
-// set of elements, a hash strategy and a Bloom filter algorithm.
-class PARQUET_EXPORT BloomFilter {
- public:
- // Maximum Bloom filter size, it sets to HDFS default block size 128MB
- // This value will be reconsidered when implementing Bloom filter producer.
- static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
-
- /// Determine whether an element exist in set or not.
- ///
- /// @param hash the element to contain.
- /// @return false if value is definitely not in set, and true means PROBABLY
- /// in set.
- virtual bool FindHash(uint64_t hash) const = 0;
-
- /// Insert element to set represented by Bloom filter bitset.
- /// @param hash the hash of value to insert into Bloom filter.
- virtual void InsertHash(uint64_t hash) = 0;
-
- /// Write this Bloom filter to an output stream. A Bloom filter structure should
- /// include bitset length, hash strategy, algorithm, and bitset.
- ///
- /// @param sink the output stream to write
- virtual void WriteTo(ArrowOutputStream* sink) const = 0;
-
- /// Get the number of bytes of bitset
- virtual uint32_t GetBitsetSize() const = 0;
-
- /// Compute hash for 32 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int32_t value) const = 0;
-
- /// Compute hash for 64 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int64_t value) const = 0;
-
- /// Compute hash for float value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(float value) const = 0;
-
- /// Compute hash for double value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(double value) const = 0;
-
- /// Compute hash for Int96 value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const Int96* value) const = 0;
-
- /// Compute hash for ByteArray value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const ByteArray* value) const = 0;
-
- /// Compute hash for fixed byte array value by using its plain encoding result.
- ///
- /// @param value the value address.
- /// @param len the value length.
- /// @return hash result.
- virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
-
- virtual ~BloomFilter() {}
-
- protected:
- // Hash strategy available for Bloom filter.
- enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
-
- // Bloom filter algorithm.
- enum class Algorithm : uint32_t { BLOCK = 0 };
-};
-
-// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
-// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
-// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
-//
-// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
-// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
-class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
- public:
- /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
- BlockSplitBloomFilter();
-
- /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
- /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
- /// rounded up/down to lower/upper bound if num_bytes is out of range and also
- /// will be rounded up to a power of 2.
- ///
- /// @param num_bytes The number of bytes to store Bloom filter bitset.
- void Init(uint32_t num_bytes);
-
- /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
- /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
- /// which may lead to segfault when performing SIMD instructions. It is the caller's
- /// responsibility to free the bitset passed in. This is used when reconstructing
- /// a Bloom filter from a parquet file.
- ///
- /// @param bitset The given bitset to initialize the Bloom filter.
- /// @param num_bytes The number of bytes of given bitset.
- void Init(const uint8_t* bitset, uint32_t num_bytes);
-
- // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
- static constexpr uint32_t kMinimumBloomFilterBytes = 32;
-
- /// Calculate optimal size according to the number of distinct values and false
- /// positive probability.
- ///
- /// @param ndv The number of distinct values.
- /// @param fpp The false positive probability.
- /// @return it always return a value between kMinimumBloomFilterBytes and
- /// kMaximumBloomFilterBytes, and the return value is always a power of 2
- static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
- DCHECK(fpp > 0.0 && fpp < 1.0);
- const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
- uint32_t num_bits;
-
- // Handle overflow.
- if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
- num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
- } else {
- num_bits = static_cast<uint32_t>(m);
- }
-
- // Round up to lower bound
- if (num_bits < kMinimumBloomFilterBytes << 3) {
- num_bits = kMinimumBloomFilterBytes << 3;
- }
-
- // Get next power of 2 if bits is not power of 2.
- if ((num_bits & (num_bits - 1)) != 0) {
- num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
- }
-
- // Round down to upper bound
- if (num_bits > kMaximumBloomFilterBytes << 3) {
- num_bits = kMaximumBloomFilterBytes << 3;
- }
-
- return num_bits;
- }
-
- bool FindHash(uint64_t hash) const override;
- void InsertHash(uint64_t hash) override;
- void WriteTo(ArrowOutputStream* sink) const override;
- uint32_t GetBitsetSize() const override { return num_bytes_; }
-
- uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
- uint64_t Hash(float value) const override { return hasher_->Hash(value); }
- uint64_t Hash(double value) const override { return hasher_->Hash(value); }
- uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
- uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
- uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
- uint64_t Hash(const FLBA* value, uint32_t len) const override {
- return hasher_->Hash(value, len);
- }
-
- /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
- /// a Bloom filter from a parquet filter.
- ///
- /// @param input_stream The input stream from which to construct the Bloom filter
- /// @return The BlockSplitBloomFilter.
- static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
-
- private:
- // Bytes in a tiny Bloom filter block.
- static constexpr int kBytesPerFilterBlock = 32;
-
- // The number of bits to be set in each tiny Bloom filter
- static constexpr int kBitsSetPerBlock = 8;
-
- // A mask structure used to set bits in each tiny Bloom filter.
- struct BlockMask {
- uint32_t item[kBitsSetPerBlock];
- };
-
- // The block-based algorithm needs eight odd SALT values to calculate eight indexes
- // of bit to set, one bit in each 32-bit word.
- static constexpr uint32_t SALT[kBitsSetPerBlock] = {
- 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
- 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
-
- /// Set bits in mask array according to input key.
- /// @param key the value to calculate mask values.
- /// @param mask the mask array is used to set inside a block
- void SetMask(uint32_t key, BlockMask& mask) const;
-
- // Memory pool to allocate aligned buffer for bitset
- ::arrow::MemoryPool* pool_;
-
- // The underlying buffer of bitset.
- std::shared_ptr<Buffer> data_;
-
- // The number of bytes of Bloom filter bitset.
- uint32_t num_bytes_;
-
- // Hash strategy used in this Bloom filter.
- HashStrategy hash_strategy_;
-
- // Algorithm used in this Bloom filter.
- Algorithm algorithm_;
-
- // The hash pointer points to actual hash class used.
- std::unique_ptr<Hasher> hasher_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/logging.h"
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// A Bloom filter is a compact structure to indicate whether an item is not in a set or
+// probably in a set. The Bloom filter usually consists of a bit set that represents a
+// set of elements, a hash strategy and a Bloom filter algorithm.
+class PARQUET_EXPORT BloomFilter {
+ public:
+ // Maximum Bloom filter size, it sets to HDFS default block size 128MB
+ // This value will be reconsidered when implementing Bloom filter producer.
+ static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
+
+ /// Determine whether an element exist in set or not.
+ ///
+ /// @param hash the element to contain.
+ /// @return false if value is definitely not in set, and true means PROBABLY
+ /// in set.
+ virtual bool FindHash(uint64_t hash) const = 0;
+
+ /// Insert element to set represented by Bloom filter bitset.
+ /// @param hash the hash of value to insert into Bloom filter.
+ virtual void InsertHash(uint64_t hash) = 0;
+
+ /// Write this Bloom filter to an output stream. A Bloom filter structure should
+ /// include bitset length, hash strategy, algorithm, and bitset.
+ ///
+ /// @param sink the output stream to write
+ virtual void WriteTo(ArrowOutputStream* sink) const = 0;
+
+ /// Get the number of bytes of bitset
+ virtual uint32_t GetBitsetSize() const = 0;
+
+ /// Compute hash for 32 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int32_t value) const = 0;
+
+ /// Compute hash for 64 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int64_t value) const = 0;
+
+ /// Compute hash for float value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(float value) const = 0;
+
+ /// Compute hash for double value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(double value) const = 0;
+
+ /// Compute hash for Int96 value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const Int96* value) const = 0;
+
+ /// Compute hash for ByteArray value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+ /// Compute hash for fixed byte array value by using its plain encoding result.
+ ///
+ /// @param value the value address.
+ /// @param len the value length.
+ /// @return hash result.
+ virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+ virtual ~BloomFilter() {}
+
+ protected:
+ // Hash strategy available for Bloom filter.
+ enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
+
+ // Bloom filter algorithm.
+ enum class Algorithm : uint32_t { BLOCK = 0 };
+};
+
+// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
+// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
+// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
+//
+// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
+// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
+class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
+ public:
+ /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
+ BlockSplitBloomFilter();
+
+ /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
+ /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
+ /// rounded up/down to lower/upper bound if num_bytes is out of range and also
+ /// will be rounded up to a power of 2.
+ ///
+ /// @param num_bytes The number of bytes to store Bloom filter bitset.
+ void Init(uint32_t num_bytes);
+
+ /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
+ /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
+ /// which may lead to segfault when performing SIMD instructions. It is the caller's
+ /// responsibility to free the bitset passed in. This is used when reconstructing
+ /// a Bloom filter from a parquet file.
+ ///
+ /// @param bitset The given bitset to initialize the Bloom filter.
+ /// @param num_bytes The number of bytes of given bitset.
+ void Init(const uint8_t* bitset, uint32_t num_bytes);
+
+ // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
+ static constexpr uint32_t kMinimumBloomFilterBytes = 32;
+
+ /// Calculate optimal size according to the number of distinct values and false
+ /// positive probability.
+ ///
+ /// @param ndv The number of distinct values.
+ /// @param fpp The false positive probability.
+ /// @return it always return a value between kMinimumBloomFilterBytes and
+ /// kMaximumBloomFilterBytes, and the return value is always a power of 2
+ static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
+ DCHECK(fpp > 0.0 && fpp < 1.0);
+ const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
+ uint32_t num_bits;
+
+ // Handle overflow.
+ if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
+ num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
+ } else {
+ num_bits = static_cast<uint32_t>(m);
+ }
+
+ // Round up to lower bound
+ if (num_bits < kMinimumBloomFilterBytes << 3) {
+ num_bits = kMinimumBloomFilterBytes << 3;
+ }
+
+ // Get next power of 2 if bits is not power of 2.
+ if ((num_bits & (num_bits - 1)) != 0) {
+ num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
+ }
+
+ // Round down to upper bound
+ if (num_bits > kMaximumBloomFilterBytes << 3) {
+ num_bits = kMaximumBloomFilterBytes << 3;
+ }
+
+ return num_bits;
+ }
+
+ bool FindHash(uint64_t hash) const override;
+ void InsertHash(uint64_t hash) override;
+ void WriteTo(ArrowOutputStream* sink) const override;
+ uint32_t GetBitsetSize() const override { return num_bytes_; }
+
+ uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(float value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(double value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
+ uint64_t Hash(const FLBA* value, uint32_t len) const override {
+ return hasher_->Hash(value, len);
+ }
+
+ /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
+ /// a Bloom filter from a parquet filter.
+ ///
+ /// @param input_stream The input stream from which to construct the Bloom filter
+ /// @return The BlockSplitBloomFilter.
+ static BlockSplitBloomFilter Deserialize(ArrowInputStream* input_stream);
+
+ private:
+ // Bytes in a tiny Bloom filter block.
+ static constexpr int kBytesPerFilterBlock = 32;
+
+ // The number of bits to be set in each tiny Bloom filter
+ static constexpr int kBitsSetPerBlock = 8;
+
+ // A mask structure used to set bits in each tiny Bloom filter.
+ struct BlockMask {
+ uint32_t item[kBitsSetPerBlock];
+ };
+
+ // The block-based algorithm needs eight odd SALT values to calculate eight indexes
+ // of bit to set, one bit in each 32-bit word.
+ static constexpr uint32_t SALT[kBitsSetPerBlock] = {
+ 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
+ 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
+
+ /// Set bits in mask array according to input key.
+ /// @param key the value to calculate mask values.
+ /// @param mask the mask array is used to set inside a block
+ void SetMask(uint32_t key, BlockMask& mask) const;
+
+ // Memory pool to allocate aligned buffer for bitset
+ ::arrow::MemoryPool* pool_;
+
+ // The underlying buffer of bitset.
+ std::shared_ptr<Buffer> data_;
+
+ // The number of bytes of Bloom filter bitset.
+ uint32_t num_bytes_;
+
+ // Hash strategy used in this Bloom filter.
+ HashStrategy hash_strategy_;
+
+ // Algorithm used in this Bloom filter.
+ Algorithm algorithm_;
+
+ // The hash pointer points to actual hash class used.
+ std::unique_ptr<Hasher> hasher_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
index 242f16b2e67..2fab77ed01a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_page.h
@@ -1,160 +1,160 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This module defines an abstract interface for iterating through pages in a
-// Parquet column chunk within a row group. It could be extended in the future
-// to iterate through all data pages in all chunks in a file.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <string>
-
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-// TODO: Parallel processing is not yet safe because of memory-ownership
-// semantics (the PageReader may or may not own the memory referenced by a
-// page)
-//
-// TODO(wesm): In the future Parquet implementations may store the crc code
-// in format::PageHeader. parquet-mr currently does not, so we also skip it
-// here, both on the read and write path
-class Page {
- public:
- Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
- : buffer_(buffer), type_(type) {}
-
- PageType::type type() const { return type_; }
-
- std::shared_ptr<Buffer> buffer() const { return buffer_; }
-
- // @returns: a pointer to the page's data
- const uint8_t* data() const { return buffer_->data(); }
-
- // @returns: the total size in bytes of the page's data buffer
- int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
-
- private:
- std::shared_ptr<Buffer> buffer_;
- PageType::type type_;
-};
-
-/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
-class DataPage : public Page {
- public:
- int32_t num_values() const { return num_values_; }
- Encoding::type encoding() const { return encoding_; }
- int64_t uncompressed_size() const { return uncompressed_size_; }
- const EncodedStatistics& statistics() const { return statistics_; }
-
- virtual ~DataPage() = default;
-
- protected:
- DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
- Encoding::type encoding, int64_t uncompressed_size,
- const EncodedStatistics& statistics = EncodedStatistics())
- : Page(buffer, type),
- num_values_(num_values),
- encoding_(encoding),
- uncompressed_size_(uncompressed_size),
- statistics_(statistics) {}
-
- int32_t num_values_;
- Encoding::type encoding_;
- int64_t uncompressed_size_;
- EncodedStatistics statistics_;
-};
-
-class DataPageV1 : public DataPage {
- public:
- DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
- Encoding::type encoding, Encoding::type definition_level_encoding,
- Encoding::type repetition_level_encoding, int64_t uncompressed_size,
- const EncodedStatistics& statistics = EncodedStatistics())
- : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
- statistics),
- definition_level_encoding_(definition_level_encoding),
- repetition_level_encoding_(repetition_level_encoding) {}
-
- Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
-
- Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
-
- private:
- Encoding::type definition_level_encoding_;
- Encoding::type repetition_level_encoding_;
-};
-
-class DataPageV2 : public DataPage {
- public:
- DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
- int32_t num_rows, Encoding::type encoding,
- int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
- int64_t uncompressed_size, bool is_compressed = false,
- const EncodedStatistics& statistics = EncodedStatistics())
- : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
- statistics),
- num_nulls_(num_nulls),
- num_rows_(num_rows),
- definition_levels_byte_length_(definition_levels_byte_length),
- repetition_levels_byte_length_(repetition_levels_byte_length),
- is_compressed_(is_compressed) {}
-
- int32_t num_nulls() const { return num_nulls_; }
-
- int32_t num_rows() const { return num_rows_; }
-
- int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
-
- int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
-
- bool is_compressed() const { return is_compressed_; }
-
- private:
- int32_t num_nulls_;
- int32_t num_rows_;
- int32_t definition_levels_byte_length_;
- int32_t repetition_levels_byte_length_;
- bool is_compressed_;
-};
-
-class DictionaryPage : public Page {
- public:
- DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
- Encoding::type encoding, bool is_sorted = false)
- : Page(buffer, PageType::DICTIONARY_PAGE),
- num_values_(num_values),
- encoding_(encoding),
- is_sorted_(is_sorted) {}
-
- int32_t num_values() const { return num_values_; }
-
- Encoding::type encoding() const { return encoding_; }
-
- bool is_sorted() const { return is_sorted_; }
-
- private:
- int32_t num_values_;
- Encoding::type encoding_;
- bool is_sorted_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// TODO: Parallel processing is not yet safe because of memory-ownership
+// semantics (the PageReader may or may not own the memory referenced by a
+// page)
+//
+// TODO(wesm): In the future Parquet implementations may store the crc code
+// in format::PageHeader. parquet-mr currently does not, so we also skip it
+// here, both on the read and write path
+class Page {
+ public:
+ Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
+ : buffer_(buffer), type_(type) {}
+
+ PageType::type type() const { return type_; }
+
+ std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
+ // @returns: a pointer to the page's data
+ const uint8_t* data() const { return buffer_->data(); }
+
+ // @returns: the total size in bytes of the page's data buffer
+ int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
+
+ private:
+ std::shared_ptr<Buffer> buffer_;
+ PageType::type type_;
+};
+
+/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
+class DataPage : public Page {
+ public:
+ int32_t num_values() const { return num_values_; }
+ Encoding::type encoding() const { return encoding_; }
+ int64_t uncompressed_size() const { return uncompressed_size_; }
+ const EncodedStatistics& statistics() const { return statistics_; }
+
+ virtual ~DataPage() = default;
+
+ protected:
+ DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, int64_t uncompressed_size,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : Page(buffer, type),
+ num_values_(num_values),
+ encoding_(encoding),
+ uncompressed_size_(uncompressed_size),
+ statistics_(statistics) {}
+
+ int32_t num_values_;
+ Encoding::type encoding_;
+ int64_t uncompressed_size_;
+ EncodedStatistics statistics_;
+};
+
+class DataPageV1 : public DataPage {
+ public:
+ DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, Encoding::type definition_level_encoding,
+ Encoding::type repetition_level_encoding, int64_t uncompressed_size,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
+ statistics),
+ definition_level_encoding_(definition_level_encoding),
+ repetition_level_encoding_(repetition_level_encoding) {}
+
+ Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
+
+ Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
+
+ private:
+ Encoding::type definition_level_encoding_;
+ Encoding::type repetition_level_encoding_;
+};
+
+class DataPageV2 : public DataPage {
+ public:
+ DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
+ int32_t num_rows, Encoding::type encoding,
+ int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
+ int64_t uncompressed_size, bool is_compressed = false,
+ const EncodedStatistics& statistics = EncodedStatistics())
+ : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
+ statistics),
+ num_nulls_(num_nulls),
+ num_rows_(num_rows),
+ definition_levels_byte_length_(definition_levels_byte_length),
+ repetition_levels_byte_length_(repetition_levels_byte_length),
+ is_compressed_(is_compressed) {}
+
+ int32_t num_nulls() const { return num_nulls_; }
+
+ int32_t num_rows() const { return num_rows_; }
+
+ int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
+
+ int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
+
+ bool is_compressed() const { return is_compressed_; }
+
+ private:
+ int32_t num_nulls_;
+ int32_t num_rows_;
+ int32_t definition_levels_byte_length_;
+ int32_t repetition_levels_byte_length_;
+ bool is_compressed_;
+};
+
+class DictionaryPage : public Page {
+ public:
+ DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+ Encoding::type encoding, bool is_sorted = false)
+ : Page(buffer, PageType::DICTIONARY_PAGE),
+ num_values_(num_values),
+ encoding_(encoding),
+ is_sorted_(is_sorted) {}
+
+ int32_t num_values() const { return num_values_; }
+
+ Encoding::type encoding() const { return encoding_; }
+
+ bool is_sorted() const { return is_sorted_; }
+
+ private:
+ int32_t num_values_;
+ Encoding::type encoding_;
+ bool is_sorted_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
index 713205e98dd..047d99fed9a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.cc
@@ -1,1802 +1,1802 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/column_reader.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <exception>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/array/builder_binary.h"
-#include "arrow/array/builder_dict.h"
-#include "arrow/array/builder_primitive.h"
-#include "arrow/chunked_array.h"
-#include "arrow/type.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
-#include "arrow/util/int_util_internal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/rle_encoding.h"
-#include "parquet/column_page.h"
-#include "parquet/encoding.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/level_comparison.h"
-#include "parquet/level_conversion.h"
-#include "parquet/properties.h"
-#include "parquet/statistics.h"
-#include "parquet/thrift_internal.h" // IWYU pragma: keep
-// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
-#include "parquet/windows_compatibility.h"
-
-using arrow::MemoryPool;
-using arrow::internal::AddWithOverflow;
-using arrow::internal::checked_cast;
-using arrow::internal::MultiplyWithOverflow;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-namespace {
-inline bool HasSpacedValues(const ColumnDescriptor* descr) {
- if (descr->max_repetition_level() > 0) {
- // repeated+flat case
- return !descr->schema_node()->is_required();
- } else {
- // non-repeated+nested case
- // Find if a node forces nulls in the lowest level along the hierarchy
- const schema::Node* node = descr->schema_node().get();
- while (node) {
- if (node->is_optional()) {
- return true;
- }
- node = node->parent();
- }
- return false;
- }
-}
-} // namespace
-
-LevelDecoder::LevelDecoder() : num_values_remaining_(0) {}
-
-LevelDecoder::~LevelDecoder() {}
-
-int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
- int num_buffered_values, const uint8_t* data,
- int32_t data_size) {
- max_level_ = max_level;
- int32_t num_bytes = 0;
- encoding_ = encoding;
- num_values_remaining_ = num_buffered_values;
- bit_width_ = BitUtil::Log2(max_level + 1);
- switch (encoding) {
- case Encoding::RLE: {
- if (data_size < 4) {
- throw ParquetException("Received invalid levels (corrupt data page?)");
- }
- num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data);
- if (num_bytes < 0 || num_bytes > data_size - 4) {
- throw ParquetException("Received invalid number of bytes (corrupt data page?)");
- }
- const uint8_t* decoder_data = data + 4;
- if (!rle_decoder_) {
- rle_decoder_.reset(
- new ::arrow::util::RleDecoder(decoder_data, num_bytes, bit_width_));
- } else {
- rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
- }
- return 4 + num_bytes;
- }
- case Encoding::BIT_PACKED: {
- int num_bits = 0;
- if (MultiplyWithOverflow(num_buffered_values, bit_width_, &num_bits)) {
- throw ParquetException(
- "Number of buffered values too large (corrupt data page?)");
- }
- num_bytes = static_cast<int32_t>(BitUtil::BytesForBits(num_bits));
- if (num_bytes < 0 || num_bytes > data_size - 4) {
- throw ParquetException("Received invalid number of bytes (corrupt data page?)");
- }
- if (!bit_packed_decoder_) {
- bit_packed_decoder_.reset(new ::arrow::BitUtil::BitReader(data, num_bytes));
- } else {
- bit_packed_decoder_->Reset(data, num_bytes);
- }
- return num_bytes;
- }
- default:
- throw ParquetException("Unknown encoding type for levels.");
- }
- return -1;
-}
-
-void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level,
- int num_buffered_values, const uint8_t* data) {
- max_level_ = max_level;
- // Repetition and definition levels always uses RLE encoding
- // in the DataPageV2 format.
- if (num_bytes < 0) {
- throw ParquetException("Invalid page header (corrupt data page?)");
- }
- encoding_ = Encoding::RLE;
- num_values_remaining_ = num_buffered_values;
- bit_width_ = BitUtil::Log2(max_level + 1);
-
- if (!rle_decoder_) {
- rle_decoder_.reset(new ::arrow::util::RleDecoder(data, num_bytes, bit_width_));
- } else {
- rle_decoder_->Reset(data, num_bytes, bit_width_);
- }
-}
-
-int LevelDecoder::Decode(int batch_size, int16_t* levels) {
- int num_decoded = 0;
-
- int num_values = std::min(num_values_remaining_, batch_size);
- if (encoding_ == Encoding::RLE) {
- num_decoded = rle_decoder_->GetBatch(levels, num_values);
- } else {
- num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values);
- }
- if (num_decoded > 0) {
- internal::MinMax min_max = internal::FindMinMax(levels, num_decoded);
- if (ARROW_PREDICT_FALSE(min_max.min < 0 || min_max.max > max_level_)) {
- std::stringstream ss;
- ss << "Malformed levels. min: " << min_max.min << " max: " << min_max.max
- << " out of range. Max Level: " << max_level_;
- throw ParquetException(ss.str());
- }
- }
- num_values_remaining_ -= num_decoded;
- return num_decoded;
-}
-
-ReaderProperties default_reader_properties() {
- static ReaderProperties default_reader_properties;
- return default_reader_properties;
-}
-
-namespace {
-
-// Extracts encoded statistics from V1 and V2 data page headers
-template <typename H>
-EncodedStatistics ExtractStatsFromHeader(const H& header) {
- EncodedStatistics page_statistics;
- if (!header.__isset.statistics) {
- return page_statistics;
- }
- const format::Statistics& stats = header.statistics;
- if (stats.__isset.max) {
- page_statistics.set_max(stats.max);
- }
- if (stats.__isset.min) {
- page_statistics.set_min(stats.min);
- }
- if (stats.__isset.null_count) {
- page_statistics.set_null_count(stats.null_count);
- }
- if (stats.__isset.distinct_count) {
- page_statistics.set_distinct_count(stats.distinct_count);
- }
- return page_statistics;
-}
-
-// ----------------------------------------------------------------------
-// SerializedPageReader deserializes Thrift metadata and pages that have been
-// assembled in a serialized stream for storing in a Parquet files
-
-// This subclass delimits pages appearing in a serialized stream, each preceded
-// by a serialized Thrift format::PageHeader indicating the type of each page
-// and the page metadata.
-class SerializedPageReader : public PageReader {
- public:
- SerializedPageReader(std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
- Compression::type codec, ::arrow::MemoryPool* pool,
- const CryptoContext* crypto_ctx)
- : stream_(std::move(stream)),
- decompression_buffer_(AllocateBuffer(pool, 0)),
- page_ordinal_(0),
- seen_num_rows_(0),
- total_num_rows_(total_num_rows),
- decryption_buffer_(AllocateBuffer(pool, 0)) {
- if (crypto_ctx != nullptr) {
- crypto_ctx_ = *crypto_ctx;
- InitDecryption();
- }
- max_page_header_size_ = kDefaultMaxPageHeaderSize;
- decompressor_ = GetCodec(codec);
- }
-
- // Implement the PageReader interface
- std::shared_ptr<Page> NextPage() override;
-
- void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; }
-
- private:
- void UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor, int8_t module_type,
- const std::string& page_aad);
-
- void InitDecryption();
-
- std::shared_ptr<Buffer> DecompressIfNeeded(std::shared_ptr<Buffer> page_buffer,
- int compressed_len, int uncompressed_len,
- int levels_byte_len = 0);
-
- std::shared_ptr<ArrowInputStream> stream_;
-
- format::PageHeader current_page_header_;
- std::shared_ptr<Page> current_page_;
-
- // Compression codec to use.
- std::unique_ptr<::arrow::util::Codec> decompressor_;
- std::shared_ptr<ResizableBuffer> decompression_buffer_;
-
- // The fields below are used for calculation of AAD (additional authenticated data)
- // suffix which is part of the Parquet Modular Encryption.
- // The AAD suffix for a parquet module is built internally by
- // concatenating different parts some of which include
- // the row group ordinal, column ordinal and page ordinal.
- // Please refer to the encryption specification for more details:
- // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data
-
- // The ordinal fields in the context below are used for AAD suffix calculation.
- CryptoContext crypto_ctx_;
- int16_t page_ordinal_; // page ordinal does not count the dictionary page
-
- // Maximum allowed page size
- uint32_t max_page_header_size_;
-
- // Number of rows read in data pages so far
- int64_t seen_num_rows_;
-
- // Number of rows in all the data pages
- int64_t total_num_rows_;
-
- // data_page_aad_ and data_page_header_aad_ contain the AAD for data page and data page
- // header in a single column respectively.
- // While calculating AAD for different pages in a single column the pages AAD is
- // updated by only the page ordinal.
- std::string data_page_aad_;
- std::string data_page_header_aad_;
- // Encryption
- std::shared_ptr<ResizableBuffer> decryption_buffer_;
-};
-
-void SerializedPageReader::InitDecryption() {
- // Prepare the AAD for quick update later.
- if (crypto_ctx_.data_decryptor != nullptr) {
- DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty());
- data_page_aad_ = encryption::CreateModuleAad(
- crypto_ctx_.data_decryptor->file_aad(), encryption::kDataPage,
- crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
- }
- if (crypto_ctx_.meta_decryptor != nullptr) {
- DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty());
- data_page_header_aad_ = encryption::CreateModuleAad(
- crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader,
- crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
- }
-}
-
-void SerializedPageReader::UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor,
- int8_t module_type,
- const std::string& page_aad) {
- DCHECK(decryptor != nullptr);
- if (crypto_ctx_.start_decrypt_with_dictionary_page) {
- std::string aad = encryption::CreateModuleAad(
- decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal,
- crypto_ctx_.column_ordinal, kNonPageOrdinal);
- decryptor->UpdateAad(aad);
- } else {
- encryption::QuickUpdatePageAad(page_aad, page_ordinal_);
- decryptor->UpdateAad(page_aad);
- }
-}
-
-std::shared_ptr<Page> SerializedPageReader::NextPage() {
- // Loop here because there may be unhandled page types that we skip until
- // finding a page that we do know what to do with
-
- while (seen_num_rows_ < total_num_rows_) {
- uint32_t header_size = 0;
- uint32_t allowed_page_size = kDefaultPageHeaderSize;
-
- // Page headers can be very large because of page statistics
- // We try to deserialize a larger buffer progressively
- // until a maximum allowed header limit
- while (true) {
- PARQUET_ASSIGN_OR_THROW(auto view, stream_->Peek(allowed_page_size));
- if (view.size() == 0) {
- return std::shared_ptr<Page>(nullptr);
- }
-
- // This gets used, then set by DeserializeThriftMsg
- header_size = static_cast<uint32_t>(view.size());
- try {
- if (crypto_ctx_.meta_decryptor != nullptr) {
- UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader,
- data_page_header_aad_);
- }
- DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(view.data()), &header_size,
- &current_page_header_, crypto_ctx_.meta_decryptor);
- break;
- } catch (std::exception& e) {
- // Failed to deserialize. Double the allowed page header size and try again
- std::stringstream ss;
- ss << e.what();
- allowed_page_size *= 2;
- if (allowed_page_size > max_page_header_size_) {
- ss << "Deserializing page header failed.\n";
- throw ParquetException(ss.str());
- }
- }
- }
- // Advance the stream offset
- PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
-
- int compressed_len = current_page_header_.compressed_page_size;
- int uncompressed_len = current_page_header_.uncompressed_page_size;
- if (compressed_len < 0 || uncompressed_len < 0) {
- throw ParquetException("Invalid page header");
- }
-
- if (crypto_ctx_.data_decryptor != nullptr) {
- UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage,
- data_page_aad_);
- }
-
- // Read the compressed data page.
- PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
- if (page_buffer->size() != compressed_len) {
- std::stringstream ss;
- ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
- << compressed_len << ")";
- ParquetException::EofException(ss.str());
- }
-
- // Decrypt it if we need to
- if (crypto_ctx_.data_decryptor != nullptr) {
- PARQUET_THROW_NOT_OK(decryption_buffer_->Resize(
- compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta(), false));
- compressed_len = crypto_ctx_.data_decryptor->Decrypt(
- page_buffer->data(), compressed_len, decryption_buffer_->mutable_data());
-
- page_buffer = decryption_buffer_;
- }
-
- const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
-
- if (page_type == PageType::DICTIONARY_PAGE) {
- crypto_ctx_.start_decrypt_with_dictionary_page = false;
- const format::DictionaryPageHeader& dict_header =
- current_page_header_.dictionary_page_header;
-
- bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
- if (dict_header.num_values < 0) {
- throw ParquetException("Invalid page header (negative number of values)");
- }
-
- // Uncompress if needed
- page_buffer =
- DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
-
- return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
- LoadEnumSafe(&dict_header.encoding),
- is_sorted);
- } else if (page_type == PageType::DATA_PAGE) {
- ++page_ordinal_;
- const format::DataPageHeader& header = current_page_header_.data_page_header;
-
- if (header.num_values < 0) {
- throw ParquetException("Invalid page header (negative number of values)");
- }
- EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
- seen_num_rows_ += header.num_values;
-
- // Uncompress if needed
- page_buffer =
- DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
-
- return std::make_shared<DataPageV1>(page_buffer, header.num_values,
- LoadEnumSafe(&header.encoding),
- LoadEnumSafe(&header.definition_level_encoding),
- LoadEnumSafe(&header.repetition_level_encoding),
- uncompressed_len, page_statistics);
- } else if (page_type == PageType::DATA_PAGE_V2) {
- ++page_ordinal_;
- const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;
-
- if (header.num_values < 0) {
- throw ParquetException("Invalid page header (negative number of values)");
- }
- if (header.definition_levels_byte_length < 0 ||
- header.repetition_levels_byte_length < 0) {
- throw ParquetException("Invalid page header (negative levels byte length)");
- }
- bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
- EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
- seen_num_rows_ += header.num_values;
-
- // Uncompress if needed
- int levels_byte_len;
- if (AddWithOverflow(header.definition_levels_byte_length,
- header.repetition_levels_byte_length, &levels_byte_len)) {
- throw ParquetException("Levels size too large (corrupt file?)");
- }
- // DecompressIfNeeded doesn't take `is_compressed` into account as
- // it's page type-agnostic.
- if (is_compressed) {
- page_buffer = DecompressIfNeeded(std::move(page_buffer), compressed_len,
- uncompressed_len, levels_byte_len);
- }
-
- return std::make_shared<DataPageV2>(
- page_buffer, header.num_values, header.num_nulls, header.num_rows,
- LoadEnumSafe(&header.encoding), header.definition_levels_byte_length,
- header.repetition_levels_byte_length, uncompressed_len, is_compressed,
- page_statistics);
- } else {
- // We don't know what this page type is. We're allowed to skip non-data
- // pages.
- continue;
- }
- }
- return std::shared_ptr<Page>(nullptr);
-}
-
-std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
- std::shared_ptr<Buffer> page_buffer, int compressed_len, int uncompressed_len,
- int levels_byte_len) {
- if (decompressor_ == nullptr) {
- return page_buffer;
- }
- if (compressed_len < levels_byte_len || uncompressed_len < levels_byte_len) {
- throw ParquetException("Invalid page header");
- }
-
- // Grow the uncompressed buffer if we need to.
- if (uncompressed_len > static_cast<int>(decompression_buffer_->size())) {
- PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len, false));
- }
-
- if (levels_byte_len > 0) {
- // First copy the levels as-is
- uint8_t* decompressed = decompression_buffer_->mutable_data();
- memcpy(decompressed, page_buffer->data(), levels_byte_len);
- }
-
- // Decompress the values
- PARQUET_THROW_NOT_OK(decompressor_->Decompress(
- compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
- uncompressed_len - levels_byte_len,
- decompression_buffer_->mutable_data() + levels_byte_len));
-
- return decompression_buffer_;
-}
-
-} // namespace
-
-std::unique_ptr<PageReader> PageReader::Open(std::shared_ptr<ArrowInputStream> stream,
- int64_t total_num_rows,
- Compression::type codec,
- ::arrow::MemoryPool* pool,
- const CryptoContext* ctx) {
- return std::unique_ptr<PageReader>(
- new SerializedPageReader(std::move(stream), total_num_rows, codec, pool, ctx));
-}
-
-namespace {
-
-// ----------------------------------------------------------------------
-// Impl base class for TypedColumnReader and RecordReader
-
-// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
-// encoding.
-static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
- return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
-}
-
-template <typename DType>
-class ColumnReaderImplBase {
- public:
- using T = typename DType::c_type;
-
- ColumnReaderImplBase(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
- : descr_(descr),
- max_def_level_(descr->max_definition_level()),
- max_rep_level_(descr->max_repetition_level()),
- num_buffered_values_(0),
- num_decoded_values_(0),
- pool_(pool),
- current_decoder_(nullptr),
- current_encoding_(Encoding::UNKNOWN) {}
-
- virtual ~ColumnReaderImplBase() = default;
-
- protected:
- // Read up to batch_size values from the current data page into the
- // pre-allocated memory T*
- //
- // @returns: the number of values read into the out buffer
- int64_t ReadValues(int64_t batch_size, T* out) {
- int64_t num_decoded = current_decoder_->Decode(out, static_cast<int>(batch_size));
- return num_decoded;
- }
-
- // Read up to batch_size values from the current data page into the
- // pre-allocated memory T*, leaving spaces for null entries according
- // to the def_levels.
- //
- // @returns: the number of values read into the out buffer
- int64_t ReadValuesSpaced(int64_t batch_size, T* out, int64_t null_count,
- uint8_t* valid_bits, int64_t valid_bits_offset) {
- return current_decoder_->DecodeSpaced(out, static_cast<int>(batch_size),
- static_cast<int>(null_count), valid_bits,
- valid_bits_offset);
- }
-
- // Read multiple definition levels into preallocated memory
- //
- // Returns the number of decoded definition levels
- int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
- if (max_def_level_ == 0) {
- return 0;
- }
- return definition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
- }
-
- bool HasNextInternal() {
- // Either there is no data page available yet, or the data page has been
- // exhausted
- if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) {
- if (!ReadNewPage() || num_buffered_values_ == 0) {
- return false;
- }
- }
- return true;
- }
-
- // Read multiple repetition levels into preallocated memory
- // Returns the number of decoded repetition levels
- int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels) {
- if (max_rep_level_ == 0) {
- return 0;
- }
- return repetition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
- }
-
- // Advance to the next data page
- bool ReadNewPage() {
- // Loop until we find the next data page.
- while (true) {
- current_page_ = pager_->NextPage();
- if (!current_page_) {
- // EOS
- return false;
- }
-
- if (current_page_->type() == PageType::DICTIONARY_PAGE) {
- ConfigureDictionary(static_cast<const DictionaryPage*>(current_page_.get()));
- continue;
- } else if (current_page_->type() == PageType::DATA_PAGE) {
- const auto page = std::static_pointer_cast<DataPageV1>(current_page_);
- const int64_t levels_byte_size = InitializeLevelDecoders(
- *page, page->repetition_level_encoding(), page->definition_level_encoding());
- InitializeDataDecoder(*page, levels_byte_size);
- return true;
- } else if (current_page_->type() == PageType::DATA_PAGE_V2) {
- const auto page = std::static_pointer_cast<DataPageV2>(current_page_);
- int64_t levels_byte_size = InitializeLevelDecodersV2(*page);
- InitializeDataDecoder(*page, levels_byte_size);
- return true;
- } else {
- // We don't know what this page type is. We're allowed to skip non-data
- // pages.
- continue;
- }
- }
- return true;
- }
-
- void ConfigureDictionary(const DictionaryPage* page) {
- int encoding = static_cast<int>(page->encoding());
- if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
- page->encoding() == Encoding::PLAIN) {
- encoding = static_cast<int>(Encoding::RLE_DICTIONARY);
- }
-
- auto it = decoders_.find(encoding);
- if (it != decoders_.end()) {
- throw ParquetException("Column cannot have more than one dictionary.");
- }
-
- if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
- page->encoding() == Encoding::PLAIN) {
- auto dictionary = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
- dictionary->SetData(page->num_values(), page->data(), page->size());
-
- // The dictionary is fully decoded during DictionaryDecoder::Init, so the
- // DictionaryPage buffer is no longer required after this step
- //
- // TODO(wesm): investigate whether this all-or-nothing decoding of the
- // dictionary makes sense and whether performance can be improved
-
- std::unique_ptr<DictDecoder<DType>> decoder = MakeDictDecoder<DType>(descr_, pool_);
- decoder->SetDict(dictionary.get());
- decoders_[encoding] =
- std::unique_ptr<DecoderType>(dynamic_cast<DecoderType*>(decoder.release()));
- } else {
- ParquetException::NYI("only plain dictionary encoding has been implemented");
- }
-
- new_dictionary_ = true;
- current_decoder_ = decoders_[encoding].get();
- DCHECK(current_decoder_);
- }
-
- // Initialize repetition and definition level decoders on the next data page.
-
- // If the data page includes repetition and definition levels, we
- // initialize the level decoders and return the number of encoded level bytes.
- // The return value helps determine the number of bytes in the encoded data.
- int64_t InitializeLevelDecoders(const DataPage& page,
- Encoding::type repetition_level_encoding,
- Encoding::type definition_level_encoding) {
- // Read a data page.
- num_buffered_values_ = page.num_values();
-
- // Have not decoded any values from the data page yet
- num_decoded_values_ = 0;
-
- const uint8_t* buffer = page.data();
- int32_t levels_byte_size = 0;
- int32_t max_size = page.size();
-
- // Data page Layout: Repetition Levels - Definition Levels - encoded values.
- // Levels are encoded as rle or bit-packed.
- // Init repetition levels
- if (max_rep_level_ > 0) {
- int32_t rep_levels_bytes = repetition_level_decoder_.SetData(
- repetition_level_encoding, max_rep_level_,
- static_cast<int>(num_buffered_values_), buffer, max_size);
- buffer += rep_levels_bytes;
- levels_byte_size += rep_levels_bytes;
- max_size -= rep_levels_bytes;
- }
- // TODO figure a way to set max_def_level_ to 0
- // if the initial value is invalid
-
- // Init definition levels
- if (max_def_level_ > 0) {
- int32_t def_levels_bytes = definition_level_decoder_.SetData(
- definition_level_encoding, max_def_level_,
- static_cast<int>(num_buffered_values_), buffer, max_size);
- levels_byte_size += def_levels_bytes;
- max_size -= def_levels_bytes;
- }
-
- return levels_byte_size;
- }
-
- int64_t InitializeLevelDecodersV2(const DataPageV2& page) {
- // Read a data page.
- num_buffered_values_ = page.num_values();
-
- // Have not decoded any values from the data page yet
- num_decoded_values_ = 0;
- const uint8_t* buffer = page.data();
-
- const int64_t total_levels_length =
- static_cast<int64_t>(page.repetition_levels_byte_length()) +
- page.definition_levels_byte_length();
-
- if (total_levels_length > page.size()) {
- throw ParquetException("Data page too small for levels (corrupt header?)");
- }
-
- if (max_rep_level_ > 0) {
- repetition_level_decoder_.SetDataV2(page.repetition_levels_byte_length(),
- max_rep_level_,
- static_cast<int>(num_buffered_values_), buffer);
- buffer += page.repetition_levels_byte_length();
- }
-
- if (max_def_level_ > 0) {
- definition_level_decoder_.SetDataV2(page.definition_levels_byte_length(),
- max_def_level_,
- static_cast<int>(num_buffered_values_), buffer);
- }
-
- return total_levels_length;
- }
-
- // Get a decoder object for this page or create a new decoder if this is the
- // first page with this encoding.
- void InitializeDataDecoder(const DataPage& page, int64_t levels_byte_size) {
- const uint8_t* buffer = page.data() + levels_byte_size;
- const int64_t data_size = page.size() - levels_byte_size;
-
- if (data_size < 0) {
- throw ParquetException("Page smaller than size of encoded levels");
- }
-
- Encoding::type encoding = page.encoding();
-
- if (IsDictionaryIndexEncoding(encoding)) {
- encoding = Encoding::RLE_DICTIONARY;
- }
-
- auto it = decoders_.find(static_cast<int>(encoding));
- if (it != decoders_.end()) {
- DCHECK(it->second.get() != nullptr);
- if (encoding == Encoding::RLE_DICTIONARY) {
- DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY);
- }
- current_decoder_ = it->second.get();
- } else {
- switch (encoding) {
- case Encoding::PLAIN: {
- auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
- current_decoder_ = decoder.get();
- decoders_[static_cast<int>(encoding)] = std::move(decoder);
- break;
- }
- case Encoding::BYTE_STREAM_SPLIT: {
- auto decoder = MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT, descr_);
- current_decoder_ = decoder.get();
- decoders_[static_cast<int>(encoding)] = std::move(decoder);
- break;
- }
- case Encoding::RLE_DICTIONARY:
- throw ParquetException("Dictionary page must be before data page.");
-
- case Encoding::DELTA_BINARY_PACKED:
- case Encoding::DELTA_LENGTH_BYTE_ARRAY:
- case Encoding::DELTA_BYTE_ARRAY:
- ParquetException::NYI("Unsupported encoding");
-
- default:
- throw ParquetException("Unknown encoding type.");
- }
- }
- current_encoding_ = encoding;
- current_decoder_->SetData(static_cast<int>(num_buffered_values_), buffer,
- static_cast<int>(data_size));
- }
-
- const ColumnDescriptor* descr_;
- const int16_t max_def_level_;
- const int16_t max_rep_level_;
-
- std::unique_ptr<PageReader> pager_;
- std::shared_ptr<Page> current_page_;
-
- // Not set if full schema for this field has no optional or repeated elements
- LevelDecoder definition_level_decoder_;
-
- // Not set for flat schemas.
- LevelDecoder repetition_level_decoder_;
-
- // The total number of values stored in the data page. This is the maximum of
- // the number of encoded definition levels or encoded values. For
- // non-repeated, required columns, this is equal to the number of encoded
- // values. For repeated or optional values, there may be fewer data values
- // than levels, and this tells you how many encoded levels there are in that
- // case.
- int64_t num_buffered_values_;
-
- // The number of values from the current data page that have been decoded
- // into memory
- int64_t num_decoded_values_;
-
- ::arrow::MemoryPool* pool_;
-
- using DecoderType = TypedDecoder<DType>;
- DecoderType* current_decoder_;
- Encoding::type current_encoding_;
-
- /// Flag to signal when a new dictionary has been set, for the benefit of
- /// DictionaryRecordReader
- bool new_dictionary_;
-
- // The exposed encoding
- ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
-
- // Map of encoding type to the respective decoder object. For example, a
- // column chunk's data pages may include both dictionary-encoded and
- // plain-encoded data.
- std::unordered_map<int, std::unique_ptr<DecoderType>> decoders_;
-
- void ConsumeBufferedValues(int64_t num_values) { num_decoded_values_ += num_values; }
-};
-
-// ----------------------------------------------------------------------
-// TypedColumnReader implementations
-
-template <typename DType>
-class TypedColumnReaderImpl : public TypedColumnReader<DType>,
- public ColumnReaderImplBase<DType> {
- public:
- using T = typename DType::c_type;
-
- TypedColumnReaderImpl(const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
- ::arrow::MemoryPool* pool)
- : ColumnReaderImplBase<DType>(descr, pool) {
- this->pager_ = std::move(pager);
- }
-
- bool HasNext() override { return this->HasNextInternal(); }
-
- int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- T* values, int64_t* values_read) override;
-
- int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- T* values, uint8_t* valid_bits, int64_t valid_bits_offset,
- int64_t* levels_read, int64_t* values_read,
- int64_t* null_count) override;
-
- int64_t Skip(int64_t num_rows_to_skip) override;
-
- Type::type type() const override { return this->descr_->physical_type(); }
-
- const ColumnDescriptor* descr() const override { return this->descr_; }
-
- ExposedEncoding GetExposedEncoding() override { return this->exposed_encoding_; };
-
- int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, int32_t* indices,
- int64_t* indices_read, const T** dict,
- int32_t* dict_len) override;
-
- protected:
- void SetExposedEncoding(ExposedEncoding encoding) override {
- this->exposed_encoding_ = encoding;
- }
-
- private:
- // Read dictionary indices. Similar to ReadValues but decode data to dictionary indices.
- // This function is called only by ReadBatchWithDictionary().
- int64_t ReadDictionaryIndices(int64_t indices_to_read, int32_t* indices) {
- auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
- return decoder->DecodeIndices(static_cast<int>(indices_to_read), indices);
- }
-
- // Get dictionary. The dictionary should have been set by SetDict(). The dictionary is
- // owned by the internal decoder and is destroyed when the reader is destroyed. This
- // function is called only by ReadBatchWithDictionary() after dictionary is configured.
- void GetDictionary(const T** dictionary, int32_t* dictionary_length) {
- auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
- decoder->GetDictionary(dictionary, dictionary_length);
- }
-
- // Read definition and repetition levels. Also return the number of definition levels
- // and number of values to read. This function is called before reading values.
- void ReadLevels(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- int64_t* num_def_levels, int64_t* values_to_read) {
- batch_size =
- std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
-
- // If the field is required and non-repeated, there are no definition levels
- if (this->max_def_level_ > 0 && def_levels != nullptr) {
- *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
- // TODO(wesm): this tallying of values-to-decode can be performed with better
- // cache-efficiency if fused with the level decoding.
- for (int64_t i = 0; i < *num_def_levels; ++i) {
- if (def_levels[i] == this->max_def_level_) {
- ++(*values_to_read);
- }
- }
- } else {
- // Required field, read all values
- *values_to_read = batch_size;
- }
-
- // Not present for non-repeated fields
- if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
- int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
- if (def_levels != nullptr && *num_def_levels != num_rep_levels) {
- throw ParquetException("Number of decoded rep / def levels did not match");
- }
- }
- }
-};
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatchWithDictionary(
- int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, int32_t* indices,
- int64_t* indices_read, const T** dict, int32_t* dict_len) {
- bool has_dict_output = dict != nullptr && dict_len != nullptr;
- // Similar logic as ReadValues to get pages.
- if (!HasNext()) {
- *indices_read = 0;
- if (has_dict_output) {
- *dict = nullptr;
- *dict_len = 0;
- }
- return 0;
- }
-
- // Verify the current data page is dictionary encoded.
- if (this->current_encoding_ != Encoding::RLE_DICTIONARY) {
- std::stringstream ss;
- ss << "Data page is not dictionary encoded. Encoding: "
- << EncodingToString(this->current_encoding_);
- throw ParquetException(ss.str());
- }
-
- // Get dictionary pointer and length.
- if (has_dict_output) {
- GetDictionary(dict, dict_len);
- }
-
- // Similar logic as ReadValues to get def levels and rep levels.
- int64_t num_def_levels = 0;
- int64_t indices_to_read = 0;
- ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &indices_to_read);
-
- // Read dictionary indices.
- *indices_read = ReadDictionaryIndices(indices_to_read, indices);
- int64_t total_indices = std::max(num_def_levels, *indices_read);
- this->ConsumeBufferedValues(total_indices);
-
- return total_indices;
-}
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, T* values,
- int64_t* values_read) {
- // HasNext invokes ReadNewPage
- if (!HasNext()) {
- *values_read = 0;
- return 0;
- }
-
- // TODO(wesm): keep reading data pages until batch_size is reached, or the
- // row group is finished
- int64_t num_def_levels = 0;
- int64_t values_to_read = 0;
- ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &values_to_read);
-
- *values_read = this->ReadValues(values_to_read, values);
- int64_t total_values = std::max(num_def_levels, *values_read);
- this->ConsumeBufferedValues(total_values);
-
- return total_values;
-}
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
- int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
- uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
- int64_t* values_read, int64_t* null_count_out) {
- // HasNext invokes ReadNewPage
- if (!HasNext()) {
- *levels_read = 0;
- *values_read = 0;
- *null_count_out = 0;
- return 0;
- }
-
- int64_t total_values;
- // TODO(wesm): keep reading data pages until batch_size is reached, or the
- // row group is finished
- batch_size =
- std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
-
- // If the field is required and non-repeated, there are no definition levels
- if (this->max_def_level_ > 0) {
- int64_t num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
-
- // Not present for non-repeated fields
- if (this->max_rep_level_ > 0) {
- int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
- if (num_def_levels != num_rep_levels) {
- throw ParquetException("Number of decoded rep / def levels did not match");
- }
- }
-
- const bool has_spaced_values = HasSpacedValues(this->descr_);
- int64_t null_count = 0;
- if (!has_spaced_values) {
- int values_to_read = 0;
- for (int64_t i = 0; i < num_def_levels; ++i) {
- if (def_levels[i] == this->max_def_level_) {
- ++values_to_read;
- }
- }
- total_values = this->ReadValues(values_to_read, values);
- ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
- /*length=*/total_values,
- /*bits_are_set=*/true);
- *values_read = total_values;
- } else {
- internal::LevelInfo info;
- info.repeated_ancestor_def_level = this->max_def_level_ - 1;
- info.def_level = this->max_def_level_;
- info.rep_level = this->max_rep_level_;
- internal::ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = num_def_levels;
- validity_io.valid_bits = valid_bits;
- validity_io.valid_bits_offset = valid_bits_offset;
- validity_io.null_count = null_count;
- validity_io.values_read = *values_read;
-
- internal::DefLevelsToBitmap(def_levels, num_def_levels, info, &validity_io);
- null_count = validity_io.null_count;
- *values_read = validity_io.values_read;
-
- total_values =
- this->ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
- valid_bits, valid_bits_offset);
- }
- *levels_read = num_def_levels;
- *null_count_out = null_count;
-
- } else {
- // Required field, read all values
- total_values = this->ReadValues(batch_size, values);
- ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
- /*length=*/total_values,
- /*bits_are_set=*/true);
- *null_count_out = 0;
- *values_read = total_values;
- *levels_read = total_values;
- }
-
- this->ConsumeBufferedValues(*levels_read);
- return total_values;
-}
-
-template <typename DType>
-int64_t TypedColumnReaderImpl<DType>::Skip(int64_t num_rows_to_skip) {
- int64_t rows_to_skip = num_rows_to_skip;
- while (HasNext() && rows_to_skip > 0) {
- // If the number of rows to skip is more than the number of undecoded values, skip the
- // Page.
- if (rows_to_skip > (this->num_buffered_values_ - this->num_decoded_values_)) {
- rows_to_skip -= this->num_buffered_values_ - this->num_decoded_values_;
- this->num_decoded_values_ = this->num_buffered_values_;
- } else {
- // We need to read this Page
- // Jump to the right offset in the Page
- int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint
- int64_t values_read = 0;
-
- // This will be enough scratch space to accommodate 16-bit levels or any
- // value type
- std::shared_ptr<ResizableBuffer> scratch = AllocateBuffer(
- this->pool_, batch_size * type_traits<DType::type_num>::value_byte_size);
-
- do {
- batch_size = std::min(batch_size, rows_to_skip);
- values_read =
- ReadBatch(static_cast<int>(batch_size),
- reinterpret_cast<int16_t*>(scratch->mutable_data()),
- reinterpret_cast<int16_t*>(scratch->mutable_data()),
- reinterpret_cast<T*>(scratch->mutable_data()), &values_read);
- rows_to_skip -= values_read;
- } while (values_read > 0 && rows_to_skip > 0);
- }
- }
- return num_rows_to_skip - rows_to_skip;
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// Dynamic column reader constructor
-
-std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
- std::unique_ptr<PageReader> pager,
- MemoryPool* pool) {
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedColumnReaderImpl<BooleanType>>(descr, std::move(pager),
- pool);
- case Type::INT32:
- return std::make_shared<TypedColumnReaderImpl<Int32Type>>(descr, std::move(pager),
- pool);
- case Type::INT64:
- return std::make_shared<TypedColumnReaderImpl<Int64Type>>(descr, std::move(pager),
- pool);
- case Type::INT96:
- return std::make_shared<TypedColumnReaderImpl<Int96Type>>(descr, std::move(pager),
- pool);
- case Type::FLOAT:
- return std::make_shared<TypedColumnReaderImpl<FloatType>>(descr, std::move(pager),
- pool);
- case Type::DOUBLE:
- return std::make_shared<TypedColumnReaderImpl<DoubleType>>(descr, std::move(pager),
- pool);
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedColumnReaderImpl<ByteArrayType>>(
- descr, std::move(pager), pool);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedColumnReaderImpl<FLBAType>>(descr, std::move(pager),
- pool);
- default:
- ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return std::shared_ptr<ColumnReader>(nullptr);
-}
-
-// ----------------------------------------------------------------------
-// RecordReader
-
-namespace internal {
-namespace {
-
-// The minimum number of repetition/definition levels to decode at a time, for
-// better vectorized performance when doing many smaller record reads
-constexpr int64_t kMinLevelBatchSize = 1024;
-
-template <typename DType>
-class TypedRecordReader : public ColumnReaderImplBase<DType>,
- virtual public RecordReader {
- public:
- using T = typename DType::c_type;
- using BASE = ColumnReaderImplBase<DType>;
- TypedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool)
- : BASE(descr, pool) {
- leaf_info_ = leaf_info;
- nullable_values_ = leaf_info.HasNullableValues();
- at_record_start_ = true;
- records_read_ = 0;
- values_written_ = 0;
- values_capacity_ = 0;
- null_count_ = 0;
- levels_written_ = 0;
- levels_position_ = 0;
- levels_capacity_ = 0;
- uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
-
- if (uses_values_) {
- values_ = AllocateBuffer(pool);
- }
- valid_bits_ = AllocateBuffer(pool);
- def_levels_ = AllocateBuffer(pool);
- rep_levels_ = AllocateBuffer(pool);
- Reset();
- }
-
- int64_t available_values_current_page() const {
- return this->num_buffered_values_ - this->num_decoded_values_;
- }
-
- // Compute the values capacity in bytes for the given number of elements
- int64_t bytes_for_values(int64_t nitems) const {
- int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
- int64_t bytes_for_values = -1;
- if (MultiplyWithOverflow(nitems, type_size, &bytes_for_values)) {
- throw ParquetException("Total size of items too large");
- }
- return bytes_for_values;
- }
-
- int64_t ReadRecords(int64_t num_records) override {
- // Delimit records, then read values at the end
- int64_t records_read = 0;
-
- if (levels_position_ < levels_written_) {
- records_read += ReadRecordData(num_records);
- }
-
- int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records);
-
- // If we are in the middle of a record, we continue until reaching the
- // desired number of records or the end of the current record if we've found
- // enough records
- while (!at_record_start_ || records_read < num_records) {
- // Is there more data to read in this row group?
- if (!this->HasNextInternal()) {
- if (!at_record_start_) {
- // We ended the row group while inside a record that we haven't seen
- // the end of yet. So increment the record count for the last record in
- // the row group
- ++records_read;
- at_record_start_ = true;
- }
- break;
- }
-
- /// We perform multiple batch reads until we either exhaust the row group
- /// or observe the desired number of records
- int64_t batch_size = std::min(level_batch_size, available_values_current_page());
-
- // No more data in column
- if (batch_size == 0) {
- break;
- }
-
- if (this->max_def_level_ > 0) {
- ReserveLevels(batch_size);
-
- int16_t* def_levels = this->def_levels() + levels_written_;
- int16_t* rep_levels = this->rep_levels() + levels_written_;
-
- // Not present for non-repeated fields
- int64_t levels_read = 0;
- if (this->max_rep_level_ > 0) {
- levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
- if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) {
- throw ParquetException("Number of decoded rep / def levels did not match");
- }
- } else if (this->max_def_level_ > 0) {
- levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
- }
-
- // Exhausted column chunk
- if (levels_read == 0) {
- break;
- }
-
- levels_written_ += levels_read;
- records_read += ReadRecordData(num_records - records_read);
- } else {
- // No repetition or definition levels
- batch_size = std::min(num_records - records_read, batch_size);
- records_read += ReadRecordData(batch_size);
- }
- }
-
- return records_read;
- }
-
- // We may outwardly have the appearance of having exhausted a column chunk
- // when in fact we are in the middle of processing the last batch
- bool has_values_to_process() const { return levels_position_ < levels_written_; }
-
- std::shared_ptr<ResizableBuffer> ReleaseValues() override {
- if (uses_values_) {
- auto result = values_;
- PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
- values_ = AllocateBuffer(this->pool_);
- values_capacity_ = 0;
- return result;
- } else {
- return nullptr;
- }
- }
-
- std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
- if (leaf_info_.HasNullableValues()) {
- auto result = valid_bits_;
- PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
- valid_bits_ = AllocateBuffer(this->pool_);
- return result;
- } else {
- return nullptr;
- }
- }
-
- // Process written repetition/definition levels to reach the end of
- // records. Process no more levels than necessary to delimit the indicated
- // number of logical records. Updates internal state of RecordReader
- //
- // \return Number of records delimited
- int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
- int64_t values_to_read = 0;
- int64_t records_read = 0;
-
- const int16_t* def_levels = this->def_levels() + levels_position_;
- const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
- DCHECK_GT(this->max_rep_level_, 0);
-
- // Count logical records and number of values to read
- while (levels_position_ < levels_written_) {
- const int16_t rep_level = *rep_levels++;
- if (rep_level == 0) {
- // If at_record_start_ is true, we are seeing the start of a record
- // for the second time, such as after repeated calls to
- // DelimitRecords. In this case we must continue until we find
- // another record start or exhausting the ColumnChunk
- if (!at_record_start_) {
- // We've reached the end of a record; increment the record count.
- ++records_read;
- if (records_read == num_records) {
- // We've found the number of records we were looking for. Set
- // at_record_start_ to true and break
- at_record_start_ = true;
- break;
- }
- }
- }
- // We have decided to consume the level at this position; therefore we
- // must advance until we find another record boundary
- at_record_start_ = false;
-
- const int16_t def_level = *def_levels++;
- if (def_level == this->max_def_level_) {
- ++values_to_read;
- }
- ++levels_position_;
- }
- *values_seen = values_to_read;
- return records_read;
- }
-
- void Reserve(int64_t capacity) override {
- ReserveLevels(capacity);
- ReserveValues(capacity);
- }
-
- int64_t UpdateCapacity(int64_t capacity, int64_t size, int64_t extra_size) {
- if (extra_size < 0) {
- throw ParquetException("Negative size (corrupt file?)");
- }
- int64_t target_size = -1;
- if (AddWithOverflow(size, extra_size, &target_size)) {
- throw ParquetException("Allocation size too large (corrupt file?)");
- }
- if (target_size >= (1LL << 62)) {
- throw ParquetException("Allocation size too large (corrupt file?)");
- }
- if (capacity >= target_size) {
- return capacity;
- }
- return BitUtil::NextPower2(target_size);
- }
-
- void ReserveLevels(int64_t extra_levels) {
- if (this->max_def_level_ > 0) {
- const int64_t new_levels_capacity =
- UpdateCapacity(levels_capacity_, levels_written_, extra_levels);
- if (new_levels_capacity > levels_capacity_) {
- constexpr auto kItemSize = static_cast<int64_t>(sizeof(int16_t));
- int64_t capacity_in_bytes = -1;
- if (MultiplyWithOverflow(new_levels_capacity, kItemSize, &capacity_in_bytes)) {
- throw ParquetException("Allocation size too large (corrupt file?)");
- }
- PARQUET_THROW_NOT_OK(def_levels_->Resize(capacity_in_bytes, false));
- if (this->max_rep_level_ > 0) {
- PARQUET_THROW_NOT_OK(rep_levels_->Resize(capacity_in_bytes, false));
- }
- levels_capacity_ = new_levels_capacity;
- }
- }
- }
-
- void ReserveValues(int64_t extra_values) {
- const int64_t new_values_capacity =
- UpdateCapacity(values_capacity_, values_written_, extra_values);
- if (new_values_capacity > values_capacity_) {
- // XXX(wesm): A hack to avoid memory allocation when reading directly
- // into builder classes
- if (uses_values_) {
- PARQUET_THROW_NOT_OK(
- values_->Resize(bytes_for_values(new_values_capacity), false));
- }
- values_capacity_ = new_values_capacity;
- }
- if (leaf_info_.HasNullableValues()) {
- int64_t valid_bytes_new = BitUtil::BytesForBits(values_capacity_);
- if (valid_bits_->size() < valid_bytes_new) {
- int64_t valid_bytes_old = BitUtil::BytesForBits(values_written_);
- PARQUET_THROW_NOT_OK(valid_bits_->Resize(valid_bytes_new, false));
-
- // Avoid valgrind warnings
- memset(valid_bits_->mutable_data() + valid_bytes_old, 0,
- valid_bytes_new - valid_bytes_old);
- }
- }
- }
-
- void Reset() override {
- ResetValues();
-
- if (levels_written_ > 0) {
- const int64_t levels_remaining = levels_written_ - levels_position_;
- // Shift remaining levels to beginning of buffer and trim to only the number
- // of decoded levels remaining
- int16_t* def_data = def_levels();
- int16_t* rep_data = rep_levels();
-
- std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
- PARQUET_THROW_NOT_OK(
- def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
-
- if (this->max_rep_level_ > 0) {
- std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
- PARQUET_THROW_NOT_OK(
- rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
- }
-
- levels_written_ -= levels_position_;
- levels_position_ = 0;
- levels_capacity_ = levels_remaining;
- }
-
- records_read_ = 0;
-
- // Call Finish on the binary builders to reset them
- }
-
- void SetPageReader(std::unique_ptr<PageReader> reader) override {
- at_record_start_ = true;
- this->pager_ = std::move(reader);
- ResetDecoders();
- }
-
- bool HasMoreData() const override { return this->pager_ != nullptr; }
-
- // Dictionary decoders must be reset when advancing row groups
- void ResetDecoders() { this->decoders_.clear(); }
-
- virtual void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) {
- uint8_t* valid_bits = valid_bits_->mutable_data();
- const int64_t valid_bits_offset = values_written_;
-
- int64_t num_decoded = this->current_decoder_->DecodeSpaced(
- ValuesHead<T>(), static_cast<int>(values_with_nulls),
- static_cast<int>(null_count), valid_bits, valid_bits_offset);
- DCHECK_EQ(num_decoded, values_with_nulls);
- }
-
- virtual void ReadValuesDense(int64_t values_to_read) {
- int64_t num_decoded =
- this->current_decoder_->Decode(ValuesHead<T>(), static_cast<int>(values_to_read));
- DCHECK_EQ(num_decoded, values_to_read);
- }
-
- // Return number of logical records read
- int64_t ReadRecordData(int64_t num_records) {
- // Conservative upper bound
- const int64_t possible_num_values =
- std::max(num_records, levels_written_ - levels_position_);
- ReserveValues(possible_num_values);
-
- const int64_t start_levels_position = levels_position_;
-
- int64_t values_to_read = 0;
- int64_t records_read = 0;
- if (this->max_rep_level_ > 0) {
- records_read = DelimitRecords(num_records, &values_to_read);
- } else if (this->max_def_level_ > 0) {
- // No repetition levels, skip delimiting logic. Each level represents a
- // null or not null entry
- records_read = std::min(levels_written_ - levels_position_, num_records);
-
- // This is advanced by DelimitRecords, which we skipped
- levels_position_ += records_read;
- } else {
- records_read = values_to_read = num_records;
- }
-
- int64_t null_count = 0;
- if (leaf_info_.HasNullableValues()) {
- ValidityBitmapInputOutput validity_io;
- validity_io.values_read_upper_bound = levels_position_ - start_levels_position;
- validity_io.valid_bits = valid_bits_->mutable_data();
- validity_io.valid_bits_offset = values_written_;
-
- DefLevelsToBitmap(def_levels() + start_levels_position,
- levels_position_ - start_levels_position, leaf_info_,
- &validity_io);
- values_to_read = validity_io.values_read - validity_io.null_count;
- null_count = validity_io.null_count;
- DCHECK_GE(values_to_read, 0);
- ReadValuesSpaced(validity_io.values_read, null_count);
- } else {
- DCHECK_GE(values_to_read, 0);
- ReadValuesDense(values_to_read);
- }
- if (this->leaf_info_.def_level > 0) {
- // Optional, repeated, or some mix thereof
- this->ConsumeBufferedValues(levels_position_ - start_levels_position);
- } else {
- // Flat, non-repeated
- this->ConsumeBufferedValues(values_to_read);
- }
- // Total values, including null spaces, if any
- values_written_ += values_to_read + null_count;
- null_count_ += null_count;
-
- return records_read;
- }
-
- void DebugPrintState() override {
- const int16_t* def_levels = this->def_levels();
- const int16_t* rep_levels = this->rep_levels();
- const int64_t total_levels_read = levels_position_;
-
- const T* vals = reinterpret_cast<const T*>(this->values());
-
- std::cout << "def levels: ";
- for (int64_t i = 0; i < total_levels_read; ++i) {
- std::cout << def_levels[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "rep levels: ";
- for (int64_t i = 0; i < total_levels_read; ++i) {
- std::cout << rep_levels[i] << " ";
- }
- std::cout << std::endl;
-
- std::cout << "values: ";
- for (int64_t i = 0; i < this->values_written(); ++i) {
- std::cout << vals[i] << " ";
- }
- std::cout << std::endl;
- }
-
- void ResetValues() {
- if (values_written_ > 0) {
- // Resize to 0, but do not shrink to fit
- if (uses_values_) {
- PARQUET_THROW_NOT_OK(values_->Resize(0, false));
- }
- PARQUET_THROW_NOT_OK(valid_bits_->Resize(0, false));
- values_written_ = 0;
- values_capacity_ = 0;
- null_count_ = 0;
- }
- }
-
- protected:
- template <typename T>
- T* ValuesHead() {
- return reinterpret_cast<T*>(values_->mutable_data()) + values_written_;
- }
- LevelInfo leaf_info_;
-};
-
-class FLBARecordReader : public TypedRecordReader<FLBAType>,
- virtual public BinaryRecordReader {
- public:
- FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool)
- : TypedRecordReader<FLBAType>(descr, leaf_info, pool), builder_(nullptr) {
- DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY);
- int byte_width = descr_->type_length();
- std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width);
- builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, this->pool_));
- }
-
- ::arrow::ArrayVector GetBuilderChunks() override {
- std::shared_ptr<::arrow::Array> chunk;
- PARQUET_THROW_NOT_OK(builder_->Finish(&chunk));
- return ::arrow::ArrayVector({chunk});
- }
-
- void ReadValuesDense(int64_t values_to_read) override {
- auto values = ValuesHead<FLBA>();
- int64_t num_decoded =
- this->current_decoder_->Decode(values, static_cast<int>(values_to_read));
- DCHECK_EQ(num_decoded, values_to_read);
-
- for (int64_t i = 0; i < num_decoded; i++) {
- PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
- }
- ResetValues();
- }
-
- void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
- uint8_t* valid_bits = valid_bits_->mutable_data();
- const int64_t valid_bits_offset = values_written_;
- auto values = ValuesHead<FLBA>();
-
- int64_t num_decoded = this->current_decoder_->DecodeSpaced(
- values, static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits, valid_bits_offset);
- DCHECK_EQ(num_decoded, values_to_read);
-
- for (int64_t i = 0; i < num_decoded; i++) {
- if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
- PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
- } else {
- PARQUET_THROW_NOT_OK(builder_->AppendNull());
- }
- }
- ResetValues();
- }
-
- private:
- std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
-};
-
-class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
- virtual public BinaryRecordReader {
- public:
- ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool)
- : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
- DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
- accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
- }
-
- ::arrow::ArrayVector GetBuilderChunks() override {
- ::arrow::ArrayVector result = accumulator_.chunks;
- if (result.size() == 0 || accumulator_.builder->length() > 0) {
- std::shared_ptr<::arrow::Array> last_chunk;
- PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
- result.push_back(std::move(last_chunk));
- }
- accumulator_.chunks = {};
- return result;
- }
-
- void ReadValuesDense(int64_t values_to_read) override {
- int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
- static_cast<int>(values_to_read), &accumulator_);
- DCHECK_EQ(num_decoded, values_to_read);
- ResetValues();
- }
-
- void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
- int64_t num_decoded = this->current_decoder_->DecodeArrow(
- static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits_->mutable_data(), values_written_, &accumulator_);
- DCHECK_EQ(num_decoded, values_to_read - null_count);
- ResetValues();
- }
-
- private:
- // Helper data structure for accumulating builder chunks
- typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
-};
-
-class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
- virtual public DictionaryRecordReader {
- public:
- ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool)
- : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool), builder_(pool) {
- this->read_dictionary_ = true;
- }
-
- std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
- FlushBuilder();
- std::vector<std::shared_ptr<::arrow::Array>> result;
- std::swap(result, result_chunks_);
- return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
- }
-
- void FlushBuilder() {
- if (builder_.length() > 0) {
- std::shared_ptr<::arrow::Array> chunk;
- PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
- result_chunks_.emplace_back(std::move(chunk));
-
- // Also clears the dictionary memo table
- builder_.Reset();
- }
- }
-
- void MaybeWriteNewDictionary() {
- if (this->new_dictionary_) {
- /// If there is a new dictionary, we may need to flush the builder, then
- /// insert the new dictionary values
- FlushBuilder();
- builder_.ResetFull();
- auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
- decoder->InsertDictionary(&builder_);
- this->new_dictionary_ = false;
- }
- }
-
- void ReadValuesDense(int64_t values_to_read) override {
- int64_t num_decoded = 0;
- if (current_encoding_ == Encoding::RLE_DICTIONARY) {
- MaybeWriteNewDictionary();
- auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
- num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
- } else {
- num_decoded = this->current_decoder_->DecodeArrowNonNull(
- static_cast<int>(values_to_read), &builder_);
-
- /// Flush values since they have been copied into the builder
- ResetValues();
- }
- DCHECK_EQ(num_decoded, values_to_read);
- }
-
- void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
- int64_t num_decoded = 0;
- if (current_encoding_ == Encoding::RLE_DICTIONARY) {
- MaybeWriteNewDictionary();
- auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
- num_decoded = decoder->DecodeIndicesSpaced(
- static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits_->mutable_data(), values_written_, &builder_);
- } else {
- num_decoded = this->current_decoder_->DecodeArrow(
- static_cast<int>(values_to_read), static_cast<int>(null_count),
- valid_bits_->mutable_data(), values_written_, &builder_);
-
- /// Flush values since they have been copied into the builder
- ResetValues();
- }
- DCHECK_EQ(num_decoded, values_to_read - null_count);
- }
-
- private:
- using BinaryDictDecoder = DictDecoder<ByteArrayType>;
-
- ::arrow::BinaryDictionary32Builder builder_;
- std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
-};
-
-// TODO(wesm): Implement these to some satisfaction
-template <>
-void TypedRecordReader<Int96Type>::DebugPrintState() {}
-
-template <>
-void TypedRecordReader<ByteArrayType>::DebugPrintState() {}
-
-template <>
-void TypedRecordReader<FLBAType>::DebugPrintState() {}
-
-std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* descr,
- LevelInfo leaf_info,
- ::arrow::MemoryPool* pool,
- bool read_dictionary) {
- if (read_dictionary) {
- return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool);
- } else {
- return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool);
- }
-}
-
-} // namespace
-
-std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
- LevelInfo leaf_info, MemoryPool* pool,
- const bool read_dictionary) {
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool);
- case Type::INT32:
- return std::make_shared<TypedRecordReader<Int32Type>>(descr, leaf_info, pool);
- case Type::INT64:
- return std::make_shared<TypedRecordReader<Int64Type>>(descr, leaf_info, pool);
- case Type::INT96:
- return std::make_shared<TypedRecordReader<Int96Type>>(descr, leaf_info, pool);
- case Type::FLOAT:
- return std::make_shared<TypedRecordReader<FloatType>>(descr, leaf_info, pool);
- case Type::DOUBLE:
- return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool);
- case Type::BYTE_ARRAY:
- return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<FLBARecordReader>(descr, leaf_info, pool);
- default: {
- // PARQUET-1481: This can occur if the file is corrupt
- std::stringstream ss;
- ss << "Invalid physical column type: " << static_cast<int>(descr->physical_type());
- throw ParquetException(ss.str());
- }
- }
- // Unreachable code, but suppress compiler warning
- return nullptr;
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/chunked_array.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "parquet/column_page.h"
+#include "parquet/encoding.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/level_comparison.h"
+#include "parquet/level_conversion.h"
+#include "parquet/properties.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h" // IWYU pragma: keep
+// Required after "arrow/util/int_util_internal.h" (for OPTIONAL)
+#include "parquet/windows_compatibility.h"
+
+using arrow::MemoryPool;
+using arrow::internal::AddWithOverflow;
+using arrow::internal::checked_cast;
+using arrow::internal::MultiplyWithOverflow;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+namespace {
+inline bool HasSpacedValues(const ColumnDescriptor* descr) {
+ if (descr->max_repetition_level() > 0) {
+ // repeated+flat case
+ return !descr->schema_node()->is_required();
+ } else {
+ // non-repeated+nested case
+ // Find if a node forces nulls in the lowest level along the hierarchy
+ const schema::Node* node = descr->schema_node().get();
+ while (node) {
+ if (node->is_optional()) {
+ return true;
+ }
+ node = node->parent();
+ }
+ return false;
+ }
+}
+} // namespace
+
+LevelDecoder::LevelDecoder() : num_values_remaining_(0) {}
+
+LevelDecoder::~LevelDecoder() {}
+
+int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values, const uint8_t* data,
+ int32_t data_size) {
+ max_level_ = max_level;
+ int32_t num_bytes = 0;
+ encoding_ = encoding;
+ num_values_remaining_ = num_buffered_values;
+ bit_width_ = BitUtil::Log2(max_level + 1);
+ switch (encoding) {
+ case Encoding::RLE: {
+ if (data_size < 4) {
+ throw ParquetException("Received invalid levels (corrupt data page?)");
+ }
+ num_bytes = ::arrow::util::SafeLoadAs<int32_t>(data);
+ if (num_bytes < 0 || num_bytes > data_size - 4) {
+ throw ParquetException("Received invalid number of bytes (corrupt data page?)");
+ }
+ const uint8_t* decoder_data = data + 4;
+ if (!rle_decoder_) {
+ rle_decoder_.reset(
+ new ::arrow::util::RleDecoder(decoder_data, num_bytes, bit_width_));
+ } else {
+ rle_decoder_->Reset(decoder_data, num_bytes, bit_width_);
+ }
+ return 4 + num_bytes;
+ }
+ case Encoding::BIT_PACKED: {
+ int num_bits = 0;
+ if (MultiplyWithOverflow(num_buffered_values, bit_width_, &num_bits)) {
+ throw ParquetException(
+ "Number of buffered values too large (corrupt data page?)");
+ }
+ num_bytes = static_cast<int32_t>(BitUtil::BytesForBits(num_bits));
+ if (num_bytes < 0 || num_bytes > data_size - 4) {
+ throw ParquetException("Received invalid number of bytes (corrupt data page?)");
+ }
+ if (!bit_packed_decoder_) {
+ bit_packed_decoder_.reset(new ::arrow::BitUtil::BitReader(data, num_bytes));
+ } else {
+ bit_packed_decoder_->Reset(data, num_bytes);
+ }
+ return num_bytes;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return -1;
+}
+
+void LevelDecoder::SetDataV2(int32_t num_bytes, int16_t max_level,
+ int num_buffered_values, const uint8_t* data) {
+ max_level_ = max_level;
+ // Repetition and definition levels always uses RLE encoding
+ // in the DataPageV2 format.
+ if (num_bytes < 0) {
+ throw ParquetException("Invalid page header (corrupt data page?)");
+ }
+ encoding_ = Encoding::RLE;
+ num_values_remaining_ = num_buffered_values;
+ bit_width_ = BitUtil::Log2(max_level + 1);
+
+ if (!rle_decoder_) {
+ rle_decoder_.reset(new ::arrow::util::RleDecoder(data, num_bytes, bit_width_));
+ } else {
+ rle_decoder_->Reset(data, num_bytes, bit_width_);
+ }
+}
+
+int LevelDecoder::Decode(int batch_size, int16_t* levels) {
+ int num_decoded = 0;
+
+ int num_values = std::min(num_values_remaining_, batch_size);
+ if (encoding_ == Encoding::RLE) {
+ num_decoded = rle_decoder_->GetBatch(levels, num_values);
+ } else {
+ num_decoded = bit_packed_decoder_->GetBatch(bit_width_, levels, num_values);
+ }
+ if (num_decoded > 0) {
+ internal::MinMax min_max = internal::FindMinMax(levels, num_decoded);
+ if (ARROW_PREDICT_FALSE(min_max.min < 0 || min_max.max > max_level_)) {
+ std::stringstream ss;
+ ss << "Malformed levels. min: " << min_max.min << " max: " << min_max.max
+ << " out of range. Max Level: " << max_level_;
+ throw ParquetException(ss.str());
+ }
+ }
+ num_values_remaining_ -= num_decoded;
+ return num_decoded;
+}
+
+ReaderProperties default_reader_properties() {
+ static ReaderProperties default_reader_properties;
+ return default_reader_properties;
+}
+
+namespace {
+
+// Extracts encoded statistics from V1 and V2 data page headers
+template <typename H>
+EncodedStatistics ExtractStatsFromHeader(const H& header) {
+ EncodedStatistics page_statistics;
+ if (!header.__isset.statistics) {
+ return page_statistics;
+ }
+ const format::Statistics& stats = header.statistics;
+ if (stats.__isset.max) {
+ page_statistics.set_max(stats.max);
+ }
+ if (stats.__isset.min) {
+ page_statistics.set_min(stats.min);
+ }
+ if (stats.__isset.null_count) {
+ page_statistics.set_null_count(stats.null_count);
+ }
+ if (stats.__isset.distinct_count) {
+ page_statistics.set_distinct_count(stats.distinct_count);
+ }
+ return page_statistics;
+}
+
+// ----------------------------------------------------------------------
+// SerializedPageReader deserializes Thrift metadata and pages that have been
+// assembled in a serialized stream for storing in a Parquet files
+
+// This subclass delimits pages appearing in a serialized stream, each preceded
+// by a serialized Thrift format::PageHeader indicating the type of each page
+// and the page metadata.
+class SerializedPageReader : public PageReader {
+ public:
+ SerializedPageReader(std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
+ Compression::type codec, ::arrow::MemoryPool* pool,
+ const CryptoContext* crypto_ctx)
+ : stream_(std::move(stream)),
+ decompression_buffer_(AllocateBuffer(pool, 0)),
+ page_ordinal_(0),
+ seen_num_rows_(0),
+ total_num_rows_(total_num_rows),
+ decryption_buffer_(AllocateBuffer(pool, 0)) {
+ if (crypto_ctx != nullptr) {
+ crypto_ctx_ = *crypto_ctx;
+ InitDecryption();
+ }
+ max_page_header_size_ = kDefaultMaxPageHeaderSize;
+ decompressor_ = GetCodec(codec);
+ }
+
+ // Implement the PageReader interface
+ std::shared_ptr<Page> NextPage() override;
+
+ void set_max_page_header_size(uint32_t size) override { max_page_header_size_ = size; }
+
+ private:
+ void UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor, int8_t module_type,
+ const std::string& page_aad);
+
+ void InitDecryption();
+
+ std::shared_ptr<Buffer> DecompressIfNeeded(std::shared_ptr<Buffer> page_buffer,
+ int compressed_len, int uncompressed_len,
+ int levels_byte_len = 0);
+
+ std::shared_ptr<ArrowInputStream> stream_;
+
+ format::PageHeader current_page_header_;
+ std::shared_ptr<Page> current_page_;
+
+ // Compression codec to use.
+ std::unique_ptr<::arrow::util::Codec> decompressor_;
+ std::shared_ptr<ResizableBuffer> decompression_buffer_;
+
+ // The fields below are used for calculation of AAD (additional authenticated data)
+ // suffix which is part of the Parquet Modular Encryption.
+ // The AAD suffix for a parquet module is built internally by
+ // concatenating different parts some of which include
+ // the row group ordinal, column ordinal and page ordinal.
+ // Please refer to the encryption specification for more details:
+ // https://github.com/apache/parquet-format/blob/encryption/Encryption.md#44-additional-authenticated-data
+
+ // The ordinal fields in the context below are used for AAD suffix calculation.
+ CryptoContext crypto_ctx_;
+ int16_t page_ordinal_; // page ordinal does not count the dictionary page
+
+ // Maximum allowed page size
+ uint32_t max_page_header_size_;
+
+ // Number of rows read in data pages so far
+ int64_t seen_num_rows_;
+
+ // Number of rows in all the data pages
+ int64_t total_num_rows_;
+
+ // data_page_aad_ and data_page_header_aad_ contain the AAD for data page and data page
+ // header in a single column respectively.
+ // While calculating AAD for different pages in a single column the pages AAD is
+ // updated by only the page ordinal.
+ std::string data_page_aad_;
+ std::string data_page_header_aad_;
+ // Encryption
+ std::shared_ptr<ResizableBuffer> decryption_buffer_;
+};
+
+void SerializedPageReader::InitDecryption() {
+ // Prepare the AAD for quick update later.
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ DCHECK(!crypto_ctx_.data_decryptor->file_aad().empty());
+ data_page_aad_ = encryption::CreateModuleAad(
+ crypto_ctx_.data_decryptor->file_aad(), encryption::kDataPage,
+ crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ }
+ if (crypto_ctx_.meta_decryptor != nullptr) {
+ DCHECK(!crypto_ctx_.meta_decryptor->file_aad().empty());
+ data_page_header_aad_ = encryption::CreateModuleAad(
+ crypto_ctx_.meta_decryptor->file_aad(), encryption::kDataPageHeader,
+ crypto_ctx_.row_group_ordinal, crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ }
+}
+
+void SerializedPageReader::UpdateDecryption(const std::shared_ptr<Decryptor>& decryptor,
+ int8_t module_type,
+ const std::string& page_aad) {
+ DCHECK(decryptor != nullptr);
+ if (crypto_ctx_.start_decrypt_with_dictionary_page) {
+ std::string aad = encryption::CreateModuleAad(
+ decryptor->file_aad(), module_type, crypto_ctx_.row_group_ordinal,
+ crypto_ctx_.column_ordinal, kNonPageOrdinal);
+ decryptor->UpdateAad(aad);
+ } else {
+ encryption::QuickUpdatePageAad(page_aad, page_ordinal_);
+ decryptor->UpdateAad(page_aad);
+ }
+}
+
+std::shared_ptr<Page> SerializedPageReader::NextPage() {
+ // Loop here because there may be unhandled page types that we skip until
+ // finding a page that we do know what to do with
+
+ while (seen_num_rows_ < total_num_rows_) {
+ uint32_t header_size = 0;
+ uint32_t allowed_page_size = kDefaultPageHeaderSize;
+
+ // Page headers can be very large because of page statistics
+ // We try to deserialize a larger buffer progressively
+ // until a maximum allowed header limit
+ while (true) {
+ PARQUET_ASSIGN_OR_THROW(auto view, stream_->Peek(allowed_page_size));
+ if (view.size() == 0) {
+ return std::shared_ptr<Page>(nullptr);
+ }
+
+ // This gets used, then set by DeserializeThriftMsg
+ header_size = static_cast<uint32_t>(view.size());
+ try {
+ if (crypto_ctx_.meta_decryptor != nullptr) {
+ UpdateDecryption(crypto_ctx_.meta_decryptor, encryption::kDictionaryPageHeader,
+ data_page_header_aad_);
+ }
+ DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(view.data()), &header_size,
+ &current_page_header_, crypto_ctx_.meta_decryptor);
+ break;
+ } catch (std::exception& e) {
+ // Failed to deserialize. Double the allowed page header size and try again
+ std::stringstream ss;
+ ss << e.what();
+ allowed_page_size *= 2;
+ if (allowed_page_size > max_page_header_size_) {
+ ss << "Deserializing page header failed.\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ }
+ // Advance the stream offset
+ PARQUET_THROW_NOT_OK(stream_->Advance(header_size));
+
+ int compressed_len = current_page_header_.compressed_page_size;
+ int uncompressed_len = current_page_header_.uncompressed_page_size;
+ if (compressed_len < 0 || uncompressed_len < 0) {
+ throw ParquetException("Invalid page header");
+ }
+
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ UpdateDecryption(crypto_ctx_.data_decryptor, encryption::kDictionaryPage,
+ data_page_aad_);
+ }
+
+ // Read the compressed data page.
+ PARQUET_ASSIGN_OR_THROW(auto page_buffer, stream_->Read(compressed_len));
+ if (page_buffer->size() != compressed_len) {
+ std::stringstream ss;
+ ss << "Page was smaller (" << page_buffer->size() << ") than expected ("
+ << compressed_len << ")";
+ ParquetException::EofException(ss.str());
+ }
+
+ // Decrypt it if we need to
+ if (crypto_ctx_.data_decryptor != nullptr) {
+ PARQUET_THROW_NOT_OK(decryption_buffer_->Resize(
+ compressed_len - crypto_ctx_.data_decryptor->CiphertextSizeDelta(), false));
+ compressed_len = crypto_ctx_.data_decryptor->Decrypt(
+ page_buffer->data(), compressed_len, decryption_buffer_->mutable_data());
+
+ page_buffer = decryption_buffer_;
+ }
+
+ const PageType::type page_type = LoadEnumSafe(&current_page_header_.type);
+
+ if (page_type == PageType::DICTIONARY_PAGE) {
+ crypto_ctx_.start_decrypt_with_dictionary_page = false;
+ const format::DictionaryPageHeader& dict_header =
+ current_page_header_.dictionary_page_header;
+
+ bool is_sorted = dict_header.__isset.is_sorted ? dict_header.is_sorted : false;
+ if (dict_header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+
+ // Uncompress if needed
+ page_buffer =
+ DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
+
+ return std::make_shared<DictionaryPage>(page_buffer, dict_header.num_values,
+ LoadEnumSafe(&dict_header.encoding),
+ is_sorted);
+ } else if (page_type == PageType::DATA_PAGE) {
+ ++page_ordinal_;
+ const format::DataPageHeader& header = current_page_header_.data_page_header;
+
+ if (header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+ EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
+ seen_num_rows_ += header.num_values;
+
+ // Uncompress if needed
+ page_buffer =
+ DecompressIfNeeded(std::move(page_buffer), compressed_len, uncompressed_len);
+
+ return std::make_shared<DataPageV1>(page_buffer, header.num_values,
+ LoadEnumSafe(&header.encoding),
+ LoadEnumSafe(&header.definition_level_encoding),
+ LoadEnumSafe(&header.repetition_level_encoding),
+ uncompressed_len, page_statistics);
+ } else if (page_type == PageType::DATA_PAGE_V2) {
+ ++page_ordinal_;
+ const format::DataPageHeaderV2& header = current_page_header_.data_page_header_v2;
+
+ if (header.num_values < 0) {
+ throw ParquetException("Invalid page header (negative number of values)");
+ }
+ if (header.definition_levels_byte_length < 0 ||
+ header.repetition_levels_byte_length < 0) {
+ throw ParquetException("Invalid page header (negative levels byte length)");
+ }
+ bool is_compressed = header.__isset.is_compressed ? header.is_compressed : false;
+ EncodedStatistics page_statistics = ExtractStatsFromHeader(header);
+ seen_num_rows_ += header.num_values;
+
+ // Uncompress if needed
+ int levels_byte_len;
+ if (AddWithOverflow(header.definition_levels_byte_length,
+ header.repetition_levels_byte_length, &levels_byte_len)) {
+ throw ParquetException("Levels size too large (corrupt file?)");
+ }
+ // DecompressIfNeeded doesn't take `is_compressed` into account as
+ // it's page type-agnostic.
+ if (is_compressed) {
+ page_buffer = DecompressIfNeeded(std::move(page_buffer), compressed_len,
+ uncompressed_len, levels_byte_len);
+ }
+
+ return std::make_shared<DataPageV2>(
+ page_buffer, header.num_values, header.num_nulls, header.num_rows,
+ LoadEnumSafe(&header.encoding), header.definition_levels_byte_length,
+ header.repetition_levels_byte_length, uncompressed_len, is_compressed,
+ page_statistics);
+ } else {
+ // We don't know what this page type is. We're allowed to skip non-data
+ // pages.
+ continue;
+ }
+ }
+ return std::shared_ptr<Page>(nullptr);
+}
+
+std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
+ std::shared_ptr<Buffer> page_buffer, int compressed_len, int uncompressed_len,
+ int levels_byte_len) {
+ if (decompressor_ == nullptr) {
+ return page_buffer;
+ }
+ if (compressed_len < levels_byte_len || uncompressed_len < levels_byte_len) {
+ throw ParquetException("Invalid page header");
+ }
+
+ // Grow the uncompressed buffer if we need to.
+ if (uncompressed_len > static_cast<int>(decompression_buffer_->size())) {
+ PARQUET_THROW_NOT_OK(decompression_buffer_->Resize(uncompressed_len, false));
+ }
+
+ if (levels_byte_len > 0) {
+ // First copy the levels as-is
+ uint8_t* decompressed = decompression_buffer_->mutable_data();
+ memcpy(decompressed, page_buffer->data(), levels_byte_len);
+ }
+
+ // Decompress the values
+ PARQUET_THROW_NOT_OK(decompressor_->Decompress(
+ compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
+ uncompressed_len - levels_byte_len,
+ decompression_buffer_->mutable_data() + levels_byte_len));
+
+ return decompression_buffer_;
+}
+
+} // namespace
+
+std::unique_ptr<PageReader> PageReader::Open(std::shared_ptr<ArrowInputStream> stream,
+ int64_t total_num_rows,
+ Compression::type codec,
+ ::arrow::MemoryPool* pool,
+ const CryptoContext* ctx) {
+ return std::unique_ptr<PageReader>(
+ new SerializedPageReader(std::move(stream), total_num_rows, codec, pool, ctx));
+}
+
+namespace {
+
+// ----------------------------------------------------------------------
+// Impl base class for TypedColumnReader and RecordReader
+
+// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
+// encoding.
+static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
+ return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
+}
+
+template <typename DType>
+class ColumnReaderImplBase {
+ public:
+ using T = typename DType::c_type;
+
+ ColumnReaderImplBase(const ColumnDescriptor* descr, ::arrow::MemoryPool* pool)
+ : descr_(descr),
+ max_def_level_(descr->max_definition_level()),
+ max_rep_level_(descr->max_repetition_level()),
+ num_buffered_values_(0),
+ num_decoded_values_(0),
+ pool_(pool),
+ current_decoder_(nullptr),
+ current_encoding_(Encoding::UNKNOWN) {}
+
+ virtual ~ColumnReaderImplBase() = default;
+
+ protected:
+ // Read up to batch_size values from the current data page into the
+ // pre-allocated memory T*
+ //
+ // @returns: the number of values read into the out buffer
+ int64_t ReadValues(int64_t batch_size, T* out) {
+ int64_t num_decoded = current_decoder_->Decode(out, static_cast<int>(batch_size));
+ return num_decoded;
+ }
+
+ // Read up to batch_size values from the current data page into the
+ // pre-allocated memory T*, leaving spaces for null entries according
+ // to the def_levels.
+ //
+ // @returns: the number of values read into the out buffer
+ int64_t ReadValuesSpaced(int64_t batch_size, T* out, int64_t null_count,
+ uint8_t* valid_bits, int64_t valid_bits_offset) {
+ return current_decoder_->DecodeSpaced(out, static_cast<int>(batch_size),
+ static_cast<int>(null_count), valid_bits,
+ valid_bits_offset);
+ }
+
+ // Read multiple definition levels into preallocated memory
+ //
+ // Returns the number of decoded definition levels
+ int64_t ReadDefinitionLevels(int64_t batch_size, int16_t* levels) {
+ if (max_def_level_ == 0) {
+ return 0;
+ }
+ return definition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+ }
+
+ bool HasNextInternal() {
+ // Either there is no data page available yet, or the data page has been
+ // exhausted
+ if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) {
+ if (!ReadNewPage() || num_buffered_values_ == 0) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Read multiple repetition levels into preallocated memory
+ // Returns the number of decoded repetition levels
+ int64_t ReadRepetitionLevels(int64_t batch_size, int16_t* levels) {
+ if (max_rep_level_ == 0) {
+ return 0;
+ }
+ return repetition_level_decoder_.Decode(static_cast<int>(batch_size), levels);
+ }
+
+ // Advance to the next data page
+ bool ReadNewPage() {
+ // Loop until we find the next data page.
+ while (true) {
+ current_page_ = pager_->NextPage();
+ if (!current_page_) {
+ // EOS
+ return false;
+ }
+
+ if (current_page_->type() == PageType::DICTIONARY_PAGE) {
+ ConfigureDictionary(static_cast<const DictionaryPage*>(current_page_.get()));
+ continue;
+ } else if (current_page_->type() == PageType::DATA_PAGE) {
+ const auto page = std::static_pointer_cast<DataPageV1>(current_page_);
+ const int64_t levels_byte_size = InitializeLevelDecoders(
+ *page, page->repetition_level_encoding(), page->definition_level_encoding());
+ InitializeDataDecoder(*page, levels_byte_size);
+ return true;
+ } else if (current_page_->type() == PageType::DATA_PAGE_V2) {
+ const auto page = std::static_pointer_cast<DataPageV2>(current_page_);
+ int64_t levels_byte_size = InitializeLevelDecodersV2(*page);
+ InitializeDataDecoder(*page, levels_byte_size);
+ return true;
+ } else {
+ // We don't know what this page type is. We're allowed to skip non-data
+ // pages.
+ continue;
+ }
+ }
+ return true;
+ }
+
+ void ConfigureDictionary(const DictionaryPage* page) {
+ int encoding = static_cast<int>(page->encoding());
+ if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
+ page->encoding() == Encoding::PLAIN) {
+ encoding = static_cast<int>(Encoding::RLE_DICTIONARY);
+ }
+
+ auto it = decoders_.find(encoding);
+ if (it != decoders_.end()) {
+ throw ParquetException("Column cannot have more than one dictionary.");
+ }
+
+ if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
+ page->encoding() == Encoding::PLAIN) {
+ auto dictionary = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ dictionary->SetData(page->num_values(), page->data(), page->size());
+
+ // The dictionary is fully decoded during DictionaryDecoder::Init, so the
+ // DictionaryPage buffer is no longer required after this step
+ //
+ // TODO(wesm): investigate whether this all-or-nothing decoding of the
+ // dictionary makes sense and whether performance can be improved
+
+ std::unique_ptr<DictDecoder<DType>> decoder = MakeDictDecoder<DType>(descr_, pool_);
+ decoder->SetDict(dictionary.get());
+ decoders_[encoding] =
+ std::unique_ptr<DecoderType>(dynamic_cast<DecoderType*>(decoder.release()));
+ } else {
+ ParquetException::NYI("only plain dictionary encoding has been implemented");
+ }
+
+ new_dictionary_ = true;
+ current_decoder_ = decoders_[encoding].get();
+ DCHECK(current_decoder_);
+ }
+
+ // Initialize repetition and definition level decoders on the next data page.
+
+ // If the data page includes repetition and definition levels, we
+ // initialize the level decoders and return the number of encoded level bytes.
+ // The return value helps determine the number of bytes in the encoded data.
+ int64_t InitializeLevelDecoders(const DataPage& page,
+ Encoding::type repetition_level_encoding,
+ Encoding::type definition_level_encoding) {
+ // Read a data page.
+ num_buffered_values_ = page.num_values();
+
+ // Have not decoded any values from the data page yet
+ num_decoded_values_ = 0;
+
+ const uint8_t* buffer = page.data();
+ int32_t levels_byte_size = 0;
+ int32_t max_size = page.size();
+
+ // Data page Layout: Repetition Levels - Definition Levels - encoded values.
+ // Levels are encoded as rle or bit-packed.
+ // Init repetition levels
+ if (max_rep_level_ > 0) {
+ int32_t rep_levels_bytes = repetition_level_decoder_.SetData(
+ repetition_level_encoding, max_rep_level_,
+ static_cast<int>(num_buffered_values_), buffer, max_size);
+ buffer += rep_levels_bytes;
+ levels_byte_size += rep_levels_bytes;
+ max_size -= rep_levels_bytes;
+ }
+ // TODO figure a way to set max_def_level_ to 0
+ // if the initial value is invalid
+
+ // Init definition levels
+ if (max_def_level_ > 0) {
+ int32_t def_levels_bytes = definition_level_decoder_.SetData(
+ definition_level_encoding, max_def_level_,
+ static_cast<int>(num_buffered_values_), buffer, max_size);
+ levels_byte_size += def_levels_bytes;
+ max_size -= def_levels_bytes;
+ }
+
+ return levels_byte_size;
+ }
+
+ int64_t InitializeLevelDecodersV2(const DataPageV2& page) {
+ // Read a data page.
+ num_buffered_values_ = page.num_values();
+
+ // Have not decoded any values from the data page yet
+ num_decoded_values_ = 0;
+ const uint8_t* buffer = page.data();
+
+ const int64_t total_levels_length =
+ static_cast<int64_t>(page.repetition_levels_byte_length()) +
+ page.definition_levels_byte_length();
+
+ if (total_levels_length > page.size()) {
+ throw ParquetException("Data page too small for levels (corrupt header?)");
+ }
+
+ if (max_rep_level_ > 0) {
+ repetition_level_decoder_.SetDataV2(page.repetition_levels_byte_length(),
+ max_rep_level_,
+ static_cast<int>(num_buffered_values_), buffer);
+ buffer += page.repetition_levels_byte_length();
+ }
+
+ if (max_def_level_ > 0) {
+ definition_level_decoder_.SetDataV2(page.definition_levels_byte_length(),
+ max_def_level_,
+ static_cast<int>(num_buffered_values_), buffer);
+ }
+
+ return total_levels_length;
+ }
+
+ // Get a decoder object for this page or create a new decoder if this is the
+ // first page with this encoding.
+ void InitializeDataDecoder(const DataPage& page, int64_t levels_byte_size) {
+ const uint8_t* buffer = page.data() + levels_byte_size;
+ const int64_t data_size = page.size() - levels_byte_size;
+
+ if (data_size < 0) {
+ throw ParquetException("Page smaller than size of encoded levels");
+ }
+
+ Encoding::type encoding = page.encoding();
+
+ if (IsDictionaryIndexEncoding(encoding)) {
+ encoding = Encoding::RLE_DICTIONARY;
+ }
+
+ auto it = decoders_.find(static_cast<int>(encoding));
+ if (it != decoders_.end()) {
+ DCHECK(it->second.get() != nullptr);
+ if (encoding == Encoding::RLE_DICTIONARY) {
+ DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY);
+ }
+ current_decoder_ = it->second.get();
+ } else {
+ switch (encoding) {
+ case Encoding::PLAIN: {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ current_decoder_ = decoder.get();
+ decoders_[static_cast<int>(encoding)] = std::move(decoder);
+ break;
+ }
+ case Encoding::BYTE_STREAM_SPLIT: {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::BYTE_STREAM_SPLIT, descr_);
+ current_decoder_ = decoder.get();
+ decoders_[static_cast<int>(encoding)] = std::move(decoder);
+ break;
+ }
+ case Encoding::RLE_DICTIONARY:
+ throw ParquetException("Dictionary page must be before data page.");
+
+ case Encoding::DELTA_BINARY_PACKED:
+ case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ case Encoding::DELTA_BYTE_ARRAY:
+ ParquetException::NYI("Unsupported encoding");
+
+ default:
+ throw ParquetException("Unknown encoding type.");
+ }
+ }
+ current_encoding_ = encoding;
+ current_decoder_->SetData(static_cast<int>(num_buffered_values_), buffer,
+ static_cast<int>(data_size));
+ }
+
+ const ColumnDescriptor* descr_;
+ const int16_t max_def_level_;
+ const int16_t max_rep_level_;
+
+ std::unique_ptr<PageReader> pager_;
+ std::shared_ptr<Page> current_page_;
+
+ // Not set if full schema for this field has no optional or repeated elements
+ LevelDecoder definition_level_decoder_;
+
+ // Not set for flat schemas.
+ LevelDecoder repetition_level_decoder_;
+
+ // The total number of values stored in the data page. This is the maximum of
+ // the number of encoded definition levels or encoded values. For
+ // non-repeated, required columns, this is equal to the number of encoded
+ // values. For repeated or optional values, there may be fewer data values
+ // than levels, and this tells you how many encoded levels there are in that
+ // case.
+ int64_t num_buffered_values_;
+
+ // The number of values from the current data page that have been decoded
+ // into memory
+ int64_t num_decoded_values_;
+
+ ::arrow::MemoryPool* pool_;
+
+ using DecoderType = TypedDecoder<DType>;
+ DecoderType* current_decoder_;
+ Encoding::type current_encoding_;
+
+ /// Flag to signal when a new dictionary has been set, for the benefit of
+ /// DictionaryRecordReader
+ bool new_dictionary_;
+
+ // The exposed encoding
+ ExposedEncoding exposed_encoding_ = ExposedEncoding::NO_ENCODING;
+
+ // Map of encoding type to the respective decoder object. For example, a
+ // column chunk's data pages may include both dictionary-encoded and
+ // plain-encoded data.
+ std::unordered_map<int, std::unique_ptr<DecoderType>> decoders_;
+
+ void ConsumeBufferedValues(int64_t num_values) { num_decoded_values_ += num_values; }
+};
+
+// ----------------------------------------------------------------------
+// TypedColumnReader implementations
+
+template <typename DType>
+class TypedColumnReaderImpl : public TypedColumnReader<DType>,
+ public ColumnReaderImplBase<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedColumnReaderImpl(const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+ ::arrow::MemoryPool* pool)
+ : ColumnReaderImplBase<DType>(descr, pool) {
+ this->pager_ = std::move(pager);
+ }
+
+ bool HasNext() override { return this->HasNextInternal(); }
+
+ int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, int64_t* values_read) override;
+
+ int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, uint8_t* valid_bits, int64_t valid_bits_offset,
+ int64_t* levels_read, int64_t* values_read,
+ int64_t* null_count) override;
+
+ int64_t Skip(int64_t num_rows_to_skip) override;
+
+ Type::type type() const override { return this->descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return this->descr_; }
+
+ ExposedEncoding GetExposedEncoding() override { return this->exposed_encoding_; };
+
+ int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict,
+ int32_t* dict_len) override;
+
+ protected:
+ void SetExposedEncoding(ExposedEncoding encoding) override {
+ this->exposed_encoding_ = encoding;
+ }
+
+ private:
+ // Read dictionary indices. Similar to ReadValues but decode data to dictionary indices.
+ // This function is called only by ReadBatchWithDictionary().
+ int64_t ReadDictionaryIndices(int64_t indices_to_read, int32_t* indices) {
+ auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+ return decoder->DecodeIndices(static_cast<int>(indices_to_read), indices);
+ }
+
+ // Get dictionary. The dictionary should have been set by SetDict(). The dictionary is
+ // owned by the internal decoder and is destroyed when the reader is destroyed. This
+ // function is called only by ReadBatchWithDictionary() after dictionary is configured.
+ void GetDictionary(const T** dictionary, int32_t* dictionary_length) {
+ auto decoder = dynamic_cast<DictDecoder<DType>*>(this->current_decoder_);
+ decoder->GetDictionary(dictionary, dictionary_length);
+ }
+
+ // Read definition and repetition levels. Also return the number of definition levels
+ // and number of values to read. This function is called before reading values.
+ void ReadLevels(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ int64_t* num_def_levels, int64_t* values_to_read) {
+ batch_size =
+ std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+ // If the field is required and non-repeated, there are no definition levels
+ if (this->max_def_level_ > 0 && def_levels != nullptr) {
+ *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+ // TODO(wesm): this tallying of values-to-decode can be performed with better
+ // cache-efficiency if fused with the level decoding.
+ for (int64_t i = 0; i < *num_def_levels; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++(*values_to_read);
+ }
+ }
+ } else {
+ // Required field, read all values
+ *values_to_read = batch_size;
+ }
+
+ // Not present for non-repeated fields
+ if (this->max_rep_level_ > 0 && rep_levels != nullptr) {
+ int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+ if (def_levels != nullptr && *num_def_levels != num_rep_levels) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ }
+ }
+};
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchWithDictionary(
+ int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict, int32_t* dict_len) {
+ bool has_dict_output = dict != nullptr && dict_len != nullptr;
+ // Similar logic as ReadValues to get pages.
+ if (!HasNext()) {
+ *indices_read = 0;
+ if (has_dict_output) {
+ *dict = nullptr;
+ *dict_len = 0;
+ }
+ return 0;
+ }
+
+ // Verify the current data page is dictionary encoded.
+ if (this->current_encoding_ != Encoding::RLE_DICTIONARY) {
+ std::stringstream ss;
+ ss << "Data page is not dictionary encoded. Encoding: "
+ << EncodingToString(this->current_encoding_);
+ throw ParquetException(ss.str());
+ }
+
+ // Get dictionary pointer and length.
+ if (has_dict_output) {
+ GetDictionary(dict, dict_len);
+ }
+
+ // Similar logic as ReadValues to get def levels and rep levels.
+ int64_t num_def_levels = 0;
+ int64_t indices_to_read = 0;
+ ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &indices_to_read);
+
+ // Read dictionary indices.
+ *indices_read = ReadDictionaryIndices(indices_to_read, indices);
+ int64_t total_indices = std::max(num_def_levels, *indices_read);
+ this->ConsumeBufferedValues(total_indices);
+
+ return total_indices;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatch(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, T* values,
+ int64_t* values_read) {
+ // HasNext invokes ReadNewPage
+ if (!HasNext()) {
+ *values_read = 0;
+ return 0;
+ }
+
+ // TODO(wesm): keep reading data pages until batch_size is reached, or the
+ // row group is finished
+ int64_t num_def_levels = 0;
+ int64_t values_to_read = 0;
+ ReadLevels(batch_size, def_levels, rep_levels, &num_def_levels, &values_to_read);
+
+ *values_read = this->ReadValues(values_to_read, values);
+ int64_t total_values = std::max(num_def_levels, *values_read);
+ this->ConsumeBufferedValues(total_values);
+
+ return total_values;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
+ int64_t batch_size, int16_t* def_levels, int16_t* rep_levels, T* values,
+ uint8_t* valid_bits, int64_t valid_bits_offset, int64_t* levels_read,
+ int64_t* values_read, int64_t* null_count_out) {
+ // HasNext invokes ReadNewPage
+ if (!HasNext()) {
+ *levels_read = 0;
+ *values_read = 0;
+ *null_count_out = 0;
+ return 0;
+ }
+
+ int64_t total_values;
+ // TODO(wesm): keep reading data pages until batch_size is reached, or the
+ // row group is finished
+ batch_size =
+ std::min(batch_size, this->num_buffered_values_ - this->num_decoded_values_);
+
+ // If the field is required and non-repeated, there are no definition levels
+ if (this->max_def_level_ > 0) {
+ int64_t num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
+
+ // Not present for non-repeated fields
+ if (this->max_rep_level_ > 0) {
+ int64_t num_rep_levels = this->ReadRepetitionLevels(batch_size, rep_levels);
+ if (num_def_levels != num_rep_levels) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ }
+
+ const bool has_spaced_values = HasSpacedValues(this->descr_);
+ int64_t null_count = 0;
+ if (!has_spaced_values) {
+ int values_to_read = 0;
+ for (int64_t i = 0; i < num_def_levels; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++values_to_read;
+ }
+ }
+ total_values = this->ReadValues(values_to_read, values);
+ ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
+ /*length=*/total_values,
+ /*bits_are_set=*/true);
+ *values_read = total_values;
+ } else {
+ internal::LevelInfo info;
+ info.repeated_ancestor_def_level = this->max_def_level_ - 1;
+ info.def_level = this->max_def_level_;
+ info.rep_level = this->max_rep_level_;
+ internal::ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = num_def_levels;
+ validity_io.valid_bits = valid_bits;
+ validity_io.valid_bits_offset = valid_bits_offset;
+ validity_io.null_count = null_count;
+ validity_io.values_read = *values_read;
+
+ internal::DefLevelsToBitmap(def_levels, num_def_levels, info, &validity_io);
+ null_count = validity_io.null_count;
+ *values_read = validity_io.values_read;
+
+ total_values =
+ this->ReadValuesSpaced(*values_read, values, static_cast<int>(null_count),
+ valid_bits, valid_bits_offset);
+ }
+ *levels_read = num_def_levels;
+ *null_count_out = null_count;
+
+ } else {
+ // Required field, read all values
+ total_values = this->ReadValues(batch_size, values);
+ ::arrow::BitUtil::SetBitsTo(valid_bits, valid_bits_offset,
+ /*length=*/total_values,
+ /*bits_are_set=*/true);
+ *null_count_out = 0;
+ *values_read = total_values;
+ *levels_read = total_values;
+ }
+
+ this->ConsumeBufferedValues(*levels_read);
+ return total_values;
+}
+
+template <typename DType>
+int64_t TypedColumnReaderImpl<DType>::Skip(int64_t num_rows_to_skip) {
+ int64_t rows_to_skip = num_rows_to_skip;
+ while (HasNext() && rows_to_skip > 0) {
+ // If the number of rows to skip is more than the number of undecoded values, skip the
+ // Page.
+ if (rows_to_skip > (this->num_buffered_values_ - this->num_decoded_values_)) {
+ rows_to_skip -= this->num_buffered_values_ - this->num_decoded_values_;
+ this->num_decoded_values_ = this->num_buffered_values_;
+ } else {
+ // We need to read this Page
+ // Jump to the right offset in the Page
+ int64_t batch_size = 1024; // ReadBatch with a smaller memory footprint
+ int64_t values_read = 0;
+
+ // This will be enough scratch space to accommodate 16-bit levels or any
+ // value type
+ std::shared_ptr<ResizableBuffer> scratch = AllocateBuffer(
+ this->pool_, batch_size * type_traits<DType::type_num>::value_byte_size);
+
+ do {
+ batch_size = std::min(batch_size, rows_to_skip);
+ values_read =
+ ReadBatch(static_cast<int>(batch_size),
+ reinterpret_cast<int16_t*>(scratch->mutable_data()),
+ reinterpret_cast<int16_t*>(scratch->mutable_data()),
+ reinterpret_cast<T*>(scratch->mutable_data()), &values_read);
+ rows_to_skip -= values_read;
+ } while (values_read > 0 && rows_to_skip > 0);
+ }
+ }
+ return num_rows_to_skip - rows_to_skip;
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Dynamic column reader constructor
+
+std::shared_ptr<ColumnReader> ColumnReader::Make(const ColumnDescriptor* descr,
+ std::unique_ptr<PageReader> pager,
+ MemoryPool* pool) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedColumnReaderImpl<BooleanType>>(descr, std::move(pager),
+ pool);
+ case Type::INT32:
+ return std::make_shared<TypedColumnReaderImpl<Int32Type>>(descr, std::move(pager),
+ pool);
+ case Type::INT64:
+ return std::make_shared<TypedColumnReaderImpl<Int64Type>>(descr, std::move(pager),
+ pool);
+ case Type::INT96:
+ return std::make_shared<TypedColumnReaderImpl<Int96Type>>(descr, std::move(pager),
+ pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedColumnReaderImpl<FloatType>>(descr, std::move(pager),
+ pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedColumnReaderImpl<DoubleType>>(descr, std::move(pager),
+ pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedColumnReaderImpl<ByteArrayType>>(
+ descr, std::move(pager), pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedColumnReaderImpl<FLBAType>>(descr, std::move(pager),
+ pool);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<ColumnReader>(nullptr);
+}
+
+// ----------------------------------------------------------------------
+// RecordReader
+
+namespace internal {
+namespace {
+
+// The minimum number of repetition/definition levels to decode at a time, for
+// better vectorized performance when doing many smaller record reads
+constexpr int64_t kMinLevelBatchSize = 1024;
+
+template <typename DType>
+class TypedRecordReader : public ColumnReaderImplBase<DType>,
+ virtual public RecordReader {
+ public:
+ using T = typename DType::c_type;
+ using BASE = ColumnReaderImplBase<DType>;
+ TypedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool)
+ : BASE(descr, pool) {
+ leaf_info_ = leaf_info;
+ nullable_values_ = leaf_info.HasNullableValues();
+ at_record_start_ = true;
+ records_read_ = 0;
+ values_written_ = 0;
+ values_capacity_ = 0;
+ null_count_ = 0;
+ levels_written_ = 0;
+ levels_position_ = 0;
+ levels_capacity_ = 0;
+ uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
+
+ if (uses_values_) {
+ values_ = AllocateBuffer(pool);
+ }
+ valid_bits_ = AllocateBuffer(pool);
+ def_levels_ = AllocateBuffer(pool);
+ rep_levels_ = AllocateBuffer(pool);
+ Reset();
+ }
+
+ int64_t available_values_current_page() const {
+ return this->num_buffered_values_ - this->num_decoded_values_;
+ }
+
+ // Compute the values capacity in bytes for the given number of elements
+ int64_t bytes_for_values(int64_t nitems) const {
+ int64_t type_size = GetTypeByteSize(this->descr_->physical_type());
+ int64_t bytes_for_values = -1;
+ if (MultiplyWithOverflow(nitems, type_size, &bytes_for_values)) {
+ throw ParquetException("Total size of items too large");
+ }
+ return bytes_for_values;
+ }
+
+ int64_t ReadRecords(int64_t num_records) override {
+ // Delimit records, then read values at the end
+ int64_t records_read = 0;
+
+ if (levels_position_ < levels_written_) {
+ records_read += ReadRecordData(num_records);
+ }
+
+ int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records);
+
+ // If we are in the middle of a record, we continue until reaching the
+ // desired number of records or the end of the current record if we've found
+ // enough records
+ while (!at_record_start_ || records_read < num_records) {
+ // Is there more data to read in this row group?
+ if (!this->HasNextInternal()) {
+ if (!at_record_start_) {
+ // We ended the row group while inside a record that we haven't seen
+ // the end of yet. So increment the record count for the last record in
+ // the row group
+ ++records_read;
+ at_record_start_ = true;
+ }
+ break;
+ }
+
+ /// We perform multiple batch reads until we either exhaust the row group
+ /// or observe the desired number of records
+ int64_t batch_size = std::min(level_batch_size, available_values_current_page());
+
+ // No more data in column
+ if (batch_size == 0) {
+ break;
+ }
+
+ if (this->max_def_level_ > 0) {
+ ReserveLevels(batch_size);
+
+ int16_t* def_levels = this->def_levels() + levels_written_;
+ int16_t* rep_levels = this->rep_levels() + levels_written_;
+
+ // Not present for non-repeated fields
+ int64_t levels_read = 0;
+ if (this->max_rep_level_ > 0) {
+ levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
+ if (this->ReadRepetitionLevels(batch_size, rep_levels) != levels_read) {
+ throw ParquetException("Number of decoded rep / def levels did not match");
+ }
+ } else if (this->max_def_level_ > 0) {
+ levels_read = this->ReadDefinitionLevels(batch_size, def_levels);
+ }
+
+ // Exhausted column chunk
+ if (levels_read == 0) {
+ break;
+ }
+
+ levels_written_ += levels_read;
+ records_read += ReadRecordData(num_records - records_read);
+ } else {
+ // No repetition or definition levels
+ batch_size = std::min(num_records - records_read, batch_size);
+ records_read += ReadRecordData(batch_size);
+ }
+ }
+
+ return records_read;
+ }
+
+ // We may outwardly have the appearance of having exhausted a column chunk
+ // when in fact we are in the middle of processing the last batch
+ bool has_values_to_process() const { return levels_position_ < levels_written_; }
+
+ std::shared_ptr<ResizableBuffer> ReleaseValues() override {
+ if (uses_values_) {
+ auto result = values_;
+ PARQUET_THROW_NOT_OK(result->Resize(bytes_for_values(values_written_), true));
+ values_ = AllocateBuffer(this->pool_);
+ values_capacity_ = 0;
+ return result;
+ } else {
+ return nullptr;
+ }
+ }
+
+ std::shared_ptr<ResizableBuffer> ReleaseIsValid() override {
+ if (leaf_info_.HasNullableValues()) {
+ auto result = valid_bits_;
+ PARQUET_THROW_NOT_OK(result->Resize(BitUtil::BytesForBits(values_written_), true));
+ valid_bits_ = AllocateBuffer(this->pool_);
+ return result;
+ } else {
+ return nullptr;
+ }
+ }
+
+ // Process written repetition/definition levels to reach the end of
+ // records. Process no more levels than necessary to delimit the indicated
+ // number of logical records. Updates internal state of RecordReader
+ //
+ // \return Number of records delimited
+ int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+ int64_t values_to_read = 0;
+ int64_t records_read = 0;
+
+ const int16_t* def_levels = this->def_levels() + levels_position_;
+ const int16_t* rep_levels = this->rep_levels() + levels_position_;
+
+ DCHECK_GT(this->max_rep_level_, 0);
+
+ // Count logical records and number of values to read
+ while (levels_position_ < levels_written_) {
+ const int16_t rep_level = *rep_levels++;
+ if (rep_level == 0) {
+ // If at_record_start_ is true, we are seeing the start of a record
+ // for the second time, such as after repeated calls to
+ // DelimitRecords. In this case we must continue until we find
+ // another record start or exhausting the ColumnChunk
+ if (!at_record_start_) {
+ // We've reached the end of a record; increment the record count.
+ ++records_read;
+ if (records_read == num_records) {
+ // We've found the number of records we were looking for. Set
+ // at_record_start_ to true and break
+ at_record_start_ = true;
+ break;
+ }
+ }
+ }
+ // We have decided to consume the level at this position; therefore we
+ // must advance until we find another record boundary
+ at_record_start_ = false;
+
+ const int16_t def_level = *def_levels++;
+ if (def_level == this->max_def_level_) {
+ ++values_to_read;
+ }
+ ++levels_position_;
+ }
+ *values_seen = values_to_read;
+ return records_read;
+ }
+
+ void Reserve(int64_t capacity) override {
+ ReserveLevels(capacity);
+ ReserveValues(capacity);
+ }
+
+ int64_t UpdateCapacity(int64_t capacity, int64_t size, int64_t extra_size) {
+ if (extra_size < 0) {
+ throw ParquetException("Negative size (corrupt file?)");
+ }
+ int64_t target_size = -1;
+ if (AddWithOverflow(size, extra_size, &target_size)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ if (target_size >= (1LL << 62)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ if (capacity >= target_size) {
+ return capacity;
+ }
+ return BitUtil::NextPower2(target_size);
+ }
+
+ void ReserveLevels(int64_t extra_levels) {
+ if (this->max_def_level_ > 0) {
+ const int64_t new_levels_capacity =
+ UpdateCapacity(levels_capacity_, levels_written_, extra_levels);
+ if (new_levels_capacity > levels_capacity_) {
+ constexpr auto kItemSize = static_cast<int64_t>(sizeof(int16_t));
+ int64_t capacity_in_bytes = -1;
+ if (MultiplyWithOverflow(new_levels_capacity, kItemSize, &capacity_in_bytes)) {
+ throw ParquetException("Allocation size too large (corrupt file?)");
+ }
+ PARQUET_THROW_NOT_OK(def_levels_->Resize(capacity_in_bytes, false));
+ if (this->max_rep_level_ > 0) {
+ PARQUET_THROW_NOT_OK(rep_levels_->Resize(capacity_in_bytes, false));
+ }
+ levels_capacity_ = new_levels_capacity;
+ }
+ }
+ }
+
+ void ReserveValues(int64_t extra_values) {
+ const int64_t new_values_capacity =
+ UpdateCapacity(values_capacity_, values_written_, extra_values);
+ if (new_values_capacity > values_capacity_) {
+ // XXX(wesm): A hack to avoid memory allocation when reading directly
+ // into builder classes
+ if (uses_values_) {
+ PARQUET_THROW_NOT_OK(
+ values_->Resize(bytes_for_values(new_values_capacity), false));
+ }
+ values_capacity_ = new_values_capacity;
+ }
+ if (leaf_info_.HasNullableValues()) {
+ int64_t valid_bytes_new = BitUtil::BytesForBits(values_capacity_);
+ if (valid_bits_->size() < valid_bytes_new) {
+ int64_t valid_bytes_old = BitUtil::BytesForBits(values_written_);
+ PARQUET_THROW_NOT_OK(valid_bits_->Resize(valid_bytes_new, false));
+
+ // Avoid valgrind warnings
+ memset(valid_bits_->mutable_data() + valid_bytes_old, 0,
+ valid_bytes_new - valid_bytes_old);
+ }
+ }
+ }
+
+ void Reset() override {
+ ResetValues();
+
+ if (levels_written_ > 0) {
+ const int64_t levels_remaining = levels_written_ - levels_position_;
+ // Shift remaining levels to beginning of buffer and trim to only the number
+ // of decoded levels remaining
+ int16_t* def_data = def_levels();
+ int16_t* rep_data = rep_levels();
+
+ std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
+ PARQUET_THROW_NOT_OK(
+ def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+
+ if (this->max_rep_level_ > 0) {
+ std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
+ PARQUET_THROW_NOT_OK(
+ rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+ }
+
+ levels_written_ -= levels_position_;
+ levels_position_ = 0;
+ levels_capacity_ = levels_remaining;
+ }
+
+ records_read_ = 0;
+
+ // Call Finish on the binary builders to reset them
+ }
+
+ void SetPageReader(std::unique_ptr<PageReader> reader) override {
+ at_record_start_ = true;
+ this->pager_ = std::move(reader);
+ ResetDecoders();
+ }
+
+ bool HasMoreData() const override { return this->pager_ != nullptr; }
+
+ // Dictionary decoders must be reset when advancing row groups
+ void ResetDecoders() { this->decoders_.clear(); }
+
+ virtual void ReadValuesSpaced(int64_t values_with_nulls, int64_t null_count) {
+ uint8_t* valid_bits = valid_bits_->mutable_data();
+ const int64_t valid_bits_offset = values_written_;
+
+ int64_t num_decoded = this->current_decoder_->DecodeSpaced(
+ ValuesHead<T>(), static_cast<int>(values_with_nulls),
+ static_cast<int>(null_count), valid_bits, valid_bits_offset);
+ DCHECK_EQ(num_decoded, values_with_nulls);
+ }
+
+ virtual void ReadValuesDense(int64_t values_to_read) {
+ int64_t num_decoded =
+ this->current_decoder_->Decode(ValuesHead<T>(), static_cast<int>(values_to_read));
+ DCHECK_EQ(num_decoded, values_to_read);
+ }
+
+ // Return number of logical records read
+ int64_t ReadRecordData(int64_t num_records) {
+ // Conservative upper bound
+ const int64_t possible_num_values =
+ std::max(num_records, levels_written_ - levels_position_);
+ ReserveValues(possible_num_values);
+
+ const int64_t start_levels_position = levels_position_;
+
+ int64_t values_to_read = 0;
+ int64_t records_read = 0;
+ if (this->max_rep_level_ > 0) {
+ records_read = DelimitRecords(num_records, &values_to_read);
+ } else if (this->max_def_level_ > 0) {
+ // No repetition levels, skip delimiting logic. Each level represents a
+ // null or not null entry
+ records_read = std::min(levels_written_ - levels_position_, num_records);
+
+ // This is advanced by DelimitRecords, which we skipped
+ levels_position_ += records_read;
+ } else {
+ records_read = values_to_read = num_records;
+ }
+
+ int64_t null_count = 0;
+ if (leaf_info_.HasNullableValues()) {
+ ValidityBitmapInputOutput validity_io;
+ validity_io.values_read_upper_bound = levels_position_ - start_levels_position;
+ validity_io.valid_bits = valid_bits_->mutable_data();
+ validity_io.valid_bits_offset = values_written_;
+
+ DefLevelsToBitmap(def_levels() + start_levels_position,
+ levels_position_ - start_levels_position, leaf_info_,
+ &validity_io);
+ values_to_read = validity_io.values_read - validity_io.null_count;
+ null_count = validity_io.null_count;
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesSpaced(validity_io.values_read, null_count);
+ } else {
+ DCHECK_GE(values_to_read, 0);
+ ReadValuesDense(values_to_read);
+ }
+ if (this->leaf_info_.def_level > 0) {
+ // Optional, repeated, or some mix thereof
+ this->ConsumeBufferedValues(levels_position_ - start_levels_position);
+ } else {
+ // Flat, non-repeated
+ this->ConsumeBufferedValues(values_to_read);
+ }
+ // Total values, including null spaces, if any
+ values_written_ += values_to_read + null_count;
+ null_count_ += null_count;
+
+ return records_read;
+ }
+
+ void DebugPrintState() override {
+ const int16_t* def_levels = this->def_levels();
+ const int16_t* rep_levels = this->rep_levels();
+ const int64_t total_levels_read = levels_position_;
+
+ const T* vals = reinterpret_cast<const T*>(this->values());
+
+ std::cout << "def levels: ";
+ for (int64_t i = 0; i < total_levels_read; ++i) {
+ std::cout << def_levels[i] << " ";
+ }
+ std::cout << std::endl;
+
+ std::cout << "rep levels: ";
+ for (int64_t i = 0; i < total_levels_read; ++i) {
+ std::cout << rep_levels[i] << " ";
+ }
+ std::cout << std::endl;
+
+ std::cout << "values: ";
+ for (int64_t i = 0; i < this->values_written(); ++i) {
+ std::cout << vals[i] << " ";
+ }
+ std::cout << std::endl;
+ }
+
+ void ResetValues() {
+ if (values_written_ > 0) {
+ // Resize to 0, but do not shrink to fit
+ if (uses_values_) {
+ PARQUET_THROW_NOT_OK(values_->Resize(0, false));
+ }
+ PARQUET_THROW_NOT_OK(valid_bits_->Resize(0, false));
+ values_written_ = 0;
+ values_capacity_ = 0;
+ null_count_ = 0;
+ }
+ }
+
+ protected:
+ template <typename T>
+ T* ValuesHead() {
+ return reinterpret_cast<T*>(values_->mutable_data()) + values_written_;
+ }
+ LevelInfo leaf_info_;
+};
+
+class FLBARecordReader : public TypedRecordReader<FLBAType>,
+ virtual public BinaryRecordReader {
+ public:
+ FLBARecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<FLBAType>(descr, leaf_info, pool), builder_(nullptr) {
+ DCHECK_EQ(descr_->physical_type(), Type::FIXED_LEN_BYTE_ARRAY);
+ int byte_width = descr_->type_length();
+ std::shared_ptr<::arrow::DataType> type = ::arrow::fixed_size_binary(byte_width);
+ builder_.reset(new ::arrow::FixedSizeBinaryBuilder(type, this->pool_));
+ }
+
+ ::arrow::ArrayVector GetBuilderChunks() override {
+ std::shared_ptr<::arrow::Array> chunk;
+ PARQUET_THROW_NOT_OK(builder_->Finish(&chunk));
+ return ::arrow::ArrayVector({chunk});
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ auto values = ValuesHead<FLBA>();
+ int64_t num_decoded =
+ this->current_decoder_->Decode(values, static_cast<int>(values_to_read));
+ DCHECK_EQ(num_decoded, values_to_read);
+
+ for (int64_t i = 0; i < num_decoded; i++) {
+ PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
+ }
+ ResetValues();
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ uint8_t* valid_bits = valid_bits_->mutable_data();
+ const int64_t valid_bits_offset = values_written_;
+ auto values = ValuesHead<FLBA>();
+
+ int64_t num_decoded = this->current_decoder_->DecodeSpaced(
+ values, static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits, valid_bits_offset);
+ DCHECK_EQ(num_decoded, values_to_read);
+
+ for (int64_t i = 0; i < num_decoded; i++) {
+ if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) {
+ PARQUET_THROW_NOT_OK(builder_->Append(values[i].ptr));
+ } else {
+ PARQUET_THROW_NOT_OK(builder_->AppendNull());
+ }
+ }
+ ResetValues();
+ }
+
+ private:
+ std::unique_ptr<::arrow::FixedSizeBinaryBuilder> builder_;
+};
+
+class ByteArrayChunkedRecordReader : public TypedRecordReader<ByteArrayType>,
+ virtual public BinaryRecordReader {
+ public:
+ ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
+ DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+ accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
+ }
+
+ ::arrow::ArrayVector GetBuilderChunks() override {
+ ::arrow::ArrayVector result = accumulator_.chunks;
+ if (result.size() == 0 || accumulator_.builder->length() > 0) {
+ std::shared_ptr<::arrow::Array> last_chunk;
+ PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
+ result.push_back(std::move(last_chunk));
+ }
+ accumulator_.chunks = {};
+ return result;
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ int64_t num_decoded = this->current_decoder_->DecodeArrowNonNull(
+ static_cast<int>(values_to_read), &accumulator_);
+ DCHECK_EQ(num_decoded, values_to_read);
+ ResetValues();
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ int64_t num_decoded = this->current_decoder_->DecodeArrow(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &accumulator_);
+ DCHECK_EQ(num_decoded, values_to_read - null_count);
+ ResetValues();
+ }
+
+ private:
+ // Helper data structure for accumulating builder chunks
+ typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
+};
+
+class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
+ virtual public DictionaryRecordReader {
+ public:
+ ByteArrayDictionaryRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool)
+ : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool), builder_(pool) {
+ this->read_dictionary_ = true;
+ }
+
+ std::shared_ptr<::arrow::ChunkedArray> GetResult() override {
+ FlushBuilder();
+ std::vector<std::shared_ptr<::arrow::Array>> result;
+ std::swap(result, result_chunks_);
+ return std::make_shared<::arrow::ChunkedArray>(std::move(result), builder_.type());
+ }
+
+ void FlushBuilder() {
+ if (builder_.length() > 0) {
+ std::shared_ptr<::arrow::Array> chunk;
+ PARQUET_THROW_NOT_OK(builder_.Finish(&chunk));
+ result_chunks_.emplace_back(std::move(chunk));
+
+ // Also clears the dictionary memo table
+ builder_.Reset();
+ }
+ }
+
+ void MaybeWriteNewDictionary() {
+ if (this->new_dictionary_) {
+ /// If there is a new dictionary, we may need to flush the builder, then
+ /// insert the new dictionary values
+ FlushBuilder();
+ builder_.ResetFull();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ decoder->InsertDictionary(&builder_);
+ this->new_dictionary_ = false;
+ }
+ }
+
+ void ReadValuesDense(int64_t values_to_read) override {
+ int64_t num_decoded = 0;
+ if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+ MaybeWriteNewDictionary();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ num_decoded = decoder->DecodeIndices(static_cast<int>(values_to_read), &builder_);
+ } else {
+ num_decoded = this->current_decoder_->DecodeArrowNonNull(
+ static_cast<int>(values_to_read), &builder_);
+
+ /// Flush values since they have been copied into the builder
+ ResetValues();
+ }
+ DCHECK_EQ(num_decoded, values_to_read);
+ }
+
+ void ReadValuesSpaced(int64_t values_to_read, int64_t null_count) override {
+ int64_t num_decoded = 0;
+ if (current_encoding_ == Encoding::RLE_DICTIONARY) {
+ MaybeWriteNewDictionary();
+ auto decoder = dynamic_cast<BinaryDictDecoder*>(this->current_decoder_);
+ num_decoded = decoder->DecodeIndicesSpaced(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &builder_);
+ } else {
+ num_decoded = this->current_decoder_->DecodeArrow(
+ static_cast<int>(values_to_read), static_cast<int>(null_count),
+ valid_bits_->mutable_data(), values_written_, &builder_);
+
+ /// Flush values since they have been copied into the builder
+ ResetValues();
+ }
+ DCHECK_EQ(num_decoded, values_to_read - null_count);
+ }
+
+ private:
+ using BinaryDictDecoder = DictDecoder<ByteArrayType>;
+
+ ::arrow::BinaryDictionary32Builder builder_;
+ std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
+};
+
+// TODO(wesm): Implement these to some satisfaction
+template <>
+void TypedRecordReader<Int96Type>::DebugPrintState() {}
+
+template <>
+void TypedRecordReader<ByteArrayType>::DebugPrintState() {}
+
+template <>
+void TypedRecordReader<FLBAType>::DebugPrintState() {}
+
+std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* descr,
+ LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool,
+ bool read_dictionary) {
+ if (read_dictionary) {
+ return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool);
+ } else {
+ return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool);
+ }
+}
+
+} // namespace
+
+std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr,
+ LevelInfo leaf_info, MemoryPool* pool,
+ const bool read_dictionary) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedRecordReader<BooleanType>>(descr, leaf_info, pool);
+ case Type::INT32:
+ return std::make_shared<TypedRecordReader<Int32Type>>(descr, leaf_info, pool);
+ case Type::INT64:
+ return std::make_shared<TypedRecordReader<Int64Type>>(descr, leaf_info, pool);
+ case Type::INT96:
+ return std::make_shared<TypedRecordReader<Int96Type>>(descr, leaf_info, pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedRecordReader<FloatType>>(descr, leaf_info, pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedRecordReader<DoubleType>>(descr, leaf_info, pool);
+ case Type::BYTE_ARRAY:
+ return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FLBARecordReader>(descr, leaf_info, pool);
+ default: {
+ // PARQUET-1481: This can occur if the file is corrupt
+ std::stringstream ss;
+ ss << "Invalid physical column type: " << static_cast<int>(descr->physical_type());
+ throw ParquetException(ss.str());
+ }
+ }
+ // Unreachable code, but suppress compiler warning
+ return nullptr;
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
index 7f51cff2e97..8c48e4d7843 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_reader.h
@@ -1,376 +1,376 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "parquet/exception.h"
-#include "parquet/level_conversion.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-
-namespace BitUtil {
-class BitReader;
-} // namespace BitUtil
-
-namespace util {
-class RleDecoder;
-} // namespace util
-
-} // namespace arrow
-
-namespace parquet {
-
-class Decryptor;
-class Page;
-
-// 16 MB is the default maximum page header size
-static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
-
-// 16 KB is the default expected page header size
-static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
-
-class PARQUET_EXPORT LevelDecoder {
- public:
- LevelDecoder();
- ~LevelDecoder();
-
- // Initialize the LevelDecoder state with new data
- // and return the number of bytes consumed
- int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
- const uint8_t* data, int32_t data_size);
-
- void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
- const uint8_t* data);
-
- // Decodes a batch of levels into an array and returns the number of levels decoded
- int Decode(int batch_size, int16_t* levels);
-
- private:
- int bit_width_;
- int num_values_remaining_;
- Encoding::type encoding_;
- std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
- std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_;
- int16_t max_level_;
-};
-
-struct CryptoContext {
- CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
- std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
- : start_decrypt_with_dictionary_page(start_with_dictionary_page),
- row_group_ordinal(rg_ordinal),
- column_ordinal(col_ordinal),
- meta_decryptor(std::move(meta)),
- data_decryptor(std::move(data)) {}
- CryptoContext() {}
-
- bool start_decrypt_with_dictionary_page = false;
- int16_t row_group_ordinal = -1;
- int16_t column_ordinal = -1;
- std::shared_ptr<Decryptor> meta_decryptor;
- std::shared_ptr<Decryptor> data_decryptor;
-};
-
-// Abstract page iterator interface. This way, we can feed column pages to the
-// ColumnReader through whatever mechanism we choose
-class PARQUET_EXPORT PageReader {
- public:
- virtual ~PageReader() = default;
-
- static std::unique_ptr<PageReader> Open(
- std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
- Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
- const CryptoContext* ctx = NULLPTR);
-
- // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
- // containing new Page otherwise
- virtual std::shared_ptr<Page> NextPage() = 0;
-
- virtual void set_max_page_header_size(uint32_t size) = 0;
-};
-
-class PARQUET_EXPORT ColumnReader {
- public:
- virtual ~ColumnReader() = default;
-
- static std::shared_ptr<ColumnReader> Make(
- const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- // Returns true if there are still values in this column.
- virtual bool HasNext() = 0;
-
- virtual Type::type type() const = 0;
-
- virtual const ColumnDescriptor* descr() const = 0;
-
- // Get the encoding that can be exposed by this reader. If it returns
- // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
- //
- // \note API EXPERIMENTAL
- virtual ExposedEncoding GetExposedEncoding() = 0;
-
- protected:
- friend class RowGroupReader;
- // Set the encoding that can be exposed by this reader.
- //
- // \note API EXPERIMENTAL
- virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
-};
-
-// API to read values from a single column. This is a main client facing API.
-template <typename DType>
-class TypedColumnReader : public ColumnReader {
- public:
- typedef typename DType::c_type T;
-
- // Read a batch of repetition levels, definition levels, and values from the
- // column.
- //
- // Since null values are not stored in the values, the number of values read
- // may be less than the number of repetition and definition levels. With
- // nested data this is almost certainly true.
- //
- // Set def_levels or rep_levels to nullptr if you want to skip reading them.
- // This is only safe if you know through some other source that there are no
- // undefined values.
- //
- // To fully exhaust a row group, you must read batches until the number of
- // values read reaches the number of stored values according to the metadata.
- //
- // This API is the same for both V1 and V2 of the DataPage
- //
- // @returns: actual number of levels read (see values_read for number of values read)
- virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- T* values, int64_t* values_read) = 0;
-
- /// Read a batch of repetition levels, definition levels, and values from the
- /// column and leave spaces for null entries on the lowest level in the values
- /// buffer.
- ///
- /// In comparison to ReadBatch the length of repetition and definition levels
- /// is the same as of the number of values read for max_definition_level == 1.
- /// In the case of max_definition_level > 1, the repetition and definition
- /// levels are larger than the values but the values include the null entries
- /// with definition_level == (max_definition_level - 1).
- ///
- /// To fully exhaust a row group, you must read batches until the number of
- /// values read reaches the number of stored values according to the metadata.
- ///
- /// @param batch_size the number of levels to read
- /// @param[out] def_levels The Parquet definition levels, output has
- /// the length levels_read.
- /// @param[out] rep_levels The Parquet repetition levels, output has
- /// the length levels_read.
- /// @param[out] values The values in the lowest nested level including
- /// spacing for nulls on the lowest levels; output has the length
- /// values_read.
- /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
- /// the row is null or on the maximum definition level. For performance
- /// reasons the underlying buffer should be able to store 1 bit more than
- /// required. If this requires an additional byte, this byte is only read
- /// but never written to.
- /// @param valid_bits_offset The offset in bits of the valid_bits where the
- /// first relevant bit resides.
- /// @param[out] levels_read The number of repetition/definition levels that were read.
- /// @param[out] values_read The number of values read, this includes all
- /// non-null entries as well as all null-entries on the lowest level
- /// (i.e. definition_level == max_definition_level - 1)
- /// @param[out] null_count The number of nulls on the lowest levels.
- /// (i.e. (values_read - null_count) is total number of non-null entries)
- ///
- /// \deprecated Since 4.0.0
- ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
- virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, T* values, uint8_t* valid_bits,
- int64_t valid_bits_offset, int64_t* levels_read,
- int64_t* values_read, int64_t* null_count) = 0;
-
- // Skip reading levels
- // Returns the number of levels skipped
- virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
-
- // Read a batch of repetition levels, definition levels, and indices from the
- // column. And read the dictionary if a dictionary page is encountered during
- // reading pages. This API is similar to ReadBatch(), with ability to read
- // dictionary and indices. It is only valid to call this method when the reader can
- // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
- // DICTIONARY).
- //
- // The dictionary is read along with the data page. When there's no data page,
- // the dictionary won't be returned.
- //
- // @param batch_size The batch size to read
- // @param[out] def_levels The Parquet definition levels.
- // @param[out] rep_levels The Parquet repetition levels.
- // @param[out] indices The dictionary indices.
- // @param[out] indices_read The number of indices read.
- // @param[out] dict The pointer to dictionary values. It will return nullptr if
- // there's no data page. Each column chunk only has one dictionary page. The dictionary
- // is owned by the reader, so the caller is responsible for copying the dictionary
- // values before the reader gets destroyed.
- // @param[out] dict_len The dictionary length. It will return 0 if there's no data
- // page.
- // @returns: actual number of levels read (see indices_read for number of
- // indices read
- //
- // \note API EXPERIMENTAL
- virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, int32_t* indices,
- int64_t* indices_read, const T** dict,
- int32_t* dict_len) = 0;
-};
-
-namespace internal {
-
-/// \brief Stateful column reader that delimits semantic records for both flat
-/// and nested columns
-///
-/// \note API EXPERIMENTAL
-/// \since 1.3.0
-class RecordReader {
- public:
- static std::shared_ptr<RecordReader> Make(
- const ColumnDescriptor* descr, LevelInfo leaf_info,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
- const bool read_dictionary = false);
-
- virtual ~RecordReader() = default;
-
- /// \brief Attempt to read indicated number of records from column chunk
- /// \return number of records read
- virtual int64_t ReadRecords(int64_t num_records) = 0;
-
- /// \brief Pre-allocate space for data. Results in better flat read performance
- virtual void Reserve(int64_t num_values) = 0;
-
- /// \brief Clear consumed values and repetition/definition levels as the
- /// result of calling ReadRecords
- virtual void Reset() = 0;
-
- /// \brief Transfer filled values buffer to caller. A new one will be
- /// allocated in subsequent ReadRecords calls
- virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
-
- /// \brief Transfer filled validity bitmap buffer to caller. A new one will
- /// be allocated in subsequent ReadRecords calls
- virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
-
- /// \brief Return true if the record reader has more internal data yet to
- /// process
- virtual bool HasMoreData() const = 0;
-
- /// \brief Advance record reader to the next row group
- /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
- virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
-
- virtual void DebugPrintState() = 0;
-
- /// \brief Decoded definition levels
- int16_t* def_levels() const {
- return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
- }
-
- /// \brief Decoded repetition levels
- int16_t* rep_levels() const {
- return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
- }
-
- /// \brief Decoded values, including nulls, if any
- uint8_t* values() const { return values_->mutable_data(); }
-
- /// \brief Number of values written including nulls (if any)
- int64_t values_written() const { return values_written_; }
-
- /// \brief Number of definition / repetition levels (from those that have
- /// been decoded) that have been consumed inside the reader.
- int64_t levels_position() const { return levels_position_; }
-
- /// \brief Number of definition / repetition levels that have been written
- /// internally in the reader
- int64_t levels_written() const { return levels_written_; }
-
- /// \brief Number of nulls in the leaf
- int64_t null_count() const { return null_count_; }
-
- /// \brief True if the leaf values are nullable
- bool nullable_values() const { return nullable_values_; }
-
- /// \brief True if reading directly as Arrow dictionary-encoded
- bool read_dictionary() const { return read_dictionary_; }
-
- protected:
- bool nullable_values_;
-
- bool at_record_start_;
- int64_t records_read_;
-
- int64_t values_written_;
- int64_t values_capacity_;
- int64_t null_count_;
-
- int64_t levels_written_;
- int64_t levels_position_;
- int64_t levels_capacity_;
-
- std::shared_ptr<::arrow::ResizableBuffer> values_;
- // In the case of false, don't allocate the values buffer (when we directly read into
- // builder classes).
- bool uses_values_;
-
- std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
- std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
- std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
-
- bool read_dictionary_ = false;
-};
-
-class BinaryRecordReader : virtual public RecordReader {
- public:
- virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
-};
-
-/// \brief Read records directly to dictionary-encoded Arrow form (int32
-/// indices). Only valid for BYTE_ARRAY columns
-class DictionaryRecordReader : virtual public RecordReader {
- public:
- virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
-};
-
-} // namespace internal
-
-using BoolReader = TypedColumnReader<BooleanType>;
-using Int32Reader = TypedColumnReader<Int32Type>;
-using Int64Reader = TypedColumnReader<Int64Type>;
-using Int96Reader = TypedColumnReader<Int96Type>;
-using FloatReader = TypedColumnReader<FloatType>;
-using DoubleReader = TypedColumnReader<DoubleType>;
-using ByteArrayReader = TypedColumnReader<ByteArrayType>;
-using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "parquet/exception.h"
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+
+namespace BitUtil {
+class BitReader;
+} // namespace BitUtil
+
+namespace util {
+class RleDecoder;
+} // namespace util
+
+} // namespace arrow
+
+namespace parquet {
+
+class Decryptor;
+class Page;
+
+// 16 MB is the default maximum page header size
+static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+
+// 16 KB is the default expected page header size
+static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
+
+class PARQUET_EXPORT LevelDecoder {
+ public:
+ LevelDecoder();
+ ~LevelDecoder();
+
+ // Initialize the LevelDecoder state with new data
+ // and return the number of bytes consumed
+ int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+ const uint8_t* data, int32_t data_size);
+
+ void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
+ const uint8_t* data);
+
+ // Decodes a batch of levels into an array and returns the number of levels decoded
+ int Decode(int batch_size, int16_t* levels);
+
+ private:
+ int bit_width_;
+ int num_values_remaining_;
+ Encoding::type encoding_;
+ std::unique_ptr<::arrow::util::RleDecoder> rle_decoder_;
+ std::unique_ptr<::arrow::BitUtil::BitReader> bit_packed_decoder_;
+ int16_t max_level_;
+};
+
+struct CryptoContext {
+ CryptoContext(bool start_with_dictionary_page, int16_t rg_ordinal, int16_t col_ordinal,
+ std::shared_ptr<Decryptor> meta, std::shared_ptr<Decryptor> data)
+ : start_decrypt_with_dictionary_page(start_with_dictionary_page),
+ row_group_ordinal(rg_ordinal),
+ column_ordinal(col_ordinal),
+ meta_decryptor(std::move(meta)),
+ data_decryptor(std::move(data)) {}
+ CryptoContext() {}
+
+ bool start_decrypt_with_dictionary_page = false;
+ int16_t row_group_ordinal = -1;
+ int16_t column_ordinal = -1;
+ std::shared_ptr<Decryptor> meta_decryptor;
+ std::shared_ptr<Decryptor> data_decryptor;
+};
+
+// Abstract page iterator interface. This way, we can feed column pages to the
+// ColumnReader through whatever mechanism we choose
+class PARQUET_EXPORT PageReader {
+ public:
+ virtual ~PageReader() = default;
+
+ static std::unique_ptr<PageReader> Open(
+ std::shared_ptr<ArrowInputStream> stream, int64_t total_num_rows,
+ Compression::type codec, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ const CryptoContext* ctx = NULLPTR);
+
+ // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
+ // containing new Page otherwise
+ virtual std::shared_ptr<Page> NextPage() = 0;
+
+ virtual void set_max_page_header_size(uint32_t size) = 0;
+};
+
+class PARQUET_EXPORT ColumnReader {
+ public:
+ virtual ~ColumnReader() = default;
+
+ static std::shared_ptr<ColumnReader> Make(
+ const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ // Returns true if there are still values in this column.
+ virtual bool HasNext() = 0;
+
+ virtual Type::type type() const = 0;
+
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ // Get the encoding that can be exposed by this reader. If it returns
+ // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+ //
+ // \note API EXPERIMENTAL
+ virtual ExposedEncoding GetExposedEncoding() = 0;
+
+ protected:
+ friend class RowGroupReader;
+ // Set the encoding that can be exposed by this reader.
+ //
+ // \note API EXPERIMENTAL
+ virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
+};
+
+// API to read values from a single column. This is a main client facing API.
+template <typename DType>
+class TypedColumnReader : public ColumnReader {
+ public:
+ typedef typename DType::c_type T;
+
+ // Read a batch of repetition levels, definition levels, and values from the
+ // column.
+ //
+ // Since null values are not stored in the values, the number of values read
+ // may be less than the number of repetition and definition levels. With
+ // nested data this is almost certainly true.
+ //
+ // Set def_levels or rep_levels to nullptr if you want to skip reading them.
+ // This is only safe if you know through some other source that there are no
+ // undefined values.
+ //
+ // To fully exhaust a row group, you must read batches until the number of
+ // values read reaches the number of stored values according to the metadata.
+ //
+ // This API is the same for both V1 and V2 of the DataPage
+ //
+ // @returns: actual number of levels read (see values_read for number of values read)
+ virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ T* values, int64_t* values_read) = 0;
+
+ /// Read a batch of repetition levels, definition levels, and values from the
+ /// column and leave spaces for null entries on the lowest level in the values
+ /// buffer.
+ ///
+ /// In comparison to ReadBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1).
+ ///
+ /// To fully exhaust a row group, you must read batches until the number of
+ /// values read reaches the number of stored values according to the metadata.
+ ///
+ /// @param batch_size the number of levels to read
+ /// @param[out] def_levels The Parquet definition levels, output has
+ /// the length levels_read.
+ /// @param[out] rep_levels The Parquet repetition levels, output has
+ /// the length levels_read.
+ /// @param[out] values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; output has the length
+ /// values_read.
+ /// @param[out] valid_bits Memory allocated for a bitmap that indicates if
+ /// the row is null or on the maximum definition level. For performance
+ /// reasons the underlying buffer should be able to store 1 bit more than
+ /// required. If this requires an additional byte, this byte is only read
+ /// but never written to.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param[out] levels_read The number of repetition/definition levels that were read.
+ /// @param[out] values_read The number of values read, this includes all
+ /// non-null entries as well as all null-entries on the lowest level
+ /// (i.e. definition_level == max_definition_level - 1)
+ /// @param[out] null_count The number of nulls on the lowest levels.
+ /// (i.e. (values_read - null_count) is total number of non-null entries)
+ ///
+ /// \deprecated Since 4.0.0
+ ARROW_DEPRECATED("Doesn't handle nesting correctly and unused outside of unit tests.")
+ virtual int64_t ReadBatchSpaced(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, T* values, uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t* levels_read,
+ int64_t* values_read, int64_t* null_count) = 0;
+
+ // Skip reading levels
+ // Returns the number of levels skipped
+ virtual int64_t Skip(int64_t num_rows_to_skip) = 0;
+
+ // Read a batch of repetition levels, definition levels, and indices from the
+ // column. And read the dictionary if a dictionary page is encountered during
+ // reading pages. This API is similar to ReadBatch(), with ability to read
+ // dictionary and indices. It is only valid to call this method when the reader can
+ // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+ // DICTIONARY).
+ //
+ // The dictionary is read along with the data page. When there's no data page,
+ // the dictionary won't be returned.
+ //
+ // @param batch_size The batch size to read
+ // @param[out] def_levels The Parquet definition levels.
+ // @param[out] rep_levels The Parquet repetition levels.
+ // @param[out] indices The dictionary indices.
+ // @param[out] indices_read The number of indices read.
+ // @param[out] dict The pointer to dictionary values. It will return nullptr if
+ // there's no data page. Each column chunk only has one dictionary page. The dictionary
+ // is owned by the reader, so the caller is responsible for copying the dictionary
+ // values before the reader gets destroyed.
+ // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+ // page.
+ // @returns: actual number of levels read (see indices_read for number of
+ // indices read
+ //
+ // \note API EXPERIMENTAL
+ virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, int32_t* indices,
+ int64_t* indices_read, const T** dict,
+ int32_t* dict_len) = 0;
+};
+
+namespace internal {
+
+/// \brief Stateful column reader that delimits semantic records for both flat
+/// and nested columns
+///
+/// \note API EXPERIMENTAL
+/// \since 1.3.0
+class RecordReader {
+ public:
+ static std::shared_ptr<RecordReader> Make(
+ const ColumnDescriptor* descr, LevelInfo leaf_info,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ const bool read_dictionary = false);
+
+ virtual ~RecordReader() = default;
+
+ /// \brief Attempt to read indicated number of records from column chunk
+ /// \return number of records read
+ virtual int64_t ReadRecords(int64_t num_records) = 0;
+
+ /// \brief Pre-allocate space for data. Results in better flat read performance
+ virtual void Reserve(int64_t num_values) = 0;
+
+ /// \brief Clear consumed values and repetition/definition levels as the
+ /// result of calling ReadRecords
+ virtual void Reset() = 0;
+
+ /// \brief Transfer filled values buffer to caller. A new one will be
+ /// allocated in subsequent ReadRecords calls
+ virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
+
+ /// \brief Transfer filled validity bitmap buffer to caller. A new one will
+ /// be allocated in subsequent ReadRecords calls
+ virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
+
+ /// \brief Return true if the record reader has more internal data yet to
+ /// process
+ virtual bool HasMoreData() const = 0;
+
+ /// \brief Advance record reader to the next row group
+ /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
+ virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
+
+ virtual void DebugPrintState() = 0;
+
+ /// \brief Decoded definition levels
+ int16_t* def_levels() const {
+ return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
+ }
+
+ /// \brief Decoded repetition levels
+ int16_t* rep_levels() const {
+ return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
+ }
+
+ /// \brief Decoded values, including nulls, if any
+ uint8_t* values() const { return values_->mutable_data(); }
+
+ /// \brief Number of values written including nulls (if any)
+ int64_t values_written() const { return values_written_; }
+
+ /// \brief Number of definition / repetition levels (from those that have
+ /// been decoded) that have been consumed inside the reader.
+ int64_t levels_position() const { return levels_position_; }
+
+ /// \brief Number of definition / repetition levels that have been written
+ /// internally in the reader
+ int64_t levels_written() const { return levels_written_; }
+
+ /// \brief Number of nulls in the leaf
+ int64_t null_count() const { return null_count_; }
+
+ /// \brief True if the leaf values are nullable
+ bool nullable_values() const { return nullable_values_; }
+
+ /// \brief True if reading directly as Arrow dictionary-encoded
+ bool read_dictionary() const { return read_dictionary_; }
+
+ protected:
+ bool nullable_values_;
+
+ bool at_record_start_;
+ int64_t records_read_;
+
+ int64_t values_written_;
+ int64_t values_capacity_;
+ int64_t null_count_;
+
+ int64_t levels_written_;
+ int64_t levels_position_;
+ int64_t levels_capacity_;
+
+ std::shared_ptr<::arrow::ResizableBuffer> values_;
+ // In the case of false, don't allocate the values buffer (when we directly read into
+ // builder classes).
+ bool uses_values_;
+
+ std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
+ std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
+ std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
+
+ bool read_dictionary_ = false;
+};
+
+class BinaryRecordReader : virtual public RecordReader {
+ public:
+ virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+};
+
+/// \brief Read records directly to dictionary-encoded Arrow form (int32
+/// indices). Only valid for BYTE_ARRAY columns
+class DictionaryRecordReader : virtual public RecordReader {
+ public:
+ virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
+};
+
+} // namespace internal
+
+using BoolReader = TypedColumnReader<BooleanType>;
+using Int32Reader = TypedColumnReader<Int32Type>;
+using Int64Reader = TypedColumnReader<Int64Type>;
+using Int96Reader = TypedColumnReader<Int96Type>;
+using FloatReader = TypedColumnReader<FloatType>;
+using DoubleReader = TypedColumnReader<DoubleType>;
+using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
index 0ef83568e3e..9ab1663ccd7 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.cc
@@ -1,91 +1,91 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/column_scanner.h"
-
-#include <cstdint>
-#include <memory>
-
-#include "parquet/column_reader.h"
-
-using arrow::MemoryPool;
-
-namespace parquet {
-
-std::shared_ptr<Scanner> Scanner::Make(std::shared_ptr<ColumnReader> col_reader,
- int64_t batch_size, MemoryPool* pool) {
- switch (col_reader->type()) {
- case Type::BOOLEAN:
- return std::make_shared<BoolScanner>(std::move(col_reader), batch_size, pool);
- case Type::INT32:
- return std::make_shared<Int32Scanner>(std::move(col_reader), batch_size, pool);
- case Type::INT64:
- return std::make_shared<Int64Scanner>(std::move(col_reader), batch_size, pool);
- case Type::INT96:
- return std::make_shared<Int96Scanner>(std::move(col_reader), batch_size, pool);
- case Type::FLOAT:
- return std::make_shared<FloatScanner>(std::move(col_reader), batch_size, pool);
- case Type::DOUBLE:
- return std::make_shared<DoubleScanner>(std::move(col_reader), batch_size, pool);
- case Type::BYTE_ARRAY:
- return std::make_shared<ByteArrayScanner>(std::move(col_reader), batch_size, pool);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<FixedLenByteArrayScanner>(std::move(col_reader), batch_size,
- pool);
- default:
- ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return std::shared_ptr<Scanner>(nullptr);
-}
-
-int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- uint8_t* values, int64_t* values_buffered,
- parquet::ColumnReader* reader) {
- switch (reader->type()) {
- case parquet::Type::BOOLEAN:
- return ScanAll<parquet::BoolReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::INT32:
- return ScanAll<parquet::Int32Reader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::INT64:
- return ScanAll<parquet::Int64Reader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::INT96:
- return ScanAll<parquet::Int96Reader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::FLOAT:
- return ScanAll<parquet::FloatReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::DOUBLE:
- return ScanAll<parquet::DoubleReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::BYTE_ARRAY:
- return ScanAll<parquet::ByteArrayReader>(batch_size, def_levels, rep_levels, values,
- values_buffered, reader);
- case parquet::Type::FIXED_LEN_BYTE_ARRAY:
- return ScanAll<parquet::FixedLenByteArrayReader>(batch_size, def_levels, rep_levels,
- values, values_buffered, reader);
- default:
- parquet::ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return 0;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_scanner.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/column_reader.h"
+
+using arrow::MemoryPool;
+
+namespace parquet {
+
+std::shared_ptr<Scanner> Scanner::Make(std::shared_ptr<ColumnReader> col_reader,
+ int64_t batch_size, MemoryPool* pool) {
+ switch (col_reader->type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<BoolScanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT32:
+ return std::make_shared<Int32Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT64:
+ return std::make_shared<Int64Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::INT96:
+ return std::make_shared<Int96Scanner>(std::move(col_reader), batch_size, pool);
+ case Type::FLOAT:
+ return std::make_shared<FloatScanner>(std::move(col_reader), batch_size, pool);
+ case Type::DOUBLE:
+ return std::make_shared<DoubleScanner>(std::move(col_reader), batch_size, pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<ByteArrayScanner>(std::move(col_reader), batch_size, pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<FixedLenByteArrayScanner>(std::move(col_reader), batch_size,
+ pool);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<Scanner>(nullptr);
+}
+
+int64_t ScanAllValues(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered,
+ parquet::ColumnReader* reader) {
+ switch (reader->type()) {
+ case parquet::Type::BOOLEAN:
+ return ScanAll<parquet::BoolReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT32:
+ return ScanAll<parquet::Int32Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT64:
+ return ScanAll<parquet::Int64Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::INT96:
+ return ScanAll<parquet::Int96Reader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::FLOAT:
+ return ScanAll<parquet::FloatReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::DOUBLE:
+ return ScanAll<parquet::DoubleReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::BYTE_ARRAY:
+ return ScanAll<parquet::ByteArrayReader>(batch_size, def_levels, rep_levels, values,
+ values_buffered, reader);
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY:
+ return ScanAll<parquet::FixedLenByteArrayReader>(batch_size, def_levels, rep_levels,
+ values, values_buffered, reader);
+ default:
+ parquet::ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return 0;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
index 61d08841409..d53435f03cd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_scanner.h
@@ -1,262 +1,262 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <stdio.h>
-
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "parquet/column_reader.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
-
-class PARQUET_EXPORT Scanner {
- public:
- explicit Scanner(std::shared_ptr<ColumnReader> reader,
- int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
- : batch_size_(batch_size),
- level_offset_(0),
- levels_buffered_(0),
- value_buffer_(AllocateBuffer(pool)),
- value_offset_(0),
- values_buffered_(0),
- reader_(std::move(reader)) {
- def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
- rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
- }
-
- virtual ~Scanner() {}
-
- static std::shared_ptr<Scanner> Make(
- std::shared_ptr<ColumnReader> col_reader,
- int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
-
- bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
-
- const ColumnDescriptor* descr() const { return reader_->descr(); }
-
- int64_t batch_size() const { return batch_size_; }
-
- void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
-
- protected:
- int64_t batch_size_;
-
- std::vector<int16_t> def_levels_;
- std::vector<int16_t> rep_levels_;
- int level_offset_;
- int levels_buffered_;
-
- std::shared_ptr<ResizableBuffer> value_buffer_;
- int value_offset_;
- int64_t values_buffered_;
- std::shared_ptr<ColumnReader> reader_;
-};
-
-template <typename DType>
-class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
- public:
- typedef typename DType::c_type T;
-
- explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
- int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
- : Scanner(std::move(reader), batch_size, pool) {
- typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
- int value_byte_size = type_traits<DType::type_num>::value_byte_size;
- PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
- values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
- }
-
- virtual ~TypedScanner() {}
-
- bool NextLevels(int16_t* def_level, int16_t* rep_level) {
- if (level_offset_ == levels_buffered_) {
- levels_buffered_ = static_cast<int>(
- typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
- rep_levels_.data(), values_, &values_buffered_));
-
- value_offset_ = 0;
- level_offset_ = 0;
- if (!levels_buffered_) {
- return false;
- }
- }
- *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
- *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
- level_offset_++;
- return true;
- }
-
- bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
- if (level_offset_ == levels_buffered_) {
- if (!HasNext()) {
- // Out of data pages
- return false;
- }
- }
-
- NextLevels(def_level, rep_level);
- *is_null = *def_level < descr()->max_definition_level();
-
- if (*is_null) {
- return true;
- }
-
- if (value_offset_ == values_buffered_) {
- throw ParquetException("Value was non-null, but has not been buffered");
- }
- *val = values_[value_offset_++];
- return true;
- }
-
- // Returns true if there is a next value
- bool NextValue(T* val, bool* is_null) {
- if (level_offset_ == levels_buffered_) {
- if (!HasNext()) {
- // Out of data pages
- return false;
- }
- }
-
- // Out of values
- int16_t def_level = -1;
- int16_t rep_level = -1;
- NextLevels(&def_level, &rep_level);
- *is_null = def_level < descr()->max_definition_level();
-
- if (*is_null) {
- return true;
- }
-
- if (value_offset_ == values_buffered_) {
- throw ParquetException("Value was non-null, but has not been buffered");
- }
- *val = values_[value_offset_++];
- return true;
- }
-
- virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
- T val{};
- int16_t def_level = -1;
- int16_t rep_level = -1;
- bool is_null = false;
- char buffer[80];
-
- if (!Next(&val, &def_level, &rep_level, &is_null)) {
- throw ParquetException("No more values buffered");
- }
-
- if (with_levels) {
- out << " D:" << def_level << " R:" << rep_level << " ";
- if (!is_null) {
- out << "V:";
- }
- }
-
- if (is_null) {
- std::string null_fmt = format_fwf<ByteArrayType>(width);
- snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
- } else {
- FormatValue(&val, buffer, sizeof(buffer), width);
- }
- out << buffer;
- }
-
- private:
- // The ownership of this object is expressed through the reader_ variable in the base
- TypedColumnReader<DType>* typed_reader_;
-
- inline void FormatValue(void* val, char* buffer, int bufsize, int width);
-
- T* values_;
-};
-
-template <typename DType>
-inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<DType>(width);
- snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
-}
-
-template <>
-inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<Int96Type>(width);
- std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
- snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
-template <>
-inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<ByteArrayType>(width);
- std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
- snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
-template <>
-inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
- int width) {
- std::string fmt = format_fwf<FLBAType>(width);
- std::string result = FixedLenByteArrayToString(
- *reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
- snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
-}
-
-typedef TypedScanner<BooleanType> BoolScanner;
-typedef TypedScanner<Int32Type> Int32Scanner;
-typedef TypedScanner<Int64Type> Int64Scanner;
-typedef TypedScanner<Int96Type> Int96Scanner;
-typedef TypedScanner<FloatType> FloatScanner;
-typedef TypedScanner<DoubleType> DoubleScanner;
-typedef TypedScanner<ByteArrayType> ByteArrayScanner;
-typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
-
-template <typename RType>
-int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
- uint8_t* values, int64_t* values_buffered,
- parquet::ColumnReader* reader) {
- typedef typename RType::T Type;
- auto typed_reader = static_cast<RType*>(reader);
- auto vals = reinterpret_cast<Type*>(&values[0]);
- return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
- values_buffered);
-}
-
-int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
- int16_t* rep_levels, uint8_t* values,
- int64_t* values_buffered,
- parquet::ColumnReader* reader);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
+
+class PARQUET_EXPORT Scanner {
+ public:
+ explicit Scanner(std::shared_ptr<ColumnReader> reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+ : batch_size_(batch_size),
+ level_offset_(0),
+ levels_buffered_(0),
+ value_buffer_(AllocateBuffer(pool)),
+ value_offset_(0),
+ values_buffered_(0),
+ reader_(std::move(reader)) {
+ def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
+ rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
+ }
+
+ virtual ~Scanner() {}
+
+ static std::shared_ptr<Scanner> Make(
+ std::shared_ptr<ColumnReader> col_reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
+
+ bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
+
+ const ColumnDescriptor* descr() const { return reader_->descr(); }
+
+ int64_t batch_size() const { return batch_size_; }
+
+ void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
+
+ protected:
+ int64_t batch_size_;
+
+ std::vector<int16_t> def_levels_;
+ std::vector<int16_t> rep_levels_;
+ int level_offset_;
+ int levels_buffered_;
+
+ std::shared_ptr<ResizableBuffer> value_buffer_;
+ int value_offset_;
+ int64_t values_buffered_;
+ std::shared_ptr<ColumnReader> reader_;
+};
+
+template <typename DType>
+class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
+ public:
+ typedef typename DType::c_type T;
+
+ explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
+ int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+ : Scanner(std::move(reader), batch_size, pool) {
+ typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
+ int value_byte_size = type_traits<DType::type_num>::value_byte_size;
+ PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
+ values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
+ }
+
+ virtual ~TypedScanner() {}
+
+ bool NextLevels(int16_t* def_level, int16_t* rep_level) {
+ if (level_offset_ == levels_buffered_) {
+ levels_buffered_ = static_cast<int>(
+ typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
+ rep_levels_.data(), values_, &values_buffered_));
+
+ value_offset_ = 0;
+ level_offset_ = 0;
+ if (!levels_buffered_) {
+ return false;
+ }
+ }
+ *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
+ *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
+ level_offset_++;
+ return true;
+ }
+
+ bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
+ if (level_offset_ == levels_buffered_) {
+ if (!HasNext()) {
+ // Out of data pages
+ return false;
+ }
+ }
+
+ NextLevels(def_level, rep_level);
+ *is_null = *def_level < descr()->max_definition_level();
+
+ if (*is_null) {
+ return true;
+ }
+
+ if (value_offset_ == values_buffered_) {
+ throw ParquetException("Value was non-null, but has not been buffered");
+ }
+ *val = values_[value_offset_++];
+ return true;
+ }
+
+ // Returns true if there is a next value
+ bool NextValue(T* val, bool* is_null) {
+ if (level_offset_ == levels_buffered_) {
+ if (!HasNext()) {
+ // Out of data pages
+ return false;
+ }
+ }
+
+ // Out of values
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
+ NextLevels(&def_level, &rep_level);
+ *is_null = def_level < descr()->max_definition_level();
+
+ if (*is_null) {
+ return true;
+ }
+
+ if (value_offset_ == values_buffered_) {
+ throw ParquetException("Value was non-null, but has not been buffered");
+ }
+ *val = values_[value_offset_++];
+ return true;
+ }
+
+ virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
+ T val{};
+ int16_t def_level = -1;
+ int16_t rep_level = -1;
+ bool is_null = false;
+ char buffer[80];
+
+ if (!Next(&val, &def_level, &rep_level, &is_null)) {
+ throw ParquetException("No more values buffered");
+ }
+
+ if (with_levels) {
+ out << " D:" << def_level << " R:" << rep_level << " ";
+ if (!is_null) {
+ out << "V:";
+ }
+ }
+
+ if (is_null) {
+ std::string null_fmt = format_fwf<ByteArrayType>(width);
+ snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
+ } else {
+ FormatValue(&val, buffer, sizeof(buffer), width);
+ }
+ out << buffer;
+ }
+
+ private:
+ // The ownership of this object is expressed through the reader_ variable in the base
+ TypedColumnReader<DType>* typed_reader_;
+
+ inline void FormatValue(void* val, char* buffer, int bufsize, int width);
+
+ T* values_;
+};
+
+template <typename DType>
+inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<DType>(width);
+ snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
+}
+
+template <>
+inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<Int96Type>(width);
+ std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<ByteArrayType>(width);
+ std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
+ int width) {
+ std::string fmt = format_fwf<FLBAType>(width);
+ std::string result = FixedLenByteArrayToString(
+ *reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
+ snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+typedef TypedScanner<BooleanType> BoolScanner;
+typedef TypedScanner<Int32Type> Int32Scanner;
+typedef TypedScanner<Int64Type> Int64Scanner;
+typedef TypedScanner<Int96Type> Int96Scanner;
+typedef TypedScanner<FloatType> FloatScanner;
+typedef TypedScanner<DoubleType> DoubleScanner;
+typedef TypedScanner<ByteArrayType> ByteArrayScanner;
+typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
+
+template <typename RType>
+int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+ uint8_t* values, int64_t* values_buffered,
+ parquet::ColumnReader* reader) {
+ typedef typename RType::T Type;
+ auto typed_reader = static_cast<RType*>(reader);
+ auto vals = reinterpret_cast<Type*>(&values[0]);
+ return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
+ values_buffered);
+}
+
+int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
+ int16_t* rep_levels, uint8_t* values,
+ int64_t* values_buffered,
+ parquet::ColumnReader* reader);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
index 75df6f0c683..446fe25e644 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.cc
@@ -1,2067 +1,2067 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/column_writer.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer_builder.h"
-#include "arrow/compute/api.h"
-#include "arrow/io/memory.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
-#include "arrow/util/endian.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/rle_encoding.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/column_page.h"
-#include "parquet/encoding.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/level_conversion.h"
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/statistics.h"
-#include "parquet/thrift_internal.h"
-#include "parquet/types.h"
-
-using arrow::Array;
-using arrow::ArrayData;
-using arrow::Datum;
-using arrow::Result;
-using arrow::Status;
-using arrow::BitUtil::BitWriter;
-using arrow::internal::checked_cast;
-using arrow::internal::checked_pointer_cast;
-using arrow::util::RleEncoder;
-
-namespace BitUtil = arrow::BitUtil;
-
-namespace parquet {
-
-namespace {
-
-// Visitor that exracts the value buffer from a FlatArray at a given offset.
-struct ValueBufferSlicer {
- template <typename T>
- ::arrow::enable_if_base_binary<typename T::TypeClass, Status> Visit(const T& array) {
- auto data = array.data();
- buffer_ =
- SliceBuffer(data->buffers[1], data->offset * sizeof(typename T::offset_type),
- data->length * sizeof(typename T::offset_type));
- return Status::OK();
- }
-
- template <typename T>
- ::arrow::enable_if_fixed_size_binary<typename T::TypeClass, Status> Visit(
- const T& array) {
- auto data = array.data();
- buffer_ = SliceBuffer(data->buffers[1], data->offset * array.byte_width(),
- data->length * array.byte_width());
- return Status::OK();
- }
-
- template <typename T>
- ::arrow::enable_if_t<::arrow::has_c_type<typename T::TypeClass>::value &&
- !std::is_same<BooleanType, typename T::TypeClass>::value,
- Status>
- Visit(const T& array) {
- auto data = array.data();
- buffer_ = SliceBuffer(
- data->buffers[1],
- ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->offset),
- ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->length));
- return Status::OK();
- }
-
- Status Visit(const ::arrow::BooleanArray& array) {
- auto data = array.data();
- if (BitUtil::IsMultipleOf8(data->offset)) {
- buffer_ = SliceBuffer(data->buffers[1], BitUtil::BytesForBits(data->offset),
- BitUtil::BytesForBits(data->length));
- return Status::OK();
- }
- PARQUET_ASSIGN_OR_THROW(buffer_,
- ::arrow::internal::CopyBitmap(pool_, data->buffers[1]->data(),
- data->offset, data->length));
- return Status::OK();
- }
-#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
- Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
- return Status::NotImplemented("Slicing not implemented for " #ArrowTypePrefix); \
- }
-
- NOT_IMPLEMENTED_VISIT(Null);
- NOT_IMPLEMENTED_VISIT(Union);
- NOT_IMPLEMENTED_VISIT(List);
- NOT_IMPLEMENTED_VISIT(LargeList);
- NOT_IMPLEMENTED_VISIT(Struct);
- NOT_IMPLEMENTED_VISIT(FixedSizeList);
- NOT_IMPLEMENTED_VISIT(Dictionary);
- NOT_IMPLEMENTED_VISIT(Extension);
-
-#undef NOT_IMPLEMENTED_VISIT
-
- MemoryPool* pool_;
- std::shared_ptr<Buffer> buffer_;
-};
-
-internal::LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
- internal::LevelInfo level_info;
- level_info.def_level = descr->max_definition_level();
- level_info.rep_level = descr->max_repetition_level();
-
- int16_t min_spaced_def_level = descr->max_definition_level();
- const ::parquet::schema::Node* node = descr->schema_node().get();
- while (node != nullptr && !node->is_repeated()) {
- if (node->is_optional()) {
- min_spaced_def_level--;
- }
- node = node->parent();
- }
- level_info.repeated_ancestor_def_level = min_spaced_def_level;
- return level_info;
-}
-
-template <class T>
-inline const T* AddIfNotNull(const T* base, int64_t offset) {
- if (base != nullptr) {
- return base + offset;
- }
- return nullptr;
-}
-
-} // namespace
-
-LevelEncoder::LevelEncoder() {}
-LevelEncoder::~LevelEncoder() {}
-
-void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
- int num_buffered_values, uint8_t* data, int data_size) {
- bit_width_ = BitUtil::Log2(max_level + 1);
- encoding_ = encoding;
- switch (encoding) {
- case Encoding::RLE: {
- rle_encoder_.reset(new RleEncoder(data, data_size, bit_width_));
- break;
- }
- case Encoding::BIT_PACKED: {
- int num_bytes =
- static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
- bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
- break;
- }
- default:
- throw ParquetException("Unknown encoding type for levels.");
- }
-}
-
-int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
- int num_buffered_values) {
- int bit_width = BitUtil::Log2(max_level + 1);
- int num_bytes = 0;
- switch (encoding) {
- case Encoding::RLE: {
- // TODO: Due to the way we currently check if the buffer is full enough,
- // we need to have MinBufferSize as head room.
- num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
- RleEncoder::MinBufferSize(bit_width);
- break;
- }
- case Encoding::BIT_PACKED: {
- num_bytes =
- static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
- break;
- }
- default:
- throw ParquetException("Unknown encoding type for levels.");
- }
- return num_bytes;
-}
-
-int LevelEncoder::Encode(int batch_size, const int16_t* levels) {
- int num_encoded = 0;
- if (!rle_encoder_ && !bit_packed_encoder_) {
- throw ParquetException("Level encoders are not initialized.");
- }
-
- if (encoding_ == Encoding::RLE) {
- for (int i = 0; i < batch_size; ++i) {
- if (!rle_encoder_->Put(*(levels + i))) {
- break;
- }
- ++num_encoded;
- }
- rle_encoder_->Flush();
- rle_length_ = rle_encoder_->len();
- } else {
- for (int i = 0; i < batch_size; ++i) {
- if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) {
- break;
- }
- ++num_encoded;
- }
- bit_packed_encoder_->Flush();
- }
- return num_encoded;
-}
-
-// ----------------------------------------------------------------------
-// PageWriter implementation
-
-// This subclass delimits pages appearing in a serialized stream, each preceded
-// by a serialized Thrift format::PageHeader indicating the type of each page
-// and the page metadata.
-class SerializedPageWriter : public PageWriter {
- public:
- SerializedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal, int16_t column_chunk_ordinal,
- MemoryPool* pool = ::arrow::default_memory_pool(),
- std::shared_ptr<Encryptor> meta_encryptor = nullptr,
- std::shared_ptr<Encryptor> data_encryptor = nullptr)
- : sink_(std::move(sink)),
- metadata_(metadata),
- pool_(pool),
- num_values_(0),
- dictionary_page_offset_(0),
- data_page_offset_(0),
- total_uncompressed_size_(0),
- total_compressed_size_(0),
- page_ordinal_(0),
- row_group_ordinal_(row_group_ordinal),
- column_ordinal_(column_chunk_ordinal),
- meta_encryptor_(std::move(meta_encryptor)),
- data_encryptor_(std::move(data_encryptor)),
- encryption_buffer_(AllocateBuffer(pool, 0)) {
- if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) {
- InitEncryption();
- }
- compressor_ = GetCodec(codec, compression_level);
- thrift_serializer_.reset(new ThriftSerializer);
- }
-
- int64_t WriteDictionaryPage(const DictionaryPage& page) override {
- int64_t uncompressed_size = page.size();
- std::shared_ptr<Buffer> compressed_data;
- if (has_compressor()) {
- auto buffer = std::static_pointer_cast<ResizableBuffer>(
- AllocateBuffer(pool_, uncompressed_size));
- Compress(*(page.buffer().get()), buffer.get());
- compressed_data = std::static_pointer_cast<Buffer>(buffer);
- } else {
- compressed_data = page.buffer();
- }
-
- format::DictionaryPageHeader dict_page_header;
- dict_page_header.__set_num_values(page.num_values());
- dict_page_header.__set_encoding(ToThrift(page.encoding()));
- dict_page_header.__set_is_sorted(page.is_sorted());
-
- const uint8_t* output_data_buffer = compressed_data->data();
- int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
-
- if (data_encryptor_.get()) {
- UpdateEncryption(encryption::kDictionaryPage);
- PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
- data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
- output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
- encryption_buffer_->mutable_data());
- output_data_buffer = encryption_buffer_->data();
- }
-
- format::PageHeader page_header;
- page_header.__set_type(format::PageType::DICTIONARY_PAGE);
- page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
- page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
- page_header.__set_dictionary_page_header(dict_page_header);
- // TODO(PARQUET-594) crc checksum
-
- PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
- if (dictionary_page_offset_ == 0) {
- dictionary_page_offset_ = start_pos;
- }
-
- if (meta_encryptor_) {
- UpdateEncryption(encryption::kDictionaryPageHeader);
- }
- const int64_t header_size =
- thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
-
- PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
-
- total_uncompressed_size_ += uncompressed_size + header_size;
- total_compressed_size_ += output_data_len + header_size;
- ++dict_encoding_stats_[page.encoding()];
- return uncompressed_size + header_size;
- }
-
- void Close(bool has_dictionary, bool fallback) override {
- if (meta_encryptor_ != nullptr) {
- UpdateEncryption(encryption::kColumnMetaData);
- }
- // index_page_offset = -1 since they are not supported
- metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_,
- total_compressed_size_, total_uncompressed_size_, has_dictionary,
- fallback, dict_encoding_stats_, data_encoding_stats_,
- meta_encryptor_);
- // Write metadata at end of column chunk
- metadata_->WriteTo(sink_.get());
- }
-
- /**
- * Compress a buffer.
- */
- void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
- DCHECK(compressor_ != nullptr);
-
- // Compress the data
- int64_t max_compressed_size =
- compressor_->MaxCompressedLen(src_buffer.size(), src_buffer.data());
-
- // Use Arrow::Buffer::shrink_to_fit = false
- // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
- PARQUET_THROW_NOT_OK(dest_buffer->Resize(max_compressed_size, false));
-
- PARQUET_ASSIGN_OR_THROW(
- int64_t compressed_size,
- compressor_->Compress(src_buffer.size(), src_buffer.data(), max_compressed_size,
- dest_buffer->mutable_data()));
- PARQUET_THROW_NOT_OK(dest_buffer->Resize(compressed_size, false));
- }
-
- int64_t WriteDataPage(const DataPage& page) override {
- const int64_t uncompressed_size = page.uncompressed_size();
- std::shared_ptr<Buffer> compressed_data = page.buffer();
- const uint8_t* output_data_buffer = compressed_data->data();
- int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
-
- if (data_encryptor_.get()) {
- PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
- data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
- UpdateEncryption(encryption::kDataPage);
- output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
- encryption_buffer_->mutable_data());
- output_data_buffer = encryption_buffer_->data();
- }
-
- format::PageHeader page_header;
- page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
- page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
- // TODO(PARQUET-594) crc checksum
-
- if (page.type() == PageType::DATA_PAGE) {
- const DataPageV1& v1_page = checked_cast<const DataPageV1&>(page);
- SetDataPageHeader(page_header, v1_page);
- } else if (page.type() == PageType::DATA_PAGE_V2) {
- const DataPageV2& v2_page = checked_cast<const DataPageV2&>(page);
- SetDataPageV2Header(page_header, v2_page);
- } else {
- throw ParquetException("Unexpected page type");
- }
-
- PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
- if (page_ordinal_ == 0) {
- data_page_offset_ = start_pos;
- }
-
- if (meta_encryptor_) {
- UpdateEncryption(encryption::kDataPageHeader);
- }
- const int64_t header_size =
- thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
- PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
-
- total_uncompressed_size_ += uncompressed_size + header_size;
- total_compressed_size_ += output_data_len + header_size;
- num_values_ += page.num_values();
- ++data_encoding_stats_[page.encoding()];
- ++page_ordinal_;
- return uncompressed_size + header_size;
- }
-
- void SetDataPageHeader(format::PageHeader& page_header, const DataPageV1& page) {
- format::DataPageHeader data_page_header;
- data_page_header.__set_num_values(page.num_values());
- data_page_header.__set_encoding(ToThrift(page.encoding()));
- data_page_header.__set_definition_level_encoding(
- ToThrift(page.definition_level_encoding()));
- data_page_header.__set_repetition_level_encoding(
- ToThrift(page.repetition_level_encoding()));
- data_page_header.__set_statistics(ToThrift(page.statistics()));
-
- page_header.__set_type(format::PageType::DATA_PAGE);
- page_header.__set_data_page_header(data_page_header);
- }
-
- void SetDataPageV2Header(format::PageHeader& page_header, const DataPageV2 page) {
- format::DataPageHeaderV2 data_page_header;
- data_page_header.__set_num_values(page.num_values());
- data_page_header.__set_num_nulls(page.num_nulls());
- data_page_header.__set_num_rows(page.num_rows());
- data_page_header.__set_encoding(ToThrift(page.encoding()));
-
- data_page_header.__set_definition_levels_byte_length(
- page.definition_levels_byte_length());
- data_page_header.__set_repetition_levels_byte_length(
- page.repetition_levels_byte_length());
-
- data_page_header.__set_is_compressed(page.is_compressed());
- data_page_header.__set_statistics(ToThrift(page.statistics()));
-
- page_header.__set_type(format::PageType::DATA_PAGE_V2);
- page_header.__set_data_page_header_v2(data_page_header);
- }
-
- bool has_compressor() override { return (compressor_ != nullptr); }
-
- int64_t num_values() { return num_values_; }
-
- int64_t dictionary_page_offset() { return dictionary_page_offset_; }
-
- int64_t data_page_offset() { return data_page_offset_; }
-
- int64_t total_compressed_size() { return total_compressed_size_; }
-
- int64_t total_uncompressed_size() { return total_uncompressed_size_; }
-
- private:
- // To allow UpdateEncryption on Close
- friend class BufferedPageWriter;
-
- void InitEncryption() {
- // Prepare the AAD for quick update later.
- if (data_encryptor_ != nullptr) {
- data_page_aad_ = encryption::CreateModuleAad(
- data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_,
- column_ordinal_, kNonPageOrdinal);
- }
- if (meta_encryptor_ != nullptr) {
- data_page_header_aad_ = encryption::CreateModuleAad(
- meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_,
- column_ordinal_, kNonPageOrdinal);
- }
- }
-
- void UpdateEncryption(int8_t module_type) {
- switch (module_type) {
- case encryption::kColumnMetaData: {
- meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
- meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
- kNonPageOrdinal));
- break;
- }
- case encryption::kDataPage: {
- encryption::QuickUpdatePageAad(data_page_aad_, page_ordinal_);
- data_encryptor_->UpdateAad(data_page_aad_);
- break;
- }
- case encryption::kDataPageHeader: {
- encryption::QuickUpdatePageAad(data_page_header_aad_, page_ordinal_);
- meta_encryptor_->UpdateAad(data_page_header_aad_);
- break;
- }
- case encryption::kDictionaryPageHeader: {
- meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
- meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
- kNonPageOrdinal));
- break;
- }
- case encryption::kDictionaryPage: {
- data_encryptor_->UpdateAad(encryption::CreateModuleAad(
- data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
- kNonPageOrdinal));
- break;
- }
- default:
- throw ParquetException("Unknown module type in UpdateEncryption");
- }
- }
-
- std::shared_ptr<ArrowOutputStream> sink_;
- ColumnChunkMetaDataBuilder* metadata_;
- MemoryPool* pool_;
- int64_t num_values_;
- int64_t dictionary_page_offset_;
- int64_t data_page_offset_;
- int64_t total_uncompressed_size_;
- int64_t total_compressed_size_;
- int16_t page_ordinal_;
- int16_t row_group_ordinal_;
- int16_t column_ordinal_;
-
- std::unique_ptr<ThriftSerializer> thrift_serializer_;
-
- // Compression codec to use.
- std::unique_ptr<::arrow::util::Codec> compressor_;
-
- std::string data_page_aad_;
- std::string data_page_header_aad_;
-
- std::shared_ptr<Encryptor> meta_encryptor_;
- std::shared_ptr<Encryptor> data_encryptor_;
-
- std::shared_ptr<ResizableBuffer> encryption_buffer_;
-
- std::map<Encoding::type, int32_t> dict_encoding_stats_;
- std::map<Encoding::type, int32_t> data_encoding_stats_;
-};
-
-// This implementation of the PageWriter writes to the final sink on Close .
-class BufferedPageWriter : public PageWriter {
- public:
- BufferedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal, int16_t current_column_ordinal,
- MemoryPool* pool = ::arrow::default_memory_pool(),
- std::shared_ptr<Encryptor> meta_encryptor = nullptr,
- std::shared_ptr<Encryptor> data_encryptor = nullptr)
- : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false) {
- in_memory_sink_ = CreateOutputStream(pool);
- pager_ = std::unique_ptr<SerializedPageWriter>(
- new SerializedPageWriter(in_memory_sink_, codec, compression_level, metadata,
- row_group_ordinal, current_column_ordinal, pool,
- std::move(meta_encryptor), std::move(data_encryptor)));
- }
-
- int64_t WriteDictionaryPage(const DictionaryPage& page) override {
- has_dictionary_pages_ = true;
- return pager_->WriteDictionaryPage(page);
- }
-
- void Close(bool has_dictionary, bool fallback) override {
- if (pager_->meta_encryptor_ != nullptr) {
- pager_->UpdateEncryption(encryption::kColumnMetaData);
- }
- // index_page_offset = -1 since they are not supported
- PARQUET_ASSIGN_OR_THROW(int64_t final_position, final_sink_->Tell());
- // dictionary page offset should be 0 iff there are no dictionary pages
- auto dictionary_page_offset =
- has_dictionary_pages_ ? pager_->dictionary_page_offset() + final_position : 0;
- metadata_->Finish(pager_->num_values(), dictionary_page_offset, -1,
- pager_->data_page_offset() + final_position,
- pager_->total_compressed_size(), pager_->total_uncompressed_size(),
- has_dictionary, fallback, pager_->dict_encoding_stats_,
- pager_->data_encoding_stats_, pager_->meta_encryptor_);
-
- // Write metadata at end of column chunk
- metadata_->WriteTo(in_memory_sink_.get());
-
- // flush everything to the serialized sink
- PARQUET_ASSIGN_OR_THROW(auto buffer, in_memory_sink_->Finish());
- PARQUET_THROW_NOT_OK(final_sink_->Write(buffer));
- }
-
- int64_t WriteDataPage(const DataPage& page) override {
- return pager_->WriteDataPage(page);
- }
-
- void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
- pager_->Compress(src_buffer, dest_buffer);
- }
-
- bool has_compressor() override { return pager_->has_compressor(); }
-
- private:
- std::shared_ptr<ArrowOutputStream> final_sink_;
- ColumnChunkMetaDataBuilder* metadata_;
- std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_;
- std::unique_ptr<SerializedPageWriter> pager_;
- bool has_dictionary_pages_;
-};
-
-std::unique_ptr<PageWriter> PageWriter::Open(
- std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
- bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
- std::shared_ptr<Encryptor> data_encryptor) {
- if (buffered_row_group) {
- return std::unique_ptr<PageWriter>(
- new BufferedPageWriter(std::move(sink), codec, compression_level, metadata,
- row_group_ordinal, column_chunk_ordinal, pool,
- std::move(meta_encryptor), std::move(data_encryptor)));
- } else {
- return std::unique_ptr<PageWriter>(
- new SerializedPageWriter(std::move(sink), codec, compression_level, metadata,
- row_group_ordinal, column_chunk_ordinal, pool,
- std::move(meta_encryptor), std::move(data_encryptor)));
- }
-}
-
-// ----------------------------------------------------------------------
-// ColumnWriter
-
-const std::shared_ptr<WriterProperties>& default_writer_properties() {
- static std::shared_ptr<WriterProperties> default_writer_properties =
- WriterProperties::Builder().build();
- return default_writer_properties;
-}
-
-class ColumnWriterImpl {
- public:
- ColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
- std::unique_ptr<PageWriter> pager, const bool use_dictionary,
- Encoding::type encoding, const WriterProperties* properties)
- : metadata_(metadata),
- descr_(metadata->descr()),
- level_info_(ComputeLevelInfo(metadata->descr())),
- pager_(std::move(pager)),
- has_dictionary_(use_dictionary),
- encoding_(encoding),
- properties_(properties),
- allocator_(properties->memory_pool()),
- num_buffered_values_(0),
- num_buffered_encoded_values_(0),
- rows_written_(0),
- total_bytes_written_(0),
- total_compressed_bytes_(0),
- closed_(false),
- fallback_(false),
- definition_levels_sink_(allocator_),
- repetition_levels_sink_(allocator_) {
- definition_levels_rle_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
- repetition_levels_rle_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
- uncompressed_data_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
-
- if (pager_->has_compressor()) {
- compressor_temp_buffer_ =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
- }
- }
-
- virtual ~ColumnWriterImpl() = default;
-
- int64_t Close();
-
- protected:
- virtual std::shared_ptr<Buffer> GetValuesBuffer() = 0;
-
- // Serializes Dictionary Page if enabled
- virtual void WriteDictionaryPage() = 0;
-
- // Plain-encoded statistics of the current page
- virtual EncodedStatistics GetPageStatistics() = 0;
-
- // Plain-encoded statistics of the whole chunk
- virtual EncodedStatistics GetChunkStatistics() = 0;
-
- // Merges page statistics into chunk statistics, then resets the values
- virtual void ResetPageStatistics() = 0;
-
- // Adds Data Pages to an in memory buffer in dictionary encoding mode
- // Serializes the Data Pages in other encoding modes
- void AddDataPage();
-
- void BuildDataPageV1(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size, int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values);
- void BuildDataPageV2(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size, int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values);
-
- // Serializes Data Pages
- void WriteDataPage(const DataPage& page) {
- total_bytes_written_ += pager_->WriteDataPage(page);
- }
-
- // Write multiple definition levels
- void WriteDefinitionLevels(int64_t num_levels, const int16_t* levels) {
- DCHECK(!closed_);
- PARQUET_THROW_NOT_OK(
- definition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
- }
-
- // Write multiple repetition levels
- void WriteRepetitionLevels(int64_t num_levels, const int16_t* levels) {
- DCHECK(!closed_);
- PARQUET_THROW_NOT_OK(
- repetition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
- }
-
- // RLE encode the src_buffer into dest_buffer and return the encoded size
- int64_t RleEncodeLevels(const void* src_buffer, ResizableBuffer* dest_buffer,
- int16_t max_level, bool include_length_prefix = true);
-
- // Serialize the buffered Data Pages
- void FlushBufferedDataPages();
-
- ColumnChunkMetaDataBuilder* metadata_;
- const ColumnDescriptor* descr_;
- // scratch buffer if validity bits need to be recalculated.
- std::shared_ptr<ResizableBuffer> bits_buffer_;
- const internal::LevelInfo level_info_;
-
- std::unique_ptr<PageWriter> pager_;
-
- bool has_dictionary_;
- Encoding::type encoding_;
- const WriterProperties* properties_;
-
- LevelEncoder level_encoder_;
-
- MemoryPool* allocator_;
-
- // The total number of values stored in the data page. This is the maximum of
- // the number of encoded definition levels or encoded values. For
- // non-repeated, required columns, this is equal to the number of encoded
- // values. For repeated or optional values, there may be fewer data values
- // than levels, and this tells you how many encoded levels there are in that
- // case.
- int64_t num_buffered_values_;
-
- // The total number of stored values. For repeated or optional values, this
- // number may be lower than num_buffered_values_.
- int64_t num_buffered_encoded_values_;
-
- // Total number of rows written with this ColumnWriter
- int rows_written_;
-
- // Records the total number of uncompressed bytes written by the serializer
- int64_t total_bytes_written_;
-
- // Records the current number of compressed bytes in a column
- int64_t total_compressed_bytes_;
-
- // Flag to check if the Writer has been closed
- bool closed_;
-
- // Flag to infer if dictionary encoding has fallen back to PLAIN
- bool fallback_;
-
- ::arrow::BufferBuilder definition_levels_sink_;
- ::arrow::BufferBuilder repetition_levels_sink_;
-
- std::shared_ptr<ResizableBuffer> definition_levels_rle_;
- std::shared_ptr<ResizableBuffer> repetition_levels_rle_;
-
- std::shared_ptr<ResizableBuffer> uncompressed_data_;
- std::shared_ptr<ResizableBuffer> compressor_temp_buffer_;
-
- std::vector<std::unique_ptr<DataPage>> data_pages_;
-
- private:
- void InitSinks() {
- definition_levels_sink_.Rewind(0);
- repetition_levels_sink_.Rewind(0);
- }
-
- // Concatenate the encoded levels and values into one buffer
- void ConcatenateBuffers(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size,
- const std::shared_ptr<Buffer>& values, uint8_t* combined) {
- memcpy(combined, repetition_levels_rle_->data(), repetition_levels_rle_size);
- combined += repetition_levels_rle_size;
- memcpy(combined, definition_levels_rle_->data(), definition_levels_rle_size);
- combined += definition_levels_rle_size;
- memcpy(combined, values->data(), values->size());
- }
-};
-
-// return the size of the encoded buffer
-int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer,
- ResizableBuffer* dest_buffer, int16_t max_level,
- bool include_length_prefix) {
- // V1 DataPage includes the length of the RLE level as a prefix.
- int32_t prefix_size = include_length_prefix ? sizeof(int32_t) : 0;
-
- // TODO: This only works with due to some RLE specifics
- int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level,
- static_cast<int>(num_buffered_values_)) +
- prefix_size;
-
- // Use Arrow::Buffer::shrink_to_fit = false
- // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
- PARQUET_THROW_NOT_OK(dest_buffer->Resize(rle_size, false));
-
- level_encoder_.Init(Encoding::RLE, max_level, static_cast<int>(num_buffered_values_),
- dest_buffer->mutable_data() + prefix_size,
- static_cast<int>(dest_buffer->size() - prefix_size));
- int encoded = level_encoder_.Encode(static_cast<int>(num_buffered_values_),
- reinterpret_cast<const int16_t*>(src_buffer));
- DCHECK_EQ(encoded, num_buffered_values_);
-
- if (include_length_prefix) {
- reinterpret_cast<int32_t*>(dest_buffer->mutable_data())[0] = level_encoder_.len();
- }
-
- return level_encoder_.len() + prefix_size;
-}
-
-void ColumnWriterImpl::AddDataPage() {
- int64_t definition_levels_rle_size = 0;
- int64_t repetition_levels_rle_size = 0;
-
- std::shared_ptr<Buffer> values = GetValuesBuffer();
- bool is_v1_data_page = properties_->data_page_version() == ParquetDataPageVersion::V1;
-
- if (descr_->max_definition_level() > 0) {
- definition_levels_rle_size = RleEncodeLevels(
- definition_levels_sink_.data(), definition_levels_rle_.get(),
- descr_->max_definition_level(), /*include_length_prefix=*/is_v1_data_page);
- }
-
- if (descr_->max_repetition_level() > 0) {
- repetition_levels_rle_size = RleEncodeLevels(
- repetition_levels_sink_.data(), repetition_levels_rle_.get(),
- descr_->max_repetition_level(), /*include_length_prefix=*/is_v1_data_page);
- }
-
- int64_t uncompressed_size =
- definition_levels_rle_size + repetition_levels_rle_size + values->size();
-
- if (is_v1_data_page) {
- BuildDataPageV1(definition_levels_rle_size, repetition_levels_rle_size,
- uncompressed_size, values);
- } else {
- BuildDataPageV2(definition_levels_rle_size, repetition_levels_rle_size,
- uncompressed_size, values);
- }
-
- // Re-initialize the sinks for next Page.
- InitSinks();
- num_buffered_values_ = 0;
- num_buffered_encoded_values_ = 0;
-}
-
-void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size,
- int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values) {
- // Use Arrow::Buffer::shrink_to_fit = false
- // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
- PARQUET_THROW_NOT_OK(uncompressed_data_->Resize(uncompressed_size, false));
- ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, values,
- uncompressed_data_->mutable_data());
-
- EncodedStatistics page_stats = GetPageStatistics();
- page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
- page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
- ResetPageStatistics();
-
- std::shared_ptr<Buffer> compressed_data;
- if (pager_->has_compressor()) {
- pager_->Compress(*(uncompressed_data_.get()), compressor_temp_buffer_.get());
- compressed_data = compressor_temp_buffer_;
- } else {
- compressed_data = uncompressed_data_;
- }
-
- // Write the page to OutputStream eagerly if there is no dictionary or
- // if dictionary encoding has fallen back to PLAIN
- if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
- PARQUET_ASSIGN_OR_THROW(
- auto compressed_data_copy,
- compressed_data->CopySlice(0, compressed_data->size(), allocator_));
- std::unique_ptr<DataPage> page_ptr(new DataPageV1(
- compressed_data_copy, static_cast<int32_t>(num_buffered_values_), encoding_,
- Encoding::RLE, Encoding::RLE, uncompressed_size, page_stats));
- total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
-
- data_pages_.push_back(std::move(page_ptr));
- } else { // Eagerly write pages
- DataPageV1 page(compressed_data, static_cast<int32_t>(num_buffered_values_),
- encoding_, Encoding::RLE, Encoding::RLE, uncompressed_size,
- page_stats);
- WriteDataPage(page);
- }
-}
-
-void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,
- int64_t repetition_levels_rle_size,
- int64_t uncompressed_size,
- const std::shared_ptr<Buffer>& values) {
- // Compress the values if needed. Repetition and definition levels are uncompressed in
- // V2.
- std::shared_ptr<Buffer> compressed_values;
- if (pager_->has_compressor()) {
- pager_->Compress(*values, compressor_temp_buffer_.get());
- compressed_values = compressor_temp_buffer_;
- } else {
- compressed_values = values;
- }
-
- // Concatenate uncompressed levels and the possibly compressed values
- int64_t combined_size =
- definition_levels_rle_size + repetition_levels_rle_size + compressed_values->size();
- std::shared_ptr<ResizableBuffer> combined = AllocateBuffer(allocator_, combined_size);
-
- ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size,
- compressed_values, combined->mutable_data());
-
- EncodedStatistics page_stats = GetPageStatistics();
- page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
- page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
- ResetPageStatistics();
-
- int32_t num_values = static_cast<int32_t>(num_buffered_values_);
- int32_t null_count = static_cast<int32_t>(page_stats.null_count);
- int32_t def_levels_byte_length = static_cast<int32_t>(definition_levels_rle_size);
- int32_t rep_levels_byte_length = static_cast<int32_t>(repetition_levels_rle_size);
-
- // Write the page to OutputStream eagerly if there is no dictionary or
- // if dictionary encoding has fallen back to PLAIN
- if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
- PARQUET_ASSIGN_OR_THROW(auto data_copy,
- combined->CopySlice(0, combined->size(), allocator_));
- std::unique_ptr<DataPage> page_ptr(new DataPageV2(
- combined, num_values, null_count, num_values, encoding_, def_levels_byte_length,
- rep_levels_byte_length, uncompressed_size, pager_->has_compressor()));
- total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
- data_pages_.push_back(std::move(page_ptr));
- } else {
- DataPageV2 page(combined, num_values, null_count, num_values, encoding_,
- def_levels_byte_length, rep_levels_byte_length, uncompressed_size,
- pager_->has_compressor());
- WriteDataPage(page);
- }
-}
-
-int64_t ColumnWriterImpl::Close() {
- if (!closed_) {
- closed_ = true;
- if (has_dictionary_ && !fallback_) {
- WriteDictionaryPage();
- }
-
- FlushBufferedDataPages();
-
- EncodedStatistics chunk_statistics = GetChunkStatistics();
- chunk_statistics.ApplyStatSizeLimits(
- properties_->max_statistics_size(descr_->path()));
- chunk_statistics.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
-
- // Write stats only if the column has at least one row written
- if (rows_written_ > 0 && chunk_statistics.is_set()) {
- metadata_->SetStatistics(chunk_statistics);
- }
- pager_->Close(has_dictionary_, fallback_);
- }
-
- return total_bytes_written_;
-}
-
-void ColumnWriterImpl::FlushBufferedDataPages() {
- // Write all outstanding data to a new page
- if (num_buffered_values_ > 0) {
- AddDataPage();
- }
- for (const auto& page_ptr : data_pages_) {
- WriteDataPage(*page_ptr);
- }
- data_pages_.clear();
- total_compressed_bytes_ = 0;
-}
-
-// ----------------------------------------------------------------------
-// TypedColumnWriter
-
-template <typename Action>
-inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
- int64_t num_batches = static_cast<int>(total / batch_size);
- for (int round = 0; round < num_batches; round++) {
- action(round * batch_size, batch_size);
- }
- // Write the remaining values
- if (total % batch_size > 0) {
- action(num_batches * batch_size, total % batch_size);
- }
-}
-
-bool DictionaryDirectWriteSupported(const ::arrow::Array& array) {
- DCHECK_EQ(array.type_id(), ::arrow::Type::DICTIONARY);
- const ::arrow::DictionaryType& dict_type =
- static_cast<const ::arrow::DictionaryType&>(*array.type());
- return ::arrow::is_base_binary_like(dict_type.value_type()->id());
-}
-
-Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
- std::shared_ptr<::arrow::Array>* out) {
- const ::arrow::DictionaryType& dict_type =
- static_cast<const ::arrow::DictionaryType&>(*array.type());
-
- ::arrow::compute::ExecContext ctx(pool);
- ARROW_ASSIGN_OR_RAISE(Datum cast_output,
- ::arrow::compute::Cast(array.data(), dict_type.value_type(),
- ::arrow::compute::CastOptions(), &ctx));
- *out = cast_output.make_array();
- return Status::OK();
-}
-
-static inline bool IsDictionaryEncoding(Encoding::type encoding) {
- return encoding == Encoding::PLAIN_DICTIONARY;
-}
-
-template <typename DType>
-class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
- public:
- using T = typename DType::c_type;
-
- TypedColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
- std::unique_ptr<PageWriter> pager, const bool use_dictionary,
- Encoding::type encoding, const WriterProperties* properties)
- : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding,
- properties) {
- current_encoder_ = MakeEncoder(DType::type_num, encoding, use_dictionary, descr_,
- properties->memory_pool());
-
- if (properties->statistics_enabled(descr_->path()) &&
- (SortOrder::UNKNOWN != descr_->sort_order())) {
- page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
- chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
- }
- }
-
- int64_t Close() override { return ColumnWriterImpl::Close(); }
-
- int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const T* values) override {
- // We check for DataPage limits only after we have inserted the values. If a user
- // writes a large number of values, the DataPage size can be much above the limit.
- // The purpose of this chunking is to bound this. Even if a user writes large number
- // of values, the chunking will ensure the AddDataPage() is called at a reasonable
- // pagesize limit
- int64_t value_offset = 0;
-
- auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t values_to_write = WriteLevels(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
-
- // PARQUET-780
- if (values_to_write > 0) {
- DCHECK_NE(nullptr, values);
- }
- WriteValues(AddIfNotNull(values, value_offset), values_to_write,
- batch_size - values_to_write);
- CommitWriteAndCheckPageLimit(batch_size, values_to_write);
- value_offset += values_to_write;
-
- // Dictionary size checked separately from data page size since we
- // circumvent this check when writing ::arrow::DictionaryArray directly
- CheckDictionarySizeLimit();
- };
- DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
- return value_offset;
- }
-
- void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const uint8_t* valid_bits,
- int64_t valid_bits_offset, const T* values) override {
- // Like WriteBatch, but for spaced values
- int64_t value_offset = 0;
- auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t batch_num_values = 0;
- int64_t batch_num_spaced_values = 0;
- int64_t null_count;
- MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
- &batch_num_values, &batch_num_spaced_values,
- &null_count);
-
- WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
- if (bits_buffer_ != nullptr) {
- WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
- batch_num_spaced_values, bits_buffer_->data(), /*offset=*/0);
- } else {
- WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
- batch_num_spaced_values, valid_bits,
- valid_bits_offset + value_offset);
- }
- CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values);
- value_offset += batch_num_spaced_values;
-
- // Dictionary size checked separately from data page size since we
- // circumvent this check when writing ::arrow::DictionaryArray directly
- CheckDictionarySizeLimit();
- };
- DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
- }
-
- Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& leaf_array,
- ArrowWriteContext* ctx, bool leaf_field_nullable) override {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- // Leaf nulls are canonical when there is only a single null element after a list
- // and it is at the leaf.
- bool single_nullable_element =
- (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
- leaf_field_nullable;
- bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
- if (maybe_parent_nulls) {
- ARROW_ASSIGN_OR_RAISE(
- bits_buffer_,
- ::arrow::AllocateResizableBuffer(
- BitUtil::BytesForBits(properties_->write_batch_size()), ctx->memory_pool));
- bits_buffer_->ZeroPadding();
- }
-
- if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
- return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx,
- maybe_parent_nulls);
- } else {
- return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx,
- maybe_parent_nulls);
- }
- END_PARQUET_CATCH_EXCEPTIONS
- }
-
- int64_t EstimatedBufferedValueBytes() const override {
- return current_encoder_->EstimatedDataEncodedSize();
- }
-
- protected:
- std::shared_ptr<Buffer> GetValuesBuffer() override {
- return current_encoder_->FlushValues();
- }
-
- // Internal function to handle direct writing of ::arrow::DictionaryArray,
- // since the standard logic concerning dictionary size limits and fallback to
- // plain encoding is circumvented
- Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& array,
- ArrowWriteContext* context, bool maybe_parent_nulls);
-
- Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& array,
- ArrowWriteContext* context, bool maybe_parent_nulls);
-
- void WriteDictionaryPage() override {
- // We have to dynamic cast here because of TypedEncoder<Type> as
- // some compilers don't want to cast through virtual inheritance
- auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
- DCHECK(dict_encoder);
- std::shared_ptr<ResizableBuffer> buffer =
- AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size());
- dict_encoder->WriteDict(buffer->mutable_data());
-
- DictionaryPage page(buffer, dict_encoder->num_entries(),
- properties_->dictionary_page_encoding());
- total_bytes_written_ += pager_->WriteDictionaryPage(page);
- }
-
- EncodedStatistics GetPageStatistics() override {
- EncodedStatistics result;
- if (page_statistics_) result = page_statistics_->Encode();
- return result;
- }
-
- EncodedStatistics GetChunkStatistics() override {
- EncodedStatistics result;
- if (chunk_statistics_) result = chunk_statistics_->Encode();
- return result;
- }
-
- void ResetPageStatistics() override {
- if (chunk_statistics_ != nullptr) {
- chunk_statistics_->Merge(*page_statistics_);
- page_statistics_->Reset();
- }
- }
-
- Type::type type() const override { return descr_->physical_type(); }
-
- const ColumnDescriptor* descr() const override { return descr_; }
-
- int64_t rows_written() const override { return rows_written_; }
-
- int64_t total_compressed_bytes() const override { return total_compressed_bytes_; }
-
- int64_t total_bytes_written() const override { return total_bytes_written_; }
-
- const WriterProperties* properties() override { return properties_; }
-
- private:
- using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
- using TypedStats = TypedStatistics<DType>;
- std::unique_ptr<Encoder> current_encoder_;
- std::shared_ptr<TypedStats> page_statistics_;
- std::shared_ptr<TypedStats> chunk_statistics_;
-
- // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the
- // dictionary passed to DictEncoder<T>::PutDictionary so we can check
- // subsequent array chunks to see either if materialization is required (in
- // which case we call back to the dense write path)
- std::shared_ptr<::arrow::Array> preserved_dictionary_;
-
- int64_t WriteLevels(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels) {
- int64_t values_to_write = 0;
- // If the field is required and non-repeated, there are no definition levels
- if (descr_->max_definition_level() > 0) {
- for (int64_t i = 0; i < num_values; ++i) {
- if (def_levels[i] == descr_->max_definition_level()) {
- ++values_to_write;
- }
- }
-
- WriteDefinitionLevels(num_values, def_levels);
- } else {
- // Required field, write all values
- values_to_write = num_values;
- }
-
- // Not present for non-repeated fields
- if (descr_->max_repetition_level() > 0) {
- // A row could include more than one value
- // Count the occasions where we start a new row
- for (int64_t i = 0; i < num_values; ++i) {
- if (rep_levels[i] == 0) {
- rows_written_++;
- }
- }
-
- WriteRepetitionLevels(num_values, rep_levels);
- } else {
- // Each value is exactly one row
- rows_written_ += static_cast<int>(num_values);
- }
- return values_to_write;
- }
-
- // This method will always update the three output parameters,
- // out_values_to_write, out_spaced_values_to_write and null_count. Additionally
- // it will update the validity bitmap if required (i.e. if at least one level
- // of nullable structs directly precede the leaf node).
- void MaybeCalculateValidityBits(const int16_t* def_levels, int64_t batch_size,
- int64_t* out_values_to_write,
- int64_t* out_spaced_values_to_write,
- int64_t* null_count) {
- if (bits_buffer_ == nullptr) {
- if (level_info_.def_level == 0) {
- // In this case def levels should be null and we only
- // need to output counts which will always be equal to
- // the batch size passed in (max def_level == 0 indicates
- // there cannot be repeated or null fields).
- DCHECK_EQ(def_levels, nullptr);
- *out_values_to_write = batch_size;
- *out_spaced_values_to_write = batch_size;
- *null_count = 0;
- } else {
- for (int x = 0; x < batch_size; x++) {
- *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 0;
- *out_spaced_values_to_write +=
- def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
- }
- *null_count = *out_values_to_write - *out_spaced_values_to_write;
- }
- return;
- }
- // Shrink to fit possible causes another allocation, and would only be necessary
- // on the last batch.
- int64_t new_bitmap_size = BitUtil::BytesForBits(batch_size);
- if (new_bitmap_size != bits_buffer_->size()) {
- PARQUET_THROW_NOT_OK(
- bits_buffer_->Resize(new_bitmap_size, /*shrink_to_fit=*/false));
- bits_buffer_->ZeroPadding();
- }
- internal::ValidityBitmapInputOutput io;
- io.valid_bits = bits_buffer_->mutable_data();
- io.values_read_upper_bound = batch_size;
- internal::DefLevelsToBitmap(def_levels, batch_size, level_info_, &io);
- *out_values_to_write = io.values_read - io.null_count;
- *out_spaced_values_to_write = io.values_read;
- *null_count = io.null_count;
- }
-
- Result<std::shared_ptr<Array>> MaybeReplaceValidity(std::shared_ptr<Array> array,
- int64_t new_null_count,
- ::arrow::MemoryPool* memory_pool) {
- if (bits_buffer_ == nullptr) {
- return array;
- }
- std::vector<std::shared_ptr<Buffer>> buffers = array->data()->buffers;
- if (buffers.empty()) {
- return array;
- }
- buffers[0] = bits_buffer_;
- // Should be a leaf array.
- DCHECK_GT(buffers.size(), 1);
- ValueBufferSlicer slicer{memory_pool, /*buffer=*/nullptr};
- if (array->data()->offset > 0) {
- RETURN_NOT_OK(::arrow::VisitArrayInline(*array, &slicer));
- buffers[1] = slicer.buffer_;
- }
- return ::arrow::MakeArray(std::make_shared<ArrayData>(
- array->type(), array->length(), std::move(buffers), new_null_count));
- }
-
- void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels,
- const int16_t* rep_levels) {
- // If the field is required and non-repeated, there are no definition levels
- if (descr_->max_definition_level() > 0) {
- WriteDefinitionLevels(num_levels, def_levels);
- }
- // Not present for non-repeated fields
- if (descr_->max_repetition_level() > 0) {
- // A row could include more than one value
- // Count the occasions where we start a new row
- for (int64_t i = 0; i < num_levels; ++i) {
- if (rep_levels[i] == 0) {
- rows_written_++;
- }
- }
- WriteRepetitionLevels(num_levels, rep_levels);
- } else {
- // Each value is exactly one row
- rows_written_ += static_cast<int>(num_levels);
- }
- }
-
- void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) {
- num_buffered_values_ += num_levels;
- num_buffered_encoded_values_ += num_values;
-
- if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
- AddDataPage();
- }
- }
-
- void FallbackToPlainEncoding() {
- if (IsDictionaryEncoding(current_encoder_->encoding())) {
- WriteDictionaryPage();
- // Serialize the buffered Dictionary Indices
- FlushBufferedDataPages();
- fallback_ = true;
- // Only PLAIN encoding is supported for fallback in V1
- current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
- properties_->memory_pool());
- encoding_ = Encoding::PLAIN;
- }
- }
-
- // Checks if the Dictionary Page size limit is reached
- // If the limit is reached, the Dictionary and Data Pages are serialized
- // The encoding is switched to PLAIN
- //
- // Only one Dictionary Page is written.
- // Fallback to PLAIN if dictionary page limit is reached.
- void CheckDictionarySizeLimit() {
- if (!has_dictionary_ || fallback_) {
- // Either not using dictionary encoding, or we have already fallen back
- // to PLAIN encoding because the size threshold was reached
- return;
- }
-
- // We have to dynamic cast here because TypedEncoder<Type> as some compilers
- // don't want to cast through virtual inheritance
- auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
- if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
- FallbackToPlainEncoding();
- }
- }
-
- void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) {
- dynamic_cast<ValueEncoderType*>(current_encoder_.get())
- ->Put(values, static_cast<int>(num_values));
- if (page_statistics_ != nullptr) {
- page_statistics_->Update(values, num_values, num_nulls);
- }
- }
-
- void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values,
- const uint8_t* valid_bits, int64_t valid_bits_offset) {
- if (num_values != num_spaced_values) {
- dynamic_cast<ValueEncoderType*>(current_encoder_.get())
- ->PutSpaced(values, static_cast<int>(num_spaced_values), valid_bits,
- valid_bits_offset);
- } else {
- dynamic_cast<ValueEncoderType*>(current_encoder_.get())
- ->Put(values, static_cast<int>(num_values));
- }
- if (page_statistics_ != nullptr) {
- const int64_t num_nulls = num_spaced_values - num_values;
- page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values,
- num_nulls);
- }
- }
-};
-
-template <typename DType>
-Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- // If this is the first time writing a DictionaryArray, then there's
- // a few possible paths to take:
- //
- // - If dictionary encoding is not enabled, convert to densely
- // encoded and call WriteArrow
- // - Dictionary encoding enabled
- // - If this is the first time this is called, then we call
- // PutDictionary into the encoder and then PutIndices on each
- // chunk. We store the dictionary that was written in
- // preserved_dictionary_ so that subsequent calls to this method
- // can make sure the dictionary has not changed
- // - On subsequent calls, we have to check whether the dictionary
- // has changed. If it has, then we trigger the varying
- // dictionary path and materialize each chunk and then call
- // WriteArrow with that
- auto WriteDense = [&] {
- std::shared_ptr<::arrow::Array> dense_array;
- RETURN_NOT_OK(
- ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array));
- return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx,
- maybe_parent_nulls);
- };
-
- if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
- !DictionaryDirectWriteSupported(array)) {
- // No longer dictionary-encoding for whatever reason, maybe we never were
- // or we decided to stop. Note that WriteArrow can be invoked multiple
- // times with both dense and dictionary-encoded versions of the same data
- // without a problem. Any dense data will be hashed to indices until the
- // dictionary page limit is reached, at which everything (dictionary and
- // dense) will fall back to plain encoding
- return WriteDense();
- }
-
- auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
- const auto& data = checked_cast<const ::arrow::DictionaryArray&>(array);
- std::shared_ptr<::arrow::Array> dictionary = data.dictionary();
- std::shared_ptr<::arrow::Array> indices = data.indices();
-
- int64_t value_offset = 0;
- auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t batch_num_values = 0;
- int64_t batch_num_spaced_values = 0;
- int64_t null_count = ::arrow::kUnknownNullCount;
- // Bits is not null for nullable values. At this point in the code we can't determine
- // if the leaf array has the same null values as any parents it might have had so we
- // need to recompute it from def levels.
- MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
- &batch_num_values, &batch_num_spaced_values, &null_count);
- WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
- std::shared_ptr<Array> writeable_indices =
- indices->Slice(value_offset, batch_num_spaced_values);
- PARQUET_ASSIGN_OR_THROW(
- writeable_indices,
- MaybeReplaceValidity(writeable_indices, null_count, ctx->memory_pool));
- dict_encoder->PutIndices(*writeable_indices);
- CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
- value_offset += batch_num_spaced_values;
- };
-
- // Handle seeing dictionary for the first time
- if (!preserved_dictionary_) {
- // It's a new dictionary. Call PutDictionary and keep track of it
- PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));
-
- // If there were duplicate value in the dictionary, the encoder's memo table
- // will be out of sync with the indices in the Arrow array.
- // The easiest solution for this uncommon case is to fallback to plain encoding.
- if (dict_encoder->num_entries() != dictionary->length()) {
- PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
- return WriteDense();
- }
-
- // TODO(wesm): If some dictionary values are unobserved, then the
- // statistics will be inaccurate. Do we care enough to fix it?
- if (page_statistics_ != nullptr) {
- PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary));
- }
- preserved_dictionary_ = dictionary;
- } else if (!dictionary->Equals(*preserved_dictionary_)) {
- // Dictionary has changed
- PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
- return WriteDense();
- }
-
- PARQUET_CATCH_NOT_OK(
- DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk));
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Direct Arrow write path
-
-template <typename ParquetType, typename ArrowType, typename Enable = void>
-struct SerializeFunctor {
- using ArrowCType = typename ArrowType::c_type;
- using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
- using ParquetCType = typename ParquetType::c_type;
- Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) {
- const ArrowCType* input = array.raw_values();
- if (array.null_count() > 0) {
- for (int i = 0; i < array.length(); i++) {
- out[i] = static_cast<ParquetCType>(input[i]);
- }
- } else {
- std::copy(input, input + array.length(), out);
- }
- return Status::OK();
- }
-};
-
-template <typename ParquetType, typename ArrowType>
-Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
- const int16_t* def_levels, const int16_t* rep_levels,
- ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
- bool maybe_parent_nulls) {
- using ParquetCType = typename ParquetType::c_type;
- using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
-
- ParquetCType* buffer = nullptr;
- PARQUET_THROW_NOT_OK(ctx->GetScratchData<ParquetCType>(array.length(), &buffer));
-
- SerializeFunctor<ParquetType, ArrowType> functor;
- RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array), ctx, buffer));
- bool no_nulls =
- writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
- if (!maybe_parent_nulls && no_nulls) {
- PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, buffer));
- } else {
- PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
- array.null_bitmap_data(),
- array.offset(), buffer));
- }
- return Status::OK();
-}
-
-template <typename ParquetType>
-Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
- const int16_t* def_levels, const int16_t* rep_levels,
- ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
- bool maybe_parent_nulls) {
- using T = typename ParquetType::c_type;
- const auto& data = static_cast<const ::arrow::PrimitiveArray&>(array);
- const T* values = nullptr;
- // The values buffer may be null if the array is empty (ARROW-2744)
- if (data.values() != nullptr) {
- values = reinterpret_cast<const T*>(data.values()->data()) + data.offset();
- } else {
- DCHECK_EQ(data.length(), 0);
- }
- bool no_nulls =
- writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
-
- if (!maybe_parent_nulls && no_nulls) {
- PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, values));
- } else {
- PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
- data.null_bitmap_data(), data.offset(),
- values));
- }
- return Status::OK();
-}
-
-#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \
- case ::arrow::Type::ArrowEnum: \
- return WriteArrowSerialize<ParquetType, ::arrow::ArrowType>( \
- array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
-
-#define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \
- case ::arrow::Type::ArrowEnum: \
- return WriteArrowZeroCopy<ParquetType>(array, num_levels, def_levels, rep_levels, \
- ctx, this, maybe_parent_nulls);
-
-#define ARROW_UNSUPPORTED() \
- std::stringstream ss; \
- ss << "Arrow type " << array.type()->ToString() \
- << " cannot be written to Parquet type " << descr_->ToString(); \
- return Status::Invalid(ss.str());
-
-// ----------------------------------------------------------------------
-// Write Arrow to BooleanType
-
-template <>
-struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
- Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) {
- for (int i = 0; i < data.length(); i++) {
- *out++ = data.Value(i);
- }
- return Status::OK();
- }
-};
-
-template <>
-Status TypedColumnWriterImpl<BooleanType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::BOOL) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowSerialize<BooleanType, ::arrow::BooleanType>(
- array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow types to INT32
-
-template <>
-struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
- Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
- const int64_t* input = array.raw_values();
- for (int i = 0; i < array.length(); i++) {
- *out++ = static_cast<int32_t>(*input++ / 86400000);
- }
- return Status::OK();
- }
-};
-
-template <>
-struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
- Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
- const int32_t* input = array.raw_values();
- const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
- if (type.unit() == ::arrow::TimeUnit::SECOND) {
- for (int i = 0; i < array.length(); i++) {
- out[i] = input[i] * 1000;
- }
- } else {
- std::copy(input, input + array.length(), out);
- }
- return Status::OK();
- }
-};
-
-template <>
-Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- switch (array.type()->id()) {
- case ::arrow::Type::NA: {
- PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr));
- } break;
- WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type)
- WRITE_SERIALIZE_CASE(UINT8, UInt8Type, Int32Type)
- WRITE_SERIALIZE_CASE(INT16, Int16Type, Int32Type)
- WRITE_SERIALIZE_CASE(UINT16, UInt16Type, Int32Type)
- WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int32Type)
- WRITE_ZERO_COPY_CASE(INT32, Int32Type, Int32Type)
- WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type)
- WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type)
- WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type)
- default:
- ARROW_UNSUPPORTED()
- }
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow to Int64 and Int96
-
-#define INT96_CONVERT_LOOP(ConversionFunction) \
- for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]);
-
-template <>
-struct SerializeFunctor<Int96Type, ::arrow::TimestampType> {
- Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
- const int64_t* input = array.raw_values();
- const auto& type = static_cast<const ::arrow::TimestampType&>(*array.type());
- switch (type.unit()) {
- case ::arrow::TimeUnit::NANO:
- INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp);
- break;
- case ::arrow::TimeUnit::MICRO:
- INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp);
- break;
- case ::arrow::TimeUnit::MILLI:
- INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp);
- break;
- case ::arrow::TimeUnit::SECOND:
- INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp);
- break;
- }
- return Status::OK();
- }
-};
-
-#define COERCE_DIVIDE -1
-#define COERCE_INVALID 0
-#define COERCE_MULTIPLY +1
-
-static std::pair<int, int64_t> kTimestampCoercionFactors[4][4] = {
- // from seconds ...
- {{COERCE_INVALID, 0}, // ... to seconds
- {COERCE_MULTIPLY, 1000}, // ... to millis
- {COERCE_MULTIPLY, 1000000}, // ... to micros
- {COERCE_MULTIPLY, INT64_C(1000000000)}}, // ... to nanos
- // from millis ...
- {{COERCE_INVALID, 0},
- {COERCE_MULTIPLY, 1},
- {COERCE_MULTIPLY, 1000},
- {COERCE_MULTIPLY, 1000000}},
- // from micros ...
- {{COERCE_INVALID, 0},
- {COERCE_DIVIDE, 1000},
- {COERCE_MULTIPLY, 1},
- {COERCE_MULTIPLY, 1000}},
- // from nanos ...
- {{COERCE_INVALID, 0},
- {COERCE_DIVIDE, 1000000},
- {COERCE_DIVIDE, 1000},
- {COERCE_MULTIPLY, 1}}};
-
-template <>
-struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
- Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx,
- int64_t* out) {
- const auto& source_type = static_cast<const ::arrow::TimestampType&>(*array.type());
- auto source_unit = source_type.unit();
- const int64_t* values = array.raw_values();
-
- ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
- auto target_type = ::arrow::timestamp(target_unit);
- bool truncation_allowed = ctx->properties->truncated_timestamps_allowed();
-
- auto DivideBy = [&](const int64_t factor) {
- for (int64_t i = 0; i < array.length(); i++) {
- if (!truncation_allowed && array.IsValid(i) && (values[i] % factor != 0)) {
- return Status::Invalid("Casting from ", source_type.ToString(), " to ",
- target_type->ToString(),
- " would lose data: ", values[i]);
- }
- out[i] = values[i] / factor;
- }
- return Status::OK();
- };
-
- auto MultiplyBy = [&](const int64_t factor) {
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = values[i] * factor;
- }
- return Status::OK();
- };
-
- const auto& coercion = kTimestampCoercionFactors[static_cast<int>(source_unit)]
- [static_cast<int>(target_unit)];
-
- // .first -> coercion operation; .second -> scale factor
- DCHECK_NE(coercion.first, COERCE_INVALID);
- return coercion.first == COERCE_DIVIDE ? DivideBy(coercion.second)
- : MultiplyBy(coercion.second);
- }
-};
-
-#undef COERCE_DIVIDE
-#undef COERCE_INVALID
-#undef COERCE_MULTIPLY
-
-Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
- const int16_t* def_levels, const int16_t* rep_levels,
- ArrowWriteContext* ctx, TypedColumnWriter<Int64Type>* writer,
- bool maybe_parent_nulls) {
- const auto& source_type = static_cast<const ::arrow::TimestampType&>(*values.type());
-
- auto WriteCoerce = [&](const ArrowWriterProperties* properties) {
- ArrowWriteContext temp_ctx = *ctx;
- temp_ctx.properties = properties;
- return WriteArrowSerialize<Int64Type, ::arrow::TimestampType>(
- values, num_levels, def_levels, rep_levels, &temp_ctx, writer,
- maybe_parent_nulls);
- };
-
- if (ctx->properties->coerce_timestamps_enabled()) {
- // User explicitly requested coercion to specific unit
- if (source_type.unit() == ctx->properties->coerce_timestamps_unit()) {
- // No data conversion necessary
- return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels,
- ctx, writer, maybe_parent_nulls);
- } else {
- return WriteCoerce(ctx->properties);
- }
- } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 &&
- source_type.unit() == ::arrow::TimeUnit::NANO) {
- // Absent superseding user instructions, when writing Parquet version 1.0 files,
- // timestamps in nanoseconds are coerced to microseconds
- std::shared_ptr<ArrowWriterProperties> properties =
- (ArrowWriterProperties::Builder())
- .coerce_timestamps(::arrow::TimeUnit::MICRO)
- ->disallow_truncated_timestamps()
- ->build();
- return WriteCoerce(properties.get());
- } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) {
- // Absent superseding user instructions, timestamps in seconds are coerced to
- // milliseconds
- std::shared_ptr<ArrowWriterProperties> properties =
- (ArrowWriterProperties::Builder())
- .coerce_timestamps(::arrow::TimeUnit::MILLI)
- ->build();
- return WriteCoerce(properties.get());
- } else {
- // No data conversion necessary
- return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels, ctx,
- writer, maybe_parent_nulls);
- }
-}
-
-template <>
-Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- switch (array.type()->id()) {
- case ::arrow::Type::TIMESTAMP:
- return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this,
- maybe_parent_nulls);
- WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type)
- WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type)
- WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type)
- WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type)
- default:
- ARROW_UNSUPPORTED();
- }
-}
-
-template <>
-Status TypedColumnWriterImpl<Int96Type>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::TIMESTAMP) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowSerialize<Int96Type, ::arrow::TimestampType>(
- array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
-}
-
-// ----------------------------------------------------------------------
-// Floating point types
-
-template <>
-Status TypedColumnWriterImpl<FloatType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::FLOAT) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowZeroCopy<FloatType>(array, num_levels, def_levels, rep_levels, ctx,
- this, maybe_parent_nulls);
-}
-
-template <>
-Status TypedColumnWriterImpl<DoubleType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (array.type_id() != ::arrow::Type::DOUBLE) {
- ARROW_UNSUPPORTED();
- }
- return WriteArrowZeroCopy<DoubleType>(array, num_levels, def_levels, rep_levels, ctx,
- this, maybe_parent_nulls);
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow to BYTE_ARRAY
-
-template <>
-Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- if (!::arrow::is_base_binary_like(array.type()->id())) {
- ARROW_UNSUPPORTED();
- }
-
- int64_t value_offset = 0;
- auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
- int64_t batch_num_values = 0;
- int64_t batch_num_spaced_values = 0;
- int64_t null_count = 0;
-
- MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
- &batch_num_values, &batch_num_spaced_values, &null_count);
- WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
- AddIfNotNull(rep_levels, offset));
- std::shared_ptr<Array> data_slice =
- array.Slice(value_offset, batch_num_spaced_values);
- PARQUET_ASSIGN_OR_THROW(
- data_slice, MaybeReplaceValidity(data_slice, null_count, ctx->memory_pool));
-
- current_encoder_->Put(*data_slice);
- if (page_statistics_ != nullptr) {
- page_statistics_->Update(*data_slice);
- }
- CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
- CheckDictionarySizeLimit();
- value_offset += batch_num_spaced_values;
- };
-
- PARQUET_CATCH_NOT_OK(
- DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk));
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Write Arrow to FIXED_LEN_BYTE_ARRAY
-
-template <typename ParquetType, typename ArrowType>
-struct SerializeFunctor<
- ParquetType, ArrowType,
- ::arrow::enable_if_t<::arrow::is_fixed_size_binary_type<ArrowType>::value &&
- !::arrow::is_decimal_type<ArrowType>::value>> {
- Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
- FLBA* out) {
- if (array.null_count() == 0) {
- // no nulls, just dump the data
- // todo(advancedxy): use a writeBatch to avoid this step
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = FixedLenByteArray(array.GetValue(i));
- }
- } else {
- for (int64_t i = 0; i < array.length(); i++) {
- if (array.IsValid(i)) {
- out[i] = FixedLenByteArray(array.GetValue(i));
- }
- }
- }
- return Status::OK();
- }
-};
-
-// ----------------------------------------------------------------------
-// Write Arrow to Decimal128
-
-// Requires a custom serializer because decimal in parquet are in big-endian
-// format. Thus, a temporary local buffer is required.
-template <typename ParquetType, typename ArrowType>
-struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_decimal<ArrowType>> {
- Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
- ArrowWriteContext* ctx, FLBA* out) {
- AllocateScratch(array, ctx);
- auto offset = Offset(array);
-
- if (array.null_count() == 0) {
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = FixDecimalEndianess<ArrowType::kByteWidth>(array.GetValue(i), offset);
- }
- } else {
- for (int64_t i = 0; i < array.length(); i++) {
- out[i] = array.IsValid(i) ? FixDecimalEndianess<ArrowType::kByteWidth>(
- array.GetValue(i), offset)
- : FixedLenByteArray();
- }
- }
-
- return Status::OK();
- }
-
- // Parquet's Decimal are stored with FixedLength values where the length is
- // proportional to the precision. Arrow's Decimal are always stored with 16/32
- // bytes. Thus the internal FLBA pointer must be adjusted by the offset calculated
- // here.
- int32_t Offset(const Array& array) {
- auto decimal_type = checked_pointer_cast<::arrow::DecimalType>(array.type());
- return decimal_type->byte_width() -
- ::arrow::DecimalType::DecimalSize(decimal_type->precision());
- }
-
- void AllocateScratch(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
- ArrowWriteContext* ctx) {
- int64_t non_null_count = array.length() - array.null_count();
- int64_t size = non_null_count * ArrowType::kByteWidth;
- scratch_buffer = AllocateBuffer(ctx->memory_pool, size);
- scratch = reinterpret_cast<int64_t*>(scratch_buffer->mutable_data());
- }
-
- template <int byte_width>
- FixedLenByteArray FixDecimalEndianess(const uint8_t* in, int64_t offset) {
- const auto* u64_in = reinterpret_cast<const int64_t*>(in);
- auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
- static_assert(byte_width == 16 || byte_width == 32,
- "only 16 and 32 byte Decimals supported");
- if (byte_width == 32) {
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[3]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[2]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
- } else {
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
- *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
- }
- return FixedLenByteArray(out);
- }
-
- std::shared_ptr<ResizableBuffer> scratch_buffer;
- int64_t* scratch;
-};
-
-template <>
-Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
- const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
- const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
- switch (array.type()->id()) {
- WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
- WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
- WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
- default:
- break;
- }
- return Status::OK();
-}
-
-// ----------------------------------------------------------------------
-// Dynamic column writer constructor
-
-std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* metadata,
- std::unique_ptr<PageWriter> pager,
- const WriterProperties* properties) {
- const ColumnDescriptor* descr = metadata->descr();
- const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
- descr->physical_type() != Type::BOOLEAN;
- Encoding::type encoding = properties->encoding(descr->path());
- if (use_dictionary) {
- encoding = properties->dictionary_index_encoding();
- }
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedColumnWriterImpl<BooleanType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::INT32:
- return std::make_shared<TypedColumnWriterImpl<Int32Type>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::INT64:
- return std::make_shared<TypedColumnWriterImpl<Int64Type>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::INT96:
- return std::make_shared<TypedColumnWriterImpl<Int96Type>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::FLOAT:
- return std::make_shared<TypedColumnWriterImpl<FloatType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::DOUBLE:
- return std::make_shared<TypedColumnWriterImpl<DoubleType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedColumnWriterImpl<ByteArrayType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedColumnWriterImpl<FLBAType>>(
- metadata, std::move(pager), use_dictionary, encoding, properties);
- default:
- ParquetException::NYI("type reader not implemented");
- }
- // Unreachable code, but suppress compiler warning
- return std::shared_ptr<ColumnWriter>(nullptr);
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/column_writer.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api.h"
+#include "arrow/io/memory.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/column_page.h"
+#include "parquet/encoding.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/level_conversion.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h"
+#include "parquet/types.h"
+
+using arrow::Array;
+using arrow::ArrayData;
+using arrow::Datum;
+using arrow::Result;
+using arrow::Status;
+using arrow::BitUtil::BitWriter;
+using arrow::internal::checked_cast;
+using arrow::internal::checked_pointer_cast;
+using arrow::util::RleEncoder;
+
+namespace BitUtil = arrow::BitUtil;
+
+namespace parquet {
+
+namespace {
+
+// Visitor that exracts the value buffer from a FlatArray at a given offset.
+struct ValueBufferSlicer {
+ template <typename T>
+ ::arrow::enable_if_base_binary<typename T::TypeClass, Status> Visit(const T& array) {
+ auto data = array.data();
+ buffer_ =
+ SliceBuffer(data->buffers[1], data->offset * sizeof(typename T::offset_type),
+ data->length * sizeof(typename T::offset_type));
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_fixed_size_binary<typename T::TypeClass, Status> Visit(
+ const T& array) {
+ auto data = array.data();
+ buffer_ = SliceBuffer(data->buffers[1], data->offset * array.byte_width(),
+ data->length * array.byte_width());
+ return Status::OK();
+ }
+
+ template <typename T>
+ ::arrow::enable_if_t<::arrow::has_c_type<typename T::TypeClass>::value &&
+ !std::is_same<BooleanType, typename T::TypeClass>::value,
+ Status>
+ Visit(const T& array) {
+ auto data = array.data();
+ buffer_ = SliceBuffer(
+ data->buffers[1],
+ ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->offset),
+ ::arrow::TypeTraits<typename T::TypeClass>::bytes_required(data->length));
+ return Status::OK();
+ }
+
+ Status Visit(const ::arrow::BooleanArray& array) {
+ auto data = array.data();
+ if (BitUtil::IsMultipleOf8(data->offset)) {
+ buffer_ = SliceBuffer(data->buffers[1], BitUtil::BytesForBits(data->offset),
+ BitUtil::BytesForBits(data->length));
+ return Status::OK();
+ }
+ PARQUET_ASSIGN_OR_THROW(buffer_,
+ ::arrow::internal::CopyBitmap(pool_, data->buffers[1]->data(),
+ data->offset, data->length));
+ return Status::OK();
+ }
+#define NOT_IMPLEMENTED_VISIT(ArrowTypePrefix) \
+ Status Visit(const ::arrow::ArrowTypePrefix##Array& array) { \
+ return Status::NotImplemented("Slicing not implemented for " #ArrowTypePrefix); \
+ }
+
+ NOT_IMPLEMENTED_VISIT(Null);
+ NOT_IMPLEMENTED_VISIT(Union);
+ NOT_IMPLEMENTED_VISIT(List);
+ NOT_IMPLEMENTED_VISIT(LargeList);
+ NOT_IMPLEMENTED_VISIT(Struct);
+ NOT_IMPLEMENTED_VISIT(FixedSizeList);
+ NOT_IMPLEMENTED_VISIT(Dictionary);
+ NOT_IMPLEMENTED_VISIT(Extension);
+
+#undef NOT_IMPLEMENTED_VISIT
+
+ MemoryPool* pool_;
+ std::shared_ptr<Buffer> buffer_;
+};
+
+internal::LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
+ internal::LevelInfo level_info;
+ level_info.def_level = descr->max_definition_level();
+ level_info.rep_level = descr->max_repetition_level();
+
+ int16_t min_spaced_def_level = descr->max_definition_level();
+ const ::parquet::schema::Node* node = descr->schema_node().get();
+ while (node != nullptr && !node->is_repeated()) {
+ if (node->is_optional()) {
+ min_spaced_def_level--;
+ }
+ node = node->parent();
+ }
+ level_info.repeated_ancestor_def_level = min_spaced_def_level;
+ return level_info;
+}
+
+template <class T>
+inline const T* AddIfNotNull(const T* base, int64_t offset) {
+ if (base != nullptr) {
+ return base + offset;
+ }
+ return nullptr;
+}
+
+} // namespace
+
+LevelEncoder::LevelEncoder() {}
+LevelEncoder::~LevelEncoder() {}
+
+void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values, uint8_t* data, int data_size) {
+ bit_width_ = BitUtil::Log2(max_level + 1);
+ encoding_ = encoding;
+ switch (encoding) {
+ case Encoding::RLE: {
+ rle_encoder_.reset(new RleEncoder(data, data_size, bit_width_));
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ int num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
+ bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+}
+
+int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values) {
+ int bit_width = BitUtil::Log2(max_level + 1);
+ int num_bytes = 0;
+ switch (encoding) {
+ case Encoding::RLE: {
+ // TODO: Due to the way we currently check if the buffer is full enough,
+ // we need to have MinBufferSize as head room.
+ num_bytes = RleEncoder::MaxBufferSize(bit_width, num_buffered_values) +
+ RleEncoder::MinBufferSize(bit_width);
+ break;
+ }
+ case Encoding::BIT_PACKED: {
+ num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown encoding type for levels.");
+ }
+ return num_bytes;
+}
+
+int LevelEncoder::Encode(int batch_size, const int16_t* levels) {
+ int num_encoded = 0;
+ if (!rle_encoder_ && !bit_packed_encoder_) {
+ throw ParquetException("Level encoders are not initialized.");
+ }
+
+ if (encoding_ == Encoding::RLE) {
+ for (int i = 0; i < batch_size; ++i) {
+ if (!rle_encoder_->Put(*(levels + i))) {
+ break;
+ }
+ ++num_encoded;
+ }
+ rle_encoder_->Flush();
+ rle_length_ = rle_encoder_->len();
+ } else {
+ for (int i = 0; i < batch_size; ++i) {
+ if (!bit_packed_encoder_->PutValue(*(levels + i), bit_width_)) {
+ break;
+ }
+ ++num_encoded;
+ }
+ bit_packed_encoder_->Flush();
+ }
+ return num_encoded;
+}
+
+// ----------------------------------------------------------------------
+// PageWriter implementation
+
+// This subclass delimits pages appearing in a serialized stream, each preceded
+// by a serialized Thrift format::PageHeader indicating the type of each page
+// and the page metadata.
+class SerializedPageWriter : public PageWriter {
+ public:
+ SerializedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t column_chunk_ordinal,
+ MemoryPool* pool = ::arrow::default_memory_pool(),
+ std::shared_ptr<Encryptor> meta_encryptor = nullptr,
+ std::shared_ptr<Encryptor> data_encryptor = nullptr)
+ : sink_(std::move(sink)),
+ metadata_(metadata),
+ pool_(pool),
+ num_values_(0),
+ dictionary_page_offset_(0),
+ data_page_offset_(0),
+ total_uncompressed_size_(0),
+ total_compressed_size_(0),
+ page_ordinal_(0),
+ row_group_ordinal_(row_group_ordinal),
+ column_ordinal_(column_chunk_ordinal),
+ meta_encryptor_(std::move(meta_encryptor)),
+ data_encryptor_(std::move(data_encryptor)),
+ encryption_buffer_(AllocateBuffer(pool, 0)) {
+ if (data_encryptor_ != nullptr || meta_encryptor_ != nullptr) {
+ InitEncryption();
+ }
+ compressor_ = GetCodec(codec, compression_level);
+ thrift_serializer_.reset(new ThriftSerializer);
+ }
+
+ int64_t WriteDictionaryPage(const DictionaryPage& page) override {
+ int64_t uncompressed_size = page.size();
+ std::shared_ptr<Buffer> compressed_data;
+ if (has_compressor()) {
+ auto buffer = std::static_pointer_cast<ResizableBuffer>(
+ AllocateBuffer(pool_, uncompressed_size));
+ Compress(*(page.buffer().get()), buffer.get());
+ compressed_data = std::static_pointer_cast<Buffer>(buffer);
+ } else {
+ compressed_data = page.buffer();
+ }
+
+ format::DictionaryPageHeader dict_page_header;
+ dict_page_header.__set_num_values(page.num_values());
+ dict_page_header.__set_encoding(ToThrift(page.encoding()));
+ dict_page_header.__set_is_sorted(page.is_sorted());
+
+ const uint8_t* output_data_buffer = compressed_data->data();
+ int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+
+ if (data_encryptor_.get()) {
+ UpdateEncryption(encryption::kDictionaryPage);
+ PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
+ data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
+ output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+ encryption_buffer_->mutable_data());
+ output_data_buffer = encryption_buffer_->data();
+ }
+
+ format::PageHeader page_header;
+ page_header.__set_type(format::PageType::DICTIONARY_PAGE);
+ page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
+ page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
+ page_header.__set_dictionary_page_header(dict_page_header);
+ // TODO(PARQUET-594) crc checksum
+
+ PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
+ if (dictionary_page_offset_ == 0) {
+ dictionary_page_offset_ = start_pos;
+ }
+
+ if (meta_encryptor_) {
+ UpdateEncryption(encryption::kDictionaryPageHeader);
+ }
+ const int64_t header_size =
+ thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
+
+ PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
+
+ total_uncompressed_size_ += uncompressed_size + header_size;
+ total_compressed_size_ += output_data_len + header_size;
+ ++dict_encoding_stats_[page.encoding()];
+ return uncompressed_size + header_size;
+ }
+
+ void Close(bool has_dictionary, bool fallback) override {
+ if (meta_encryptor_ != nullptr) {
+ UpdateEncryption(encryption::kColumnMetaData);
+ }
+ // index_page_offset = -1 since they are not supported
+ metadata_->Finish(num_values_, dictionary_page_offset_, -1, data_page_offset_,
+ total_compressed_size_, total_uncompressed_size_, has_dictionary,
+ fallback, dict_encoding_stats_, data_encoding_stats_,
+ meta_encryptor_);
+ // Write metadata at end of column chunk
+ metadata_->WriteTo(sink_.get());
+ }
+
+ /**
+ * Compress a buffer.
+ */
+ void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
+ DCHECK(compressor_ != nullptr);
+
+ // Compress the data
+ int64_t max_compressed_size =
+ compressor_->MaxCompressedLen(src_buffer.size(), src_buffer.data());
+
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(max_compressed_size, false));
+
+ PARQUET_ASSIGN_OR_THROW(
+ int64_t compressed_size,
+ compressor_->Compress(src_buffer.size(), src_buffer.data(), max_compressed_size,
+ dest_buffer->mutable_data()));
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(compressed_size, false));
+ }
+
+ int64_t WriteDataPage(const DataPage& page) override {
+ const int64_t uncompressed_size = page.uncompressed_size();
+ std::shared_ptr<Buffer> compressed_data = page.buffer();
+ const uint8_t* output_data_buffer = compressed_data->data();
+ int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+
+ if (data_encryptor_.get()) {
+ PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
+ data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
+ UpdateEncryption(encryption::kDataPage);
+ output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+ encryption_buffer_->mutable_data());
+ output_data_buffer = encryption_buffer_->data();
+ }
+
+ format::PageHeader page_header;
+ page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
+ page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
+ // TODO(PARQUET-594) crc checksum
+
+ if (page.type() == PageType::DATA_PAGE) {
+ const DataPageV1& v1_page = checked_cast<const DataPageV1&>(page);
+ SetDataPageHeader(page_header, v1_page);
+ } else if (page.type() == PageType::DATA_PAGE_V2) {
+ const DataPageV2& v2_page = checked_cast<const DataPageV2&>(page);
+ SetDataPageV2Header(page_header, v2_page);
+ } else {
+ throw ParquetException("Unexpected page type");
+ }
+
+ PARQUET_ASSIGN_OR_THROW(int64_t start_pos, sink_->Tell());
+ if (page_ordinal_ == 0) {
+ data_page_offset_ = start_pos;
+ }
+
+ if (meta_encryptor_) {
+ UpdateEncryption(encryption::kDataPageHeader);
+ }
+ const int64_t header_size =
+ thrift_serializer_->Serialize(&page_header, sink_.get(), meta_encryptor_);
+ PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
+
+ total_uncompressed_size_ += uncompressed_size + header_size;
+ total_compressed_size_ += output_data_len + header_size;
+ num_values_ += page.num_values();
+ ++data_encoding_stats_[page.encoding()];
+ ++page_ordinal_;
+ return uncompressed_size + header_size;
+ }
+
+ void SetDataPageHeader(format::PageHeader& page_header, const DataPageV1& page) {
+ format::DataPageHeader data_page_header;
+ data_page_header.__set_num_values(page.num_values());
+ data_page_header.__set_encoding(ToThrift(page.encoding()));
+ data_page_header.__set_definition_level_encoding(
+ ToThrift(page.definition_level_encoding()));
+ data_page_header.__set_repetition_level_encoding(
+ ToThrift(page.repetition_level_encoding()));
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ page_header.__set_type(format::PageType::DATA_PAGE);
+ page_header.__set_data_page_header(data_page_header);
+ }
+
+ void SetDataPageV2Header(format::PageHeader& page_header, const DataPageV2 page) {
+ format::DataPageHeaderV2 data_page_header;
+ data_page_header.__set_num_values(page.num_values());
+ data_page_header.__set_num_nulls(page.num_nulls());
+ data_page_header.__set_num_rows(page.num_rows());
+ data_page_header.__set_encoding(ToThrift(page.encoding()));
+
+ data_page_header.__set_definition_levels_byte_length(
+ page.definition_levels_byte_length());
+ data_page_header.__set_repetition_levels_byte_length(
+ page.repetition_levels_byte_length());
+
+ data_page_header.__set_is_compressed(page.is_compressed());
+ data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+ page_header.__set_type(format::PageType::DATA_PAGE_V2);
+ page_header.__set_data_page_header_v2(data_page_header);
+ }
+
+ bool has_compressor() override { return (compressor_ != nullptr); }
+
+ int64_t num_values() { return num_values_; }
+
+ int64_t dictionary_page_offset() { return dictionary_page_offset_; }
+
+ int64_t data_page_offset() { return data_page_offset_; }
+
+ int64_t total_compressed_size() { return total_compressed_size_; }
+
+ int64_t total_uncompressed_size() { return total_uncompressed_size_; }
+
+ private:
+ // To allow UpdateEncryption on Close
+ friend class BufferedPageWriter;
+
+ void InitEncryption() {
+ // Prepare the AAD for quick update later.
+ if (data_encryptor_ != nullptr) {
+ data_page_aad_ = encryption::CreateModuleAad(
+ data_encryptor_->file_aad(), encryption::kDataPage, row_group_ordinal_,
+ column_ordinal_, kNonPageOrdinal);
+ }
+ if (meta_encryptor_ != nullptr) {
+ data_page_header_aad_ = encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), encryption::kDataPageHeader, row_group_ordinal_,
+ column_ordinal_, kNonPageOrdinal);
+ }
+ }
+
+ void UpdateEncryption(int8_t module_type) {
+ switch (module_type) {
+ case encryption::kColumnMetaData: {
+ meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ case encryption::kDataPage: {
+ encryption::QuickUpdatePageAad(data_page_aad_, page_ordinal_);
+ data_encryptor_->UpdateAad(data_page_aad_);
+ break;
+ }
+ case encryption::kDataPageHeader: {
+ encryption::QuickUpdatePageAad(data_page_header_aad_, page_ordinal_);
+ meta_encryptor_->UpdateAad(data_page_header_aad_);
+ break;
+ }
+ case encryption::kDictionaryPageHeader: {
+ meta_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ meta_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ case encryption::kDictionaryPage: {
+ data_encryptor_->UpdateAad(encryption::CreateModuleAad(
+ data_encryptor_->file_aad(), module_type, row_group_ordinal_, column_ordinal_,
+ kNonPageOrdinal));
+ break;
+ }
+ default:
+ throw ParquetException("Unknown module type in UpdateEncryption");
+ }
+ }
+
+ std::shared_ptr<ArrowOutputStream> sink_;
+ ColumnChunkMetaDataBuilder* metadata_;
+ MemoryPool* pool_;
+ int64_t num_values_;
+ int64_t dictionary_page_offset_;
+ int64_t data_page_offset_;
+ int64_t total_uncompressed_size_;
+ int64_t total_compressed_size_;
+ int16_t page_ordinal_;
+ int16_t row_group_ordinal_;
+ int16_t column_ordinal_;
+
+ std::unique_ptr<ThriftSerializer> thrift_serializer_;
+
+ // Compression codec to use.
+ std::unique_ptr<::arrow::util::Codec> compressor_;
+
+ std::string data_page_aad_;
+ std::string data_page_header_aad_;
+
+ std::shared_ptr<Encryptor> meta_encryptor_;
+ std::shared_ptr<Encryptor> data_encryptor_;
+
+ std::shared_ptr<ResizableBuffer> encryption_buffer_;
+
+ std::map<Encoding::type, int32_t> dict_encoding_stats_;
+ std::map<Encoding::type, int32_t> data_encoding_stats_;
+};
+
+// This implementation of the PageWriter writes to the final sink on Close .
+class BufferedPageWriter : public PageWriter {
+ public:
+ BufferedPageWriter(std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t current_column_ordinal,
+ MemoryPool* pool = ::arrow::default_memory_pool(),
+ std::shared_ptr<Encryptor> meta_encryptor = nullptr,
+ std::shared_ptr<Encryptor> data_encryptor = nullptr)
+ : final_sink_(std::move(sink)), metadata_(metadata), has_dictionary_pages_(false) {
+ in_memory_sink_ = CreateOutputStream(pool);
+ pager_ = std::unique_ptr<SerializedPageWriter>(
+ new SerializedPageWriter(in_memory_sink_, codec, compression_level, metadata,
+ row_group_ordinal, current_column_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ }
+
+ int64_t WriteDictionaryPage(const DictionaryPage& page) override {
+ has_dictionary_pages_ = true;
+ return pager_->WriteDictionaryPage(page);
+ }
+
+ void Close(bool has_dictionary, bool fallback) override {
+ if (pager_->meta_encryptor_ != nullptr) {
+ pager_->UpdateEncryption(encryption::kColumnMetaData);
+ }
+ // index_page_offset = -1 since they are not supported
+ PARQUET_ASSIGN_OR_THROW(int64_t final_position, final_sink_->Tell());
+ // dictionary page offset should be 0 iff there are no dictionary pages
+ auto dictionary_page_offset =
+ has_dictionary_pages_ ? pager_->dictionary_page_offset() + final_position : 0;
+ metadata_->Finish(pager_->num_values(), dictionary_page_offset, -1,
+ pager_->data_page_offset() + final_position,
+ pager_->total_compressed_size(), pager_->total_uncompressed_size(),
+ has_dictionary, fallback, pager_->dict_encoding_stats_,
+ pager_->data_encoding_stats_, pager_->meta_encryptor_);
+
+ // Write metadata at end of column chunk
+ metadata_->WriteTo(in_memory_sink_.get());
+
+ // flush everything to the serialized sink
+ PARQUET_ASSIGN_OR_THROW(auto buffer, in_memory_sink_->Finish());
+ PARQUET_THROW_NOT_OK(final_sink_->Write(buffer));
+ }
+
+ int64_t WriteDataPage(const DataPage& page) override {
+ return pager_->WriteDataPage(page);
+ }
+
+ void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) override {
+ pager_->Compress(src_buffer, dest_buffer);
+ }
+
+ bool has_compressor() override { return pager_->has_compressor(); }
+
+ private:
+ std::shared_ptr<ArrowOutputStream> final_sink_;
+ ColumnChunkMetaDataBuilder* metadata_;
+ std::shared_ptr<::arrow::io::BufferOutputStream> in_memory_sink_;
+ std::unique_ptr<SerializedPageWriter> pager_;
+ bool has_dictionary_pages_;
+};
+
+std::unique_ptr<PageWriter> PageWriter::Open(
+ std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal, int16_t column_chunk_ordinal, MemoryPool* pool,
+ bool buffered_row_group, std::shared_ptr<Encryptor> meta_encryptor,
+ std::shared_ptr<Encryptor> data_encryptor) {
+ if (buffered_row_group) {
+ return std::unique_ptr<PageWriter>(
+ new BufferedPageWriter(std::move(sink), codec, compression_level, metadata,
+ row_group_ordinal, column_chunk_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ } else {
+ return std::unique_ptr<PageWriter>(
+ new SerializedPageWriter(std::move(sink), codec, compression_level, metadata,
+ row_group_ordinal, column_chunk_ordinal, pool,
+ std::move(meta_encryptor), std::move(data_encryptor)));
+ }
+}
+
+// ----------------------------------------------------------------------
+// ColumnWriter
+
+const std::shared_ptr<WriterProperties>& default_writer_properties() {
+ static std::shared_ptr<WriterProperties> default_writer_properties =
+ WriterProperties::Builder().build();
+ return default_writer_properties;
+}
+
+class ColumnWriterImpl {
+ public:
+ ColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager, const bool use_dictionary,
+ Encoding::type encoding, const WriterProperties* properties)
+ : metadata_(metadata),
+ descr_(metadata->descr()),
+ level_info_(ComputeLevelInfo(metadata->descr())),
+ pager_(std::move(pager)),
+ has_dictionary_(use_dictionary),
+ encoding_(encoding),
+ properties_(properties),
+ allocator_(properties->memory_pool()),
+ num_buffered_values_(0),
+ num_buffered_encoded_values_(0),
+ rows_written_(0),
+ total_bytes_written_(0),
+ total_compressed_bytes_(0),
+ closed_(false),
+ fallback_(false),
+ definition_levels_sink_(allocator_),
+ repetition_levels_sink_(allocator_) {
+ definition_levels_rle_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ repetition_levels_rle_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ uncompressed_data_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+
+ if (pager_->has_compressor()) {
+ compressor_temp_buffer_ =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0));
+ }
+ }
+
+ virtual ~ColumnWriterImpl() = default;
+
+ int64_t Close();
+
+ protected:
+ virtual std::shared_ptr<Buffer> GetValuesBuffer() = 0;
+
+ // Serializes Dictionary Page if enabled
+ virtual void WriteDictionaryPage() = 0;
+
+ // Plain-encoded statistics of the current page
+ virtual EncodedStatistics GetPageStatistics() = 0;
+
+ // Plain-encoded statistics of the whole chunk
+ virtual EncodedStatistics GetChunkStatistics() = 0;
+
+ // Merges page statistics into chunk statistics, then resets the values
+ virtual void ResetPageStatistics() = 0;
+
+ // Adds Data Pages to an in memory buffer in dictionary encoding mode
+ // Serializes the Data Pages in other encoding modes
+ void AddDataPage();
+
+ void BuildDataPageV1(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size, int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values);
+ void BuildDataPageV2(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size, int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values);
+
+ // Serializes Data Pages
+ void WriteDataPage(const DataPage& page) {
+ total_bytes_written_ += pager_->WriteDataPage(page);
+ }
+
+ // Write multiple definition levels
+ void WriteDefinitionLevels(int64_t num_levels, const int16_t* levels) {
+ DCHECK(!closed_);
+ PARQUET_THROW_NOT_OK(
+ definition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
+ }
+
+ // Write multiple repetition levels
+ void WriteRepetitionLevels(int64_t num_levels, const int16_t* levels) {
+ DCHECK(!closed_);
+ PARQUET_THROW_NOT_OK(
+ repetition_levels_sink_.Append(levels, sizeof(int16_t) * num_levels));
+ }
+
+ // RLE encode the src_buffer into dest_buffer and return the encoded size
+ int64_t RleEncodeLevels(const void* src_buffer, ResizableBuffer* dest_buffer,
+ int16_t max_level, bool include_length_prefix = true);
+
+ // Serialize the buffered Data Pages
+ void FlushBufferedDataPages();
+
+ ColumnChunkMetaDataBuilder* metadata_;
+ const ColumnDescriptor* descr_;
+ // scratch buffer if validity bits need to be recalculated.
+ std::shared_ptr<ResizableBuffer> bits_buffer_;
+ const internal::LevelInfo level_info_;
+
+ std::unique_ptr<PageWriter> pager_;
+
+ bool has_dictionary_;
+ Encoding::type encoding_;
+ const WriterProperties* properties_;
+
+ LevelEncoder level_encoder_;
+
+ MemoryPool* allocator_;
+
+ // The total number of values stored in the data page. This is the maximum of
+ // the number of encoded definition levels or encoded values. For
+ // non-repeated, required columns, this is equal to the number of encoded
+ // values. For repeated or optional values, there may be fewer data values
+ // than levels, and this tells you how many encoded levels there are in that
+ // case.
+ int64_t num_buffered_values_;
+
+ // The total number of stored values. For repeated or optional values, this
+ // number may be lower than num_buffered_values_.
+ int64_t num_buffered_encoded_values_;
+
+ // Total number of rows written with this ColumnWriter
+ int rows_written_;
+
+ // Records the total number of uncompressed bytes written by the serializer
+ int64_t total_bytes_written_;
+
+ // Records the current number of compressed bytes in a column
+ int64_t total_compressed_bytes_;
+
+ // Flag to check if the Writer has been closed
+ bool closed_;
+
+ // Flag to infer if dictionary encoding has fallen back to PLAIN
+ bool fallback_;
+
+ ::arrow::BufferBuilder definition_levels_sink_;
+ ::arrow::BufferBuilder repetition_levels_sink_;
+
+ std::shared_ptr<ResizableBuffer> definition_levels_rle_;
+ std::shared_ptr<ResizableBuffer> repetition_levels_rle_;
+
+ std::shared_ptr<ResizableBuffer> uncompressed_data_;
+ std::shared_ptr<ResizableBuffer> compressor_temp_buffer_;
+
+ std::vector<std::unique_ptr<DataPage>> data_pages_;
+
+ private:
+ void InitSinks() {
+ definition_levels_sink_.Rewind(0);
+ repetition_levels_sink_.Rewind(0);
+ }
+
+ // Concatenate the encoded levels and values into one buffer
+ void ConcatenateBuffers(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ const std::shared_ptr<Buffer>& values, uint8_t* combined) {
+ memcpy(combined, repetition_levels_rle_->data(), repetition_levels_rle_size);
+ combined += repetition_levels_rle_size;
+ memcpy(combined, definition_levels_rle_->data(), definition_levels_rle_size);
+ combined += definition_levels_rle_size;
+ memcpy(combined, values->data(), values->size());
+ }
+};
+
+// return the size of the encoded buffer
+int64_t ColumnWriterImpl::RleEncodeLevels(const void* src_buffer,
+ ResizableBuffer* dest_buffer, int16_t max_level,
+ bool include_length_prefix) {
+ // V1 DataPage includes the length of the RLE level as a prefix.
+ int32_t prefix_size = include_length_prefix ? sizeof(int32_t) : 0;
+
+ // TODO: This only works with due to some RLE specifics
+ int64_t rle_size = LevelEncoder::MaxBufferSize(Encoding::RLE, max_level,
+ static_cast<int>(num_buffered_values_)) +
+ prefix_size;
+
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(dest_buffer->Resize(rle_size, false));
+
+ level_encoder_.Init(Encoding::RLE, max_level, static_cast<int>(num_buffered_values_),
+ dest_buffer->mutable_data() + prefix_size,
+ static_cast<int>(dest_buffer->size() - prefix_size));
+ int encoded = level_encoder_.Encode(static_cast<int>(num_buffered_values_),
+ reinterpret_cast<const int16_t*>(src_buffer));
+ DCHECK_EQ(encoded, num_buffered_values_);
+
+ if (include_length_prefix) {
+ reinterpret_cast<int32_t*>(dest_buffer->mutable_data())[0] = level_encoder_.len();
+ }
+
+ return level_encoder_.len() + prefix_size;
+}
+
+void ColumnWriterImpl::AddDataPage() {
+ int64_t definition_levels_rle_size = 0;
+ int64_t repetition_levels_rle_size = 0;
+
+ std::shared_ptr<Buffer> values = GetValuesBuffer();
+ bool is_v1_data_page = properties_->data_page_version() == ParquetDataPageVersion::V1;
+
+ if (descr_->max_definition_level() > 0) {
+ definition_levels_rle_size = RleEncodeLevels(
+ definition_levels_sink_.data(), definition_levels_rle_.get(),
+ descr_->max_definition_level(), /*include_length_prefix=*/is_v1_data_page);
+ }
+
+ if (descr_->max_repetition_level() > 0) {
+ repetition_levels_rle_size = RleEncodeLevels(
+ repetition_levels_sink_.data(), repetition_levels_rle_.get(),
+ descr_->max_repetition_level(), /*include_length_prefix=*/is_v1_data_page);
+ }
+
+ int64_t uncompressed_size =
+ definition_levels_rle_size + repetition_levels_rle_size + values->size();
+
+ if (is_v1_data_page) {
+ BuildDataPageV1(definition_levels_rle_size, repetition_levels_rle_size,
+ uncompressed_size, values);
+ } else {
+ BuildDataPageV2(definition_levels_rle_size, repetition_levels_rle_size,
+ uncompressed_size, values);
+ }
+
+ // Re-initialize the sinks for next Page.
+ InitSinks();
+ num_buffered_values_ = 0;
+ num_buffered_encoded_values_ = 0;
+}
+
+void ColumnWriterImpl::BuildDataPageV1(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values) {
+ // Use Arrow::Buffer::shrink_to_fit = false
+ // underlying buffer only keeps growing. Resize to a smaller size does not reallocate.
+ PARQUET_THROW_NOT_OK(uncompressed_data_->Resize(uncompressed_size, false));
+ ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size, values,
+ uncompressed_data_->mutable_data());
+
+ EncodedStatistics page_stats = GetPageStatistics();
+ page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
+ page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+ ResetPageStatistics();
+
+ std::shared_ptr<Buffer> compressed_data;
+ if (pager_->has_compressor()) {
+ pager_->Compress(*(uncompressed_data_.get()), compressor_temp_buffer_.get());
+ compressed_data = compressor_temp_buffer_;
+ } else {
+ compressed_data = uncompressed_data_;
+ }
+
+ // Write the page to OutputStream eagerly if there is no dictionary or
+ // if dictionary encoding has fallen back to PLAIN
+ if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
+ PARQUET_ASSIGN_OR_THROW(
+ auto compressed_data_copy,
+ compressed_data->CopySlice(0, compressed_data->size(), allocator_));
+ std::unique_ptr<DataPage> page_ptr(new DataPageV1(
+ compressed_data_copy, static_cast<int32_t>(num_buffered_values_), encoding_,
+ Encoding::RLE, Encoding::RLE, uncompressed_size, page_stats));
+ total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
+
+ data_pages_.push_back(std::move(page_ptr));
+ } else { // Eagerly write pages
+ DataPageV1 page(compressed_data, static_cast<int32_t>(num_buffered_values_),
+ encoding_, Encoding::RLE, Encoding::RLE, uncompressed_size,
+ page_stats);
+ WriteDataPage(page);
+ }
+}
+
+void ColumnWriterImpl::BuildDataPageV2(int64_t definition_levels_rle_size,
+ int64_t repetition_levels_rle_size,
+ int64_t uncompressed_size,
+ const std::shared_ptr<Buffer>& values) {
+ // Compress the values if needed. Repetition and definition levels are uncompressed in
+ // V2.
+ std::shared_ptr<Buffer> compressed_values;
+ if (pager_->has_compressor()) {
+ pager_->Compress(*values, compressor_temp_buffer_.get());
+ compressed_values = compressor_temp_buffer_;
+ } else {
+ compressed_values = values;
+ }
+
+ // Concatenate uncompressed levels and the possibly compressed values
+ int64_t combined_size =
+ definition_levels_rle_size + repetition_levels_rle_size + compressed_values->size();
+ std::shared_ptr<ResizableBuffer> combined = AllocateBuffer(allocator_, combined_size);
+
+ ConcatenateBuffers(definition_levels_rle_size, repetition_levels_rle_size,
+ compressed_values, combined->mutable_data());
+
+ EncodedStatistics page_stats = GetPageStatistics();
+ page_stats.ApplyStatSizeLimits(properties_->max_statistics_size(descr_->path()));
+ page_stats.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+ ResetPageStatistics();
+
+ int32_t num_values = static_cast<int32_t>(num_buffered_values_);
+ int32_t null_count = static_cast<int32_t>(page_stats.null_count);
+ int32_t def_levels_byte_length = static_cast<int32_t>(definition_levels_rle_size);
+ int32_t rep_levels_byte_length = static_cast<int32_t>(repetition_levels_rle_size);
+
+ // Write the page to OutputStream eagerly if there is no dictionary or
+ // if dictionary encoding has fallen back to PLAIN
+ if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding
+ PARQUET_ASSIGN_OR_THROW(auto data_copy,
+ combined->CopySlice(0, combined->size(), allocator_));
+ std::unique_ptr<DataPage> page_ptr(new DataPageV2(
+ combined, num_values, null_count, num_values, encoding_, def_levels_byte_length,
+ rep_levels_byte_length, uncompressed_size, pager_->has_compressor()));
+ total_compressed_bytes_ += page_ptr->size() + sizeof(format::PageHeader);
+ data_pages_.push_back(std::move(page_ptr));
+ } else {
+ DataPageV2 page(combined, num_values, null_count, num_values, encoding_,
+ def_levels_byte_length, rep_levels_byte_length, uncompressed_size,
+ pager_->has_compressor());
+ WriteDataPage(page);
+ }
+}
+
+int64_t ColumnWriterImpl::Close() {
+ if (!closed_) {
+ closed_ = true;
+ if (has_dictionary_ && !fallback_) {
+ WriteDictionaryPage();
+ }
+
+ FlushBufferedDataPages();
+
+ EncodedStatistics chunk_statistics = GetChunkStatistics();
+ chunk_statistics.ApplyStatSizeLimits(
+ properties_->max_statistics_size(descr_->path()));
+ chunk_statistics.set_is_signed(SortOrder::SIGNED == descr_->sort_order());
+
+ // Write stats only if the column has at least one row written
+ if (rows_written_ > 0 && chunk_statistics.is_set()) {
+ metadata_->SetStatistics(chunk_statistics);
+ }
+ pager_->Close(has_dictionary_, fallback_);
+ }
+
+ return total_bytes_written_;
+}
+
+void ColumnWriterImpl::FlushBufferedDataPages() {
+ // Write all outstanding data to a new page
+ if (num_buffered_values_ > 0) {
+ AddDataPage();
+ }
+ for (const auto& page_ptr : data_pages_) {
+ WriteDataPage(*page_ptr);
+ }
+ data_pages_.clear();
+ total_compressed_bytes_ = 0;
+}
+
+// ----------------------------------------------------------------------
+// TypedColumnWriter
+
+template <typename Action>
+inline void DoInBatches(int64_t total, int64_t batch_size, Action&& action) {
+ int64_t num_batches = static_cast<int>(total / batch_size);
+ for (int round = 0; round < num_batches; round++) {
+ action(round * batch_size, batch_size);
+ }
+ // Write the remaining values
+ if (total % batch_size > 0) {
+ action(num_batches * batch_size, total % batch_size);
+ }
+}
+
+bool DictionaryDirectWriteSupported(const ::arrow::Array& array) {
+ DCHECK_EQ(array.type_id(), ::arrow::Type::DICTIONARY);
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*array.type());
+ return ::arrow::is_base_binary_like(dict_type.value_type()->id());
+}
+
+Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
+ std::shared_ptr<::arrow::Array>* out) {
+ const ::arrow::DictionaryType& dict_type =
+ static_cast<const ::arrow::DictionaryType&>(*array.type());
+
+ ::arrow::compute::ExecContext ctx(pool);
+ ARROW_ASSIGN_OR_RAISE(Datum cast_output,
+ ::arrow::compute::Cast(array.data(), dict_type.value_type(),
+ ::arrow::compute::CastOptions(), &ctx));
+ *out = cast_output.make_array();
+ return Status::OK();
+}
+
+static inline bool IsDictionaryEncoding(Encoding::type encoding) {
+ return encoding == Encoding::PLAIN_DICTIONARY;
+}
+
+template <typename DType>
+class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedColumnWriterImpl(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager, const bool use_dictionary,
+ Encoding::type encoding, const WriterProperties* properties)
+ : ColumnWriterImpl(metadata, std::move(pager), use_dictionary, encoding,
+ properties) {
+ current_encoder_ = MakeEncoder(DType::type_num, encoding, use_dictionary, descr_,
+ properties->memory_pool());
+
+ if (properties->statistics_enabled(descr_->path()) &&
+ (SortOrder::UNKNOWN != descr_->sort_order())) {
+ page_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+ chunk_statistics_ = MakeStatistics<DType>(descr_, allocator_);
+ }
+ }
+
+ int64_t Close() override { return ColumnWriterImpl::Close(); }
+
+ int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const T* values) override {
+ // We check for DataPage limits only after we have inserted the values. If a user
+ // writes a large number of values, the DataPage size can be much above the limit.
+ // The purpose of this chunking is to bound this. Even if a user writes large number
+ // of values, the chunking will ensure the AddDataPage() is called at a reasonable
+ // pagesize limit
+ int64_t value_offset = 0;
+
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t values_to_write = WriteLevels(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+
+ // PARQUET-780
+ if (values_to_write > 0) {
+ DCHECK_NE(nullptr, values);
+ }
+ WriteValues(AddIfNotNull(values, value_offset), values_to_write,
+ batch_size - values_to_write);
+ CommitWriteAndCheckPageLimit(batch_size, values_to_write);
+ value_offset += values_to_write;
+
+ // Dictionary size checked separately from data page size since we
+ // circumvent this check when writing ::arrow::DictionaryArray directly
+ CheckDictionarySizeLimit();
+ };
+ DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+ return value_offset;
+ }
+
+ void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, const T* values) override {
+ // Like WriteBatch, but for spaced values
+ int64_t value_offset = 0;
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count;
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values,
+ &null_count);
+
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ if (bits_buffer_ != nullptr) {
+ WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
+ batch_num_spaced_values, bits_buffer_->data(), /*offset=*/0);
+ } else {
+ WriteValuesSpaced(AddIfNotNull(values, value_offset), batch_num_values,
+ batch_num_spaced_values, valid_bits,
+ valid_bits_offset + value_offset);
+ }
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_spaced_values);
+ value_offset += batch_num_spaced_values;
+
+ // Dictionary size checked separately from data page size since we
+ // circumvent this check when writing ::arrow::DictionaryArray directly
+ CheckDictionarySizeLimit();
+ };
+ DoInBatches(num_values, properties_->write_batch_size(), WriteChunk);
+ }
+
+ Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& leaf_array,
+ ArrowWriteContext* ctx, bool leaf_field_nullable) override {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ // Leaf nulls are canonical when there is only a single null element after a list
+ // and it is at the leaf.
+ bool single_nullable_element =
+ (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) &&
+ leaf_field_nullable;
+ bool maybe_parent_nulls = level_info_.HasNullableValues() && !single_nullable_element;
+ if (maybe_parent_nulls) {
+ ARROW_ASSIGN_OR_RAISE(
+ bits_buffer_,
+ ::arrow::AllocateResizableBuffer(
+ BitUtil::BytesForBits(properties_->write_batch_size()), ctx->memory_pool));
+ bits_buffer_->ZeroPadding();
+ }
+
+ if (leaf_array.type()->id() == ::arrow::Type::DICTIONARY) {
+ return WriteArrowDictionary(def_levels, rep_levels, num_levels, leaf_array, ctx,
+ maybe_parent_nulls);
+ } else {
+ return WriteArrowDense(def_levels, rep_levels, num_levels, leaf_array, ctx,
+ maybe_parent_nulls);
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ }
+
+ int64_t EstimatedBufferedValueBytes() const override {
+ return current_encoder_->EstimatedDataEncodedSize();
+ }
+
+ protected:
+ std::shared_ptr<Buffer> GetValuesBuffer() override {
+ return current_encoder_->FlushValues();
+ }
+
+ // Internal function to handle direct writing of ::arrow::DictionaryArray,
+ // since the standard logic concerning dictionary size limits and fallback to
+ // plain encoding is circumvented
+ Status WriteArrowDictionary(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& array,
+ ArrowWriteContext* context, bool maybe_parent_nulls);
+
+ Status WriteArrowDense(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& array,
+ ArrowWriteContext* context, bool maybe_parent_nulls);
+
+ void WriteDictionaryPage() override {
+ // We have to dynamic cast here because of TypedEncoder<Type> as
+ // some compilers don't want to cast through virtual inheritance
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ DCHECK(dict_encoder);
+ std::shared_ptr<ResizableBuffer> buffer =
+ AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size());
+ dict_encoder->WriteDict(buffer->mutable_data());
+
+ DictionaryPage page(buffer, dict_encoder->num_entries(),
+ properties_->dictionary_page_encoding());
+ total_bytes_written_ += pager_->WriteDictionaryPage(page);
+ }
+
+ EncodedStatistics GetPageStatistics() override {
+ EncodedStatistics result;
+ if (page_statistics_) result = page_statistics_->Encode();
+ return result;
+ }
+
+ EncodedStatistics GetChunkStatistics() override {
+ EncodedStatistics result;
+ if (chunk_statistics_) result = chunk_statistics_->Encode();
+ return result;
+ }
+
+ void ResetPageStatistics() override {
+ if (chunk_statistics_ != nullptr) {
+ chunk_statistics_->Merge(*page_statistics_);
+ page_statistics_->Reset();
+ }
+ }
+
+ Type::type type() const override { return descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return descr_; }
+
+ int64_t rows_written() const override { return rows_written_; }
+
+ int64_t total_compressed_bytes() const override { return total_compressed_bytes_; }
+
+ int64_t total_bytes_written() const override { return total_bytes_written_; }
+
+ const WriterProperties* properties() override { return properties_; }
+
+ private:
+ using ValueEncoderType = typename EncodingTraits<DType>::Encoder;
+ using TypedStats = TypedStatistics<DType>;
+ std::unique_ptr<Encoder> current_encoder_;
+ std::shared_ptr<TypedStats> page_statistics_;
+ std::shared_ptr<TypedStats> chunk_statistics_;
+
+ // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the
+ // dictionary passed to DictEncoder<T>::PutDictionary so we can check
+ // subsequent array chunks to see either if materialization is required (in
+ // which case we call back to the dense write path)
+ std::shared_ptr<::arrow::Array> preserved_dictionary_;
+
+ int64_t WriteLevels(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels) {
+ int64_t values_to_write = 0;
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (def_levels[i] == descr_->max_definition_level()) {
+ ++values_to_write;
+ }
+ }
+
+ WriteDefinitionLevels(num_values, def_levels);
+ } else {
+ // Required field, write all values
+ values_to_write = num_values;
+ }
+
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (rep_levels[i] == 0) {
+ rows_written_++;
+ }
+ }
+
+ WriteRepetitionLevels(num_values, rep_levels);
+ } else {
+ // Each value is exactly one row
+ rows_written_ += static_cast<int>(num_values);
+ }
+ return values_to_write;
+ }
+
+ // This method will always update the three output parameters,
+ // out_values_to_write, out_spaced_values_to_write and null_count. Additionally
+ // it will update the validity bitmap if required (i.e. if at least one level
+ // of nullable structs directly precede the leaf node).
+ void MaybeCalculateValidityBits(const int16_t* def_levels, int64_t batch_size,
+ int64_t* out_values_to_write,
+ int64_t* out_spaced_values_to_write,
+ int64_t* null_count) {
+ if (bits_buffer_ == nullptr) {
+ if (level_info_.def_level == 0) {
+ // In this case def levels should be null and we only
+ // need to output counts which will always be equal to
+ // the batch size passed in (max def_level == 0 indicates
+ // there cannot be repeated or null fields).
+ DCHECK_EQ(def_levels, nullptr);
+ *out_values_to_write = batch_size;
+ *out_spaced_values_to_write = batch_size;
+ *null_count = 0;
+ } else {
+ for (int x = 0; x < batch_size; x++) {
+ *out_values_to_write += def_levels[x] == level_info_.def_level ? 1 : 0;
+ *out_spaced_values_to_write +=
+ def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
+ }
+ *null_count = *out_values_to_write - *out_spaced_values_to_write;
+ }
+ return;
+ }
+ // Shrink to fit possible causes another allocation, and would only be necessary
+ // on the last batch.
+ int64_t new_bitmap_size = BitUtil::BytesForBits(batch_size);
+ if (new_bitmap_size != bits_buffer_->size()) {
+ PARQUET_THROW_NOT_OK(
+ bits_buffer_->Resize(new_bitmap_size, /*shrink_to_fit=*/false));
+ bits_buffer_->ZeroPadding();
+ }
+ internal::ValidityBitmapInputOutput io;
+ io.valid_bits = bits_buffer_->mutable_data();
+ io.values_read_upper_bound = batch_size;
+ internal::DefLevelsToBitmap(def_levels, batch_size, level_info_, &io);
+ *out_values_to_write = io.values_read - io.null_count;
+ *out_spaced_values_to_write = io.values_read;
+ *null_count = io.null_count;
+ }
+
+ Result<std::shared_ptr<Array>> MaybeReplaceValidity(std::shared_ptr<Array> array,
+ int64_t new_null_count,
+ ::arrow::MemoryPool* memory_pool) {
+ if (bits_buffer_ == nullptr) {
+ return array;
+ }
+ std::vector<std::shared_ptr<Buffer>> buffers = array->data()->buffers;
+ if (buffers.empty()) {
+ return array;
+ }
+ buffers[0] = bits_buffer_;
+ // Should be a leaf array.
+ DCHECK_GT(buffers.size(), 1);
+ ValueBufferSlicer slicer{memory_pool, /*buffer=*/nullptr};
+ if (array->data()->offset > 0) {
+ RETURN_NOT_OK(::arrow::VisitArrayInline(*array, &slicer));
+ buffers[1] = slicer.buffer_;
+ }
+ return ::arrow::MakeArray(std::make_shared<ArrayData>(
+ array->type(), array->length(), std::move(buffers), new_null_count));
+ }
+
+ void WriteLevelsSpaced(int64_t num_levels, const int16_t* def_levels,
+ const int16_t* rep_levels) {
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ WriteDefinitionLevels(num_levels, def_levels);
+ }
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_levels; ++i) {
+ if (rep_levels[i] == 0) {
+ rows_written_++;
+ }
+ }
+ WriteRepetitionLevels(num_levels, rep_levels);
+ } else {
+ // Each value is exactly one row
+ rows_written_ += static_cast<int>(num_levels);
+ }
+ }
+
+ void CommitWriteAndCheckPageLimit(int64_t num_levels, int64_t num_values) {
+ num_buffered_values_ += num_levels;
+ num_buffered_encoded_values_ += num_values;
+
+ if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
+ AddDataPage();
+ }
+ }
+
+ void FallbackToPlainEncoding() {
+ if (IsDictionaryEncoding(current_encoder_->encoding())) {
+ WriteDictionaryPage();
+ // Serialize the buffered Dictionary Indices
+ FlushBufferedDataPages();
+ fallback_ = true;
+ // Only PLAIN encoding is supported for fallback in V1
+ current_encoder_ = MakeEncoder(DType::type_num, Encoding::PLAIN, false, descr_,
+ properties_->memory_pool());
+ encoding_ = Encoding::PLAIN;
+ }
+ }
+
+ // Checks if the Dictionary Page size limit is reached
+ // If the limit is reached, the Dictionary and Data Pages are serialized
+ // The encoding is switched to PLAIN
+ //
+ // Only one Dictionary Page is written.
+ // Fallback to PLAIN if dictionary page limit is reached.
+ void CheckDictionarySizeLimit() {
+ if (!has_dictionary_ || fallback_) {
+ // Either not using dictionary encoding, or we have already fallen back
+ // to PLAIN encoding because the size threshold was reached
+ return;
+ }
+
+ // We have to dynamic cast here because TypedEncoder<Type> as some compilers
+ // don't want to cast through virtual inheritance
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) {
+ FallbackToPlainEncoding();
+ }
+ }
+
+ void WriteValues(const T* values, int64_t num_values, int64_t num_nulls) {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->Put(values, static_cast<int>(num_values));
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(values, num_values, num_nulls);
+ }
+ }
+
+ void WriteValuesSpaced(const T* values, int64_t num_values, int64_t num_spaced_values,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ if (num_values != num_spaced_values) {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->PutSpaced(values, static_cast<int>(num_spaced_values), valid_bits,
+ valid_bits_offset);
+ } else {
+ dynamic_cast<ValueEncoderType*>(current_encoder_.get())
+ ->Put(values, static_cast<int>(num_values));
+ }
+ if (page_statistics_ != nullptr) {
+ const int64_t num_nulls = num_spaced_values - num_values;
+ page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, num_values,
+ num_nulls);
+ }
+ }
+};
+
+template <typename DType>
+Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ // If this is the first time writing a DictionaryArray, then there's
+ // a few possible paths to take:
+ //
+ // - If dictionary encoding is not enabled, convert to densely
+ // encoded and call WriteArrow
+ // - Dictionary encoding enabled
+ // - If this is the first time this is called, then we call
+ // PutDictionary into the encoder and then PutIndices on each
+ // chunk. We store the dictionary that was written in
+ // preserved_dictionary_ so that subsequent calls to this method
+ // can make sure the dictionary has not changed
+ // - On subsequent calls, we have to check whether the dictionary
+ // has changed. If it has, then we trigger the varying
+ // dictionary path and materialize each chunk and then call
+ // WriteArrow with that
+ auto WriteDense = [&] {
+ std::shared_ptr<::arrow::Array> dense_array;
+ RETURN_NOT_OK(
+ ConvertDictionaryToDense(array, properties_->memory_pool(), &dense_array));
+ return WriteArrowDense(def_levels, rep_levels, num_levels, *dense_array, ctx,
+ maybe_parent_nulls);
+ };
+
+ if (!IsDictionaryEncoding(current_encoder_->encoding()) ||
+ !DictionaryDirectWriteSupported(array)) {
+ // No longer dictionary-encoding for whatever reason, maybe we never were
+ // or we decided to stop. Note that WriteArrow can be invoked multiple
+ // times with both dense and dictionary-encoded versions of the same data
+ // without a problem. Any dense data will be hashed to indices until the
+ // dictionary page limit is reached, at which everything (dictionary and
+ // dense) will fall back to plain encoding
+ return WriteDense();
+ }
+
+ auto dict_encoder = dynamic_cast<DictEncoder<DType>*>(current_encoder_.get());
+ const auto& data = checked_cast<const ::arrow::DictionaryArray&>(array);
+ std::shared_ptr<::arrow::Array> dictionary = data.dictionary();
+ std::shared_ptr<::arrow::Array> indices = data.indices();
+
+ int64_t value_offset = 0;
+ auto WriteIndicesChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count = ::arrow::kUnknownNullCount;
+ // Bits is not null for nullable values. At this point in the code we can't determine
+ // if the leaf array has the same null values as any parents it might have had so we
+ // need to recompute it from def levels.
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values, &null_count);
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ std::shared_ptr<Array> writeable_indices =
+ indices->Slice(value_offset, batch_num_spaced_values);
+ PARQUET_ASSIGN_OR_THROW(
+ writeable_indices,
+ MaybeReplaceValidity(writeable_indices, null_count, ctx->memory_pool));
+ dict_encoder->PutIndices(*writeable_indices);
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+ value_offset += batch_num_spaced_values;
+ };
+
+ // Handle seeing dictionary for the first time
+ if (!preserved_dictionary_) {
+ // It's a new dictionary. Call PutDictionary and keep track of it
+ PARQUET_CATCH_NOT_OK(dict_encoder->PutDictionary(*dictionary));
+
+ // If there were duplicate value in the dictionary, the encoder's memo table
+ // will be out of sync with the indices in the Arrow array.
+ // The easiest solution for this uncommon case is to fallback to plain encoding.
+ if (dict_encoder->num_entries() != dictionary->length()) {
+ PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+ return WriteDense();
+ }
+
+ // TODO(wesm): If some dictionary values are unobserved, then the
+ // statistics will be inaccurate. Do we care enough to fix it?
+ if (page_statistics_ != nullptr) {
+ PARQUET_CATCH_NOT_OK(page_statistics_->Update(*dictionary));
+ }
+ preserved_dictionary_ = dictionary;
+ } else if (!dictionary->Equals(*preserved_dictionary_)) {
+ // Dictionary has changed
+ PARQUET_CATCH_NOT_OK(FallbackToPlainEncoding());
+ return WriteDense();
+ }
+
+ PARQUET_CATCH_NOT_OK(
+ DoInBatches(num_levels, properties_->write_batch_size(), WriteIndicesChunk));
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Direct Arrow write path
+
+template <typename ParquetType, typename ArrowType, typename Enable = void>
+struct SerializeFunctor {
+ using ArrowCType = typename ArrowType::c_type;
+ using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+ using ParquetCType = typename ParquetType::c_type;
+ Status Serialize(const ArrayType& array, ArrowWriteContext*, ParquetCType* out) {
+ const ArrowCType* input = array.raw_values();
+ if (array.null_count() > 0) {
+ for (int i = 0; i < array.length(); i++) {
+ out[i] = static_cast<ParquetCType>(input[i]);
+ }
+ } else {
+ std::copy(input, input + array.length(), out);
+ }
+ return Status::OK();
+ }
+};
+
+template <typename ParquetType, typename ArrowType>
+Status WriteArrowSerialize(const ::arrow::Array& array, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
+ bool maybe_parent_nulls) {
+ using ParquetCType = typename ParquetType::c_type;
+ using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
+
+ ParquetCType* buffer = nullptr;
+ PARQUET_THROW_NOT_OK(ctx->GetScratchData<ParquetCType>(array.length(), &buffer));
+
+ SerializeFunctor<ParquetType, ArrowType> functor;
+ RETURN_NOT_OK(functor.Serialize(checked_cast<const ArrayType&>(array), ctx, buffer));
+ bool no_nulls =
+ writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
+ if (!maybe_parent_nulls && no_nulls) {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, buffer));
+ } else {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
+ array.null_bitmap_data(),
+ array.offset(), buffer));
+ }
+ return Status::OK();
+}
+
+template <typename ParquetType>
+Status WriteArrowZeroCopy(const ::arrow::Array& array, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<ParquetType>* writer,
+ bool maybe_parent_nulls) {
+ using T = typename ParquetType::c_type;
+ const auto& data = static_cast<const ::arrow::PrimitiveArray&>(array);
+ const T* values = nullptr;
+ // The values buffer may be null if the array is empty (ARROW-2744)
+ if (data.values() != nullptr) {
+ values = reinterpret_cast<const T*>(data.values()->data()) + data.offset();
+ } else {
+ DCHECK_EQ(data.length(), 0);
+ }
+ bool no_nulls =
+ writer->descr()->schema_node()->is_required() || (array.null_count() == 0);
+
+ if (!maybe_parent_nulls && no_nulls) {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatch(num_levels, def_levels, rep_levels, values));
+ } else {
+ PARQUET_CATCH_NOT_OK(writer->WriteBatchSpaced(num_levels, def_levels, rep_levels,
+ data.null_bitmap_data(), data.offset(),
+ values));
+ }
+ return Status::OK();
+}
+
+#define WRITE_SERIALIZE_CASE(ArrowEnum, ArrowType, ParquetType) \
+ case ::arrow::Type::ArrowEnum: \
+ return WriteArrowSerialize<ParquetType, ::arrow::ArrowType>( \
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+
+#define WRITE_ZERO_COPY_CASE(ArrowEnum, ArrowType, ParquetType) \
+ case ::arrow::Type::ArrowEnum: \
+ return WriteArrowZeroCopy<ParquetType>(array, num_levels, def_levels, rep_levels, \
+ ctx, this, maybe_parent_nulls);
+
+#define ARROW_UNSUPPORTED() \
+ std::stringstream ss; \
+ ss << "Arrow type " << array.type()->ToString() \
+ << " cannot be written to Parquet type " << descr_->ToString(); \
+ return Status::Invalid(ss.str());
+
+// ----------------------------------------------------------------------
+// Write Arrow to BooleanType
+
+template <>
+struct SerializeFunctor<BooleanType, ::arrow::BooleanType> {
+ Status Serialize(const ::arrow::BooleanArray& data, ArrowWriteContext*, bool* out) {
+ for (int i = 0; i < data.length(); i++) {
+ *out++ = data.Value(i);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+Status TypedColumnWriterImpl<BooleanType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::BOOL) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowSerialize<BooleanType, ::arrow::BooleanType>(
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow types to INT32
+
+template <>
+struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
+ Status Serialize(const ::arrow::Date64Array& array, ArrowWriteContext*, int32_t* out) {
+ const int64_t* input = array.raw_values();
+ for (int i = 0; i < array.length(); i++) {
+ *out++ = static_cast<int32_t>(*input++ / 86400000);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
+ Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
+ const int32_t* input = array.raw_values();
+ const auto& type = static_cast<const ::arrow::Time32Type&>(*array.type());
+ if (type.unit() == ::arrow::TimeUnit::SECOND) {
+ for (int i = 0; i < array.length(); i++) {
+ out[i] = input[i] * 1000;
+ }
+ } else {
+ std::copy(input, input + array.length(), out);
+ }
+ return Status::OK();
+ }
+};
+
+template <>
+Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ case ::arrow::Type::NA: {
+ PARQUET_CATCH_NOT_OK(WriteBatch(num_levels, def_levels, rep_levels, nullptr));
+ } break;
+ WRITE_SERIALIZE_CASE(INT8, Int8Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT8, UInt8Type, Int32Type)
+ WRITE_SERIALIZE_CASE(INT16, Int16Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT16, UInt16Type, Int32Type)
+ WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int32Type)
+ WRITE_ZERO_COPY_CASE(INT32, Int32Type, Int32Type)
+ WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type)
+ WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type)
+ WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type)
+ default:
+ ARROW_UNSUPPORTED()
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to Int64 and Int96
+
+#define INT96_CONVERT_LOOP(ConversionFunction) \
+ for (int64_t i = 0; i < array.length(); i++) ConversionFunction(input[i], &out[i]);
+
+template <>
+struct SerializeFunctor<Int96Type, ::arrow::TimestampType> {
+ Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext*, Int96* out) {
+ const int64_t* input = array.raw_values();
+ const auto& type = static_cast<const ::arrow::TimestampType&>(*array.type());
+ switch (type.unit()) {
+ case ::arrow::TimeUnit::NANO:
+ INT96_CONVERT_LOOP(internal::NanosecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::MICRO:
+ INT96_CONVERT_LOOP(internal::MicrosecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::MILLI:
+ INT96_CONVERT_LOOP(internal::MillisecondsToImpalaTimestamp);
+ break;
+ case ::arrow::TimeUnit::SECOND:
+ INT96_CONVERT_LOOP(internal::SecondsToImpalaTimestamp);
+ break;
+ }
+ return Status::OK();
+ }
+};
+
+#define COERCE_DIVIDE -1
+#define COERCE_INVALID 0
+#define COERCE_MULTIPLY +1
+
+static std::pair<int, int64_t> kTimestampCoercionFactors[4][4] = {
+ // from seconds ...
+ {{COERCE_INVALID, 0}, // ... to seconds
+ {COERCE_MULTIPLY, 1000}, // ... to millis
+ {COERCE_MULTIPLY, 1000000}, // ... to micros
+ {COERCE_MULTIPLY, INT64_C(1000000000)}}, // ... to nanos
+ // from millis ...
+ {{COERCE_INVALID, 0},
+ {COERCE_MULTIPLY, 1},
+ {COERCE_MULTIPLY, 1000},
+ {COERCE_MULTIPLY, 1000000}},
+ // from micros ...
+ {{COERCE_INVALID, 0},
+ {COERCE_DIVIDE, 1000},
+ {COERCE_MULTIPLY, 1},
+ {COERCE_MULTIPLY, 1000}},
+ // from nanos ...
+ {{COERCE_INVALID, 0},
+ {COERCE_DIVIDE, 1000000},
+ {COERCE_DIVIDE, 1000},
+ {COERCE_MULTIPLY, 1}}};
+
+template <>
+struct SerializeFunctor<Int64Type, ::arrow::TimestampType> {
+ Status Serialize(const ::arrow::TimestampArray& array, ArrowWriteContext* ctx,
+ int64_t* out) {
+ const auto& source_type = static_cast<const ::arrow::TimestampType&>(*array.type());
+ auto source_unit = source_type.unit();
+ const int64_t* values = array.raw_values();
+
+ ::arrow::TimeUnit::type target_unit = ctx->properties->coerce_timestamps_unit();
+ auto target_type = ::arrow::timestamp(target_unit);
+ bool truncation_allowed = ctx->properties->truncated_timestamps_allowed();
+
+ auto DivideBy = [&](const int64_t factor) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (!truncation_allowed && array.IsValid(i) && (values[i] % factor != 0)) {
+ return Status::Invalid("Casting from ", source_type.ToString(), " to ",
+ target_type->ToString(),
+ " would lose data: ", values[i]);
+ }
+ out[i] = values[i] / factor;
+ }
+ return Status::OK();
+ };
+
+ auto MultiplyBy = [&](const int64_t factor) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = values[i] * factor;
+ }
+ return Status::OK();
+ };
+
+ const auto& coercion = kTimestampCoercionFactors[static_cast<int>(source_unit)]
+ [static_cast<int>(target_unit)];
+
+ // .first -> coercion operation; .second -> scale factor
+ DCHECK_NE(coercion.first, COERCE_INVALID);
+ return coercion.first == COERCE_DIVIDE ? DivideBy(coercion.second)
+ : MultiplyBy(coercion.second);
+ }
+};
+
+#undef COERCE_DIVIDE
+#undef COERCE_INVALID
+#undef COERCE_MULTIPLY
+
+Status WriteTimestamps(const ::arrow::Array& values, int64_t num_levels,
+ const int16_t* def_levels, const int16_t* rep_levels,
+ ArrowWriteContext* ctx, TypedColumnWriter<Int64Type>* writer,
+ bool maybe_parent_nulls) {
+ const auto& source_type = static_cast<const ::arrow::TimestampType&>(*values.type());
+
+ auto WriteCoerce = [&](const ArrowWriterProperties* properties) {
+ ArrowWriteContext temp_ctx = *ctx;
+ temp_ctx.properties = properties;
+ return WriteArrowSerialize<Int64Type, ::arrow::TimestampType>(
+ values, num_levels, def_levels, rep_levels, &temp_ctx, writer,
+ maybe_parent_nulls);
+ };
+
+ if (ctx->properties->coerce_timestamps_enabled()) {
+ // User explicitly requested coercion to specific unit
+ if (source_type.unit() == ctx->properties->coerce_timestamps_unit()) {
+ // No data conversion necessary
+ return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels,
+ ctx, writer, maybe_parent_nulls);
+ } else {
+ return WriteCoerce(ctx->properties);
+ }
+ } else if (writer->properties()->version() == ParquetVersion::PARQUET_1_0 &&
+ source_type.unit() == ::arrow::TimeUnit::NANO) {
+ // Absent superseding user instructions, when writing Parquet version 1.0 files,
+ // timestamps in nanoseconds are coerced to microseconds
+ std::shared_ptr<ArrowWriterProperties> properties =
+ (ArrowWriterProperties::Builder())
+ .coerce_timestamps(::arrow::TimeUnit::MICRO)
+ ->disallow_truncated_timestamps()
+ ->build();
+ return WriteCoerce(properties.get());
+ } else if (source_type.unit() == ::arrow::TimeUnit::SECOND) {
+ // Absent superseding user instructions, timestamps in seconds are coerced to
+ // milliseconds
+ std::shared_ptr<ArrowWriterProperties> properties =
+ (ArrowWriterProperties::Builder())
+ .coerce_timestamps(::arrow::TimeUnit::MILLI)
+ ->build();
+ return WriteCoerce(properties.get());
+ } else {
+ // No data conversion necessary
+ return WriteArrowZeroCopy<Int64Type>(values, num_levels, def_levels, rep_levels, ctx,
+ writer, maybe_parent_nulls);
+ }
+}
+
+template <>
+Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ case ::arrow::Type::TIMESTAMP:
+ return WriteTimestamps(array, num_levels, def_levels, rep_levels, ctx, this,
+ maybe_parent_nulls);
+ WRITE_ZERO_COPY_CASE(INT64, Int64Type, Int64Type)
+ WRITE_SERIALIZE_CASE(UINT32, UInt32Type, Int64Type)
+ WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type)
+ WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type)
+ default:
+ ARROW_UNSUPPORTED();
+ }
+}
+
+template <>
+Status TypedColumnWriterImpl<Int96Type>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::TIMESTAMP) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowSerialize<Int96Type, ::arrow::TimestampType>(
+ array, num_levels, def_levels, rep_levels, ctx, this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Floating point types
+
+template <>
+Status TypedColumnWriterImpl<FloatType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::FLOAT) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowZeroCopy<FloatType>(array, num_levels, def_levels, rep_levels, ctx,
+ this, maybe_parent_nulls);
+}
+
+template <>
+Status TypedColumnWriterImpl<DoubleType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (array.type_id() != ::arrow::Type::DOUBLE) {
+ ARROW_UNSUPPORTED();
+ }
+ return WriteArrowZeroCopy<DoubleType>(array, num_levels, def_levels, rep_levels, ctx,
+ this, maybe_parent_nulls);
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to BYTE_ARRAY
+
+template <>
+Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ if (!::arrow::is_base_binary_like(array.type()->id())) {
+ ARROW_UNSUPPORTED();
+ }
+
+ int64_t value_offset = 0;
+ auto WriteChunk = [&](int64_t offset, int64_t batch_size) {
+ int64_t batch_num_values = 0;
+ int64_t batch_num_spaced_values = 0;
+ int64_t null_count = 0;
+
+ MaybeCalculateValidityBits(AddIfNotNull(def_levels, offset), batch_size,
+ &batch_num_values, &batch_num_spaced_values, &null_count);
+ WriteLevelsSpaced(batch_size, AddIfNotNull(def_levels, offset),
+ AddIfNotNull(rep_levels, offset));
+ std::shared_ptr<Array> data_slice =
+ array.Slice(value_offset, batch_num_spaced_values);
+ PARQUET_ASSIGN_OR_THROW(
+ data_slice, MaybeReplaceValidity(data_slice, null_count, ctx->memory_pool));
+
+ current_encoder_->Put(*data_slice);
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(*data_slice);
+ }
+ CommitWriteAndCheckPageLimit(batch_size, batch_num_values);
+ CheckDictionarySizeLimit();
+ value_offset += batch_num_spaced_values;
+ };
+
+ PARQUET_CATCH_NOT_OK(
+ DoInBatches(num_levels, properties_->write_batch_size(), WriteChunk));
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Write Arrow to FIXED_LEN_BYTE_ARRAY
+
+template <typename ParquetType, typename ArrowType>
+struct SerializeFunctor<
+ ParquetType, ArrowType,
+ ::arrow::enable_if_t<::arrow::is_fixed_size_binary_type<ArrowType>::value &&
+ !::arrow::is_decimal_type<ArrowType>::value>> {
+ Status Serialize(const ::arrow::FixedSizeBinaryArray& array, ArrowWriteContext*,
+ FLBA* out) {
+ if (array.null_count() == 0) {
+ // no nulls, just dump the data
+ // todo(advancedxy): use a writeBatch to avoid this step
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = FixedLenByteArray(array.GetValue(i));
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsValid(i)) {
+ out[i] = FixedLenByteArray(array.GetValue(i));
+ }
+ }
+ }
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Write Arrow to Decimal128
+
+// Requires a custom serializer because decimal in parquet are in big-endian
+// format. Thus, a temporary local buffer is required.
+template <typename ParquetType, typename ArrowType>
+struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_decimal<ArrowType>> {
+ Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
+ ArrowWriteContext* ctx, FLBA* out) {
+ AllocateScratch(array, ctx);
+ auto offset = Offset(array);
+
+ if (array.null_count() == 0) {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = FixDecimalEndianess<ArrowType::kByteWidth>(array.GetValue(i), offset);
+ }
+ } else {
+ for (int64_t i = 0; i < array.length(); i++) {
+ out[i] = array.IsValid(i) ? FixDecimalEndianess<ArrowType::kByteWidth>(
+ array.GetValue(i), offset)
+ : FixedLenByteArray();
+ }
+ }
+
+ return Status::OK();
+ }
+
+ // Parquet's Decimal are stored with FixedLength values where the length is
+ // proportional to the precision. Arrow's Decimal are always stored with 16/32
+ // bytes. Thus the internal FLBA pointer must be adjusted by the offset calculated
+ // here.
+ int32_t Offset(const Array& array) {
+ auto decimal_type = checked_pointer_cast<::arrow::DecimalType>(array.type());
+ return decimal_type->byte_width() -
+ ::arrow::DecimalType::DecimalSize(decimal_type->precision());
+ }
+
+ void AllocateScratch(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
+ ArrowWriteContext* ctx) {
+ int64_t non_null_count = array.length() - array.null_count();
+ int64_t size = non_null_count * ArrowType::kByteWidth;
+ scratch_buffer = AllocateBuffer(ctx->memory_pool, size);
+ scratch = reinterpret_cast<int64_t*>(scratch_buffer->mutable_data());
+ }
+
+ template <int byte_width>
+ FixedLenByteArray FixDecimalEndianess(const uint8_t* in, int64_t offset) {
+ const auto* u64_in = reinterpret_cast<const int64_t*>(in);
+ auto out = reinterpret_cast<const uint8_t*>(scratch) + offset;
+ static_assert(byte_width == 16 || byte_width == 32,
+ "only 16 and 32 byte Decimals supported");
+ if (byte_width == 32) {
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[3]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[2]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
+ } else {
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[1]);
+ *scratch++ = ::arrow::BitUtil::ToBigEndian(u64_in[0]);
+ }
+ return FixedLenByteArray(out);
+ }
+
+ std::shared_ptr<ResizableBuffer> scratch_buffer;
+ int64_t* scratch;
+};
+
+template <>
+Status TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
+ const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels,
+ const ::arrow::Array& array, ArrowWriteContext* ctx, bool maybe_parent_nulls) {
+ switch (array.type()->id()) {
+ WRITE_SERIALIZE_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryType, FLBAType)
+ WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, FLBAType)
+ WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, FLBAType)
+ default:
+ break;
+ }
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Dynamic column writer constructor
+
+std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* metadata,
+ std::unique_ptr<PageWriter> pager,
+ const WriterProperties* properties) {
+ const ColumnDescriptor* descr = metadata->descr();
+ const bool use_dictionary = properties->dictionary_enabled(descr->path()) &&
+ descr->physical_type() != Type::BOOLEAN;
+ Encoding::type encoding = properties->encoding(descr->path());
+ if (use_dictionary) {
+ encoding = properties->dictionary_index_encoding();
+ }
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedColumnWriterImpl<BooleanType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT32:
+ return std::make_shared<TypedColumnWriterImpl<Int32Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT64:
+ return std::make_shared<TypedColumnWriterImpl<Int64Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::INT96:
+ return std::make_shared<TypedColumnWriterImpl<Int96Type>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::FLOAT:
+ return std::make_shared<TypedColumnWriterImpl<FloatType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::DOUBLE:
+ return std::make_shared<TypedColumnWriterImpl<DoubleType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedColumnWriterImpl<ByteArrayType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedColumnWriterImpl<FLBAType>>(
+ metadata, std::move(pager), use_dictionary, encoding, properties);
+ default:
+ ParquetException::NYI("type reader not implemented");
+ }
+ // Unreachable code, but suppress compiler warning
+ return std::shared_ptr<ColumnWriter>(nullptr);
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
index 6661385abdb..0a609021739 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/column_writer.h
@@ -1,270 +1,270 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-
-namespace BitUtil {
-class BitWriter;
-} // namespace BitUtil
-
-namespace util {
-class RleEncoder;
-} // namespace util
-
-} // namespace arrow
-
-namespace parquet {
-
-struct ArrowWriteContext;
-class ColumnDescriptor;
-class DataPage;
-class DictionaryPage;
-class ColumnChunkMetaDataBuilder;
-class Encryptor;
-class WriterProperties;
-
-class PARQUET_EXPORT LevelEncoder {
- public:
- LevelEncoder();
- ~LevelEncoder();
-
- static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
- int num_buffered_values);
-
- // Initialize the LevelEncoder.
- void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
- uint8_t* data, int data_size);
-
- // Encodes a batch of levels from an array and returns the number of levels encoded
- int Encode(int batch_size, const int16_t* levels);
-
- int32_t len() {
- if (encoding_ != Encoding::RLE) {
- throw ParquetException("Only implemented for RLE encoding");
- }
- return rle_length_;
- }
-
- private:
- int bit_width_;
- int rle_length_;
- Encoding::type encoding_;
- std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
- std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
-};
-
-class PARQUET_EXPORT PageWriter {
- public:
- virtual ~PageWriter() {}
-
- static std::unique_ptr<PageWriter> Open(
- std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
- int compression_level, ColumnChunkMetaDataBuilder* metadata,
- int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
- bool buffered_row_group = false,
- std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
- std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
-
- // The Column Writer decides if dictionary encoding is used if set and
- // if the dictionary encoding has fallen back to default encoding on reaching dictionary
- // page limit
- virtual void Close(bool has_dictionary, bool fallback) = 0;
-
- // Return the number of uncompressed bytes written (including header size)
- virtual int64_t WriteDataPage(const DataPage& page) = 0;
-
- // Return the number of uncompressed bytes written (including header size)
- virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
-
- virtual bool has_compressor() = 0;
-
- virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
-};
-
-static constexpr int WRITE_BATCH_SIZE = 1000;
-class PARQUET_EXPORT ColumnWriter {
- public:
- virtual ~ColumnWriter() = default;
-
- static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
- std::unique_ptr<PageWriter>,
- const WriterProperties* properties);
-
- /// \brief Closes the ColumnWriter, commits any buffered values to pages.
- /// \return Total size of the column in bytes
- virtual int64_t Close() = 0;
-
- /// \brief The physical Parquet type of the column
- virtual Type::type type() const = 0;
-
- /// \brief The schema for the column
- virtual const ColumnDescriptor* descr() const = 0;
-
- /// \brief The number of rows written so far
- virtual int64_t rows_written() const = 0;
-
- /// \brief The total size of the compressed pages + page headers. Some values
- /// might be still buffered and not written to a page yet
- virtual int64_t total_compressed_bytes() const = 0;
-
- /// \brief The total number of bytes written as serialized data and
- /// dictionary pages to the ColumnChunk so far
- virtual int64_t total_bytes_written() const = 0;
-
- /// \brief The file-level writer properties
- virtual const WriterProperties* properties() = 0;
-
- /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
- /// error status if the array data type is not compatible with the concrete
- /// writer type.
- ///
- /// leaf_array is always a primitive (possibly dictionary encoded type).
- /// Leaf_field_nullable indicates whether the leaf array is considered nullable
- /// according to its schema in a Table or its parent array.
- virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_levels, const ::arrow::Array& leaf_array,
- ArrowWriteContext* ctx,
- bool leaf_field_nullable) = 0;
-};
-
-// API to write values to a single column. This is the main client facing API.
-template <typename DType>
-class TypedColumnWriter : public ColumnWriter {
- public:
- using T = typename DType::c_type;
-
- // Write a batch of repetition levels, definition levels, and values to the
- // column.
- // `num_values` is the number of logical leaf values.
- // `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
- // (resp. max repetition level) is 0.
- // If not null, each of `def_levels` and `rep_levels` must have at least
- // `num_values`.
- //
- // The number of physical values written (taken from `values`) is returned.
- // It can be smaller than `num_values` is there are some undefined values.
- virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const T* values) = 0;
-
- /// Write a batch of repetition levels, definition levels, and values to the
- /// column.
- ///
- /// In comparison to WriteBatch the length of repetition and definition levels
- /// is the same as of the number of values read for max_definition_level == 1.
- /// In the case of max_definition_level > 1, the repetition and definition
- /// levels are larger than the values but the values include the null entries
- /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
- /// in the parameters of this function if the input has the length of num_values or the
- /// _number of rows in the lowest nesting level_.
- ///
- /// In the case that the most inner node in the Parquet is required, the _number of rows
- /// in the lowest nesting level_ is equal to the number of non-null values. If the
- /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
- /// also includes all values with definition_level == (max_definition_level - 1).
- ///
- /// @param num_values number of levels to write.
- /// @param def_levels The Parquet definition levels, length is num_values
- /// @param rep_levels The Parquet repetition levels, length is num_values
- /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
- /// level. The length is number of rows in the lowest nesting level.
- /// @param valid_bits_offset The offset in bits of the valid_bits where the
- /// first relevant bit resides.
- /// @param values The values in the lowest nested level including
- /// spacing for nulls on the lowest levels; input has the length
- /// of the number of rows on the lowest nesting level.
- virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
- const int16_t* rep_levels, const uint8_t* valid_bits,
- int64_t valid_bits_offset, const T* values) = 0;
-
- // Estimated size of the values that are not written to a page yet
- virtual int64_t EstimatedBufferedValueBytes() const = 0;
-};
-
-using BoolWriter = TypedColumnWriter<BooleanType>;
-using Int32Writer = TypedColumnWriter<Int32Type>;
-using Int64Writer = TypedColumnWriter<Int64Type>;
-using Int96Writer = TypedColumnWriter<Int96Type>;
-using FloatWriter = TypedColumnWriter<FloatType>;
-using DoubleWriter = TypedColumnWriter<DoubleType>;
-using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
-using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
-
-namespace internal {
-
-/**
- * Timestamp conversion constants
- */
-constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
-
-template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
-inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
- int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
- (*impala_timestamp).value[2] = (uint32_t)julian_days;
-
- int64_t last_day_units = time % UnitPerDay;
- auto last_day_nanos = last_day_units * NanosecondsPerUnit;
- // impala_timestamp will be unaligned every other entry so do memcpy instead
- // of assign and reinterpret cast to avoid undefined behavior.
- std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
-}
-
-constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
-
-inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
- impala_timestamp);
-}
-
-constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
-
-inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
- Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
- milliseconds, impala_timestamp);
-}
-
-constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
-
-inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
- Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
- microseconds, impala_timestamp);
-}
-
-constexpr int64_t kNanosecondsInNanos = INT64_C(1);
-
-inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
- Int96* impala_timestamp) {
- ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
- nanoseconds, impala_timestamp);
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+
+namespace BitUtil {
+class BitWriter;
+} // namespace BitUtil
+
+namespace util {
+class RleEncoder;
+} // namespace util
+
+} // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+class ColumnDescriptor;
+class DataPage;
+class DictionaryPage;
+class ColumnChunkMetaDataBuilder;
+class Encryptor;
+class WriterProperties;
+
+class PARQUET_EXPORT LevelEncoder {
+ public:
+ LevelEncoder();
+ ~LevelEncoder();
+
+ static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
+ int num_buffered_values);
+
+ // Initialize the LevelEncoder.
+ void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+ uint8_t* data, int data_size);
+
+ // Encodes a batch of levels from an array and returns the number of levels encoded
+ int Encode(int batch_size, const int16_t* levels);
+
+ int32_t len() {
+ if (encoding_ != Encoding::RLE) {
+ throw ParquetException("Only implemented for RLE encoding");
+ }
+ return rle_length_;
+ }
+
+ private:
+ int bit_width_;
+ int rle_length_;
+ Encoding::type encoding_;
+ std::unique_ptr<::arrow::util::RleEncoder> rle_encoder_;
+ std::unique_ptr<::arrow::BitUtil::BitWriter> bit_packed_encoder_;
+};
+
+class PARQUET_EXPORT PageWriter {
+ public:
+ virtual ~PageWriter() {}
+
+ static std::unique_ptr<PageWriter> Open(
+ std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+ int compression_level, ColumnChunkMetaDataBuilder* metadata,
+ int16_t row_group_ordinal = -1, int16_t column_chunk_ordinal = -1,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+ bool buffered_row_group = false,
+ std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
+ std::shared_ptr<Encryptor> data_encryptor = NULLPTR);
+
+ // The Column Writer decides if dictionary encoding is used if set and
+ // if the dictionary encoding has fallen back to default encoding on reaching dictionary
+ // page limit
+ virtual void Close(bool has_dictionary, bool fallback) = 0;
+
+ // Return the number of uncompressed bytes written (including header size)
+ virtual int64_t WriteDataPage(const DataPage& page) = 0;
+
+ // Return the number of uncompressed bytes written (including header size)
+ virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
+
+ virtual bool has_compressor() = 0;
+
+ virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
+};
+
+static constexpr int WRITE_BATCH_SIZE = 1000;
+class PARQUET_EXPORT ColumnWriter {
+ public:
+ virtual ~ColumnWriter() = default;
+
+ static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
+ std::unique_ptr<PageWriter>,
+ const WriterProperties* properties);
+
+ /// \brief Closes the ColumnWriter, commits any buffered values to pages.
+ /// \return Total size of the column in bytes
+ virtual int64_t Close() = 0;
+
+ /// \brief The physical Parquet type of the column
+ virtual Type::type type() const = 0;
+
+ /// \brief The schema for the column
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ /// \brief The number of rows written so far
+ virtual int64_t rows_written() const = 0;
+
+ /// \brief The total size of the compressed pages + page headers. Some values
+ /// might be still buffered and not written to a page yet
+ virtual int64_t total_compressed_bytes() const = 0;
+
+ /// \brief The total number of bytes written as serialized data and
+ /// dictionary pages to the ColumnChunk so far
+ virtual int64_t total_bytes_written() const = 0;
+
+ /// \brief The file-level writer properties
+ virtual const WriterProperties* properties() = 0;
+
+ /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
+ /// error status if the array data type is not compatible with the concrete
+ /// writer type.
+ ///
+ /// leaf_array is always a primitive (possibly dictionary encoded type).
+ /// Leaf_field_nullable indicates whether the leaf array is considered nullable
+ /// according to its schema in a Table or its parent array.
+ virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_levels, const ::arrow::Array& leaf_array,
+ ArrowWriteContext* ctx,
+ bool leaf_field_nullable) = 0;
+};
+
+// API to write values to a single column. This is the main client facing API.
+template <typename DType>
+class TypedColumnWriter : public ColumnWriter {
+ public:
+ using T = typename DType::c_type;
+
+ // Write a batch of repetition levels, definition levels, and values to the
+ // column.
+ // `num_values` is the number of logical leaf values.
+ // `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
+ // (resp. max repetition level) is 0.
+ // If not null, each of `def_levels` and `rep_levels` must have at least
+ // `num_values`.
+ //
+ // The number of physical values written (taken from `values`) is returned.
+ // It can be smaller than `num_values` is there are some undefined values.
+ virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const T* values) = 0;
+
+ /// Write a batch of repetition levels, definition levels, and values to the
+ /// column.
+ ///
+ /// In comparison to WriteBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+ /// in the parameters of this function if the input has the length of num_values or the
+ /// _number of rows in the lowest nesting level_.
+ ///
+ /// In the case that the most inner node in the Parquet is required, the _number of rows
+ /// in the lowest nesting level_ is equal to the number of non-null values. If the
+ /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+ /// also includes all values with definition_level == (max_definition_level - 1).
+ ///
+ /// @param num_values number of levels to write.
+ /// @param def_levels The Parquet definition levels, length is num_values
+ /// @param rep_levels The Parquet repetition levels, length is num_values
+ /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+ /// level. The length is number of rows in the lowest nesting level.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; input has the length
+ /// of the number of rows on the lowest nesting level.
+ virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+ const int16_t* rep_levels, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, const T* values) = 0;
+
+ // Estimated size of the values that are not written to a page yet
+ virtual int64_t EstimatedBufferedValueBytes() const = 0;
+};
+
+using BoolWriter = TypedColumnWriter<BooleanType>;
+using Int32Writer = TypedColumnWriter<Int32Type>;
+using Int64Writer = TypedColumnWriter<Int64Type>;
+using Int96Writer = TypedColumnWriter<Int96Type>;
+using FloatWriter = TypedColumnWriter<FloatType>;
+using DoubleWriter = TypedColumnWriter<DoubleType>;
+using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
+using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
+
+namespace internal {
+
+/**
+ * Timestamp conversion constants
+ */
+constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
+
+template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
+inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
+ int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
+ (*impala_timestamp).value[2] = (uint32_t)julian_days;
+
+ int64_t last_day_units = time % UnitPerDay;
+ auto last_day_nanos = last_day_units * NanosecondsPerUnit;
+ // impala_timestamp will be unaligned every other entry so do memcpy instead
+ // of assign and reinterpret cast to avoid undefined behavior.
+ std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
+}
+
+constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
+
+inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
+ impala_timestamp);
+}
+
+constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
+
+inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
+ milliseconds, impala_timestamp);
+}
+
+constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
+
+inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
+ microseconds, impala_timestamp);
+}
+
+constexpr int64_t kNanosecondsInNanos = INT64_C(1);
+
+inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
+ Int96* impala_timestamp) {
+ ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
+ nanoseconds, impala_timestamp);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
index 3b615af706d..6e8f7ee5491 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.cc
@@ -1,2547 +1,2547 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encoding.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/array/builder_dict.h"
-#include "arrow/stl_allocator.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_stream_utils.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_ops.h"
-#include "arrow/util/bitmap_writer.h"
-#include "arrow/util/byte_stream_split.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/rle_encoding.h"
-#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
-
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace BitUtil = arrow::BitUtil;
-
-using arrow::Status;
-using arrow::VisitNullBitmapInline;
-using arrow::internal::checked_cast;
-
-template <typename T>
-using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
-
-namespace parquet {
-namespace {
-
-constexpr int64_t kInMemoryDefaultCapacity = 1024;
-// The Parquet spec isn't very clear whether ByteArray lengths are signed or
-// unsigned, but the Java implementation uses signed ints.
-constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
-
-class EncoderImpl : virtual public Encoder {
- public:
- EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
- : descr_(descr),
- encoding_(encoding),
- pool_(pool),
- type_length_(descr ? descr->type_length() : -1) {}
-
- Encoding::type encoding() const override { return encoding_; }
-
- MemoryPool* memory_pool() const override { return pool_; }
-
- protected:
- // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
- const ColumnDescriptor* descr_;
- const Encoding::type encoding_;
- MemoryPool* pool_;
-
- /// Type length from descr
- int type_length_;
-};
-
-// ----------------------------------------------------------------------
-// Plain encoder implementation
-
-template <typename DType>
-class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
- public:
- using T = typename DType::c_type;
-
- explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
- : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
-
- int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
-
- std::shared_ptr<Buffer> FlushValues() override {
- std::shared_ptr<Buffer> buffer;
- PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
- return buffer;
- }
-
- using TypedEncoder<DType>::Put;
-
- void Put(const T* buffer, int num_values) override;
-
- void Put(const ::arrow::Array& values) override;
-
- void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- if (valid_bits != NULLPTR) {
- PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
- this->memory_pool()));
- T* data = reinterpret_cast<T*>(buffer->mutable_data());
- int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
- src, num_values, valid_bits, valid_bits_offset, data);
- Put(data, num_valid_values);
- } else {
- Put(src, num_values);
- }
- }
-
- void UnsafePutByteArray(const void* data, uint32_t length) {
- DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
- sink_.UnsafeAppend(&length, sizeof(uint32_t));
- sink_.UnsafeAppend(data, static_cast<int64_t>(length));
- }
-
- void Put(const ByteArray& val) {
- // Write the result to the output stream
- const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
- if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
- PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
- }
- UnsafePutByteArray(val.ptr, val.len);
- }
-
- protected:
- template <typename ArrayType>
- void PutBinaryArray(const ArrayType& array) {
- const int64_t total_bytes =
- array.value_offset(array.length()) - array.value_offset(0);
- PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
-
- PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
- *array.data(),
- [&](::arrow::util::string_view view) {
- if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
- }
- UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
- return Status::OK();
- },
- []() { return Status::OK(); }));
- }
-
- ::arrow::BufferBuilder sink_;
-};
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
- if (num_values > 0) {
- PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
- }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
- for (int i = 0; i < num_values; ++i) {
- Put(src[i]);
- }
-}
-
-template <typename ArrayType>
-void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
- if (values.type_id() != ArrayType::TypeClass::type_id) {
- std::string type_name = ArrayType::TypeClass::type_name();
- throw ParquetException("direct put to " + type_name + " from " +
- values.type()->ToString() + " not supported");
- }
-
- using value_type = typename ArrayType::value_type;
- constexpr auto value_size = sizeof(value_type);
- auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
-
- if (values.null_count() == 0) {
- // no nulls, just dump the data
- PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
- } else {
- PARQUET_THROW_NOT_OK(
- sink->Reserve((values.length() - values.null_count()) * value_size));
-
- for (int64_t i = 0; i < values.length(); i++) {
- if (values.IsValid(i)) {
- sink->UnsafeAppend(&raw_values[i], value_size);
- }
- }
- }
-}
-
-template <>
-void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::Int32Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::Int64Array>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
- ParquetException::NYI("direct put to Int96");
-}
-
-template <>
-void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::FloatArray>(values, &sink_);
-}
-
-template <>
-void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
- DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
-}
-
-template <typename DType>
-void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
- ParquetException::NYI("direct put of " + values.type()->ToString());
-}
-
-void AssertBaseBinary(const ::arrow::Array& values) {
- if (!::arrow::is_base_binary_like(values.type_id())) {
- throw ParquetException("Only BaseBinaryArray and subclasses supported");
- }
-}
-
-template <>
-inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
- AssertBaseBinary(values);
-
- if (::arrow::is_binary_like(values.type_id())) {
- PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
- }
-}
-
-void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
- if (values.type_id() != ::arrow::Type::FIXED_SIZE_BINARY &&
- values.type_id() != ::arrow::Type::DECIMAL) {
- throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
- }
- if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
- type_length) {
- throw ParquetException("Size mismatch: " + values.type()->ToString() +
- " should have been " + std::to_string(type_length) + " wide");
- }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
- AssertFixedSizeBinary(values, descr_->type_length());
- const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
- if (data.null_count() == 0) {
- // no nulls, just dump the data
- PARQUET_THROW_NOT_OK(
- sink_.Append(data.raw_values(), data.length() * data.byte_width()));
- } else {
- const int64_t total_bytes =
- data.length() * data.byte_width() - data.null_count() * data.byte_width();
- PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- sink_.UnsafeAppend(data.Value(i), data.byte_width());
- }
- }
- }
-}
-
-template <>
-inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
- if (descr_->type_length() == 0) {
- return;
- }
- for (int i = 0; i < num_values; ++i) {
- // Write the result to the output stream
- DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
- PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
- }
-}
-
-template <>
-class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
- public:
- explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
- : EncoderImpl(descr, Encoding::PLAIN, pool),
- bits_available_(kInMemoryDefaultCapacity * 8),
- bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
- sink_(pool),
- bit_writer_(bits_buffer_->mutable_data(),
- static_cast<int>(bits_buffer_->size())) {}
-
- int64_t EstimatedDataEncodedSize() override;
- std::shared_ptr<Buffer> FlushValues() override;
-
- void Put(const bool* src, int num_values) override;
-
- void Put(const std::vector<bool>& src, int num_values) override;
-
- void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- if (valid_bits != NULLPTR) {
- PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
- this->memory_pool()));
- T* data = reinterpret_cast<T*>(buffer->mutable_data());
- int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
- src, num_values, valid_bits, valid_bits_offset, data);
- Put(data, num_valid_values);
- } else {
- Put(src, num_values);
- }
- }
-
- void Put(const ::arrow::Array& values) override {
- if (values.type_id() != ::arrow::Type::BOOL) {
- throw ParquetException("direct put to boolean from " + values.type()->ToString() +
- " not supported");
- }
-
- const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
- if (data.null_count() == 0) {
- PARQUET_THROW_NOT_OK(sink_.Reserve(BitUtil::BytesForBits(data.length())));
- // no nulls, just dump the data
- ::arrow::internal::CopyBitmap(data.data()->GetValues<uint8_t>(1), data.offset(),
- data.length(), sink_.mutable_data(), sink_.length());
- } else {
- auto n_valid = BitUtil::BytesForBits(data.length() - data.null_count());
- PARQUET_THROW_NOT_OK(sink_.Reserve(n_valid));
- ::arrow::internal::FirstTimeBitmapWriter writer(sink_.mutable_data(),
- sink_.length(), n_valid);
-
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- if (data.Value(i)) {
- writer.Set();
- } else {
- writer.Clear();
- }
- writer.Next();
- }
- }
- writer.Finish();
- }
- sink_.UnsafeAdvance(data.length());
- }
-
- private:
- int bits_available_;
- std::shared_ptr<ResizableBuffer> bits_buffer_;
- ::arrow::BufferBuilder sink_;
- ::arrow::BitUtil::BitWriter bit_writer_;
-
- template <typename SequenceType>
- void PutImpl(const SequenceType& src, int num_values);
-};
-
-template <typename SequenceType>
-void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
- int bit_offset = 0;
- if (bits_available_ > 0) {
- int bits_to_write = std::min(bits_available_, num_values);
- for (int i = 0; i < bits_to_write; i++) {
- bit_writer_.PutValue(src[i], 1);
- }
- bits_available_ -= bits_to_write;
- bit_offset = bits_to_write;
-
- if (bits_available_ == 0) {
- bit_writer_.Flush();
- PARQUET_THROW_NOT_OK(
- sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
- bit_writer_.Clear();
- }
- }
-
- int bits_remaining = num_values - bit_offset;
- while (bit_offset < num_values) {
- bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
-
- int bits_to_write = std::min(bits_available_, bits_remaining);
- for (int i = bit_offset; i < bit_offset + bits_to_write; i++) {
- bit_writer_.PutValue(src[i], 1);
- }
- bit_offset += bits_to_write;
- bits_available_ -= bits_to_write;
- bits_remaining -= bits_to_write;
-
- if (bits_available_ == 0) {
- bit_writer_.Flush();
- PARQUET_THROW_NOT_OK(
- sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
- bit_writer_.Clear();
- }
- }
-}
-
-int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
- int64_t position = sink_.length();
- return position + bit_writer_.bytes_written();
-}
-
-std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
- if (bits_available_ > 0) {
- bit_writer_.Flush();
- PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
- bit_writer_.Clear();
- bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
- }
-
- std::shared_ptr<Buffer> buffer;
- PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
- return buffer;
-}
-
-void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
- PutImpl(src, num_values);
-}
-
-void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
- PutImpl(src, num_values);
-}
-
-// ----------------------------------------------------------------------
-// DictEncoder<T> implementations
-
-template <typename DType>
-struct DictEncoderTraits {
- using c_type = typename DType::c_type;
- using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
-};
-
-template <>
-struct DictEncoderTraits<ByteArrayType> {
- using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-template <>
-struct DictEncoderTraits<FLBAType> {
- using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
-};
-
-// Initially 1024 elements
-static constexpr int32_t kInitialHashTableSize = 1 << 10;
-
-/// See the dictionary encoding section of
-/// https://github.com/Parquet/parquet-format. The encoding supports
-/// streaming encoding. Values are encoded as they are added while the
-/// dictionary is being constructed. At any time, the buffered values
-/// can be written out with the current dictionary size. More values
-/// can then be added to the encoder, including new dictionary
-/// entries.
-template <typename DType>
-class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
- using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
-
- public:
- typedef typename DType::c_type T;
-
- explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
- : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
- buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
- dict_encoded_size_(0),
- memo_table_(pool, kInitialHashTableSize) {}
-
- ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); }
-
- int dict_encoded_size() override { return dict_encoded_size_; }
-
- int WriteIndices(uint8_t* buffer, int buffer_len) override {
- // Write bit width in first byte
- *buffer = static_cast<uint8_t>(bit_width());
- ++buffer;
- --buffer_len;
-
- ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
-
- for (int32_t index : buffered_indices_) {
- if (!encoder.Put(index)) return -1;
- }
- encoder.Flush();
-
- ClearIndices();
- return 1 + encoder.len();
- }
-
- void set_type_length(int type_length) { this->type_length_ = type_length; }
-
- /// Returns a conservative estimate of the number of bytes needed to encode the buffered
- /// indices. Used to size the buffer passed to WriteIndices().
- int64_t EstimatedDataEncodedSize() override {
- // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
- // reserve
- // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
- // but not reserving them would cause the encoder to fail.
- return 1 +
- ::arrow::util::RleEncoder::MaxBufferSize(
- bit_width(), static_cast<int>(buffered_indices_.size())) +
- ::arrow::util::RleEncoder::MinBufferSize(bit_width());
- }
-
- /// The minimum bit width required to encode the currently buffered indices.
- int bit_width() const override {
- if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
- if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
- return BitUtil::Log2(num_entries());
- }
-
- /// Encode value. Note that this does not actually write any data, just
- /// buffers the value's index to be written later.
- inline void Put(const T& value);
-
- // Not implemented for other data types
- inline void PutByteArray(const void* ptr, int32_t length);
-
- void Put(const T* src, int num_values) override {
- for (int32_t i = 0; i < num_values; i++) {
- Put(src[i]);
- }
- }
-
- void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
- [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; i++) {
- Put(src[i + position]);
- }
- });
- }
-
- using TypedEncoder<DType>::Put;
-
- void Put(const ::arrow::Array& values) override;
- void PutDictionary(const ::arrow::Array& values) override;
-
- template <typename ArrowType, typename T = typename ArrowType::c_type>
- void PutIndicesTyped(const ::arrow::Array& data) {
- auto values = data.data()->GetValues<T>(1);
- size_t buffer_position = buffered_indices_.size();
- buffered_indices_.resize(buffer_position +
- static_cast<size_t>(data.length() - data.null_count()));
- ::arrow::internal::VisitSetBitRunsVoid(
- data.null_bitmap_data(), data.offset(), data.length(),
- [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; ++i) {
- buffered_indices_[buffer_position++] =
- static_cast<int32_t>(values[i + position]);
- }
- });
- }
-
- void PutIndices(const ::arrow::Array& data) override {
- switch (data.type()->id()) {
- case ::arrow::Type::UINT8:
- case ::arrow::Type::INT8:
- return PutIndicesTyped<::arrow::UInt8Type>(data);
- case ::arrow::Type::UINT16:
- case ::arrow::Type::INT16:
- return PutIndicesTyped<::arrow::UInt16Type>(data);
- case ::arrow::Type::UINT32:
- case ::arrow::Type::INT32:
- return PutIndicesTyped<::arrow::UInt32Type>(data);
- case ::arrow::Type::UINT64:
- case ::arrow::Type::INT64:
- return PutIndicesTyped<::arrow::UInt64Type>(data);
- default:
- throw ParquetException("Passed non-integer array to PutIndices");
- }
- }
-
- std::shared_ptr<Buffer> FlushValues() override {
- std::shared_ptr<ResizableBuffer> buffer =
- AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
- int result_size = WriteIndices(buffer->mutable_data(),
- static_cast<int>(EstimatedDataEncodedSize()));
- PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
- return std::move(buffer);
- }
-
- /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
- /// dict_encoded_size() bytes.
- void WriteDict(uint8_t* buffer) override;
-
- /// The number of entries in the dictionary.
- int num_entries() const override { return memo_table_.size(); }
-
- private:
- /// Clears all the indices (but leaves the dictionary).
- void ClearIndices() { buffered_indices_.clear(); }
-
- /// Indices that have not yet be written out by WriteIndices().
- ArrowPoolVector<int32_t> buffered_indices_;
-
- template <typename ArrayType>
- void PutBinaryArray(const ArrayType& array) {
- PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
- *array.data(),
- [&](::arrow::util::string_view view) {
- if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
- return Status::Invalid("Parquet cannot store strings with size 2GB or more");
- }
- PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
- return Status::OK();
- },
- []() { return Status::OK(); }));
- }
-
- template <typename ArrayType>
- void PutBinaryDictionaryArray(const ArrayType& array) {
- DCHECK_EQ(array.null_count(), 0);
- for (int64_t i = 0; i < array.length(); i++) {
- auto v = array.GetView(i);
- if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
- throw ParquetException("Parquet cannot store strings with size 2GB or more");
- }
- dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
- int32_t unused_memo_index;
- PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
- v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
- }
- }
-
- /// The number of bytes needed to encode the dictionary.
- int dict_encoded_size_;
-
- MemoTableType memo_table_;
-};
-
-template <typename DType>
-void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
- // For primitive types, only a memcpy
- DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
- memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
-}
-
-// ByteArray and FLBA already have the dictionary encoded in their data heaps
-template <>
-void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
- memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) {
- uint32_t len = static_cast<uint32_t>(v.length());
- memcpy(buffer, &len, sizeof(len));
- buffer += sizeof(len);
- memcpy(buffer, v.data(), len);
- buffer += len;
- });
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) {
- memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) {
- DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
- memcpy(buffer, v.data(), type_length_);
- buffer += type_length_;
- });
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::Put(const T& v) {
- // Put() implementation for primitive types
- auto on_found = [](int32_t memo_index) {};
- auto on_not_found = [this](int32_t memo_index) {
- dict_encoded_size_ += static_cast<int>(sizeof(T));
- };
-
- int32_t memo_index;
- PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
- buffered_indices_.push_back(memo_index);
-}
-
-template <typename DType>
-inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
- DCHECK(false);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
- int32_t length) {
- static const uint8_t empty[] = {0};
-
- auto on_found = [](int32_t memo_index) {};
- auto on_not_found = [&](int32_t memo_index) {
- dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
- };
-
- DCHECK(ptr != nullptr || length == 0);
- ptr = (ptr != nullptr) ? ptr : empty;
- int32_t memo_index;
- PARQUET_THROW_NOT_OK(
- memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
- buffered_indices_.push_back(memo_index);
-}
-
-template <>
-inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
- return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
-}
-
-template <>
-inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
- static const uint8_t empty[] = {0};
-
- auto on_found = [](int32_t memo_index) {};
- auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
-
- DCHECK(v.ptr != nullptr || type_length_ == 0);
- const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
- int32_t memo_index;
- PARQUET_THROW_NOT_OK(
- memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
- buffered_indices_.push_back(memo_index);
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
- ParquetException::NYI("Direct put to Int96");
-}
-
-template <>
-void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
- ParquetException::NYI("Direct put to Int96");
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
- using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
- const auto& data = checked_cast<const ArrayType&>(values);
- if (data.null_count() == 0) {
- // no nulls, just dump the data
- for (int64_t i = 0; i < data.length(); i++) {
- Put(data.Value(i));
- }
- } else {
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- Put(data.Value(i));
- }
- }
- }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
- AssertFixedSizeBinary(values, type_length_);
- const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
- if (data.null_count() == 0) {
- // no nulls, just dump the data
- for (int64_t i = 0; i < data.length(); i++) {
- Put(FixedLenByteArray(data.Value(i)));
- }
- } else {
- std::vector<uint8_t> empty(type_length_, 0);
- for (int64_t i = 0; i < data.length(); i++) {
- if (data.IsValid(i)) {
- Put(FixedLenByteArray(data.Value(i)));
- }
- }
- }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
- AssertBaseBinary(values);
- if (::arrow::is_binary_like(values.type_id())) {
- PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
- }
-}
-
-template <typename DType>
-void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
- if (dict.null_count() > 0) {
- throw ParquetException("Inserted dictionary cannot cannot contain nulls");
- }
-
- if (encoder->num_entries() > 0) {
- throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
- }
-}
-
-template <typename DType>
-void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
- AssertCanPutDictionary(this, values);
-
- using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
- const auto& data = checked_cast<const ArrayType&>(values);
-
- dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
- for (int64_t i = 0; i < data.length(); i++) {
- int32_t unused_memo_index;
- PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
- }
-}
-
-template <>
-void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
- AssertFixedSizeBinary(values, type_length_);
- AssertCanPutDictionary(this, values);
-
- const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
-
- dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
- for (int64_t i = 0; i < data.length(); i++) {
- int32_t unused_memo_index;
- PARQUET_THROW_NOT_OK(
- memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
- }
-}
-
-template <>
-void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
- AssertBaseBinary(values);
- AssertCanPutDictionary(this, values);
-
- if (::arrow::is_binary_like(values.type_id())) {
- PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
- }
-}
-
-// ----------------------------------------------------------------------
-// ByteStreamSplitEncoder<T> implementations
-
-template <typename DType>
-class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
- public:
- using T = typename DType::c_type;
- using TypedEncoder<DType>::Put;
-
- explicit ByteStreamSplitEncoder(
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- int64_t EstimatedDataEncodedSize() override;
- std::shared_ptr<Buffer> FlushValues() override;
-
- void Put(const T* buffer, int num_values) override;
- void Put(const ::arrow::Array& values) override;
- void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override;
-
- protected:
- template <typename ArrowType>
- void PutImpl(const ::arrow::Array& values) {
- if (values.type_id() != ArrowType::type_id) {
- throw ParquetException(std::string() + "direct put to " + ArrowType::type_name() +
- " from " + values.type()->ToString() + " not supported");
- }
- const auto& data = *values.data();
- PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
- static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0), data.offset);
- }
-
- ::arrow::BufferBuilder sink_;
- int64_t num_values_in_buffer_;
-};
-
-template <typename DType>
-ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool)
- : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
- sink_{pool},
- num_values_in_buffer_{0} {}
-
-template <typename DType>
-int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
- return sink_.length();
-}
-
-template <typename DType>
-std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
- std::shared_ptr<ResizableBuffer> output_buffer =
- AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
- uint8_t* output_buffer_raw = output_buffer->mutable_data();
- const uint8_t* raw_values = sink_.data();
- ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values_in_buffer_,
- output_buffer_raw);
- sink_.Reset();
- num_values_in_buffer_ = 0;
- return std::move(output_buffer);
-}
-
-template <typename DType>
-void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
- if (num_values > 0) {
- PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
- num_values_in_buffer_ += num_values;
- }
-}
-
-template <>
-void ByteStreamSplitEncoder<FloatType>::Put(const ::arrow::Array& values) {
- PutImpl<::arrow::FloatType>(values);
-}
-
-template <>
-void ByteStreamSplitEncoder<DoubleType>::Put(const ::arrow::Array& values) {
- PutImpl<::arrow::DoubleType>(values);
-}
-
-template <typename DType>
-void ByteStreamSplitEncoder<DType>::PutSpaced(const T* src, int num_values,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) {
- if (valid_bits != NULLPTR) {
- PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
- this->memory_pool()));
- T* data = reinterpret_cast<T*>(buffer->mutable_data());
- int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
- src, num_values, valid_bits, valid_bits_offset, data);
- Put(data, num_valid_values);
- } else {
- Put(src, num_values);
- }
-}
-
-class DecoderImpl : virtual public Decoder {
- public:
- void SetData(int num_values, const uint8_t* data, int len) override {
- num_values_ = num_values;
- data_ = data;
- len_ = len;
- }
-
- int values_left() const override { return num_values_; }
- Encoding::type encoding() const override { return encoding_; }
-
- protected:
- explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding)
- : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {}
-
- // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
- const ColumnDescriptor* descr_;
-
- const Encoding::type encoding_;
- int num_values_;
- const uint8_t* data_;
- int len_;
- int type_length_;
-};
-
-template <typename DType>
-class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
- public:
- using T = typename DType::c_type;
- explicit PlainDecoder(const ColumnDescriptor* descr);
-
- int Decode(T* buffer, int max_values) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) override;
-};
-
-template <>
-inline int PlainDecoder<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::Accumulator* builder) {
- ParquetException::NYI("DecodeArrow not supported for Int96");
-}
-
-template <>
-inline int PlainDecoder<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow not supported for Int96");
-}
-
-template <>
-inline int PlainDecoder<BooleanType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
- ParquetException::NYI("dictionaries of BooleanType");
-}
-
-template <typename DType>
-int PlainDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) {
- using value_type = typename DType::c_type;
-
- constexpr int value_size = static_cast<int>(sizeof(value_type));
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- builder->UnsafeAppend(::arrow::util::SafeLoadAs<value_type>(data_));
- data_ += sizeof(value_type);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- num_values_ -= values_decoded;
- len_ -= sizeof(value_type) * values_decoded;
- return values_decoded;
-}
-
-template <typename DType>
-int PlainDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- using value_type = typename DType::c_type;
-
- constexpr int value_size = static_cast<int>(sizeof(value_type));
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- PARQUET_THROW_NOT_OK(
- builder->Append(::arrow::util::SafeLoadAs<value_type>(data_)));
- data_ += sizeof(value_type);
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- num_values_ -= values_decoded;
- len_ -= sizeof(value_type) * values_decoded;
- return values_decoded;
-}
-
-// Decode routine templated on C++ type rather than type enum
-template <typename T>
-inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
- int type_length, T* out) {
- int64_t bytes_to_decode = num_values * static_cast<int64_t>(sizeof(T));
- if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
- ParquetException::EofException();
- }
- // If bytes_to_decode == 0, data could be null
- if (bytes_to_decode > 0) {
- memcpy(out, data, bytes_to_decode);
- }
- return static_cast<int>(bytes_to_decode);
-}
-
-template <typename DType>
-PlainDecoder<DType>::PlainDecoder(const ColumnDescriptor* descr)
- : DecoderImpl(descr, Encoding::PLAIN) {
- if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
- type_length_ = descr_->type_length();
- } else {
- type_length_ = -1;
- }
-}
-
-// Template specialization for BYTE_ARRAY. The written values do not own their
-// own data.
-
-static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size,
- ByteArray* out) {
- if (ARROW_PREDICT_FALSE(data_size < 4)) {
- ParquetException::EofException();
- }
- const int32_t len = ::arrow::util::SafeLoadAs<int32_t>(data);
- if (len < 0) {
- throw ParquetException("Invalid BYTE_ARRAY value");
- }
- const int64_t consumed_length = static_cast<int64_t>(len) + 4;
- if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
- ParquetException::EofException();
- }
- *out = ByteArray{static_cast<uint32_t>(len), data + 4};
- return consumed_length;
-}
-
-template <>
-inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int num_values,
- int type_length, ByteArray* out) {
- int bytes_decoded = 0;
- for (int i = 0; i < num_values; ++i) {
- const auto increment = ReadByteArray(data, data_size, out + i);
- if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
- throw ParquetException("BYTE_ARRAY chunk too large");
- }
- data += increment;
- data_size -= increment;
- bytes_decoded += static_cast<int>(increment);
- }
- return bytes_decoded;
-}
-
-// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
-// own their own data.
-template <>
-inline int DecodePlain<FixedLenByteArray>(const uint8_t* data, int64_t data_size,
- int num_values, int type_length,
- FixedLenByteArray* out) {
- int64_t bytes_to_decode = static_cast<int64_t>(type_length) * num_values;
- if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
- ParquetException::EofException();
- }
- for (int i = 0; i < num_values; ++i) {
- out[i].ptr = data;
- data += type_length;
- data_size -= type_length;
- }
- return static_cast<int>(bytes_to_decode);
-}
-
-template <typename DType>
-int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
- max_values = std::min(max_values, num_values_);
- int bytes_consumed = DecodePlain<T>(data_, len_, max_values, type_length_, buffer);
- data_ += bytes_consumed;
- len_ -= bytes_consumed;
- num_values_ -= max_values;
- return max_values;
-}
-
-class PlainBooleanDecoder : public DecoderImpl,
- virtual public TypedDecoder<BooleanType>,
- virtual public BooleanDecoder {
- public:
- explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
- void SetData(int num_values, const uint8_t* data, int len) override;
-
- // Two flavors of bool decoding
- int Decode(uint8_t* buffer, int max_values) override;
- int Decode(bool* buffer, int max_values) override;
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::Accumulator* out) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* out) override;
-
- private:
- std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_;
-};
-
-PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
- : DecoderImpl(descr, Encoding::PLAIN) {}
-
-void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
- num_values_ = num_values;
- bit_reader_.reset(new BitUtil::BitReader(data, len));
-}
-
-int PlainBooleanDecoder::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::Accumulator* builder) {
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- bool value;
- ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
- builder->UnsafeAppend(value);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- num_values_ -= values_decoded;
- return values_decoded;
-}
-
-inline int PlainBooleanDecoder::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
- ParquetException::NYI("dictionaries of BooleanType");
-}
-
-int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
- max_values = std::min(max_values, num_values_);
- bool val;
- ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
- for (int i = 0; i < max_values; ++i) {
- if (!bit_reader_->GetValue(1, &val)) {
- ParquetException::EofException();
- }
- if (val) {
- bit_writer.Set();
- }
- bit_writer.Next();
- }
- bit_writer.Finish();
- num_values_ -= max_values;
- return max_values;
-}
-
-int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
- max_values = std::min(max_values, num_values_);
- if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) {
- ParquetException::EofException();
- }
- num_values_ -= max_values;
- return max_values;
-}
-
-struct ArrowBinaryHelper {
- explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) {
- this->out = out;
- this->builder = out->builder.get();
- this->chunk_space_remaining =
- ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
- }
-
- Status PushChunk() {
- std::shared_ptr<::arrow::Array> result;
- RETURN_NOT_OK(builder->Finish(&result));
- out->chunks.push_back(result);
- chunk_space_remaining = ::arrow::kBinaryMemoryLimit;
- return Status::OK();
- }
-
- bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
-
- void UnsafeAppend(const uint8_t* data, int32_t length) {
- chunk_space_remaining -= length;
- builder->UnsafeAppend(data, length);
- }
-
- void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
-
- Status Append(const uint8_t* data, int32_t length) {
- chunk_space_remaining -= length;
- return builder->Append(data, length);
- }
-
- Status AppendNull() { return builder->AppendNull(); }
-
- typename EncodingTraits<ByteArrayType>::Accumulator* out;
- ::arrow::BinaryBuilder* builder;
- int64_t chunk_space_remaining;
-};
-
-template <>
-inline int PlainDecoder<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
- ParquetException::NYI();
-}
-
-template <>
-inline int PlainDecoder<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
- ParquetException::NYI();
-}
-
-template <>
-inline int PlainDecoder<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::Accumulator* builder) {
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- builder->UnsafeAppend(data_);
- data_ += descr_->type_length();
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- num_values_ -= values_decoded;
- len_ -= descr_->type_length() * values_decoded;
- return values_decoded;
-}
-
-template <>
-inline int PlainDecoder<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- PARQUET_THROW_NOT_OK(builder->Append(data_));
- data_ += descr_->type_length();
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- num_values_ -= values_decoded;
- len_ -= descr_->type_length() * values_decoded;
- return values_decoded;
-}
-
-class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
- virtual public ByteArrayDecoder {
- public:
- using Base = PlainDecoder<ByteArrayType>;
- using Base::DecodeSpaced;
- using Base::PlainDecoder;
-
- // ----------------------------------------------------------------------
- // Dictionary read paths
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- ::arrow::BinaryDictionary32Builder* builder) override {
- int result = 0;
- PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
- valid_bits_offset, builder, &result));
- return result;
- }
-
- // ----------------------------------------------------------------------
- // Optimized dense binary read paths
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
- int result = 0;
- PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
- valid_bits_offset, out, &result));
- return result;
- }
-
- private:
- Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out,
- int* out_values_decoded) {
- ArrowBinaryHelper helper(out);
- int values_decoded = 0;
-
- RETURN_NOT_OK(helper.builder->Reserve(num_values));
- RETURN_NOT_OK(helper.builder->ReserveData(
- std::min<int64_t>(len_, helper.chunk_space_remaining)));
-
- int i = 0;
- RETURN_NOT_OK(VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- if (ARROW_PREDICT_FALSE(len_ < 4)) {
- ParquetException::EofException();
- }
- auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
- if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
- return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
- }
- auto increment = value_len + 4;
- if (ARROW_PREDICT_FALSE(len_ < increment)) {
- ParquetException::EofException();
- }
- if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
- // This element would exceed the capacity of a chunk
- RETURN_NOT_OK(helper.PushChunk());
- RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
- RETURN_NOT_OK(helper.builder->ReserveData(
- std::min<int64_t>(len_, helper.chunk_space_remaining)));
- }
- helper.UnsafeAppend(data_ + 4, value_len);
- data_ += increment;
- len_ -= increment;
- ++values_decoded;
- ++i;
- return Status::OK();
- },
- [&]() {
- helper.UnsafeAppendNull();
- ++i;
- return Status::OK();
- }));
-
- num_values_ -= values_decoded;
- *out_values_decoded = values_decoded;
- return Status::OK();
- }
-
- template <typename BuilderType>
- Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset, BuilderType* builder,
- int* out_values_decoded) {
- RETURN_NOT_OK(builder->Reserve(num_values));
- int values_decoded = 0;
-
- RETURN_NOT_OK(VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- if (ARROW_PREDICT_FALSE(len_ < 4)) {
- ParquetException::EofException();
- }
- auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
- if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
- return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
- }
- auto increment = value_len + 4;
- if (ARROW_PREDICT_FALSE(len_ < increment)) {
- ParquetException::EofException();
- }
- RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
- data_ += increment;
- len_ -= increment;
- ++values_decoded;
- return Status::OK();
- },
- [&]() { return builder->AppendNull(); }));
-
- num_values_ -= values_decoded;
- *out_values_decoded = values_decoded;
- return Status::OK();
- }
-};
-
-class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
- public:
- using Base = PlainDecoder<FLBAType>;
- using Base::PlainDecoder;
-};
-
-// ----------------------------------------------------------------------
-// Dictionary encoding and decoding
-
-template <typename Type>
-class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
- public:
- typedef typename Type::c_type T;
-
- // Initializes the dictionary with values from 'dictionary'. The data in
- // dictionary is not guaranteed to persist in memory after this call so the
- // dictionary decoder needs to copy the data out if necessary.
- explicit DictDecoderImpl(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::RLE_DICTIONARY),
- dictionary_(AllocateBuffer(pool, 0)),
- dictionary_length_(0),
- byte_array_data_(AllocateBuffer(pool, 0)),
- byte_array_offsets_(AllocateBuffer(pool, 0)),
- indices_scratch_space_(AllocateBuffer(pool, 0)) {}
-
- // Perform type-specific initiatialization
- void SetDict(TypedDecoder<Type>* dictionary) override;
-
- void SetData(int num_values, const uint8_t* data, int len) override {
- num_values_ = num_values;
- if (len == 0) {
- // Initialize dummy decoder to avoid crashes later on
- idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1);
- return;
- }
- uint8_t bit_width = *data;
- if (ARROW_PREDICT_FALSE(bit_width >= 64)) {
- throw ParquetException("Invalid or corrupted bit_width");
- }
- idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width);
- }
-
- int Decode(T* buffer, int num_values) override {
- num_values = std::min(num_values, num_values_);
- int decoded_values =
- idx_decoder_.GetBatchWithDict(reinterpret_cast<const T*>(dictionary_->data()),
- dictionary_length_, buffer, num_values);
- if (decoded_values != num_values) {
- ParquetException::EofException();
- }
- num_values_ -= num_values;
- return num_values;
- }
-
- int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- num_values = std::min(num_values, num_values_);
- if (num_values != idx_decoder_.GetBatchWithDictSpaced(
- reinterpret_cast<const T*>(dictionary_->data()),
- dictionary_length_, buffer, num_values, null_count, valid_bits,
- valid_bits_offset)) {
- ParquetException::EofException();
- }
- num_values_ -= num_values;
- return num_values;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<Type>::Accumulator* out) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<Type>::DictAccumulator* out) override;
-
- void InsertDictionary(::arrow::ArrayBuilder* builder) override;
-
- int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- ::arrow::ArrayBuilder* builder) override {
- if (num_values > 0) {
- // TODO(wesm): Refactor to batch reads for improved memory use. It is not
- // trivial because the null_count is relative to the entire bitmap
- PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
- num_values, /*shrink_to_fit=*/false));
- }
-
- auto indices_buffer =
- reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
-
- if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
- valid_bits_offset, indices_buffer)) {
- ParquetException::EofException();
- }
-
- /// XXX(wesm): Cannot append "valid bits" directly to the builder
- std::vector<uint8_t> valid_bytes(num_values);
- ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
- for (int64_t i = 0; i < num_values; ++i) {
- valid_bytes[i] = static_cast<uint8_t>(bit_reader.IsSet());
- bit_reader.Next();
- }
-
- auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
- PARQUET_THROW_NOT_OK(
- binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
- num_values_ -= num_values - null_count;
- return num_values - null_count;
- }
-
- int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
- num_values = std::min(num_values, num_values_);
- if (num_values > 0) {
- // TODO(wesm): Refactor to batch reads for improved memory use. This is
- // relatively simple here because we don't have to do any bookkeeping of
- // nulls
- PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
- num_values, /*shrink_to_fit=*/false));
- }
- auto indices_buffer =
- reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
- if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
- ParquetException::EofException();
- }
- auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
- PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
- num_values_ -= num_values;
- return num_values;
- }
-
- int DecodeIndices(int num_values, int32_t* indices) override {
- if (num_values != idx_decoder_.GetBatch(indices, num_values)) {
- ParquetException::EofException();
- }
- num_values_ -= num_values;
- return num_values;
- }
-
- void GetDictionary(const T** dictionary, int32_t* dictionary_length) override {
- *dictionary_length = dictionary_length_;
- *dictionary = reinterpret_cast<T*>(dictionary_->mutable_data());
- }
-
- protected:
- Status IndexInBounds(int32_t index) {
- if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
- return Status::OK();
- }
- return Status::Invalid("Index not in dictionary bounds");
- }
-
- inline void DecodeDict(TypedDecoder<Type>* dictionary) {
- dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
- PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
- /*shrink_to_fit=*/false));
- dictionary->Decode(reinterpret_cast<T*>(dictionary_->mutable_data()),
- dictionary_length_);
- }
-
- // Only one is set.
- std::shared_ptr<ResizableBuffer> dictionary_;
-
- int32_t dictionary_length_;
-
- // Data that contains the byte array data (byte_array_dictionary_ just has the
- // pointers).
- std::shared_ptr<ResizableBuffer> byte_array_data_;
-
- // Arrow-style byte offsets for each dictionary value. We maintain two
- // representations of the dictionary, one as ByteArray* for non-Arrow
- // consumers and this one for Arrow consumers. Since dictionaries are
- // generally pretty small to begin with this doesn't mean too much extra
- // memory use in most cases
- std::shared_ptr<ResizableBuffer> byte_array_offsets_;
-
- // Reusable buffer for decoding dictionary indices to be appended to a
- // BinaryDictionary32Builder
- std::shared_ptr<ResizableBuffer> indices_scratch_space_;
-
- ::arrow::util::RleDecoder idx_decoder_;
-};
-
-template <typename Type>
-void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {
- DecodeDict(dictionary);
-}
-
-template <>
-void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
- ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
-}
-
-template <>
-void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictionary) {
- DecodeDict(dictionary);
-
- auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
-
- int total_size = 0;
- for (int i = 0; i < dictionary_length_; ++i) {
- total_size += dict_values[i].len;
- }
- PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
- /*shrink_to_fit=*/false));
- PARQUET_THROW_NOT_OK(
- byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
- /*shrink_to_fit=*/false));
-
- int32_t offset = 0;
- uint8_t* bytes_data = byte_array_data_->mutable_data();
- int32_t* bytes_offsets =
- reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
- for (int i = 0; i < dictionary_length_; ++i) {
- memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
- bytes_offsets[i] = offset;
- dict_values[i].ptr = bytes_data + offset;
- offset += dict_values[i].len;
- }
- bytes_offsets[dictionary_length_] = offset;
-}
-
-template <>
-inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionary) {
- DecodeDict(dictionary);
-
- auto dict_values = reinterpret_cast<FLBA*>(dictionary_->mutable_data());
-
- int fixed_len = descr_->type_length();
- int total_size = dictionary_length_ * fixed_len;
-
- PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
- /*shrink_to_fit=*/false));
- uint8_t* bytes_data = byte_array_data_->mutable_data();
- for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) {
- memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
- dict_values[i].ptr = bytes_data + offset;
- }
-}
-
-template <>
-inline int DictDecoderImpl<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::Accumulator* builder) {
- ParquetException::NYI("DecodeArrow to Int96Type");
-}
-
-template <>
-inline int DictDecoderImpl<Int96Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow to Int96Type");
-}
-
-template <>
-inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
- ParquetException::NYI("DecodeArrow implemented elsewhere");
-}
-
-template <>
-inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow implemented elsewhere");
-}
-
-template <typename DType>
-int DictDecoderImpl<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const typename DType::c_type*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- PARQUET_THROW_NOT_OK(builder->Append(dict_values[index]));
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- return num_values - null_count;
-}
-
-template <>
-int DictDecoderImpl<BooleanType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
- ParquetException::NYI("No dictionary encoding for BooleanType");
-}
-
-template <>
-inline int DictDecoderImpl<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::Accumulator* builder) {
- if (builder->byte_width() != descr_->type_length()) {
- throw ParquetException("Byte width mismatch: builder was " +
- std::to_string(builder->byte_width()) + " but decoder was " +
- std::to_string(descr_->type_length()));
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- builder->UnsafeAppend(dict_values[index].ptr);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- return num_values - null_count;
-}
-
-template <>
-int DictDecoderImpl<FLBAType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
- auto value_type =
- checked_cast<const ::arrow::DictionaryType&>(*builder->type()).value_type();
- auto byte_width =
- checked_cast<const ::arrow::FixedSizeBinaryType&>(*value_type).byte_width();
- if (byte_width != descr_->type_length()) {
- throw ParquetException("Byte width mismatch: builder was " +
- std::to_string(byte_width) + " but decoder was " +
- std::to_string(descr_->type_length()));
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- PARQUET_THROW_NOT_OK(builder->Append(dict_values[index].ptr));
- },
- [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
-
- return num_values - null_count;
-}
-
-template <typename Type>
-int DictDecoderImpl<Type>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<Type>::Accumulator* builder) {
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- using value_type = typename Type::c_type;
- auto dict_values = reinterpret_cast<const value_type*>(dictionary_->data());
-
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- int32_t index;
- if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
- throw ParquetException("");
- }
- PARQUET_THROW_NOT_OK(IndexInBounds(index));
- builder->UnsafeAppend(dict_values[index]);
- },
- [&]() { builder->UnsafeAppendNull(); });
-
- return num_values - null_count;
-}
-
-template <typename Type>
-void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
- ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types");
-}
-
-template <>
-void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
- auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
-
- // Make a BinaryArray referencing the internal dictionary data
- auto arr = std::make_shared<::arrow::BinaryArray>(
- dictionary_length_, byte_array_offsets_, byte_array_data_);
- PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
-}
-
-class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
- virtual public ByteArrayDecoder {
- public:
- using BASE = DictDecoderImpl<ByteArrayType>;
- using BASE::DictDecoderImpl;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- ::arrow::BinaryDictionary32Builder* builder) override {
- int result = 0;
- if (null_count == 0) {
- PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
- } else {
- PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
- valid_bits_offset, builder, &result));
- }
- return result;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
- int result = 0;
- if (null_count == 0) {
- PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
- } else {
- PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
- valid_bits_offset, out, &result));
- }
- return result;
- }
-
- private:
- Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out,
- int* out_num_values) {
- constexpr int32_t kBufferSize = 1024;
- int32_t indices[kBufferSize];
-
- ArrowBinaryHelper helper(out);
-
- ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
- int values_decoded = 0;
- int num_appended = 0;
- while (num_appended < num_values) {
- bool is_valid = bit_reader.IsSet();
- bit_reader.Next();
-
- if (is_valid) {
- int32_t batch_size =
- std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-
- if (ARROW_PREDICT_FALSE(num_indices < 1)) {
- return Status::Invalid("Invalid number of indices '", num_indices, "'");
- }
-
- int i = 0;
- while (true) {
- // Consume all indices
- if (is_valid) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
- RETURN_NOT_OK(helper.PushChunk());
- }
- RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
- ++i;
- ++values_decoded;
- } else {
- RETURN_NOT_OK(helper.AppendNull());
- --null_count;
- }
- ++num_appended;
- if (i == num_indices) {
- // Do not advance the bit_reader if we have fulfilled the decode
- // request
- break;
- }
- is_valid = bit_reader.IsSet();
- bit_reader.Next();
- }
- } else {
- RETURN_NOT_OK(helper.AppendNull());
- --null_count;
- ++num_appended;
- }
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-
- Status DecodeArrowDenseNonNull(int num_values,
- typename EncodingTraits<ByteArrayType>::Accumulator* out,
- int* out_num_values) {
- constexpr int32_t kBufferSize = 2048;
- int32_t indices[kBufferSize];
- int values_decoded = 0;
-
- ArrowBinaryHelper helper(out);
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
- while (values_decoded < num_values) {
- int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
- if (num_indices == 0) ParquetException::EofException();
- for (int i = 0; i < num_indices; ++i) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
- RETURN_NOT_OK(helper.PushChunk());
- }
- RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
- }
- values_decoded += num_indices;
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-
- template <typename BuilderType>
- Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset, BuilderType* builder,
- int* out_num_values) {
- constexpr int32_t kBufferSize = 1024;
- int32_t indices[kBufferSize];
-
- RETURN_NOT_OK(builder->Reserve(num_values));
- ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
-
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
- int values_decoded = 0;
- int num_appended = 0;
- while (num_appended < num_values) {
- bool is_valid = bit_reader.IsSet();
- bit_reader.Next();
-
- if (is_valid) {
- int32_t batch_size =
- std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
-
- int i = 0;
- while (true) {
- // Consume all indices
- if (is_valid) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- RETURN_NOT_OK(builder->Append(val.ptr, val.len));
- ++i;
- ++values_decoded;
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- --null_count;
- }
- ++num_appended;
- if (i == num_indices) {
- // Do not advance the bit_reader if we have fulfilled the decode
- // request
- break;
- }
- is_valid = bit_reader.IsSet();
- bit_reader.Next();
- }
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- --null_count;
- ++num_appended;
- }
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-
- template <typename BuilderType>
- Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
- constexpr int32_t kBufferSize = 2048;
- int32_t indices[kBufferSize];
-
- RETURN_NOT_OK(builder->Reserve(num_values));
-
- auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
-
- int values_decoded = 0;
- while (values_decoded < num_values) {
- int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
- int num_indices = idx_decoder_.GetBatch(indices, batch_size);
- if (num_indices == 0) ParquetException::EofException();
- for (int i = 0; i < num_indices; ++i) {
- auto idx = indices[i];
- RETURN_NOT_OK(IndexInBounds(idx));
- const auto& val = dict_values[idx];
- RETURN_NOT_OK(builder->Append(val.ptr, val.len));
- }
- values_decoded += num_indices;
- }
- *out_num_values = values_decoded;
- return Status::OK();
- }
-};
-
-// ----------------------------------------------------------------------
-// DeltaBitPackDecoder
-
-template <typename DType>
-class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
- public:
- typedef typename DType::c_type T;
-
- explicit DeltaBitPackDecoder(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) {
- if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) {
- throw ParquetException("Delta bit pack encoding should only be for integer data.");
- }
- }
-
- void SetData(int num_values, const uint8_t* data, int len) override {
- this->num_values_ = num_values;
- decoder_ = ::arrow::BitUtil::BitReader(data, len);
- values_current_block_ = 0;
- values_current_mini_block_ = 0;
- }
-
- int Decode(T* buffer, int max_values) override {
- return GetInternal(buffer, max_values);
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* out) override {
- if (null_count != 0) {
- ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
- }
- std::vector<T> values(num_values);
- GetInternal(values.data(), num_values);
- PARQUET_THROW_NOT_OK(out->AppendValues(values));
- return num_values;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* out) override {
- if (null_count != 0) {
- ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
- }
- std::vector<T> values(num_values);
- GetInternal(values.data(), num_values);
- PARQUET_THROW_NOT_OK(out->Reserve(num_values));
- for (T value : values) {
- PARQUET_THROW_NOT_OK(out->Append(value));
- }
- return num_values;
- }
-
- private:
- void InitBlock() {
- // The number of values per block.
- uint32_t block_size;
- if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
- if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
- if (!decoder_.GetVlqInt(&values_current_block_)) {
- ParquetException::EofException();
- }
- if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException();
-
- delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
- uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
-
- if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
- for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
- if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
- ParquetException::EofException();
- }
- }
- values_per_mini_block_ = block_size / num_mini_blocks_;
- mini_block_idx_ = 0;
- delta_bit_width_ = bit_width_data[0];
- values_current_mini_block_ = values_per_mini_block_;
- }
-
- template <typename T>
- int GetInternal(T* buffer, int max_values) {
- max_values = std::min(max_values, this->num_values_);
- const uint8_t* bit_width_data = delta_bit_widths_->data();
- for (int i = 0; i < max_values; ++i) {
- if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
- ++mini_block_idx_;
- if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
- delta_bit_width_ = bit_width_data[mini_block_idx_];
- values_current_mini_block_ = values_per_mini_block_;
- } else {
- InitBlock();
- buffer[i] = last_value_;
- continue;
- }
- }
-
- // TODO: the key to this algorithm is to decode the entire miniblock at once.
- int64_t delta;
- if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException();
- delta += min_delta_;
- last_value_ += static_cast<int32_t>(delta);
- buffer[i] = last_value_;
- --values_current_mini_block_;
- }
- this->num_values_ -= max_values;
- return max_values;
- }
-
- MemoryPool* pool_;
- ::arrow::BitUtil::BitReader decoder_;
- uint32_t values_current_block_;
- uint32_t num_mini_blocks_;
- uint64_t values_per_mini_block_;
- uint64_t values_current_mini_block_;
-
- int32_t min_delta_;
- size_t mini_block_idx_;
- std::shared_ptr<ResizableBuffer> delta_bit_widths_;
- int delta_bit_width_;
-
- int32_t last_value_;
-};
-
-// ----------------------------------------------------------------------
-// DELTA_LENGTH_BYTE_ARRAY
-
-class DeltaLengthByteArrayDecoder : public DecoderImpl,
- virtual public TypedDecoder<ByteArrayType> {
- public:
- explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
- len_decoder_(nullptr, pool),
- pool_(pool) {}
-
- void SetData(int num_values, const uint8_t* data, int len) override {
- num_values_ = num_values;
- if (len == 0) return;
- int total_lengths_len = ::arrow::util::SafeLoadAs<int32_t>(data);
- data += 4;
- this->len_decoder_.SetData(num_values, data, total_lengths_len);
- data_ = data + total_lengths_len;
- this->len_ = len - 4 - total_lengths_len;
- }
-
- int Decode(ByteArray* buffer, int max_values) override {
- using VectorT = ArrowPoolVector<int>;
- max_values = std::min(max_values, num_values_);
- VectorT lengths(max_values, 0, ::arrow::stl::allocator<int>(pool_));
- len_decoder_.Decode(lengths.data(), max_values);
- for (int i = 0; i < max_values; ++i) {
- buffer[i].len = lengths[i];
- buffer[i].ptr = data_;
- this->data_ += lengths[i];
- this->len_ -= lengths[i];
- }
- this->num_values_ -= max_values;
- return max_values;
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
- ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
- }
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override {
- ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
- }
-
- private:
- DeltaBitPackDecoder<Int32Type> len_decoder_;
- ::arrow::MemoryPool* pool_;
-};
-
-// ----------------------------------------------------------------------
-// DELTA_BYTE_ARRAY
-
-class DeltaByteArrayDecoder : public DecoderImpl,
- virtual public TypedDecoder<ByteArrayType> {
- public:
- explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
- MemoryPool* pool = ::arrow::default_memory_pool())
- : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
- prefix_len_decoder_(nullptr, pool),
- suffix_decoder_(nullptr, pool),
- last_value_(0, nullptr) {}
-
- virtual void SetData(int num_values, const uint8_t* data, int len) {
- num_values_ = num_values;
- if (len == 0) return;
- int prefix_len_length = ::arrow::util::SafeLoadAs<int32_t>(data);
- data += 4;
- len -= 4;
- prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
- data += prefix_len_length;
- len -= prefix_len_length;
- suffix_decoder_.SetData(num_values, data, len);
- }
-
- // TODO: this doesn't work and requires memory management. We need to allocate
- // new strings to store the results.
- virtual int Decode(ByteArray* buffer, int max_values) {
- max_values = std::min(max_values, this->num_values_);
- for (int i = 0; i < max_values; ++i) {
- int prefix_len = 0;
- prefix_len_decoder_.Decode(&prefix_len, 1);
- ByteArray suffix = {0, nullptr};
- suffix_decoder_.Decode(&suffix, 1);
- buffer[i].len = prefix_len + suffix.len;
-
- uint8_t* result = reinterpret_cast<uint8_t*>(malloc(buffer[i].len));
- memcpy(result, last_value_.ptr, prefix_len);
- memcpy(result + prefix_len, suffix.ptr, suffix.len);
-
- buffer[i].ptr = result;
- last_value_ = buffer[i];
- }
- this->num_values_ -= max_values;
- return max_values;
- }
-
- private:
- DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
- DeltaLengthByteArrayDecoder suffix_decoder_;
- ByteArray last_value_;
-};
-
-// ----------------------------------------------------------------------
-// BYTE_STREAM_SPLIT
-
-template <typename DType>
-class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
- public:
- using T = typename DType::c_type;
- explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr);
-
- int Decode(T* buffer, int max_values) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) override;
-
- int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) override;
-
- void SetData(int num_values, const uint8_t* data, int len) override;
-
- T* EnsureDecodeBuffer(int64_t min_values) {
- const int64_t size = sizeof(T) * min_values;
- if (!decode_buffer_ || decode_buffer_->size() < size) {
- PARQUET_ASSIGN_OR_THROW(decode_buffer_, ::arrow::AllocateBuffer(size));
- }
- return reinterpret_cast<T*>(decode_buffer_->mutable_data());
- }
-
- private:
- int num_values_in_buffer_{0};
- std::shared_ptr<Buffer> decode_buffer_;
-
- static constexpr size_t kNumStreams = sizeof(T);
-};
-
-template <typename DType>
-ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* descr)
- : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {}
-
-template <typename DType>
-void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
- int len) {
- DecoderImpl::SetData(num_values, data, len);
- if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
- throw ParquetException("Data size too small for number of values (corrupted file?)");
- }
- num_values_in_buffer_ = num_values;
-}
-
-template <typename DType>
-int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
- const int values_to_decode = std::min(num_values_, max_values);
- const int num_decoded_previously = num_values_in_buffer_ - num_values_;
- const uint8_t* data = data_ + num_decoded_previously;
-
- ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_to_decode,
- num_values_in_buffer_, buffer);
- num_values_ -= values_to_decode;
- len_ -= sizeof(T) * values_to_decode;
- return values_to_decode;
-}
-
-template <typename DType>
-int ByteStreamSplitDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* builder) {
- constexpr int value_size = static_cast<int>(kNumStreams);
- int values_decoded = num_values - null_count;
- if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
- ParquetException::EofException();
- }
-
- PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
-
- const int num_decoded_previously = num_values_in_buffer_ - num_values_;
- const uint8_t* data = data_ + num_decoded_previously;
- int offset = 0;
-
-#if defined(ARROW_HAVE_SIMD_SPLIT)
- // Use fast decoding into intermediate buffer. This will also decode
- // some null values, but it's fast enough that we don't care.
- T* decode_out = EnsureDecodeBuffer(values_decoded);
- ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_decoded,
- num_values_in_buffer_, decode_out);
-
- // XXX If null_count is 0, we could even append in bulk or decode directly into
- // builder
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- builder->UnsafeAppend(decode_out[offset]);
- ++offset;
- },
- [&]() { builder->UnsafeAppendNull(); });
-
-#else
- VisitNullBitmapInline(
- valid_bits, valid_bits_offset, num_values, null_count,
- [&]() {
- uint8_t gathered_byte_data[kNumStreams];
- for (size_t b = 0; b < kNumStreams; ++b) {
- const size_t byte_index = b * num_values_in_buffer_ + offset;
- gathered_byte_data[b] = data[byte_index];
- }
- builder->UnsafeAppend(::arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]));
- ++offset;
- },
- [&]() { builder->UnsafeAppendNull(); });
-#endif
-
- num_values_ -= values_decoded;
- len_ -= sizeof(T) * values_decoded;
- return values_decoded;
-}
-
-template <typename DType>
-int ByteStreamSplitDecoder<DType>::DecodeArrow(
- int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// Encoder and decoder factory functions
-
-std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
- bool use_dictionary, const ColumnDescriptor* descr,
- MemoryPool* pool) {
- if (use_dictionary) {
- switch (type_num) {
- case Type::INT32:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<Int32Type>(descr, pool));
- case Type::INT64:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<Int64Type>(descr, pool));
- case Type::INT96:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<Int96Type>(descr, pool));
- case Type::FLOAT:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<DoubleType>(descr, pool));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<ByteArrayType>(descr, pool));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new DictEncoderImpl<FLBAType>(descr, pool));
- default:
- DCHECK(false) << "Encoder not implemented";
- break;
- }
- } else if (encoding == Encoding::PLAIN) {
- switch (type_num) {
- case Type::BOOLEAN:
- return std::unique_ptr<Encoder>(new PlainEncoder<BooleanType>(descr, pool));
- case Type::INT32:
- return std::unique_ptr<Encoder>(new PlainEncoder<Int32Type>(descr, pool));
- case Type::INT64:
- return std::unique_ptr<Encoder>(new PlainEncoder<Int64Type>(descr, pool));
- case Type::INT96:
- return std::unique_ptr<Encoder>(new PlainEncoder<Int96Type>(descr, pool));
- case Type::FLOAT:
- return std::unique_ptr<Encoder>(new PlainEncoder<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Encoder>(new PlainEncoder<DoubleType>(descr, pool));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new PlainEncoder<ByteArrayType>(descr, pool));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Encoder>(new PlainEncoder<FLBAType>(descr, pool));
- default:
- DCHECK(false) << "Encoder not implemented";
- break;
- }
- } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
- switch (type_num) {
- case Type::FLOAT:
- return std::unique_ptr<Encoder>(
- new ByteStreamSplitEncoder<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Encoder>(
- new ByteStreamSplitEncoder<DoubleType>(descr, pool));
- default:
- throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
- break;
- }
- } else {
- ParquetException::NYI("Selected encoding is not supported");
- }
- DCHECK(false) << "Should not be able to reach this code";
- return nullptr;
-}
-
-std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
- const ColumnDescriptor* descr) {
- if (encoding == Encoding::PLAIN) {
- switch (type_num) {
- case Type::BOOLEAN:
- return std::unique_ptr<Decoder>(new PlainBooleanDecoder(descr));
- case Type::INT32:
- return std::unique_ptr<Decoder>(new PlainDecoder<Int32Type>(descr));
- case Type::INT64:
- return std::unique_ptr<Decoder>(new PlainDecoder<Int64Type>(descr));
- case Type::INT96:
- return std::unique_ptr<Decoder>(new PlainDecoder<Int96Type>(descr));
- case Type::FLOAT:
- return std::unique_ptr<Decoder>(new PlainDecoder<FloatType>(descr));
- case Type::DOUBLE:
- return std::unique_ptr<Decoder>(new PlainDecoder<DoubleType>(descr));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new PlainByteArrayDecoder(descr));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new PlainFLBADecoder(descr));
- default:
- break;
- }
- } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
- switch (type_num) {
- case Type::FLOAT:
- return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<FloatType>(descr));
- case Type::DOUBLE:
- return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<DoubleType>(descr));
- default:
- throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
- break;
- }
- } else {
- ParquetException::NYI("Selected encoding is not supported");
- }
- DCHECK(false) << "Should not be able to reach this code";
- return nullptr;
-}
-
-namespace detail {
-std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
- const ColumnDescriptor* descr,
- MemoryPool* pool) {
- switch (type_num) {
- case Type::BOOLEAN:
- ParquetException::NYI("Dictionary encoding not implemented for boolean type");
- case Type::INT32:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<Int32Type>(descr, pool));
- case Type::INT64:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<Int64Type>(descr, pool));
- case Type::INT96:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<Int96Type>(descr, pool));
- case Type::FLOAT:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<FloatType>(descr, pool));
- case Type::DOUBLE:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<DoubleType>(descr, pool));
- case Type::BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new DictByteArrayDecoderImpl(descr, pool));
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::unique_ptr<Decoder>(new DictDecoderImpl<FLBAType>(descr, pool));
- default:
- break;
- }
- DCHECK(false) << "Should not be able to reach this code";
- return nullptr;
-}
-
-} // namespace detail
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encoding.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/stl_allocator.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_stream_utils.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/byte_stream_split.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/rle_encoding.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace BitUtil = arrow::BitUtil;
+
+using arrow::Status;
+using arrow::VisitNullBitmapInline;
+using arrow::internal::checked_cast;
+
+template <typename T>
+using ArrowPoolVector = std::vector<T, ::arrow::stl::allocator<T>>;
+
+namespace parquet {
+namespace {
+
+constexpr int64_t kInMemoryDefaultCapacity = 1024;
+// The Parquet spec isn't very clear whether ByteArray lengths are signed or
+// unsigned, but the Java implementation uses signed ints.
+constexpr size_t kMaxByteArraySize = std::numeric_limits<int32_t>::max();
+
+class EncoderImpl : virtual public Encoder {
+ public:
+ EncoderImpl(const ColumnDescriptor* descr, Encoding::type encoding, MemoryPool* pool)
+ : descr_(descr),
+ encoding_(encoding),
+ pool_(pool),
+ type_length_(descr ? descr->type_length() : -1) {}
+
+ Encoding::type encoding() const override { return encoding_; }
+
+ MemoryPool* memory_pool() const override { return pool_; }
+
+ protected:
+ // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+ const ColumnDescriptor* descr_;
+ const Encoding::type encoding_;
+ MemoryPool* pool_;
+
+ /// Type length from descr
+ int type_length_;
+};
+
+// ----------------------------------------------------------------------
+// Plain encoder implementation
+
+template <typename DType>
+class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::PLAIN, pool), sink_(pool) {}
+
+ int64_t EstimatedDataEncodedSize() override { return sink_.length(); }
+
+ std::shared_ptr<Buffer> FlushValues() override {
+ std::shared_ptr<Buffer> buffer;
+ PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+ return buffer;
+ }
+
+ using TypedEncoder<DType>::Put;
+
+ void Put(const T* buffer, int num_values) override;
+
+ void Put(const ::arrow::Array& values) override;
+
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+ }
+
+ void UnsafePutByteArray(const void* data, uint32_t length) {
+ DCHECK(length == 0 || data != nullptr) << "Value ptr cannot be NULL";
+ sink_.UnsafeAppend(&length, sizeof(uint32_t));
+ sink_.UnsafeAppend(data, static_cast<int64_t>(length));
+ }
+
+ void Put(const ByteArray& val) {
+ // Write the result to the output stream
+ const int64_t increment = static_cast<int64_t>(val.len + sizeof(uint32_t));
+ if (ARROW_PREDICT_FALSE(sink_.length() + increment > sink_.capacity())) {
+ PARQUET_THROW_NOT_OK(sink_.Reserve(increment));
+ }
+ UnsafePutByteArray(val.ptr, val.len);
+ }
+
+ protected:
+ template <typename ArrayType>
+ void PutBinaryArray(const ArrayType& array) {
+ const int64_t total_bytes =
+ array.value_offset(array.length()) - array.value_offset(0);
+ PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
+
+ PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+ *array.data(),
+ [&](::arrow::util::string_view view) {
+ if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+ return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ }
+ UnsafePutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+ return Status::OK();
+ },
+ []() { return Status::OK(); }));
+ }
+
+ ::arrow::BufferBuilder sink_;
+};
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const T* buffer, int num_values) {
+ if (num_values > 0) {
+ PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+ }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ByteArray* src, int num_values) {
+ for (int i = 0; i < num_values; ++i) {
+ Put(src[i]);
+ }
+}
+
+template <typename ArrayType>
+void DirectPutImpl(const ::arrow::Array& values, ::arrow::BufferBuilder* sink) {
+ if (values.type_id() != ArrayType::TypeClass::type_id) {
+ std::string type_name = ArrayType::TypeClass::type_name();
+ throw ParquetException("direct put to " + type_name + " from " +
+ values.type()->ToString() + " not supported");
+ }
+
+ using value_type = typename ArrayType::value_type;
+ constexpr auto value_size = sizeof(value_type);
+ auto raw_values = checked_cast<const ArrayType&>(values).raw_values();
+
+ if (values.null_count() == 0) {
+ // no nulls, just dump the data
+ PARQUET_THROW_NOT_OK(sink->Append(raw_values, values.length() * value_size));
+ } else {
+ PARQUET_THROW_NOT_OK(
+ sink->Reserve((values.length() - values.null_count()) * value_size));
+
+ for (int64_t i = 0; i < values.length(); i++) {
+ if (values.IsValid(i)) {
+ sink->UnsafeAppend(&raw_values[i], value_size);
+ }
+ }
+ }
+}
+
+template <>
+void PlainEncoder<Int32Type>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::Int32Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int64Type>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::Int64Array>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<Int96Type>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("direct put to Int96");
+}
+
+template <>
+void PlainEncoder<FloatType>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::FloatArray>(values, &sink_);
+}
+
+template <>
+void PlainEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+ DirectPutImpl<::arrow::DoubleArray>(values, &sink_);
+}
+
+template <typename DType>
+void PlainEncoder<DType>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("direct put of " + values.type()->ToString());
+}
+
+void AssertBaseBinary(const ::arrow::Array& values) {
+ if (!::arrow::is_base_binary_like(values.type_id())) {
+ throw ParquetException("Only BaseBinaryArray and subclasses supported");
+ }
+}
+
+template <>
+inline void PlainEncoder<ByteArrayType>::Put(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+void AssertFixedSizeBinary(const ::arrow::Array& values, int type_length) {
+ if (values.type_id() != ::arrow::Type::FIXED_SIZE_BINARY &&
+ values.type_id() != ::arrow::Type::DECIMAL) {
+ throw ParquetException("Only FixedSizeBinaryArray and subclasses supported");
+ }
+ if (checked_cast<const ::arrow::FixedSizeBinaryType&>(*values.type()).byte_width() !=
+ type_length) {
+ throw ParquetException("Size mismatch: " + values.type()->ToString() +
+ " should have been " + std::to_string(type_length) + " wide");
+ }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, descr_->type_length());
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(data.raw_values(), data.length() * data.byte_width()));
+ } else {
+ const int64_t total_bytes =
+ data.length() * data.byte_width() - data.null_count() * data.byte_width();
+ PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes));
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ sink_.UnsafeAppend(data.Value(i), data.byte_width());
+ }
+ }
+ }
+}
+
+template <>
+inline void PlainEncoder<FLBAType>::Put(const FixedLenByteArray* src, int num_values) {
+ if (descr_->type_length() == 0) {
+ return;
+ }
+ for (int i = 0; i < num_values; ++i) {
+ // Write the result to the output stream
+ DCHECK(src[i].ptr != nullptr) << "Value ptr cannot be NULL";
+ PARQUET_THROW_NOT_OK(sink_.Append(src[i].ptr, descr_->type_length()));
+ }
+}
+
+template <>
+class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEncoder {
+ public:
+ explicit PlainEncoder(const ColumnDescriptor* descr, MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::PLAIN, pool),
+ bits_available_(kInMemoryDefaultCapacity * 8),
+ bits_buffer_(AllocateBuffer(pool, kInMemoryDefaultCapacity)),
+ sink_(pool),
+ bit_writer_(bits_buffer_->mutable_data(),
+ static_cast<int>(bits_buffer_->size())) {}
+
+ int64_t EstimatedDataEncodedSize() override;
+ std::shared_ptr<Buffer> FlushValues() override;
+
+ void Put(const bool* src, int num_values) override;
+
+ void Put(const std::vector<bool>& src, int num_values) override;
+
+ void PutSpaced(const bool* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+ }
+
+ void Put(const ::arrow::Array& values) override {
+ if (values.type_id() != ::arrow::Type::BOOL) {
+ throw ParquetException("direct put to boolean from " + values.type()->ToString() +
+ " not supported");
+ }
+
+ const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
+ if (data.null_count() == 0) {
+ PARQUET_THROW_NOT_OK(sink_.Reserve(BitUtil::BytesForBits(data.length())));
+ // no nulls, just dump the data
+ ::arrow::internal::CopyBitmap(data.data()->GetValues<uint8_t>(1), data.offset(),
+ data.length(), sink_.mutable_data(), sink_.length());
+ } else {
+ auto n_valid = BitUtil::BytesForBits(data.length() - data.null_count());
+ PARQUET_THROW_NOT_OK(sink_.Reserve(n_valid));
+ ::arrow::internal::FirstTimeBitmapWriter writer(sink_.mutable_data(),
+ sink_.length(), n_valid);
+
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ if (data.Value(i)) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
+ }
+ }
+ writer.Finish();
+ }
+ sink_.UnsafeAdvance(data.length());
+ }
+
+ private:
+ int bits_available_;
+ std::shared_ptr<ResizableBuffer> bits_buffer_;
+ ::arrow::BufferBuilder sink_;
+ ::arrow::BitUtil::BitWriter bit_writer_;
+
+ template <typename SequenceType>
+ void PutImpl(const SequenceType& src, int num_values);
+};
+
+template <typename SequenceType>
+void PlainEncoder<BooleanType>::PutImpl(const SequenceType& src, int num_values) {
+ int bit_offset = 0;
+ if (bits_available_ > 0) {
+ int bits_to_write = std::min(bits_available_, num_values);
+ for (int i = 0; i < bits_to_write; i++) {
+ bit_writer_.PutValue(src[i], 1);
+ }
+ bits_available_ -= bits_to_write;
+ bit_offset = bits_to_write;
+
+ if (bits_available_ == 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ }
+ }
+
+ int bits_remaining = num_values - bit_offset;
+ while (bit_offset < num_values) {
+ bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
+
+ int bits_to_write = std::min(bits_available_, bits_remaining);
+ for (int i = bit_offset; i < bit_offset + bits_to_write; i++) {
+ bit_writer_.PutValue(src[i], 1);
+ }
+ bit_offset += bits_to_write;
+ bits_available_ -= bits_to_write;
+ bits_remaining -= bits_to_write;
+
+ if (bits_available_ == 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(
+ sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ }
+ }
+}
+
+int64_t PlainEncoder<BooleanType>::EstimatedDataEncodedSize() {
+ int64_t position = sink_.length();
+ return position + bit_writer_.bytes_written();
+}
+
+std::shared_ptr<Buffer> PlainEncoder<BooleanType>::FlushValues() {
+ if (bits_available_ > 0) {
+ bit_writer_.Flush();
+ PARQUET_THROW_NOT_OK(sink_.Append(bit_writer_.buffer(), bit_writer_.bytes_written()));
+ bit_writer_.Clear();
+ bits_available_ = static_cast<int>(bits_buffer_->size()) * 8;
+ }
+
+ std::shared_ptr<Buffer> buffer;
+ PARQUET_THROW_NOT_OK(sink_.Finish(&buffer));
+ return buffer;
+}
+
+void PlainEncoder<BooleanType>::Put(const bool* src, int num_values) {
+ PutImpl(src, num_values);
+}
+
+void PlainEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+ PutImpl(src, num_values);
+}
+
+// ----------------------------------------------------------------------
+// DictEncoder<T> implementations
+
+template <typename DType>
+struct DictEncoderTraits {
+ using c_type = typename DType::c_type;
+ using MemoTableType = ::arrow::internal::ScalarMemoTable<c_type>;
+};
+
+template <>
+struct DictEncoderTraits<ByteArrayType> {
+ using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+template <>
+struct DictEncoderTraits<FLBAType> {
+ using MemoTableType = ::arrow::internal::BinaryMemoTable<::arrow::BinaryBuilder>;
+};
+
+// Initially 1024 elements
+static constexpr int32_t kInitialHashTableSize = 1 << 10;
+
+/// See the dictionary encoding section of
+/// https://github.com/Parquet/parquet-format. The encoding supports
+/// streaming encoding. Values are encoded as they are added while the
+/// dictionary is being constructed. At any time, the buffered values
+/// can be written out with the current dictionary size. More values
+/// can then be added to the encoder, including new dictionary
+/// entries.
+template <typename DType>
+class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
+ using MemoTableType = typename DictEncoderTraits<DType>::MemoTableType;
+
+ public:
+ typedef typename DType::c_type T;
+
+ explicit DictEncoderImpl(const ColumnDescriptor* desc, MemoryPool* pool)
+ : EncoderImpl(desc, Encoding::PLAIN_DICTIONARY, pool),
+ buffered_indices_(::arrow::stl::allocator<int32_t>(pool)),
+ dict_encoded_size_(0),
+ memo_table_(pool, kInitialHashTableSize) {}
+
+ ~DictEncoderImpl() override { DCHECK(buffered_indices_.empty()); }
+
+ int dict_encoded_size() override { return dict_encoded_size_; }
+
+ int WriteIndices(uint8_t* buffer, int buffer_len) override {
+ // Write bit width in first byte
+ *buffer = static_cast<uint8_t>(bit_width());
+ ++buffer;
+ --buffer_len;
+
+ ::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
+
+ for (int32_t index : buffered_indices_) {
+ if (!encoder.Put(index)) return -1;
+ }
+ encoder.Flush();
+
+ ClearIndices();
+ return 1 + encoder.len();
+ }
+
+ void set_type_length(int type_length) { this->type_length_ = type_length; }
+
+ /// Returns a conservative estimate of the number of bytes needed to encode the buffered
+ /// indices. Used to size the buffer passed to WriteIndices().
+ int64_t EstimatedDataEncodedSize() override {
+ // Note: because of the way RleEncoder::CheckBufferFull() is called, we have to
+ // reserve
+ // an extra "RleEncoder::MinBufferSize" bytes. These extra bytes won't be used
+ // but not reserving them would cause the encoder to fail.
+ return 1 +
+ ::arrow::util::RleEncoder::MaxBufferSize(
+ bit_width(), static_cast<int>(buffered_indices_.size())) +
+ ::arrow::util::RleEncoder::MinBufferSize(bit_width());
+ }
+
+ /// The minimum bit width required to encode the currently buffered indices.
+ int bit_width() const override {
+ if (ARROW_PREDICT_FALSE(num_entries() == 0)) return 0;
+ if (ARROW_PREDICT_FALSE(num_entries() == 1)) return 1;
+ return BitUtil::Log2(num_entries());
+ }
+
+ /// Encode value. Note that this does not actually write any data, just
+ /// buffers the value's index to be written later.
+ inline void Put(const T& value);
+
+ // Not implemented for other data types
+ inline void PutByteArray(const void* ptr, int32_t length);
+
+ void Put(const T* src, int num_values) override {
+ for (int32_t i = 0; i < num_values; i++) {
+ Put(src[i]);
+ }
+ }
+
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset, num_values,
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ Put(src[i + position]);
+ }
+ });
+ }
+
+ using TypedEncoder<DType>::Put;
+
+ void Put(const ::arrow::Array& values) override;
+ void PutDictionary(const ::arrow::Array& values) override;
+
+ template <typename ArrowType, typename T = typename ArrowType::c_type>
+ void PutIndicesTyped(const ::arrow::Array& data) {
+ auto values = data.data()->GetValues<T>(1);
+ size_t buffer_position = buffered_indices_.size();
+ buffered_indices_.resize(buffer_position +
+ static_cast<size_t>(data.length() - data.null_count()));
+ ::arrow::internal::VisitSetBitRunsVoid(
+ data.null_bitmap_data(), data.offset(), data.length(),
+ [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; ++i) {
+ buffered_indices_[buffer_position++] =
+ static_cast<int32_t>(values[i + position]);
+ }
+ });
+ }
+
+ void PutIndices(const ::arrow::Array& data) override {
+ switch (data.type()->id()) {
+ case ::arrow::Type::UINT8:
+ case ::arrow::Type::INT8:
+ return PutIndicesTyped<::arrow::UInt8Type>(data);
+ case ::arrow::Type::UINT16:
+ case ::arrow::Type::INT16:
+ return PutIndicesTyped<::arrow::UInt16Type>(data);
+ case ::arrow::Type::UINT32:
+ case ::arrow::Type::INT32:
+ return PutIndicesTyped<::arrow::UInt32Type>(data);
+ case ::arrow::Type::UINT64:
+ case ::arrow::Type::INT64:
+ return PutIndicesTyped<::arrow::UInt64Type>(data);
+ default:
+ throw ParquetException("Passed non-integer array to PutIndices");
+ }
+ }
+
+ std::shared_ptr<Buffer> FlushValues() override {
+ std::shared_ptr<ResizableBuffer> buffer =
+ AllocateBuffer(this->pool_, EstimatedDataEncodedSize());
+ int result_size = WriteIndices(buffer->mutable_data(),
+ static_cast<int>(EstimatedDataEncodedSize()));
+ PARQUET_THROW_NOT_OK(buffer->Resize(result_size, false));
+ return std::move(buffer);
+ }
+
+ /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+ /// dict_encoded_size() bytes.
+ void WriteDict(uint8_t* buffer) override;
+
+ /// The number of entries in the dictionary.
+ int num_entries() const override { return memo_table_.size(); }
+
+ private:
+ /// Clears all the indices (but leaves the dictionary).
+ void ClearIndices() { buffered_indices_.clear(); }
+
+ /// Indices that have not yet be written out by WriteIndices().
+ ArrowPoolVector<int32_t> buffered_indices_;
+
+ template <typename ArrayType>
+ void PutBinaryArray(const ArrayType& array) {
+ PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+ *array.data(),
+ [&](::arrow::util::string_view view) {
+ if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
+ return Status::Invalid("Parquet cannot store strings with size 2GB or more");
+ }
+ PutByteArray(view.data(), static_cast<uint32_t>(view.size()));
+ return Status::OK();
+ },
+ []() { return Status::OK(); }));
+ }
+
+ template <typename ArrayType>
+ void PutBinaryDictionaryArray(const ArrayType& array) {
+ DCHECK_EQ(array.null_count(), 0);
+ for (int64_t i = 0; i < array.length(); i++) {
+ auto v = array.GetView(i);
+ if (ARROW_PREDICT_FALSE(v.size() > kMaxByteArraySize)) {
+ throw ParquetException("Parquet cannot store strings with size 2GB or more");
+ }
+ dict_encoded_size_ += static_cast<int>(v.size() + sizeof(uint32_t));
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(
+ v.data(), static_cast<int32_t>(v.size()), &unused_memo_index));
+ }
+ }
+
+ /// The number of bytes needed to encode the dictionary.
+ int dict_encoded_size_;
+
+ MemoTableType memo_table_;
+};
+
+template <typename DType>
+void DictEncoderImpl<DType>::WriteDict(uint8_t* buffer) {
+ // For primitive types, only a memcpy
+ DCHECK_EQ(static_cast<size_t>(dict_encoded_size_), sizeof(T) * memo_table_.size());
+ memo_table_.CopyValues(0 /* start_pos */, reinterpret_cast<T*>(buffer));
+}
+
+// ByteArray and FLBA already have the dictionary encoded in their data heaps
+template <>
+void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) {
+ memo_table_.VisitValues(0, [&buffer](const ::arrow::util::string_view& v) {
+ uint32_t len = static_cast<uint32_t>(v.length());
+ memcpy(buffer, &len, sizeof(len));
+ buffer += sizeof(len);
+ memcpy(buffer, v.data(), len);
+ buffer += len;
+ });
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) {
+ memo_table_.VisitValues(0, [&](const ::arrow::util::string_view& v) {
+ DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
+ memcpy(buffer, v.data(), type_length_);
+ buffer += type_length_;
+ });
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::Put(const T& v) {
+ // Put() implementation for primitive types
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [this](int32_t memo_index) {
+ dict_encoded_size_ += static_cast<int>(sizeof(T));
+ };
+
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(v, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <typename DType>
+inline void DictEncoderImpl<DType>::PutByteArray(const void* ptr, int32_t length) {
+ DCHECK(false);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::PutByteArray(const void* ptr,
+ int32_t length) {
+ static const uint8_t empty[] = {0};
+
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [&](int32_t memo_index) {
+ dict_encoded_size_ += static_cast<int>(length + sizeof(uint32_t));
+ };
+
+ DCHECK(ptr != nullptr || length == 0);
+ ptr = (ptr != nullptr) ? ptr : empty;
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(ptr, length, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <>
+inline void DictEncoderImpl<ByteArrayType>::Put(const ByteArray& val) {
+ return PutByteArray(val.ptr, static_cast<int32_t>(val.len));
+}
+
+template <>
+inline void DictEncoderImpl<FLBAType>::Put(const FixedLenByteArray& v) {
+ static const uint8_t empty[] = {0};
+
+ auto on_found = [](int32_t memo_index) {};
+ auto on_not_found = [this](int32_t memo_index) { dict_encoded_size_ += type_length_; };
+
+ DCHECK(v.ptr != nullptr || type_length_ == 0);
+ const void* ptr = (v.ptr != nullptr) ? v.ptr : empty;
+ int32_t memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(ptr, type_length_, on_found, on_not_found, &memo_index));
+ buffered_indices_.push_back(memo_index);
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::Put(const ::arrow::Array& values) {
+ ParquetException::NYI("Direct put to Int96");
+}
+
+template <>
+void DictEncoderImpl<Int96Type>::PutDictionary(const ::arrow::Array& values) {
+ ParquetException::NYI("Direct put to Int96");
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::Put(const ::arrow::Array& values) {
+ using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+ const auto& data = checked_cast<const ArrayType&>(values);
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ for (int64_t i = 0; i < data.length(); i++) {
+ Put(data.Value(i));
+ }
+ } else {
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ Put(data.Value(i));
+ }
+ }
+ }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::Put(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, type_length_);
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+ if (data.null_count() == 0) {
+ // no nulls, just dump the data
+ for (int64_t i = 0; i < data.length(); i++) {
+ Put(FixedLenByteArray(data.Value(i)));
+ }
+ } else {
+ std::vector<uint8_t> empty(type_length_, 0);
+ for (int64_t i = 0; i < data.length(); i++) {
+ if (data.IsValid(i)) {
+ Put(FixedLenByteArray(data.Value(i)));
+ }
+ }
+ }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::Put(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+template <typename DType>
+void AssertCanPutDictionary(DictEncoderImpl<DType>* encoder, const ::arrow::Array& dict) {
+ if (dict.null_count() > 0) {
+ throw ParquetException("Inserted dictionary cannot cannot contain nulls");
+ }
+
+ if (encoder->num_entries() > 0) {
+ throw ParquetException("Can only call PutDictionary on an empty DictEncoder");
+ }
+}
+
+template <typename DType>
+void DictEncoderImpl<DType>::PutDictionary(const ::arrow::Array& values) {
+ AssertCanPutDictionary(this, values);
+
+ using ArrayType = typename ::arrow::CTypeTraits<typename DType::c_type>::ArrayType;
+ const auto& data = checked_cast<const ArrayType&>(values);
+
+ dict_encoded_size_ += static_cast<int>(sizeof(typename DType::c_type) * data.length());
+ for (int64_t i = 0; i < data.length(); i++) {
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(memo_table_.GetOrInsert(data.Value(i), &unused_memo_index));
+ }
+}
+
+template <>
+void DictEncoderImpl<FLBAType>::PutDictionary(const ::arrow::Array& values) {
+ AssertFixedSizeBinary(values, type_length_);
+ AssertCanPutDictionary(this, values);
+
+ const auto& data = checked_cast<const ::arrow::FixedSizeBinaryArray&>(values);
+
+ dict_encoded_size_ += static_cast<int>(type_length_ * data.length());
+ for (int64_t i = 0; i < data.length(); i++) {
+ int32_t unused_memo_index;
+ PARQUET_THROW_NOT_OK(
+ memo_table_.GetOrInsert(data.Value(i), type_length_, &unused_memo_index));
+ }
+}
+
+template <>
+void DictEncoderImpl<ByteArrayType>::PutDictionary(const ::arrow::Array& values) {
+ AssertBaseBinary(values);
+ AssertCanPutDictionary(this, values);
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ PutBinaryDictionaryArray(checked_cast<const ::arrow::BinaryArray&>(values));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ PutBinaryDictionaryArray(checked_cast<const ::arrow::LargeBinaryArray&>(values));
+ }
+}
+
+// ----------------------------------------------------------------------
+// ByteStreamSplitEncoder<T> implementations
+
+template <typename DType>
+class ByteStreamSplitEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ using TypedEncoder<DType>::Put;
+
+ explicit ByteStreamSplitEncoder(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ int64_t EstimatedDataEncodedSize() override;
+ std::shared_ptr<Buffer> FlushValues() override;
+
+ void Put(const T* buffer, int num_values) override;
+ void Put(const ::arrow::Array& values) override;
+ void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override;
+
+ protected:
+ template <typename ArrowType>
+ void PutImpl(const ::arrow::Array& values) {
+ if (values.type_id() != ArrowType::type_id) {
+ throw ParquetException(std::string() + "direct put to " + ArrowType::type_name() +
+ " from " + values.type()->ToString() + " not supported");
+ }
+ const auto& data = *values.data();
+ PutSpaced(data.GetValues<typename ArrowType::c_type>(1),
+ static_cast<int>(data.length), data.GetValues<uint8_t>(0, 0), data.offset);
+ }
+
+ ::arrow::BufferBuilder sink_;
+ int64_t num_values_in_buffer_;
+};
+
+template <typename DType>
+ByteStreamSplitEncoder<DType>::ByteStreamSplitEncoder(const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool)
+ : EncoderImpl(descr, Encoding::BYTE_STREAM_SPLIT, pool),
+ sink_{pool},
+ num_values_in_buffer_{0} {}
+
+template <typename DType>
+int64_t ByteStreamSplitEncoder<DType>::EstimatedDataEncodedSize() {
+ return sink_.length();
+}
+
+template <typename DType>
+std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
+ std::shared_ptr<ResizableBuffer> output_buffer =
+ AllocateBuffer(this->memory_pool(), EstimatedDataEncodedSize());
+ uint8_t* output_buffer_raw = output_buffer->mutable_data();
+ const uint8_t* raw_values = sink_.data();
+ ::arrow::util::internal::ByteStreamSplitEncode<T>(raw_values, num_values_in_buffer_,
+ output_buffer_raw);
+ sink_.Reset();
+ num_values_in_buffer_ = 0;
+ return std::move(output_buffer);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::Put(const T* buffer, int num_values) {
+ if (num_values > 0) {
+ PARQUET_THROW_NOT_OK(sink_.Append(buffer, num_values * sizeof(T)));
+ num_values_in_buffer_ += num_values;
+ }
+}
+
+template <>
+void ByteStreamSplitEncoder<FloatType>::Put(const ::arrow::Array& values) {
+ PutImpl<::arrow::FloatType>(values);
+}
+
+template <>
+void ByteStreamSplitEncoder<DoubleType>::Put(const ::arrow::Array& values) {
+ PutImpl<::arrow::DoubleType>(values);
+}
+
+template <typename DType>
+void ByteStreamSplitEncoder<DType>::PutSpaced(const T* src, int num_values,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) {
+ if (valid_bits != NULLPTR) {
+ PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T),
+ this->memory_pool()));
+ T* data = reinterpret_cast<T*>(buffer->mutable_data());
+ int num_valid_values = ::arrow::util::internal::SpacedCompress<T>(
+ src, num_values, valid_bits, valid_bits_offset, data);
+ Put(data, num_valid_values);
+ } else {
+ Put(src, num_values);
+ }
+}
+
+class DecoderImpl : virtual public Decoder {
+ public:
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ data_ = data;
+ len_ = len;
+ }
+
+ int values_left() const override { return num_values_; }
+ Encoding::type encoding() const override { return encoding_; }
+
+ protected:
+ explicit DecoderImpl(const ColumnDescriptor* descr, Encoding::type encoding)
+ : descr_(descr), encoding_(encoding), num_values_(0), data_(NULLPTR), len_(0) {}
+
+ // For accessing type-specific metadata, like FIXED_LEN_BYTE_ARRAY
+ const ColumnDescriptor* descr_;
+
+ const Encoding::type encoding_;
+ int num_values_;
+ const uint8_t* data_;
+ int len_;
+ int type_length_;
+};
+
+template <typename DType>
+class PlainDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ explicit PlainDecoder(const ColumnDescriptor* descr);
+
+ int Decode(T* buffer, int max_values) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) override;
+};
+
+template <>
+inline int PlainDecoder<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow not supported for Int96");
+}
+
+template <>
+inline int PlainDecoder<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow not supported for Int96");
+}
+
+template <>
+inline int PlainDecoder<BooleanType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("dictionaries of BooleanType");
+}
+
+template <typename DType>
+int PlainDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) {
+ using value_type = typename DType::c_type;
+
+ constexpr int value_size = static_cast<int>(sizeof(value_type));
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(::arrow::util::SafeLoadAs<value_type>(data_));
+ data_ += sizeof(value_type);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(value_type) * values_decoded;
+ return values_decoded;
+}
+
+template <typename DType>
+int PlainDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ using value_type = typename DType::c_type;
+
+ constexpr int value_size = static_cast<int>(sizeof(value_type));
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ PARQUET_THROW_NOT_OK(
+ builder->Append(::arrow::util::SafeLoadAs<value_type>(data_)));
+ data_ += sizeof(value_type);
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(value_type) * values_decoded;
+ return values_decoded;
+}
+
+// Decode routine templated on C++ type rather than type enum
+template <typename T>
+inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
+ int type_length, T* out) {
+ int64_t bytes_to_decode = num_values * static_cast<int64_t>(sizeof(T));
+ if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
+ ParquetException::EofException();
+ }
+ // If bytes_to_decode == 0, data could be null
+ if (bytes_to_decode > 0) {
+ memcpy(out, data, bytes_to_decode);
+ }
+ return static_cast<int>(bytes_to_decode);
+}
+
+template <typename DType>
+PlainDecoder<DType>::PlainDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::PLAIN) {
+ if (descr_ && descr_->physical_type() == Type::FIXED_LEN_BYTE_ARRAY) {
+ type_length_ = descr_->type_length();
+ } else {
+ type_length_ = -1;
+ }
+}
+
+// Template specialization for BYTE_ARRAY. The written values do not own their
+// own data.
+
+static inline int64_t ReadByteArray(const uint8_t* data, int64_t data_size,
+ ByteArray* out) {
+ if (ARROW_PREDICT_FALSE(data_size < 4)) {
+ ParquetException::EofException();
+ }
+ const int32_t len = ::arrow::util::SafeLoadAs<int32_t>(data);
+ if (len < 0) {
+ throw ParquetException("Invalid BYTE_ARRAY value");
+ }
+ const int64_t consumed_length = static_cast<int64_t>(len) + 4;
+ if (ARROW_PREDICT_FALSE(data_size < consumed_length)) {
+ ParquetException::EofException();
+ }
+ *out = ByteArray{static_cast<uint32_t>(len), data + 4};
+ return consumed_length;
+}
+
+template <>
+inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int num_values,
+ int type_length, ByteArray* out) {
+ int bytes_decoded = 0;
+ for (int i = 0; i < num_values; ++i) {
+ const auto increment = ReadByteArray(data, data_size, out + i);
+ if (ARROW_PREDICT_FALSE(increment > INT_MAX - bytes_decoded)) {
+ throw ParquetException("BYTE_ARRAY chunk too large");
+ }
+ data += increment;
+ data_size -= increment;
+ bytes_decoded += static_cast<int>(increment);
+ }
+ return bytes_decoded;
+}
+
+// Template specialization for FIXED_LEN_BYTE_ARRAY. The written values do not
+// own their own data.
+template <>
+inline int DecodePlain<FixedLenByteArray>(const uint8_t* data, int64_t data_size,
+ int num_values, int type_length,
+ FixedLenByteArray* out) {
+ int64_t bytes_to_decode = static_cast<int64_t>(type_length) * num_values;
+ if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
+ ParquetException::EofException();
+ }
+ for (int i = 0; i < num_values; ++i) {
+ out[i].ptr = data;
+ data += type_length;
+ data_size -= type_length;
+ }
+ return static_cast<int>(bytes_to_decode);
+}
+
+template <typename DType>
+int PlainDecoder<DType>::Decode(T* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ int bytes_consumed = DecodePlain<T>(data_, len_, max_values, type_length_, buffer);
+ data_ += bytes_consumed;
+ len_ -= bytes_consumed;
+ num_values_ -= max_values;
+ return max_values;
+}
+
+class PlainBooleanDecoder : public DecoderImpl,
+ virtual public TypedDecoder<BooleanType>,
+ virtual public BooleanDecoder {
+ public:
+ explicit PlainBooleanDecoder(const ColumnDescriptor* descr);
+ void SetData(int num_values, const uint8_t* data, int len) override;
+
+ // Two flavors of bool decoding
+ int Decode(uint8_t* buffer, int max_values) override;
+ int Decode(bool* buffer, int max_values) override;
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::Accumulator* out) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* out) override;
+
+ private:
+ std::unique_ptr<::arrow::BitUtil::BitReader> bit_reader_;
+};
+
+PlainBooleanDecoder::PlainBooleanDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::PLAIN) {}
+
+void PlainBooleanDecoder::SetData(int num_values, const uint8_t* data, int len) {
+ num_values_ = num_values;
+ bit_reader_.reset(new BitUtil::BitReader(data, len));
+}
+
+int PlainBooleanDecoder::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::Accumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(num_values_ < values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ bool value;
+ ARROW_IGNORE_EXPR(bit_reader_->GetValue(1, &value));
+ builder->UnsafeAppend(value);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ return values_decoded;
+}
+
+inline int PlainBooleanDecoder::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("dictionaries of BooleanType");
+}
+
+int PlainBooleanDecoder::Decode(uint8_t* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ bool val;
+ ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
+ for (int i = 0; i < max_values; ++i) {
+ if (!bit_reader_->GetValue(1, &val)) {
+ ParquetException::EofException();
+ }
+ if (val) {
+ bit_writer.Set();
+ }
+ bit_writer.Next();
+ }
+ bit_writer.Finish();
+ num_values_ -= max_values;
+ return max_values;
+}
+
+int PlainBooleanDecoder::Decode(bool* buffer, int max_values) {
+ max_values = std::min(max_values, num_values_);
+ if (bit_reader_->GetBatch(1, buffer, max_values) != max_values) {
+ ParquetException::EofException();
+ }
+ num_values_ -= max_values;
+ return max_values;
+}
+
+struct ArrowBinaryHelper {
+ explicit ArrowBinaryHelper(typename EncodingTraits<ByteArrayType>::Accumulator* out) {
+ this->out = out;
+ this->builder = out->builder.get();
+ this->chunk_space_remaining =
+ ::arrow::kBinaryMemoryLimit - this->builder->value_data_length();
+ }
+
+ Status PushChunk() {
+ std::shared_ptr<::arrow::Array> result;
+ RETURN_NOT_OK(builder->Finish(&result));
+ out->chunks.push_back(result);
+ chunk_space_remaining = ::arrow::kBinaryMemoryLimit;
+ return Status::OK();
+ }
+
+ bool CanFit(int64_t length) const { return length <= chunk_space_remaining; }
+
+ void UnsafeAppend(const uint8_t* data, int32_t length) {
+ chunk_space_remaining -= length;
+ builder->UnsafeAppend(data, length);
+ }
+
+ void UnsafeAppendNull() { builder->UnsafeAppendNull(); }
+
+ Status Append(const uint8_t* data, int32_t length) {
+ chunk_space_remaining -= length;
+ return builder->Append(data, length);
+ }
+
+ Status AppendNull() { return builder->AppendNull(); }
+
+ typename EncodingTraits<ByteArrayType>::Accumulator* out;
+ ::arrow::BinaryBuilder* builder;
+ int64_t chunk_space_remaining;
+};
+
+template <>
+inline int PlainDecoder<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
+ ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
+ ParquetException::NYI();
+}
+
+template <>
+inline int PlainDecoder<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::Accumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(data_);
+ data_ += descr_->type_length();
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ num_values_ -= values_decoded;
+ len_ -= descr_->type_length() * values_decoded;
+ return values_decoded;
+}
+
+template <>
+inline int PlainDecoder<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < descr_->type_length() * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ PARQUET_THROW_NOT_OK(builder->Append(data_));
+ data_ += descr_->type_length();
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ num_values_ -= values_decoded;
+ len_ -= descr_->type_length() * values_decoded;
+ return values_decoded;
+}
+
+class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
+ virtual public ByteArrayDecoder {
+ public:
+ using Base = PlainDecoder<ByteArrayType>;
+ using Base::DecodeSpaced;
+ using Base::PlainDecoder;
+
+ // ----------------------------------------------------------------------
+ // Dictionary read paths
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::BinaryDictionary32Builder* builder) override {
+ int result = 0;
+ PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+ valid_bits_offset, builder, &result));
+ return result;
+ }
+
+ // ----------------------------------------------------------------------
+ // Optimized dense binary read paths
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ int result = 0;
+ PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+ valid_bits_offset, out, &result));
+ return result;
+ }
+
+ private:
+ Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_values_decoded) {
+ ArrowBinaryHelper helper(out);
+ int values_decoded = 0;
+
+ RETURN_NOT_OK(helper.builder->Reserve(num_values));
+ RETURN_NOT_OK(helper.builder->ReserveData(
+ std::min<int64_t>(len_, helper.chunk_space_remaining)));
+
+ int i = 0;
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ if (ARROW_PREDICT_FALSE(len_ < 4)) {
+ ParquetException::EofException();
+ }
+ auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+ return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+ }
+ auto increment = value_len + 4;
+ if (ARROW_PREDICT_FALSE(len_ < increment)) {
+ ParquetException::EofException();
+ }
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(value_len))) {
+ // This element would exceed the capacity of a chunk
+ RETURN_NOT_OK(helper.PushChunk());
+ RETURN_NOT_OK(helper.builder->Reserve(num_values - i));
+ RETURN_NOT_OK(helper.builder->ReserveData(
+ std::min<int64_t>(len_, helper.chunk_space_remaining)));
+ }
+ helper.UnsafeAppend(data_ + 4, value_len);
+ data_ += increment;
+ len_ -= increment;
+ ++values_decoded;
+ ++i;
+ return Status::OK();
+ },
+ [&]() {
+ helper.UnsafeAppendNull();
+ ++i;
+ return Status::OK();
+ }));
+
+ num_values_ -= values_decoded;
+ *out_values_decoded = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, BuilderType* builder,
+ int* out_values_decoded) {
+ RETURN_NOT_OK(builder->Reserve(num_values));
+ int values_decoded = 0;
+
+ RETURN_NOT_OK(VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ if (ARROW_PREDICT_FALSE(len_ < 4)) {
+ ParquetException::EofException();
+ }
+ auto value_len = ::arrow::util::SafeLoadAs<int32_t>(data_);
+ if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > INT32_MAX - 4)) {
+ return Status::Invalid("Invalid or corrupted value_len '", value_len, "'");
+ }
+ auto increment = value_len + 4;
+ if (ARROW_PREDICT_FALSE(len_ < increment)) {
+ ParquetException::EofException();
+ }
+ RETURN_NOT_OK(builder->Append(data_ + 4, value_len));
+ data_ += increment;
+ len_ -= increment;
+ ++values_decoded;
+ return Status::OK();
+ },
+ [&]() { return builder->AppendNull(); }));
+
+ num_values_ -= values_decoded;
+ *out_values_decoded = values_decoded;
+ return Status::OK();
+ }
+};
+
+class PlainFLBADecoder : public PlainDecoder<FLBAType>, virtual public FLBADecoder {
+ public:
+ using Base = PlainDecoder<FLBAType>;
+ using Base::PlainDecoder;
+};
+
+// ----------------------------------------------------------------------
+// Dictionary encoding and decoding
+
+template <typename Type>
+class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
+ public:
+ typedef typename Type::c_type T;
+
+ // Initializes the dictionary with values from 'dictionary'. The data in
+ // dictionary is not guaranteed to persist in memory after this call so the
+ // dictionary decoder needs to copy the data out if necessary.
+ explicit DictDecoderImpl(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::RLE_DICTIONARY),
+ dictionary_(AllocateBuffer(pool, 0)),
+ dictionary_length_(0),
+ byte_array_data_(AllocateBuffer(pool, 0)),
+ byte_array_offsets_(AllocateBuffer(pool, 0)),
+ indices_scratch_space_(AllocateBuffer(pool, 0)) {}
+
+ // Perform type-specific initiatialization
+ void SetDict(TypedDecoder<Type>* dictionary) override;
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ if (len == 0) {
+ // Initialize dummy decoder to avoid crashes later on
+ idx_decoder_ = ::arrow::util::RleDecoder(data, len, /*bit_width=*/1);
+ return;
+ }
+ uint8_t bit_width = *data;
+ if (ARROW_PREDICT_FALSE(bit_width >= 64)) {
+ throw ParquetException("Invalid or corrupted bit_width");
+ }
+ idx_decoder_ = ::arrow::util::RleDecoder(++data, --len, bit_width);
+ }
+
+ int Decode(T* buffer, int num_values) override {
+ num_values = std::min(num_values, num_values_);
+ int decoded_values =
+ idx_decoder_.GetBatchWithDict(reinterpret_cast<const T*>(dictionary_->data()),
+ dictionary_length_, buffer, num_values);
+ if (decoded_values != num_values) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeSpaced(T* buffer, int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ num_values = std::min(num_values, num_values_);
+ if (num_values != idx_decoder_.GetBatchWithDictSpaced(
+ reinterpret_cast<const T*>(dictionary_->data()),
+ dictionary_length_, buffer, num_values, null_count, valid_bits,
+ valid_bits_offset)) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::Accumulator* out) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::DictAccumulator* out) override;
+
+ void InsertDictionary(::arrow::ArrayBuilder* builder) override;
+
+ int DecodeIndicesSpaced(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::ArrayBuilder* builder) override {
+ if (num_values > 0) {
+ // TODO(wesm): Refactor to batch reads for improved memory use. It is not
+ // trivial because the null_count is relative to the entire bitmap
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
+ num_values, /*shrink_to_fit=*/false));
+ }
+
+ auto indices_buffer =
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
+
+ if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
+ valid_bits_offset, indices_buffer)) {
+ ParquetException::EofException();
+ }
+
+ /// XXX(wesm): Cannot append "valid bits" directly to the builder
+ std::vector<uint8_t> valid_bytes(num_values);
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+ for (int64_t i = 0; i < num_values; ++i) {
+ valid_bytes[i] = static_cast<uint8_t>(bit_reader.IsSet());
+ bit_reader.Next();
+ }
+
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+ PARQUET_THROW_NOT_OK(
+ binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
+ num_values_ -= num_values - null_count;
+ return num_values - null_count;
+ }
+
+ int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) override {
+ num_values = std::min(num_values, num_values_);
+ if (num_values > 0) {
+ // TODO(wesm): Refactor to batch reads for improved memory use. This is
+ // relatively simple here because we don't have to do any bookkeeping of
+ // nulls
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
+ num_values, /*shrink_to_fit=*/false));
+ }
+ auto indices_buffer =
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
+ if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
+ ParquetException::EofException();
+ }
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+ PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ int DecodeIndices(int num_values, int32_t* indices) override {
+ if (num_values != idx_decoder_.GetBatch(indices, num_values)) {
+ ParquetException::EofException();
+ }
+ num_values_ -= num_values;
+ return num_values;
+ }
+
+ void GetDictionary(const T** dictionary, int32_t* dictionary_length) override {
+ *dictionary_length = dictionary_length_;
+ *dictionary = reinterpret_cast<T*>(dictionary_->mutable_data());
+ }
+
+ protected:
+ Status IndexInBounds(int32_t index) {
+ if (ARROW_PREDICT_TRUE(0 <= index && index < dictionary_length_)) {
+ return Status::OK();
+ }
+ return Status::Invalid("Index not in dictionary bounds");
+ }
+
+ inline void DecodeDict(TypedDecoder<Type>* dictionary) {
+ dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
+ PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
+ /*shrink_to_fit=*/false));
+ dictionary->Decode(reinterpret_cast<T*>(dictionary_->mutable_data()),
+ dictionary_length_);
+ }
+
+ // Only one is set.
+ std::shared_ptr<ResizableBuffer> dictionary_;
+
+ int32_t dictionary_length_;
+
+ // Data that contains the byte array data (byte_array_dictionary_ just has the
+ // pointers).
+ std::shared_ptr<ResizableBuffer> byte_array_data_;
+
+ // Arrow-style byte offsets for each dictionary value. We maintain two
+ // representations of the dictionary, one as ByteArray* for non-Arrow
+ // consumers and this one for Arrow consumers. Since dictionaries are
+ // generally pretty small to begin with this doesn't mean too much extra
+ // memory use in most cases
+ std::shared_ptr<ResizableBuffer> byte_array_offsets_;
+
+ // Reusable buffer for decoding dictionary indices to be appended to a
+ // BinaryDictionary32Builder
+ std::shared_ptr<ResizableBuffer> indices_scratch_space_;
+
+ ::arrow::util::RleDecoder idx_decoder_;
+};
+
+template <typename Type>
+void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {
+ DecodeDict(dictionary);
+}
+
+template <>
+void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
+ ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
+}
+
+template <>
+void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictionary) {
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<ByteArray*>(dictionary_->mutable_data());
+
+ int total_size = 0;
+ for (int i = 0; i < dictionary_length_; ++i) {
+ total_size += dict_values[i].len;
+ }
+ PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+ /*shrink_to_fit=*/false));
+ PARQUET_THROW_NOT_OK(
+ byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
+ /*shrink_to_fit=*/false));
+
+ int32_t offset = 0;
+ uint8_t* bytes_data = byte_array_data_->mutable_data();
+ int32_t* bytes_offsets =
+ reinterpret_cast<int32_t*>(byte_array_offsets_->mutable_data());
+ for (int i = 0; i < dictionary_length_; ++i) {
+ memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
+ bytes_offsets[i] = offset;
+ dict_values[i].ptr = bytes_data + offset;
+ offset += dict_values[i].len;
+ }
+ bytes_offsets[dictionary_length_] = offset;
+}
+
+template <>
+inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionary) {
+ DecodeDict(dictionary);
+
+ auto dict_values = reinterpret_cast<FLBA*>(dictionary_->mutable_data());
+
+ int fixed_len = descr_->type_length();
+ int total_size = dictionary_length_ * fixed_len;
+
+ PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
+ /*shrink_to_fit=*/false));
+ uint8_t* bytes_data = byte_array_data_->mutable_data();
+ for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) {
+ memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
+ dict_values[i].ptr = bytes_data + offset;
+ }
+}
+
+template <>
+inline int DictDecoderImpl<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow to Int96Type");
+}
+
+template <>
+inline int DictDecoderImpl<Int96Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Int96Type>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow to Int96Type");
+}
+
+template <>
+inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* builder) {
+ ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <>
+inline int DictDecoderImpl<ByteArrayType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow implemented elsewhere");
+}
+
+template <typename DType>
+int DictDecoderImpl<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const typename DType::c_type*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ PARQUET_THROW_NOT_OK(builder->Append(dict_values[index]));
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ return num_values - null_count;
+}
+
+template <>
+int DictDecoderImpl<BooleanType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<BooleanType>::DictAccumulator* builder) {
+ ParquetException::NYI("No dictionary encoding for BooleanType");
+}
+
+template <>
+inline int DictDecoderImpl<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::Accumulator* builder) {
+ if (builder->byte_width() != descr_->type_length()) {
+ throw ParquetException("Byte width mismatch: builder was " +
+ std::to_string(builder->byte_width()) + " but decoder was " +
+ std::to_string(descr_->type_length()));
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ builder->UnsafeAppend(dict_values[index].ptr);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ return num_values - null_count;
+}
+
+template <>
+int DictDecoderImpl<FLBAType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<FLBAType>::DictAccumulator* builder) {
+ auto value_type =
+ checked_cast<const ::arrow::DictionaryType&>(*builder->type()).value_type();
+ auto byte_width =
+ checked_cast<const ::arrow::FixedSizeBinaryType&>(*value_type).byte_width();
+ if (byte_width != descr_->type_length()) {
+ throw ParquetException("Byte width mismatch: builder was " +
+ std::to_string(byte_width) + " but decoder was " +
+ std::to_string(descr_->type_length()));
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const FLBA*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ PARQUET_THROW_NOT_OK(builder->Append(dict_values[index].ptr));
+ },
+ [&]() { PARQUET_THROW_NOT_OK(builder->AppendNull()); });
+
+ return num_values - null_count;
+}
+
+template <typename Type>
+int DictDecoderImpl<Type>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<Type>::Accumulator* builder) {
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ using value_type = typename Type::c_type;
+ auto dict_values = reinterpret_cast<const value_type*>(dictionary_->data());
+
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ int32_t index;
+ if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
+ throw ParquetException("");
+ }
+ PARQUET_THROW_NOT_OK(IndexInBounds(index));
+ builder->UnsafeAppend(dict_values[index]);
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+ return num_values - null_count;
+}
+
+template <typename Type>
+void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+ ParquetException::NYI("InsertDictionary only implemented for BYTE_ARRAY types");
+}
+
+template <>
+void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
+
+ // Make a BinaryArray referencing the internal dictionary data
+ auto arr = std::make_shared<::arrow::BinaryArray>(
+ dictionary_length_, byte_array_offsets_, byte_array_data_);
+ PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
+}
+
+class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
+ virtual public ByteArrayDecoder {
+ public:
+ using BASE = DictDecoderImpl<ByteArrayType>;
+ using BASE::DictDecoderImpl;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ ::arrow::BinaryDictionary32Builder* builder) override {
+ int result = 0;
+ if (null_count == 0) {
+ PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
+ } else {
+ PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
+ valid_bits_offset, builder, &result));
+ }
+ return result;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ int result = 0;
+ if (null_count == 0) {
+ PARQUET_THROW_NOT_OK(DecodeArrowDenseNonNull(num_values, out, &result));
+ } else {
+ PARQUET_THROW_NOT_OK(DecodeArrowDense(num_values, null_count, valid_bits,
+ valid_bits_offset, out, &result));
+ }
+ return result;
+ }
+
+ private:
+ Status DecodeArrowDense(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 1024;
+ int32_t indices[kBufferSize];
+
+ ArrowBinaryHelper helper(out);
+
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+ int values_decoded = 0;
+ int num_appended = 0;
+ while (num_appended < num_values) {
+ bool is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+
+ if (is_valid) {
+ int32_t batch_size =
+ std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+ if (ARROW_PREDICT_FALSE(num_indices < 1)) {
+ return Status::Invalid("Invalid number of indices '", num_indices, "'");
+ }
+
+ int i = 0;
+ while (true) {
+ // Consume all indices
+ if (is_valid) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+ RETURN_NOT_OK(helper.PushChunk());
+ }
+ RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+ ++i;
+ ++values_decoded;
+ } else {
+ RETURN_NOT_OK(helper.AppendNull());
+ --null_count;
+ }
+ ++num_appended;
+ if (i == num_indices) {
+ // Do not advance the bit_reader if we have fulfilled the decode
+ // request
+ break;
+ }
+ is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+ }
+ } else {
+ RETURN_NOT_OK(helper.AppendNull());
+ --null_count;
+ ++num_appended;
+ }
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ Status DecodeArrowDenseNonNull(int num_values,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 2048;
+ int32_t indices[kBufferSize];
+ int values_decoded = 0;
+
+ ArrowBinaryHelper helper(out);
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ while (values_decoded < num_values) {
+ int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+ if (num_indices == 0) ParquetException::EofException();
+ for (int i = 0; i < num_indices; ++i) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+ RETURN_NOT_OK(helper.PushChunk());
+ }
+ RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+ }
+ values_decoded += num_indices;
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, BuilderType* builder,
+ int* out_num_values) {
+ constexpr int32_t kBufferSize = 1024;
+ int32_t indices[kBufferSize];
+
+ RETURN_NOT_OK(builder->Reserve(num_values));
+ ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, num_values);
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ int values_decoded = 0;
+ int num_appended = 0;
+ while (num_appended < num_values) {
+ bool is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+
+ if (is_valid) {
+ int32_t batch_size =
+ std::min<int32_t>(kBufferSize, num_values - num_appended - null_count);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+
+ int i = 0;
+ while (true) {
+ // Consume all indices
+ if (is_valid) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+ ++i;
+ ++values_decoded;
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ --null_count;
+ }
+ ++num_appended;
+ if (i == num_indices) {
+ // Do not advance the bit_reader if we have fulfilled the decode
+ // request
+ break;
+ }
+ is_valid = bit_reader.IsSet();
+ bit_reader.Next();
+ }
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ --null_count;
+ ++num_appended;
+ }
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+
+ template <typename BuilderType>
+ Status DecodeArrowNonNull(int num_values, BuilderType* builder, int* out_num_values) {
+ constexpr int32_t kBufferSize = 2048;
+ int32_t indices[kBufferSize];
+
+ RETURN_NOT_OK(builder->Reserve(num_values));
+
+ auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+ int values_decoded = 0;
+ while (values_decoded < num_values) {
+ int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - values_decoded);
+ int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+ if (num_indices == 0) ParquetException::EofException();
+ for (int i = 0; i < num_indices; ++i) {
+ auto idx = indices[i];
+ RETURN_NOT_OK(IndexInBounds(idx));
+ const auto& val = dict_values[idx];
+ RETURN_NOT_OK(builder->Append(val.ptr, val.len));
+ }
+ values_decoded += num_indices;
+ }
+ *out_num_values = values_decoded;
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// DeltaBitPackDecoder
+
+template <typename DType>
+class DeltaBitPackDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ typedef typename DType::c_type T;
+
+ explicit DeltaBitPackDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_BINARY_PACKED), pool_(pool) {
+ if (DType::type_num != Type::INT32 && DType::type_num != Type::INT64) {
+ throw ParquetException("Delta bit pack encoding should only be for integer data.");
+ }
+ }
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ this->num_values_ = num_values;
+ decoder_ = ::arrow::BitUtil::BitReader(data, len);
+ values_current_block_ = 0;
+ values_current_mini_block_ = 0;
+ }
+
+ int Decode(T* buffer, int max_values) override {
+ return GetInternal(buffer, max_values);
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* out) override {
+ if (null_count != 0) {
+ ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
+ }
+ std::vector<T> values(num_values);
+ GetInternal(values.data(), num_values);
+ PARQUET_THROW_NOT_OK(out->AppendValues(values));
+ return num_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* out) override {
+ if (null_count != 0) {
+ ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
+ }
+ std::vector<T> values(num_values);
+ GetInternal(values.data(), num_values);
+ PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+ for (T value : values) {
+ PARQUET_THROW_NOT_OK(out->Append(value));
+ }
+ return num_values;
+ }
+
+ private:
+ void InitBlock() {
+ // The number of values per block.
+ uint32_t block_size;
+ if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
+ if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
+ if (!decoder_.GetVlqInt(&values_current_block_)) {
+ ParquetException::EofException();
+ }
+ if (!decoder_.GetZigZagVlqInt(&last_value_)) ParquetException::EofException();
+
+ delta_bit_widths_ = AllocateBuffer(pool_, num_mini_blocks_);
+ uint8_t* bit_width_data = delta_bit_widths_->mutable_data();
+
+ if (!decoder_.GetZigZagVlqInt(&min_delta_)) ParquetException::EofException();
+ for (uint32_t i = 0; i < num_mini_blocks_; ++i) {
+ if (!decoder_.GetAligned<uint8_t>(1, bit_width_data + i)) {
+ ParquetException::EofException();
+ }
+ }
+ values_per_mini_block_ = block_size / num_mini_blocks_;
+ mini_block_idx_ = 0;
+ delta_bit_width_ = bit_width_data[0];
+ values_current_mini_block_ = values_per_mini_block_;
+ }
+
+ template <typename T>
+ int GetInternal(T* buffer, int max_values) {
+ max_values = std::min(max_values, this->num_values_);
+ const uint8_t* bit_width_data = delta_bit_widths_->data();
+ for (int i = 0; i < max_values; ++i) {
+ if (ARROW_PREDICT_FALSE(values_current_mini_block_ == 0)) {
+ ++mini_block_idx_;
+ if (mini_block_idx_ < static_cast<size_t>(delta_bit_widths_->size())) {
+ delta_bit_width_ = bit_width_data[mini_block_idx_];
+ values_current_mini_block_ = values_per_mini_block_;
+ } else {
+ InitBlock();
+ buffer[i] = last_value_;
+ continue;
+ }
+ }
+
+ // TODO: the key to this algorithm is to decode the entire miniblock at once.
+ int64_t delta;
+ if (!decoder_.GetValue(delta_bit_width_, &delta)) ParquetException::EofException();
+ delta += min_delta_;
+ last_value_ += static_cast<int32_t>(delta);
+ buffer[i] = last_value_;
+ --values_current_mini_block_;
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ MemoryPool* pool_;
+ ::arrow::BitUtil::BitReader decoder_;
+ uint32_t values_current_block_;
+ uint32_t num_mini_blocks_;
+ uint64_t values_per_mini_block_;
+ uint64_t values_current_mini_block_;
+
+ int32_t min_delta_;
+ size_t mini_block_idx_;
+ std::shared_ptr<ResizableBuffer> delta_bit_widths_;
+ int delta_bit_width_;
+
+ int32_t last_value_;
+};
+
+// ----------------------------------------------------------------------
+// DELTA_LENGTH_BYTE_ARRAY
+
+class DeltaLengthByteArrayDecoder : public DecoderImpl,
+ virtual public TypedDecoder<ByteArrayType> {
+ public:
+ explicit DeltaLengthByteArrayDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY),
+ len_decoder_(nullptr, pool),
+ pool_(pool) {}
+
+ void SetData(int num_values, const uint8_t* data, int len) override {
+ num_values_ = num_values;
+ if (len == 0) return;
+ int total_lengths_len = ::arrow::util::SafeLoadAs<int32_t>(data);
+ data += 4;
+ this->len_decoder_.SetData(num_values, data, total_lengths_len);
+ data_ = data + total_lengths_len;
+ this->len_ = len - 4 - total_lengths_len;
+ }
+
+ int Decode(ByteArray* buffer, int max_values) override {
+ using VectorT = ArrowPoolVector<int>;
+ max_values = std::min(max_values, num_values_);
+ VectorT lengths(max_values, 0, ::arrow::stl::allocator<int>(pool_));
+ len_decoder_.Decode(lengths.data(), max_values);
+ for (int i = 0; i < max_values; ++i) {
+ buffer[i].len = lengths[i];
+ buffer[i].ptr = data_;
+ this->data_ += lengths[i];
+ this->len_ -= lengths[i];
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::Accumulator* out) override {
+ ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
+ }
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<ByteArrayType>::DictAccumulator* out) override {
+ ParquetException::NYI("DecodeArrow for DeltaLengthByteArrayDecoder");
+ }
+
+ private:
+ DeltaBitPackDecoder<Int32Type> len_decoder_;
+ ::arrow::MemoryPool* pool_;
+};
+
+// ----------------------------------------------------------------------
+// DELTA_BYTE_ARRAY
+
+class DeltaByteArrayDecoder : public DecoderImpl,
+ virtual public TypedDecoder<ByteArrayType> {
+ public:
+ explicit DeltaByteArrayDecoder(const ColumnDescriptor* descr,
+ MemoryPool* pool = ::arrow::default_memory_pool())
+ : DecoderImpl(descr, Encoding::DELTA_BYTE_ARRAY),
+ prefix_len_decoder_(nullptr, pool),
+ suffix_decoder_(nullptr, pool),
+ last_value_(0, nullptr) {}
+
+ virtual void SetData(int num_values, const uint8_t* data, int len) {
+ num_values_ = num_values;
+ if (len == 0) return;
+ int prefix_len_length = ::arrow::util::SafeLoadAs<int32_t>(data);
+ data += 4;
+ len -= 4;
+ prefix_len_decoder_.SetData(num_values, data, prefix_len_length);
+ data += prefix_len_length;
+ len -= prefix_len_length;
+ suffix_decoder_.SetData(num_values, data, len);
+ }
+
+ // TODO: this doesn't work and requires memory management. We need to allocate
+ // new strings to store the results.
+ virtual int Decode(ByteArray* buffer, int max_values) {
+ max_values = std::min(max_values, this->num_values_);
+ for (int i = 0; i < max_values; ++i) {
+ int prefix_len = 0;
+ prefix_len_decoder_.Decode(&prefix_len, 1);
+ ByteArray suffix = {0, nullptr};
+ suffix_decoder_.Decode(&suffix, 1);
+ buffer[i].len = prefix_len + suffix.len;
+
+ uint8_t* result = reinterpret_cast<uint8_t*>(malloc(buffer[i].len));
+ memcpy(result, last_value_.ptr, prefix_len);
+ memcpy(result + prefix_len, suffix.ptr, suffix.len);
+
+ buffer[i].ptr = result;
+ last_value_ = buffer[i];
+ }
+ this->num_values_ -= max_values;
+ return max_values;
+ }
+
+ private:
+ DeltaBitPackDecoder<Int32Type> prefix_len_decoder_;
+ DeltaLengthByteArrayDecoder suffix_decoder_;
+ ByteArray last_value_;
+};
+
+// ----------------------------------------------------------------------
+// BYTE_STREAM_SPLIT
+
+template <typename DType>
+class ByteStreamSplitDecoder : public DecoderImpl, virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+ explicit ByteStreamSplitDecoder(const ColumnDescriptor* descr);
+
+ int Decode(T* buffer, int max_values) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) override;
+
+ int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) override;
+
+ void SetData(int num_values, const uint8_t* data, int len) override;
+
+ T* EnsureDecodeBuffer(int64_t min_values) {
+ const int64_t size = sizeof(T) * min_values;
+ if (!decode_buffer_ || decode_buffer_->size() < size) {
+ PARQUET_ASSIGN_OR_THROW(decode_buffer_, ::arrow::AllocateBuffer(size));
+ }
+ return reinterpret_cast<T*>(decode_buffer_->mutable_data());
+ }
+
+ private:
+ int num_values_in_buffer_{0};
+ std::shared_ptr<Buffer> decode_buffer_;
+
+ static constexpr size_t kNumStreams = sizeof(T);
+};
+
+template <typename DType>
+ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* descr)
+ : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT) {}
+
+template <typename DType>
+void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t* data,
+ int len) {
+ DecoderImpl::SetData(num_values, data, len);
+ if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
+ throw ParquetException("Data size too small for number of values (corrupted file?)");
+ }
+ num_values_in_buffer_ = num_values;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
+ const int values_to_decode = std::min(num_values_, max_values);
+ const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+ const uint8_t* data = data_ + num_decoded_previously;
+
+ ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_to_decode,
+ num_values_in_buffer_, buffer);
+ num_values_ -= values_to_decode;
+ len_ -= sizeof(T) * values_to_decode;
+ return values_to_decode;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* builder) {
+ constexpr int value_size = static_cast<int>(kNumStreams);
+ int values_decoded = num_values - null_count;
+ if (ARROW_PREDICT_FALSE(len_ < value_size * values_decoded)) {
+ ParquetException::EofException();
+ }
+
+ PARQUET_THROW_NOT_OK(builder->Reserve(num_values));
+
+ const int num_decoded_previously = num_values_in_buffer_ - num_values_;
+ const uint8_t* data = data_ + num_decoded_previously;
+ int offset = 0;
+
+#if defined(ARROW_HAVE_SIMD_SPLIT)
+ // Use fast decoding into intermediate buffer. This will also decode
+ // some null values, but it's fast enough that we don't care.
+ T* decode_out = EnsureDecodeBuffer(values_decoded);
+ ::arrow::util::internal::ByteStreamSplitDecode<T>(data, values_decoded,
+ num_values_in_buffer_, decode_out);
+
+ // XXX If null_count is 0, we could even append in bulk or decode directly into
+ // builder
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ builder->UnsafeAppend(decode_out[offset]);
+ ++offset;
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+
+#else
+ VisitNullBitmapInline(
+ valid_bits, valid_bits_offset, num_values, null_count,
+ [&]() {
+ uint8_t gathered_byte_data[kNumStreams];
+ for (size_t b = 0; b < kNumStreams; ++b) {
+ const size_t byte_index = b * num_values_in_buffer_ + offset;
+ gathered_byte_data[b] = data[byte_index];
+ }
+ builder->UnsafeAppend(::arrow::util::SafeLoadAs<T>(&gathered_byte_data[0]));
+ ++offset;
+ },
+ [&]() { builder->UnsafeAppendNull(); });
+#endif
+
+ num_values_ -= values_decoded;
+ len_ -= sizeof(T) * values_decoded;
+ return values_decoded;
+}
+
+template <typename DType>
+int ByteStreamSplitDecoder<DType>::DecodeArrow(
+ int num_values, int null_count, const uint8_t* valid_bits, int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ ParquetException::NYI("DecodeArrow for ByteStreamSplitDecoder");
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Encoder and decoder factory functions
+
+std::unique_ptr<Encoder> MakeEncoder(Type::type type_num, Encoding::type encoding,
+ bool use_dictionary, const ColumnDescriptor* descr,
+ MemoryPool* pool) {
+ if (use_dictionary) {
+ switch (type_num) {
+ case Type::INT32:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<ByteArrayType>(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new DictEncoderImpl<FLBAType>(descr, pool));
+ default:
+ DCHECK(false) << "Encoder not implemented";
+ break;
+ }
+ } else if (encoding == Encoding::PLAIN) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ return std::unique_ptr<Encoder>(new PlainEncoder<BooleanType>(descr, pool));
+ case Type::INT32:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Encoder>(new PlainEncoder<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(new PlainEncoder<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(new PlainEncoder<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new PlainEncoder<ByteArrayType>(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Encoder>(new PlainEncoder<FLBAType>(descr, pool));
+ default:
+ DCHECK(false) << "Encoder not implemented";
+ break;
+ }
+ } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+ switch (type_num) {
+ case Type::FLOAT:
+ return std::unique_ptr<Encoder>(
+ new ByteStreamSplitEncoder<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Encoder>(
+ new ByteStreamSplitEncoder<DoubleType>(descr, pool));
+ default:
+ throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+ break;
+ }
+ } else {
+ ParquetException::NYI("Selected encoding is not supported");
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
+ const ColumnDescriptor* descr) {
+ if (encoding == Encoding::PLAIN) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ return std::unique_ptr<Decoder>(new PlainBooleanDecoder(descr));
+ case Type::INT32:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int32Type>(descr));
+ case Type::INT64:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int64Type>(descr));
+ case Type::INT96:
+ return std::unique_ptr<Decoder>(new PlainDecoder<Int96Type>(descr));
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new PlainDecoder<FloatType>(descr));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new PlainDecoder<DoubleType>(descr));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new PlainByteArrayDecoder(descr));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new PlainFLBADecoder(descr));
+ default:
+ break;
+ }
+ } else if (encoding == Encoding::BYTE_STREAM_SPLIT) {
+ switch (type_num) {
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<FloatType>(descr));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new ByteStreamSplitDecoder<DoubleType>(descr));
+ default:
+ throw ParquetException("BYTE_STREAM_SPLIT only supports FLOAT and DOUBLE");
+ break;
+ }
+ } else {
+ ParquetException::NYI("Selected encoding is not supported");
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+namespace detail {
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+ const ColumnDescriptor* descr,
+ MemoryPool* pool) {
+ switch (type_num) {
+ case Type::BOOLEAN:
+ ParquetException::NYI("Dictionary encoding not implemented for boolean type");
+ case Type::INT32:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int32Type>(descr, pool));
+ case Type::INT64:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int64Type>(descr, pool));
+ case Type::INT96:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<Int96Type>(descr, pool));
+ case Type::FLOAT:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<FloatType>(descr, pool));
+ case Type::DOUBLE:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<DoubleType>(descr, pool));
+ case Type::BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new DictByteArrayDecoderImpl(descr, pool));
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::unique_ptr<Decoder>(new DictDecoderImpl<FLBAType>(descr, pool));
+ default:
+ break;
+ }
+ DCHECK(false) << "Should not be able to reach this code";
+ return nullptr;
+}
+
+} // namespace detail
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
index bf5446e0174..b9ca7a7ee68 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encoding.h
@@ -1,460 +1,460 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "arrow/util/spaced.h"
-
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-class ArrayBuilder;
-class BinaryArray;
-class BinaryBuilder;
-class BooleanBuilder;
-class Int32Type;
-class Int64Type;
-class FloatType;
-class DoubleType;
-class FixedSizeBinaryType;
-template <typename T>
-class NumericBuilder;
-class FixedSizeBinaryBuilder;
-template <typename T>
-class Dictionary32Builder;
-
-} // namespace arrow
-
-namespace parquet {
-
-template <typename DType>
-class TypedEncoder;
-
-using BooleanEncoder = TypedEncoder<BooleanType>;
-using Int32Encoder = TypedEncoder<Int32Type>;
-using Int64Encoder = TypedEncoder<Int64Type>;
-using Int96Encoder = TypedEncoder<Int96Type>;
-using FloatEncoder = TypedEncoder<FloatType>;
-using DoubleEncoder = TypedEncoder<DoubleType>;
-using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
-using FLBAEncoder = TypedEncoder<FLBAType>;
-
-template <typename DType>
-class TypedDecoder;
-
-class BooleanDecoder;
-using Int32Decoder = TypedDecoder<Int32Type>;
-using Int64Decoder = TypedDecoder<Int64Type>;
-using Int96Decoder = TypedDecoder<Int96Type>;
-using FloatDecoder = TypedDecoder<FloatType>;
-using DoubleDecoder = TypedDecoder<DoubleType>;
-using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
-class FLBADecoder;
-
-template <typename T>
-struct EncodingTraits;
-
-template <>
-struct EncodingTraits<BooleanType> {
- using Encoder = BooleanEncoder;
- using Decoder = BooleanDecoder;
-
- using ArrowType = ::arrow::BooleanType;
- using Accumulator = ::arrow::BooleanBuilder;
- struct DictAccumulator {};
-};
-
-template <>
-struct EncodingTraits<Int32Type> {
- using Encoder = Int32Encoder;
- using Decoder = Int32Decoder;
-
- using ArrowType = ::arrow::Int32Type;
- using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
-};
-
-template <>
-struct EncodingTraits<Int64Type> {
- using Encoder = Int64Encoder;
- using Decoder = Int64Decoder;
-
- using ArrowType = ::arrow::Int64Type;
- using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
-};
-
-template <>
-struct EncodingTraits<Int96Type> {
- using Encoder = Int96Encoder;
- using Decoder = Int96Decoder;
-
- struct Accumulator {};
- struct DictAccumulator {};
-};
-
-template <>
-struct EncodingTraits<FloatType> {
- using Encoder = FloatEncoder;
- using Decoder = FloatDecoder;
-
- using ArrowType = ::arrow::FloatType;
- using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
-};
-
-template <>
-struct EncodingTraits<DoubleType> {
- using Encoder = DoubleEncoder;
- using Decoder = DoubleDecoder;
-
- using ArrowType = ::arrow::DoubleType;
- using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
-};
-
-template <>
-struct EncodingTraits<ByteArrayType> {
- using Encoder = ByteArrayEncoder;
- using Decoder = ByteArrayDecoder;
-
- /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
- /// overflow the capacity of a single arrow::BinaryArray
- struct Accumulator {
- std::unique_ptr<::arrow::BinaryBuilder> builder;
- std::vector<std::shared_ptr<::arrow::Array>> chunks;
- };
- using ArrowType = ::arrow::BinaryType;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
-};
-
-template <>
-struct EncodingTraits<FLBAType> {
- using Encoder = FLBAEncoder;
- using Decoder = FLBADecoder;
-
- using ArrowType = ::arrow::FixedSizeBinaryType;
- using Accumulator = ::arrow::FixedSizeBinaryBuilder;
- using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
-};
-
-class ColumnDescriptor;
-
-// Untyped base for all encoders
-class Encoder {
- public:
- virtual ~Encoder() = default;
-
- virtual int64_t EstimatedDataEncodedSize() = 0;
- virtual std::shared_ptr<Buffer> FlushValues() = 0;
- virtual Encoding::type encoding() const = 0;
-
- virtual void Put(const ::arrow::Array& values) = 0;
-
- virtual MemoryPool* memory_pool() const = 0;
-};
-
-// Base class for value encoders. Since encoders may or not have state (e.g.,
-// dictionary encoding) we use a class instance to maintain any state.
-//
-// Encode interfaces are internal, subject to change without deprecation.
-template <typename DType>
-class TypedEncoder : virtual public Encoder {
- public:
- typedef typename DType::c_type T;
-
- using Encoder::Put;
-
- virtual void Put(const T* src, int num_values) = 0;
-
- virtual void Put(const std::vector<T>& src, int num_values = -1);
-
- virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
- int64_t valid_bits_offset) = 0;
-};
-
-template <typename DType>
-void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
- if (num_values == -1) {
- num_values = static_cast<int>(src.size());
- }
- Put(src.data(), num_values);
-}
-
-template <>
-inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
- // NOTE(wesm): This stub is here only to satisfy the compiler; it is
- // overridden later with the actual implementation
-}
-
-// Base class for dictionary encoders
-template <typename DType>
-class DictEncoder : virtual public TypedEncoder<DType> {
- public:
- /// Writes out any buffered indices to buffer preceded by the bit width of this data.
- /// Returns the number of bytes written.
- /// If the supplied buffer is not big enough, returns -1.
- /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
- /// to size buffer.
- virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
-
- virtual int dict_encoded_size() = 0;
- // virtual int dict_encoded_size() { return dict_encoded_size_; }
-
- virtual int bit_width() const = 0;
-
- /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
- /// dict_encoded_size() bytes.
- virtual void WriteDict(uint8_t* buffer) = 0;
-
- virtual int num_entries() const = 0;
-
- /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
- /// assumed (without any boundschecking) that the indices reference
- /// pre-existing dictionary values
- /// \param[in] indices the dictionary index values. Only Int32Array currently
- /// supported
- virtual void PutIndices(const ::arrow::Array& indices) = 0;
-
- /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
- /// separately. Currently throws exception if the current dictionary memo is
- /// non-empty
- /// \param[in] values the dictionary values. Only valid for certain
- /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
- virtual void PutDictionary(const ::arrow::Array& values) = 0;
-};
-
-// ----------------------------------------------------------------------
-// Value decoding
-
-class Decoder {
- public:
- virtual ~Decoder() = default;
-
- // Sets the data for a new page. This will be called multiple times on the same
- // decoder and should reset all internal state.
- virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
-
- // Returns the number of values left (for the last call to SetData()). This is
- // the number of values left in this page.
- virtual int values_left() const = 0;
- virtual Encoding::type encoding() const = 0;
-};
-
-template <typename DType>
-class TypedDecoder : virtual public Decoder {
- public:
- using T = typename DType::c_type;
-
- /// \brief Decode values into a buffer
- ///
- /// Subclasses may override the more specialized Decode methods below.
- ///
- /// \param[in] buffer destination for decoded values
- /// \param[in] max_values maximum number of values to decode
- /// \return The number of values decoded. Should be identical to max_values except
- /// at the end of the current data page.
- virtual int Decode(T* buffer, int max_values) = 0;
-
- /// \brief Decode the values in this data page but leave spaces for null entries.
- ///
- /// \param[in] buffer destination for decoded values
- /// \param[in] num_values size of the def_levels and buffer arrays including the number
- /// of null slots
- /// \param[in] null_count number of null slots
- /// \param[in] valid_bits bitmap data indicating position of valid slots
- /// \param[in] valid_bits_offset offset into valid_bits
- /// \return The number of values decoded, including nulls.
- virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset) {
- if (null_count > 0) {
- int values_to_read = num_values - null_count;
- int values_read = Decode(buffer, values_to_read);
- if (values_read != values_to_read) {
- throw ParquetException("Number of values / definition_levels read did not match");
- }
-
- return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
- valid_bits, valid_bits_offset);
- } else {
- return Decode(buffer, num_values);
- }
- }
-
- /// \brief Decode into an ArrayBuilder or other accumulator
- ///
- /// This function assumes the definition levels were already decoded
- /// as a validity bitmap in the given `valid_bits`. `null_count`
- /// is the number of 0s in `valid_bits`.
- /// As a space optimization, it is allowed for `valid_bits` to be null
- /// if `null_count` is zero.
- ///
- /// \return number of values decoded
- virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::Accumulator* out) = 0;
-
- /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
- ///
- /// \return number of values decoded
- int DecodeArrowNonNull(int num_values,
- typename EncodingTraits<DType>::Accumulator* out) {
- return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
- }
-
- /// \brief Decode into a DictionaryBuilder
- ///
- /// This function assumes the definition levels were already decoded
- /// as a validity bitmap in the given `valid_bits`. `null_count`
- /// is the number of 0s in `valid_bits`.
- /// As a space optimization, it is allowed for `valid_bits` to be null
- /// if `null_count` is zero.
- ///
- /// \return number of values decoded
- virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
-
- /// \brief Decode into a DictionaryBuilder ignoring nulls
- ///
- /// \return number of values decoded
- int DecodeArrowNonNull(int num_values,
- typename EncodingTraits<DType>::DictAccumulator* builder) {
- return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
- }
-};
-
-template <typename DType>
-class DictDecoder : virtual public TypedDecoder<DType> {
- public:
- using T = typename DType::c_type;
-
- virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
-
- /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
- /// but do not append any indices
- virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
-
- /// \brief Decode only dictionary indices and append to dictionary
- /// builder. The builder must have had the dictionary from this decoder
- /// inserted already.
- ///
- /// \warning Remember to reset the builder each time the dict decoder is initialized
- /// with a new dictionary page
- virtual int DecodeIndicesSpaced(int num_values, int null_count,
- const uint8_t* valid_bits, int64_t valid_bits_offset,
- ::arrow::ArrayBuilder* builder) = 0;
-
- /// \brief Decode only dictionary indices (no nulls)
- ///
- /// \warning Remember to reset the builder each time the dict decoder is initialized
- /// with a new dictionary page
- virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
-
- /// \brief Decode only dictionary indices (no nulls). Same as above
- /// DecodeIndices but target is an array instead of a builder.
- ///
- /// \note API EXPERIMENTAL
- virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
-
- /// \brief Get dictionary. The reader will call this API when it encounters a
- /// new dictionary.
- ///
- /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
- /// the decoder and is destroyed when the decoder is destroyed.
- /// @param[out] dictionary_length The dictionary length.
- ///
- /// \note API EXPERIMENTAL
- virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
-};
-
-// ----------------------------------------------------------------------
-// TypedEncoder specializations, traits, and factory functions
-
-class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
- public:
- using TypedDecoder<BooleanType>::Decode;
- virtual int Decode(uint8_t* buffer, int max_values) = 0;
-};
-
-class FLBADecoder : virtual public TypedDecoder<FLBAType> {
- public:
- using TypedDecoder<FLBAType>::DecodeSpaced;
-
- // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
- // there is value in adding specialized read methods for
- // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
- // then perhaps not
-};
-
-PARQUET_EXPORT
-std::unique_ptr<Encoder> MakeEncoder(
- Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
- const ColumnDescriptor* descr = NULLPTR,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
-template <typename DType>
-std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
- Encoding::type encoding, bool use_dictionary = false,
- const ColumnDescriptor* descr = NULLPTR,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- using OutType = typename EncodingTraits<DType>::Encoder;
- std::unique_ptr<Encoder> base =
- MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
- return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
-}
-
-PARQUET_EXPORT
-std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
- const ColumnDescriptor* descr = NULLPTR);
-
-namespace detail {
-
-PARQUET_EXPORT
-std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool);
-
-} // namespace detail
-
-template <typename DType>
-std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
- const ColumnDescriptor* descr = NULLPTR,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- using OutType = DictDecoder<DType>;
- auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
- return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
-}
-
-template <typename DType>
-std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
- Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
- using OutType = typename EncodingTraits<DType>::Decoder;
- std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
- return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "arrow/util/spaced.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class ArrayBuilder;
+class BinaryArray;
+class BinaryBuilder;
+class BooleanBuilder;
+class Int32Type;
+class Int64Type;
+class FloatType;
+class DoubleType;
+class FixedSizeBinaryType;
+template <typename T>
+class NumericBuilder;
+class FixedSizeBinaryBuilder;
+template <typename T>
+class Dictionary32Builder;
+
+} // namespace arrow
+
+namespace parquet {
+
+template <typename DType>
+class TypedEncoder;
+
+using BooleanEncoder = TypedEncoder<BooleanType>;
+using Int32Encoder = TypedEncoder<Int32Type>;
+using Int64Encoder = TypedEncoder<Int64Type>;
+using Int96Encoder = TypedEncoder<Int96Type>;
+using FloatEncoder = TypedEncoder<FloatType>;
+using DoubleEncoder = TypedEncoder<DoubleType>;
+using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
+using FLBAEncoder = TypedEncoder<FLBAType>;
+
+template <typename DType>
+class TypedDecoder;
+
+class BooleanDecoder;
+using Int32Decoder = TypedDecoder<Int32Type>;
+using Int64Decoder = TypedDecoder<Int64Type>;
+using Int96Decoder = TypedDecoder<Int96Type>;
+using FloatDecoder = TypedDecoder<FloatType>;
+using DoubleDecoder = TypedDecoder<DoubleType>;
+using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
+class FLBADecoder;
+
+template <typename T>
+struct EncodingTraits;
+
+template <>
+struct EncodingTraits<BooleanType> {
+ using Encoder = BooleanEncoder;
+ using Decoder = BooleanDecoder;
+
+ using ArrowType = ::arrow::BooleanType;
+ using Accumulator = ::arrow::BooleanBuilder;
+ struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<Int32Type> {
+ using Encoder = Int32Encoder;
+ using Decoder = Int32Decoder;
+
+ using ArrowType = ::arrow::Int32Type;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
+};
+
+template <>
+struct EncodingTraits<Int64Type> {
+ using Encoder = Int64Encoder;
+ using Decoder = Int64Decoder;
+
+ using ArrowType = ::arrow::Int64Type;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
+};
+
+template <>
+struct EncodingTraits<Int96Type> {
+ using Encoder = Int96Encoder;
+ using Decoder = Int96Decoder;
+
+ struct Accumulator {};
+ struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<FloatType> {
+ using Encoder = FloatEncoder;
+ using Decoder = FloatDecoder;
+
+ using ArrowType = ::arrow::FloatType;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
+};
+
+template <>
+struct EncodingTraits<DoubleType> {
+ using Encoder = DoubleEncoder;
+ using Decoder = DoubleDecoder;
+
+ using ArrowType = ::arrow::DoubleType;
+ using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
+};
+
+template <>
+struct EncodingTraits<ByteArrayType> {
+ using Encoder = ByteArrayEncoder;
+ using Decoder = ByteArrayDecoder;
+
+ /// \brief Internal helper class for decoding BYTE_ARRAY data where we can
+ /// overflow the capacity of a single arrow::BinaryArray
+ struct Accumulator {
+ std::unique_ptr<::arrow::BinaryBuilder> builder;
+ std::vector<std::shared_ptr<::arrow::Array>> chunks;
+ };
+ using ArrowType = ::arrow::BinaryType;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+};
+
+template <>
+struct EncodingTraits<FLBAType> {
+ using Encoder = FLBAEncoder;
+ using Decoder = FLBADecoder;
+
+ using ArrowType = ::arrow::FixedSizeBinaryType;
+ using Accumulator = ::arrow::FixedSizeBinaryBuilder;
+ using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
+};
+
+class ColumnDescriptor;
+
+// Untyped base for all encoders
+class Encoder {
+ public:
+ virtual ~Encoder() = default;
+
+ virtual int64_t EstimatedDataEncodedSize() = 0;
+ virtual std::shared_ptr<Buffer> FlushValues() = 0;
+ virtual Encoding::type encoding() const = 0;
+
+ virtual void Put(const ::arrow::Array& values) = 0;
+
+ virtual MemoryPool* memory_pool() const = 0;
+};
+
+// Base class for value encoders. Since encoders may or not have state (e.g.,
+// dictionary encoding) we use a class instance to maintain any state.
+//
+// Encode interfaces are internal, subject to change without deprecation.
+template <typename DType>
+class TypedEncoder : virtual public Encoder {
+ public:
+ typedef typename DType::c_type T;
+
+ using Encoder::Put;
+
+ virtual void Put(const T* src, int num_values) = 0;
+
+ virtual void Put(const std::vector<T>& src, int num_values = -1);
+
+ virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset) = 0;
+};
+
+template <typename DType>
+void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
+ if (num_values == -1) {
+ num_values = static_cast<int>(src.size());
+ }
+ Put(src.data(), num_values);
+}
+
+template <>
+inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+ // NOTE(wesm): This stub is here only to satisfy the compiler; it is
+ // overridden later with the actual implementation
+}
+
+// Base class for dictionary encoders
+template <typename DType>
+class DictEncoder : virtual public TypedEncoder<DType> {
+ public:
+ /// Writes out any buffered indices to buffer preceded by the bit width of this data.
+ /// Returns the number of bytes written.
+ /// If the supplied buffer is not big enough, returns -1.
+ /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
+ /// to size buffer.
+ virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
+
+ virtual int dict_encoded_size() = 0;
+ // virtual int dict_encoded_size() { return dict_encoded_size_; }
+
+ virtual int bit_width() const = 0;
+
+ /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+ /// dict_encoded_size() bytes.
+ virtual void WriteDict(uint8_t* buffer) = 0;
+
+ virtual int num_entries() const = 0;
+
+ /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
+ /// assumed (without any boundschecking) that the indices reference
+ /// pre-existing dictionary values
+ /// \param[in] indices the dictionary index values. Only Int32Array currently
+ /// supported
+ virtual void PutIndices(const ::arrow::Array& indices) = 0;
+
+ /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
+ /// separately. Currently throws exception if the current dictionary memo is
+ /// non-empty
+ /// \param[in] values the dictionary values. Only valid for certain
+ /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
+ virtual void PutDictionary(const ::arrow::Array& values) = 0;
+};
+
+// ----------------------------------------------------------------------
+// Value decoding
+
+class Decoder {
+ public:
+ virtual ~Decoder() = default;
+
+ // Sets the data for a new page. This will be called multiple times on the same
+ // decoder and should reset all internal state.
+ virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
+
+ // Returns the number of values left (for the last call to SetData()). This is
+ // the number of values left in this page.
+ virtual int values_left() const = 0;
+ virtual Encoding::type encoding() const = 0;
+};
+
+template <typename DType>
+class TypedDecoder : virtual public Decoder {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief Decode values into a buffer
+ ///
+ /// Subclasses may override the more specialized Decode methods below.
+ ///
+ /// \param[in] buffer destination for decoded values
+ /// \param[in] max_values maximum number of values to decode
+ /// \return The number of values decoded. Should be identical to max_values except
+ /// at the end of the current data page.
+ virtual int Decode(T* buffer, int max_values) = 0;
+
+ /// \brief Decode the values in this data page but leave spaces for null entries.
+ ///
+ /// \param[in] buffer destination for decoded values
+ /// \param[in] num_values size of the def_levels and buffer arrays including the number
+ /// of null slots
+ /// \param[in] null_count number of null slots
+ /// \param[in] valid_bits bitmap data indicating position of valid slots
+ /// \param[in] valid_bits_offset offset into valid_bits
+ /// \return The number of values decoded, including nulls.
+ virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset) {
+ if (null_count > 0) {
+ int values_to_read = num_values - null_count;
+ int values_read = Decode(buffer, values_to_read);
+ if (values_read != values_to_read) {
+ throw ParquetException("Number of values / definition_levels read did not match");
+ }
+
+ return ::arrow::util::internal::SpacedExpand<T>(buffer, num_values, null_count,
+ valid_bits, valid_bits_offset);
+ } else {
+ return Decode(buffer, num_values);
+ }
+ }
+
+ /// \brief Decode into an ArrayBuilder or other accumulator
+ ///
+ /// This function assumes the definition levels were already decoded
+ /// as a validity bitmap in the given `valid_bits`. `null_count`
+ /// is the number of 0s in `valid_bits`.
+ /// As a space optimization, it is allowed for `valid_bits` to be null
+ /// if `null_count` is zero.
+ ///
+ /// \return number of values decoded
+ virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::Accumulator* out) = 0;
+
+ /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
+ ///
+ /// \return number of values decoded
+ int DecodeArrowNonNull(int num_values,
+ typename EncodingTraits<DType>::Accumulator* out) {
+ return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
+ }
+
+ /// \brief Decode into a DictionaryBuilder
+ ///
+ /// This function assumes the definition levels were already decoded
+ /// as a validity bitmap in the given `valid_bits`. `null_count`
+ /// is the number of 0s in `valid_bits`.
+ /// As a space optimization, it is allowed for `valid_bits` to be null
+ /// if `null_count` is zero.
+ ///
+ /// \return number of values decoded
+ virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
+
+ /// \brief Decode into a DictionaryBuilder ignoring nulls
+ ///
+ /// \return number of values decoded
+ int DecodeArrowNonNull(int num_values,
+ typename EncodingTraits<DType>::DictAccumulator* builder) {
+ return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
+ }
+};
+
+template <typename DType>
+class DictDecoder : virtual public TypedDecoder<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
+
+ /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
+ /// but do not append any indices
+ virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices and append to dictionary
+ /// builder. The builder must have had the dictionary from this decoder
+ /// inserted already.
+ ///
+ /// \warning Remember to reset the builder each time the dict decoder is initialized
+ /// with a new dictionary page
+ virtual int DecodeIndicesSpaced(int num_values, int null_count,
+ const uint8_t* valid_bits, int64_t valid_bits_offset,
+ ::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices (no nulls)
+ ///
+ /// \warning Remember to reset the builder each time the dict decoder is initialized
+ /// with a new dictionary page
+ virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
+
+ /// \brief Decode only dictionary indices (no nulls). Same as above
+ /// DecodeIndices but target is an array instead of a builder.
+ ///
+ /// \note API EXPERIMENTAL
+ virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
+
+ /// \brief Get dictionary. The reader will call this API when it encounters a
+ /// new dictionary.
+ ///
+ /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
+ /// the decoder and is destroyed when the decoder is destroyed.
+ /// @param[out] dictionary_length The dictionary length.
+ ///
+ /// \note API EXPERIMENTAL
+ virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
+};
+
+// ----------------------------------------------------------------------
+// TypedEncoder specializations, traits, and factory functions
+
+class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
+ public:
+ using TypedDecoder<BooleanType>::Decode;
+ virtual int Decode(uint8_t* buffer, int max_values) = 0;
+};
+
+class FLBADecoder : virtual public TypedDecoder<FLBAType> {
+ public:
+ using TypedDecoder<FLBAType>::DecodeSpaced;
+
+ // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
+ // there is value in adding specialized read methods for
+ // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
+ // then perhaps not
+};
+
+PARQUET_EXPORT
+std::unique_ptr<Encoder> MakeEncoder(
+ Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
+ Encoding::type encoding, bool use_dictionary = false,
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ using OutType = typename EncodingTraits<DType>::Encoder;
+ std::unique_ptr<Encoder> base =
+ MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDecoder(Type::type type_num, Encoding::type encoding,
+ const ColumnDescriptor* descr = NULLPTR);
+
+namespace detail {
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool);
+
+} // namespace detail
+
+template <typename DType>
+std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
+ const ColumnDescriptor* descr = NULLPTR,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ using OutType = DictDecoder<DType>;
+ auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
+}
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
+ Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR) {
+ using OutType = typename EncodingTraits<DType>::Decoder;
+ std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr);
+ return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
index 829b0e778f1..5927503aba3 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.cc
@@ -1,412 +1,412 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/encryption.h"
-
-#include <string.h>
-
-#include <map>
-#include <utility>
-
-#include "arrow/util/logging.h"
-#include "arrow/util/utf8.h"
-#include "parquet/encryption/encryption_internal.h"
-
-namespace parquet {
-
-// integer key retriever
-void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) {
- key_map_.insert({key_id, key});
-}
-
-std::string IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) {
- uint32_t key_id;
- memcpy(reinterpret_cast<uint8_t*>(&key_id), key_metadata.c_str(), 4);
-
- return key_map_.at(key_id);
-}
-
-// string key retriever
-void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) {
- key_map_.insert({key_id, key});
-}
-
-std::string StringKeyIdRetriever::GetKey(const std::string& key_id) {
- return key_map_.at(key_id);
-}
-
-ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key(
- std::string column_key) {
- if (column_key.empty()) return this;
-
- DCHECK(key_.empty());
- key_ = column_key;
- return this;
-}
-
-ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata(
- const std::string& key_metadata) {
- DCHECK(!key_metadata.empty());
- DCHECK(key_metadata_.empty());
- key_metadata_ = key_metadata;
- return this;
-}
-
-ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id(
- const std::string& key_id) {
- // key_id is expected to be in UTF8 encoding
- ::arrow::util::InitializeUTF8();
- const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
- if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
- throw ParquetException("key id should be in UTF8 encoding");
- }
-
- DCHECK(!key_id.empty());
- this->key_metadata(key_id);
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys(
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) {
- if (column_decryption_properties.size() == 0) return this;
-
- if (column_decryption_properties_.size() != 0)
- throw ParquetException("Column properties already set");
-
- for (const auto& element : column_decryption_properties) {
- if (element.second->is_utilized()) {
- throw ParquetException("Column properties utilized in another file");
- }
- element.second->set_utilized();
- }
-
- column_decryption_properties_ = column_decryption_properties;
- return this;
-}
-
-void FileDecryptionProperties::WipeOutDecryptionKeys() {
- footer_key_.clear();
-
- for (const auto& element : column_decryption_properties_) {
- element.second->WipeOutDecryptionKey();
- }
-}
-
-bool FileDecryptionProperties::is_utilized() {
- if (footer_key_.empty() && column_decryption_properties_.size() == 0 &&
- aad_prefix_.empty())
- return false;
-
- return utilized_;
-}
-
-std::shared_ptr<FileDecryptionProperties> FileDecryptionProperties::DeepClone(
- std::string new_aad_prefix) {
- std::string footer_key_copy = footer_key_;
- ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy;
-
- for (const auto& element : column_decryption_properties_) {
- column_decryption_properties_map_copy.insert(
- {element.second->column_path(), element.second->DeepClone()});
- }
-
- if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
- return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
- footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix,
- aad_prefix_verifier_, column_decryption_properties_map_copy,
- plaintext_files_allowed_));
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key(
- const std::string footer_key) {
- if (footer_key.empty()) {
- return this;
- }
- DCHECK(footer_key_.empty());
- footer_key_ = footer_key;
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever(
- const std::shared_ptr<DecryptionKeyRetriever>& key_retriever) {
- if (key_retriever == nullptr) return this;
-
- DCHECK(key_retriever_ == nullptr);
- key_retriever_ = key_retriever;
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix(
- const std::string& aad_prefix) {
- if (aad_prefix.empty()) {
- return this;
- }
- DCHECK(aad_prefix_.empty());
- aad_prefix_ = aad_prefix;
- return this;
-}
-
-FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier(
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier) {
- if (aad_prefix_verifier == nullptr) return this;
-
- DCHECK(aad_prefix_verifier_ == nullptr);
- aad_prefix_verifier_ = std::move(aad_prefix_verifier);
- return this;
-}
-
-ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
- const std::string& key) {
- if (key.empty()) return this;
-
- DCHECK(!key.empty());
- key_ = key;
- return this;
-}
-
-std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::Builder::build() {
- return std::shared_ptr<ColumnDecryptionProperties>(
- new ColumnDecryptionProperties(column_path_, key_));
-}
-
-void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); }
-
-std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::DeepClone() {
- std::string key_copy = key_;
- return std::shared_ptr<ColumnDecryptionProperties>(
- new ColumnDecryptionProperties(column_path_, key_copy));
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata(
- const std::string& footer_key_metadata) {
- if (footer_key_metadata.empty()) return this;
-
- DCHECK(footer_key_metadata_.empty());
- footer_key_metadata_ = footer_key_metadata;
- return this;
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns(
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns) {
- if (encrypted_columns.size() == 0) return this;
-
- if (encrypted_columns_.size() != 0)
- throw ParquetException("Column properties already set");
-
- for (const auto& element : encrypted_columns) {
- if (element.second->is_utilized()) {
- throw ParquetException("Column properties utilized in another file");
- }
- element.second->set_utilized();
- }
- encrypted_columns_ = encrypted_columns;
- return this;
-}
-
-void FileEncryptionProperties::WipeOutEncryptionKeys() {
- footer_key_.clear();
- for (const auto& element : encrypted_columns_) {
- element.second->WipeOutEncryptionKey();
- }
-}
-
-std::shared_ptr<FileEncryptionProperties> FileEncryptionProperties::DeepClone(
- std::string new_aad_prefix) {
- std::string footer_key_copy = footer_key_;
- ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy;
-
- for (const auto& element : encrypted_columns_) {
- encrypted_columns_map_copy.insert(
- {element.second->column_path(), element.second->DeepClone()});
- }
-
- if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
- return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
- algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_,
- new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy));
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix(
- const std::string& aad_prefix) {
- if (aad_prefix.empty()) return this;
-
- DCHECK(aad_prefix_.empty());
- aad_prefix_ = aad_prefix;
- store_aad_prefix_in_file_ = true;
- return this;
-}
-
-FileEncryptionProperties::Builder*
-FileEncryptionProperties::Builder::disable_aad_prefix_storage() {
- DCHECK(!aad_prefix_.empty());
-
- store_aad_prefix_in_file_ = false;
- return this;
-}
-
-ColumnEncryptionProperties::ColumnEncryptionProperties(bool encrypted,
- const std::string& column_path,
- const std::string& key,
- const std::string& key_metadata)
- : column_path_(column_path) {
- // column encryption properties object (with a column key) can be used for writing only
- // one file.
- // Upon completion of file writing, the encryption keys in the properties will be wiped
- // out (set to 0 in memory).
- utilized_ = false;
-
- DCHECK(!column_path.empty());
- if (!encrypted) {
- DCHECK(key.empty() && key_metadata.empty());
- }
-
- if (!key.empty()) {
- DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
- }
-
- encrypted_with_footer_key_ = (encrypted && key.empty());
- if (encrypted_with_footer_key_) {
- DCHECK(key_metadata.empty());
- }
-
- encrypted_ = encrypted;
- key_metadata_ = key_metadata;
- key_ = key;
-}
-
-ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path,
- const std::string& key)
- : column_path_(column_path) {
- utilized_ = false;
- DCHECK(!column_path.empty());
-
- if (!key.empty()) {
- DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
- }
-
- key_ = key;
-}
-
-std::string FileDecryptionProperties::column_key(const std::string& column_path) const {
- if (column_decryption_properties_.find(column_path) !=
- column_decryption_properties_.end()) {
- auto column_prop = column_decryption_properties_.at(column_path);
- if (column_prop != nullptr) {
- return column_prop->key();
- }
- }
- return empty_string_;
-}
-
-FileDecryptionProperties::FileDecryptionProperties(
- const std::string& footer_key, std::shared_ptr<DecryptionKeyRetriever> key_retriever,
- bool check_plaintext_footer_integrity, const std::string& aad_prefix,
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
- bool plaintext_files_allowed) {
- DCHECK(!footer_key.empty() || nullptr != key_retriever ||
- 0 != column_decryption_properties.size());
-
- if (!footer_key.empty()) {
- DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
- footer_key.length() == 32);
- }
- if (footer_key.empty() && check_plaintext_footer_integrity) {
- DCHECK(nullptr != key_retriever);
- }
- aad_prefix_verifier_ = std::move(aad_prefix_verifier);
- footer_key_ = footer_key;
- check_plaintext_footer_integrity_ = check_plaintext_footer_integrity;
- key_retriever_ = std::move(key_retriever);
- aad_prefix_ = aad_prefix;
- column_decryption_properties_ = column_decryption_properties;
- plaintext_files_allowed_ = plaintext_files_allowed;
- utilized_ = false;
-}
-
-FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id(
- const std::string& key_id) {
- // key_id is expected to be in UTF8 encoding
- ::arrow::util::InitializeUTF8();
- const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
- if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
- throw ParquetException("footer key id should be in UTF8 encoding");
- }
-
- if (key_id.empty()) {
- return this;
- }
-
- return footer_key_metadata(key_id);
-}
-
-std::shared_ptr<ColumnEncryptionProperties>
-FileEncryptionProperties::column_encryption_properties(const std::string& column_path) {
- if (encrypted_columns_.size() == 0) {
- auto builder = std::make_shared<ColumnEncryptionProperties::Builder>(column_path);
- return builder->build();
- }
- if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) {
- return encrypted_columns_[column_path];
- }
-
- return nullptr;
-}
-
-FileEncryptionProperties::FileEncryptionProperties(
- ParquetCipher::type cipher, const std::string& footer_key,
- const std::string& footer_key_metadata, bool encrypted_footer,
- const std::string& aad_prefix, bool store_aad_prefix_in_file,
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns)
- : footer_key_(footer_key),
- footer_key_metadata_(footer_key_metadata),
- encrypted_footer_(encrypted_footer),
- aad_prefix_(aad_prefix),
- store_aad_prefix_in_file_(store_aad_prefix_in_file),
- encrypted_columns_(encrypted_columns) {
- // file encryption properties object can be used for writing only one file.
- // Upon completion of file writing, the encryption keys in the properties will be wiped
- // out (set to 0 in memory).
- utilized_ = false;
-
- DCHECK(!footer_key.empty());
- // footer_key must be either 16, 24 or 32 bytes.
- DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
- footer_key.length() == 32);
-
- uint8_t aad_file_unique[kAadFileUniqueLength];
- memset(aad_file_unique, 0, kAadFileUniqueLength);
- encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength));
- std::string aad_file_unique_str(reinterpret_cast<char const*>(aad_file_unique),
- kAadFileUniqueLength);
-
- bool supply_aad_prefix = false;
- if (aad_prefix.empty()) {
- file_aad_ = aad_file_unique_str;
- } else {
- file_aad_ = aad_prefix + aad_file_unique_str;
- if (!store_aad_prefix_in_file) supply_aad_prefix = true;
- }
- algorithm_.algorithm = cipher;
- algorithm_.aad.aad_file_unique = aad_file_unique_str;
- algorithm_.aad.supply_aad_prefix = supply_aad_prefix;
- if (!aad_prefix.empty() && store_aad_prefix_in_file) {
- algorithm_.aad.aad_prefix = aad_prefix;
- }
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/encryption.h"
+
+#include <string.h>
+
+#include <map>
+#include <utility>
+
+#include "arrow/util/logging.h"
+#include "arrow/util/utf8.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// integer key retriever
+void IntegerKeyIdRetriever::PutKey(uint32_t key_id, const std::string& key) {
+ key_map_.insert({key_id, key});
+}
+
+std::string IntegerKeyIdRetriever::GetKey(const std::string& key_metadata) {
+ uint32_t key_id;
+ memcpy(reinterpret_cast<uint8_t*>(&key_id), key_metadata.c_str(), 4);
+
+ return key_map_.at(key_id);
+}
+
+// string key retriever
+void StringKeyIdRetriever::PutKey(const std::string& key_id, const std::string& key) {
+ key_map_.insert({key_id, key});
+}
+
+std::string StringKeyIdRetriever::GetKey(const std::string& key_id) {
+ return key_map_.at(key_id);
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key(
+ std::string column_key) {
+ if (column_key.empty()) return this;
+
+ DCHECK(key_.empty());
+ key_ = column_key;
+ return this;
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_metadata(
+ const std::string& key_metadata) {
+ DCHECK(!key_metadata.empty());
+ DCHECK(key_metadata_.empty());
+ key_metadata_ = key_metadata;
+ return this;
+}
+
+ColumnEncryptionProperties::Builder* ColumnEncryptionProperties::Builder::key_id(
+ const std::string& key_id) {
+ // key_id is expected to be in UTF8 encoding
+ ::arrow::util::InitializeUTF8();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
+ if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
+ throw ParquetException("key id should be in UTF8 encoding");
+ }
+
+ DCHECK(!key_id.empty());
+ this->key_metadata(key_id);
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::column_keys(
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties) {
+ if (column_decryption_properties.size() == 0) return this;
+
+ if (column_decryption_properties_.size() != 0)
+ throw ParquetException("Column properties already set");
+
+ for (const auto& element : column_decryption_properties) {
+ if (element.second->is_utilized()) {
+ throw ParquetException("Column properties utilized in another file");
+ }
+ element.second->set_utilized();
+ }
+
+ column_decryption_properties_ = column_decryption_properties;
+ return this;
+}
+
+void FileDecryptionProperties::WipeOutDecryptionKeys() {
+ footer_key_.clear();
+
+ for (const auto& element : column_decryption_properties_) {
+ element.second->WipeOutDecryptionKey();
+ }
+}
+
+bool FileDecryptionProperties::is_utilized() {
+ if (footer_key_.empty() && column_decryption_properties_.size() == 0 &&
+ aad_prefix_.empty())
+ return false;
+
+ return utilized_;
+}
+
+std::shared_ptr<FileDecryptionProperties> FileDecryptionProperties::DeepClone(
+ std::string new_aad_prefix) {
+ std::string footer_key_copy = footer_key_;
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_map_copy;
+
+ for (const auto& element : column_decryption_properties_) {
+ column_decryption_properties_map_copy.insert(
+ {element.second->column_path(), element.second->DeepClone()});
+ }
+
+ if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
+ return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+ footer_key_copy, key_retriever_, check_plaintext_footer_integrity_, new_aad_prefix,
+ aad_prefix_verifier_, column_decryption_properties_map_copy,
+ plaintext_files_allowed_));
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::footer_key(
+ const std::string footer_key) {
+ if (footer_key.empty()) {
+ return this;
+ }
+ DCHECK(footer_key_.empty());
+ footer_key_ = footer_key;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::key_retriever(
+ const std::shared_ptr<DecryptionKeyRetriever>& key_retriever) {
+ if (key_retriever == nullptr) return this;
+
+ DCHECK(key_retriever_ == nullptr);
+ key_retriever_ = key_retriever;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix(
+ const std::string& aad_prefix) {
+ if (aad_prefix.empty()) {
+ return this;
+ }
+ DCHECK(aad_prefix_.empty());
+ aad_prefix_ = aad_prefix;
+ return this;
+}
+
+FileDecryptionProperties::Builder* FileDecryptionProperties::Builder::aad_prefix_verifier(
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier) {
+ if (aad_prefix_verifier == nullptr) return this;
+
+ DCHECK(aad_prefix_verifier_ == nullptr);
+ aad_prefix_verifier_ = std::move(aad_prefix_verifier);
+ return this;
+}
+
+ColumnDecryptionProperties::Builder* ColumnDecryptionProperties::Builder::key(
+ const std::string& key) {
+ if (key.empty()) return this;
+
+ DCHECK(!key.empty());
+ key_ = key;
+ return this;
+}
+
+std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::Builder::build() {
+ return std::shared_ptr<ColumnDecryptionProperties>(
+ new ColumnDecryptionProperties(column_path_, key_));
+}
+
+void ColumnDecryptionProperties::WipeOutDecryptionKey() { key_.clear(); }
+
+std::shared_ptr<ColumnDecryptionProperties> ColumnDecryptionProperties::DeepClone() {
+ std::string key_copy = key_;
+ return std::shared_ptr<ColumnDecryptionProperties>(
+ new ColumnDecryptionProperties(column_path_, key_copy));
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_metadata(
+ const std::string& footer_key_metadata) {
+ if (footer_key_metadata.empty()) return this;
+
+ DCHECK(footer_key_metadata_.empty());
+ footer_key_metadata_ = footer_key_metadata;
+ return this;
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_columns(
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns) {
+ if (encrypted_columns.size() == 0) return this;
+
+ if (encrypted_columns_.size() != 0)
+ throw ParquetException("Column properties already set");
+
+ for (const auto& element : encrypted_columns) {
+ if (element.second->is_utilized()) {
+ throw ParquetException("Column properties utilized in another file");
+ }
+ element.second->set_utilized();
+ }
+ encrypted_columns_ = encrypted_columns;
+ return this;
+}
+
+void FileEncryptionProperties::WipeOutEncryptionKeys() {
+ footer_key_.clear();
+ for (const auto& element : encrypted_columns_) {
+ element.second->WipeOutEncryptionKey();
+ }
+}
+
+std::shared_ptr<FileEncryptionProperties> FileEncryptionProperties::DeepClone(
+ std::string new_aad_prefix) {
+ std::string footer_key_copy = footer_key_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_map_copy;
+
+ for (const auto& element : encrypted_columns_) {
+ encrypted_columns_map_copy.insert(
+ {element.second->column_path(), element.second->DeepClone()});
+ }
+
+ if (new_aad_prefix.empty()) new_aad_prefix = aad_prefix_;
+ return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+ algorithm_.algorithm, footer_key_copy, footer_key_metadata_, encrypted_footer_,
+ new_aad_prefix, store_aad_prefix_in_file_, encrypted_columns_map_copy));
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::aad_prefix(
+ const std::string& aad_prefix) {
+ if (aad_prefix.empty()) return this;
+
+ DCHECK(aad_prefix_.empty());
+ aad_prefix_ = aad_prefix;
+ store_aad_prefix_in_file_ = true;
+ return this;
+}
+
+FileEncryptionProperties::Builder*
+FileEncryptionProperties::Builder::disable_aad_prefix_storage() {
+ DCHECK(!aad_prefix_.empty());
+
+ store_aad_prefix_in_file_ = false;
+ return this;
+}
+
+ColumnEncryptionProperties::ColumnEncryptionProperties(bool encrypted,
+ const std::string& column_path,
+ const std::string& key,
+ const std::string& key_metadata)
+ : column_path_(column_path) {
+ // column encryption properties object (with a column key) can be used for writing only
+ // one file.
+ // Upon completion of file writing, the encryption keys in the properties will be wiped
+ // out (set to 0 in memory).
+ utilized_ = false;
+
+ DCHECK(!column_path.empty());
+ if (!encrypted) {
+ DCHECK(key.empty() && key_metadata.empty());
+ }
+
+ if (!key.empty()) {
+ DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
+ }
+
+ encrypted_with_footer_key_ = (encrypted && key.empty());
+ if (encrypted_with_footer_key_) {
+ DCHECK(key_metadata.empty());
+ }
+
+ encrypted_ = encrypted;
+ key_metadata_ = key_metadata;
+ key_ = key;
+}
+
+ColumnDecryptionProperties::ColumnDecryptionProperties(const std::string& column_path,
+ const std::string& key)
+ : column_path_(column_path) {
+ utilized_ = false;
+ DCHECK(!column_path.empty());
+
+ if (!key.empty()) {
+ DCHECK(key.length() == 16 || key.length() == 24 || key.length() == 32);
+ }
+
+ key_ = key;
+}
+
+std::string FileDecryptionProperties::column_key(const std::string& column_path) const {
+ if (column_decryption_properties_.find(column_path) !=
+ column_decryption_properties_.end()) {
+ auto column_prop = column_decryption_properties_.at(column_path);
+ if (column_prop != nullptr) {
+ return column_prop->key();
+ }
+ }
+ return empty_string_;
+}
+
+FileDecryptionProperties::FileDecryptionProperties(
+ const std::string& footer_key, std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+ bool check_plaintext_footer_integrity, const std::string& aad_prefix,
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
+ bool plaintext_files_allowed) {
+ DCHECK(!footer_key.empty() || nullptr != key_retriever ||
+ 0 != column_decryption_properties.size());
+
+ if (!footer_key.empty()) {
+ DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
+ footer_key.length() == 32);
+ }
+ if (footer_key.empty() && check_plaintext_footer_integrity) {
+ DCHECK(nullptr != key_retriever);
+ }
+ aad_prefix_verifier_ = std::move(aad_prefix_verifier);
+ footer_key_ = footer_key;
+ check_plaintext_footer_integrity_ = check_plaintext_footer_integrity;
+ key_retriever_ = std::move(key_retriever);
+ aad_prefix_ = aad_prefix;
+ column_decryption_properties_ = column_decryption_properties;
+ plaintext_files_allowed_ = plaintext_files_allowed;
+ utilized_ = false;
+}
+
+FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::footer_key_id(
+ const std::string& key_id) {
+ // key_id is expected to be in UTF8 encoding
+ ::arrow::util::InitializeUTF8();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(key_id.c_str());
+ if (!::arrow::util::ValidateUTF8(data, key_id.size())) {
+ throw ParquetException("footer key id should be in UTF8 encoding");
+ }
+
+ if (key_id.empty()) {
+ return this;
+ }
+
+ return footer_key_metadata(key_id);
+}
+
+std::shared_ptr<ColumnEncryptionProperties>
+FileEncryptionProperties::column_encryption_properties(const std::string& column_path) {
+ if (encrypted_columns_.size() == 0) {
+ auto builder = std::make_shared<ColumnEncryptionProperties::Builder>(column_path);
+ return builder->build();
+ }
+ if (encrypted_columns_.find(column_path) != encrypted_columns_.end()) {
+ return encrypted_columns_[column_path];
+ }
+
+ return nullptr;
+}
+
+FileEncryptionProperties::FileEncryptionProperties(
+ ParquetCipher::type cipher, const std::string& footer_key,
+ const std::string& footer_key_metadata, bool encrypted_footer,
+ const std::string& aad_prefix, bool store_aad_prefix_in_file,
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns)
+ : footer_key_(footer_key),
+ footer_key_metadata_(footer_key_metadata),
+ encrypted_footer_(encrypted_footer),
+ aad_prefix_(aad_prefix),
+ store_aad_prefix_in_file_(store_aad_prefix_in_file),
+ encrypted_columns_(encrypted_columns) {
+ // file encryption properties object can be used for writing only one file.
+ // Upon completion of file writing, the encryption keys in the properties will be wiped
+ // out (set to 0 in memory).
+ utilized_ = false;
+
+ DCHECK(!footer_key.empty());
+ // footer_key must be either 16, 24 or 32 bytes.
+ DCHECK(footer_key.length() == 16 || footer_key.length() == 24 ||
+ footer_key.length() == 32);
+
+ uint8_t aad_file_unique[kAadFileUniqueLength];
+ memset(aad_file_unique, 0, kAadFileUniqueLength);
+ encryption::RandBytes(aad_file_unique, sizeof(kAadFileUniqueLength));
+ std::string aad_file_unique_str(reinterpret_cast<char const*>(aad_file_unique),
+ kAadFileUniqueLength);
+
+ bool supply_aad_prefix = false;
+ if (aad_prefix.empty()) {
+ file_aad_ = aad_file_unique_str;
+ } else {
+ file_aad_ = aad_prefix + aad_file_unique_str;
+ if (!store_aad_prefix_in_file) supply_aad_prefix = true;
+ }
+ algorithm_.algorithm = cipher;
+ algorithm_.aad.aad_file_unique = aad_file_unique_str;
+ algorithm_.aad.supply_aad_prefix = supply_aad_prefix;
+ if (!aad_prefix.empty() && store_aad_prefix_in_file) {
+ algorithm_.aad.aad_prefix = aad_prefix;
+ }
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
index 840c669e6bd..8fd7ec8d3d0 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption.h
@@ -1,510 +1,510 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "parquet/exception.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
- ParquetCipher::AES_GCM_V1;
-static constexpr int32_t kMaximalAadMetadataLength = 256;
-static constexpr bool kDefaultEncryptedFooter = true;
-static constexpr bool kDefaultCheckSignature = true;
-static constexpr bool kDefaultAllowPlaintextFiles = false;
-static constexpr int32_t kAadFileUniqueLength = 8;
-
-class ColumnDecryptionProperties;
-using ColumnPathToDecryptionPropertiesMap =
- std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
-
-class ColumnEncryptionProperties;
-using ColumnPathToEncryptionPropertiesMap =
- std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
-
-class PARQUET_EXPORT DecryptionKeyRetriever {
- public:
- virtual std::string GetKey(const std::string& key_metadata) = 0;
- virtual ~DecryptionKeyRetriever() {}
-};
-
-/// Simple integer key retriever
-class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
- public:
- void PutKey(uint32_t key_id, const std::string& key);
- std::string GetKey(const std::string& key_metadata) override;
-
- private:
- std::map<uint32_t, std::string> key_map_;
-};
-
-// Simple string key retriever
-class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
- public:
- void PutKey(const std::string& key_id, const std::string& key);
- std::string GetKey(const std::string& key_metadata) override;
-
- private:
- std::map<std::string, std::string> key_map_;
-};
-
-class PARQUET_EXPORT HiddenColumnException : public ParquetException {
- public:
- explicit HiddenColumnException(const std::string& columnPath)
- : ParquetException(columnPath.c_str()) {}
-};
-
-class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
- public:
- explicit KeyAccessDeniedException(const std::string& columnPath)
- : ParquetException(columnPath.c_str()) {}
-};
-
-inline const uint8_t* str2bytes(const std::string& str) {
- if (str.empty()) return NULLPTR;
-
- char* cbytes = const_cast<char*>(str.c_str());
- return reinterpret_cast<const uint8_t*>(cbytes);
-}
-
-class PARQUET_EXPORT ColumnEncryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- /// Convenience builder for encrypted columns.
- explicit Builder(const std::string& name) : Builder(name, true) {}
-
- /// Convenience builder for encrypted columns.
- explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
- : Builder(path->ToDotString(), true) {}
-
- /// Set a column-specific key.
- /// If key is not set on an encrypted column, the column will
- /// be encrypted with the footer key.
- /// keyBytes Key length must be either 16, 24 or 32 bytes.
- /// The key is cloned, and will be wiped out (array values set to 0) upon completion
- /// of file writing.
- /// Caller is responsible for wiping out the input key array.
- Builder* key(std::string column_key);
-
- /// Set a key retrieval metadata.
- /// use either key_metadata() or key_id(), not both
- Builder* key_metadata(const std::string& key_metadata);
-
- /// A convenience function to set key metadata using a string id.
- /// Set a key retrieval metadata (converted from String).
- /// use either key_metadata() or key_id(), not both
- /// key_id will be converted to metadata (UTF-8 array).
- Builder* key_id(const std::string& key_id);
-
- std::shared_ptr<ColumnEncryptionProperties> build() {
- return std::shared_ptr<ColumnEncryptionProperties>(
- new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
- }
-
- private:
- const std::string column_path_;
- bool encrypted_;
- std::string key_;
- std::string key_metadata_;
-
- Builder(const std::string path, bool encrypted)
- : column_path_(path), encrypted_(encrypted) {}
- };
-
- std::string column_path() const { return column_path_; }
- bool is_encrypted() const { return encrypted_; }
- bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
- std::string key() const { return key_; }
- std::string key_metadata() const { return key_metadata_; }
-
- /// Upon completion of file writing, the encryption key
- /// will be wiped out.
- void WipeOutEncryptionKey() { key_.clear(); }
-
- bool is_utilized() {
- if (key_.empty())
- return false; // can re-use column properties without encryption keys
- return utilized_;
- }
-
- /// ColumnEncryptionProperties object can be used for writing one file only.
- /// Mark ColumnEncryptionProperties as utilized once it is used in
- /// FileEncryptionProperties as the encryption key will be wiped out upon
- /// completion of file writing.
- void set_utilized() { utilized_ = true; }
-
- std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
- std::string key_copy = key_;
- return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
- encrypted_, column_path_, key_copy, key_metadata_));
- }
-
- ColumnEncryptionProperties() = default;
- ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
- ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
-
- private:
- const std::string column_path_;
- bool encrypted_;
- bool encrypted_with_footer_key_;
- std::string key_;
- std::string key_metadata_;
- bool utilized_;
- explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
- const std::string& key,
- const std::string& key_metadata);
-};
-
-class PARQUET_EXPORT ColumnDecryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- explicit Builder(const std::string& name) : column_path_(name) {}
-
- explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
- : Builder(path->ToDotString()) {}
-
- /// Set an explicit column key. If applied on a file that contains
- /// key metadata for this column the metadata will be ignored,
- /// the column will be decrypted with this key.
- /// key length must be either 16, 24 or 32 bytes.
- Builder* key(const std::string& key);
-
- std::shared_ptr<ColumnDecryptionProperties> build();
-
- private:
- const std::string column_path_;
- std::string key_;
- };
-
- ColumnDecryptionProperties() = default;
- ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
- ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
-
- std::string column_path() const { return column_path_; }
- std::string key() const { return key_; }
- bool is_utilized() { return utilized_; }
-
- /// ColumnDecryptionProperties object can be used for reading one file only.
- /// Mark ColumnDecryptionProperties as utilized once it is used in
- /// FileDecryptionProperties as the encryption key will be wiped out upon
- /// completion of file reading.
- void set_utilized() { utilized_ = true; }
-
- /// Upon completion of file reading, the encryption key
- /// will be wiped out.
- void WipeOutDecryptionKey();
-
- std::shared_ptr<ColumnDecryptionProperties> DeepClone();
-
- private:
- const std::string column_path_;
- std::string key_;
- bool utilized_;
-
- /// This class is only required for setting explicit column decryption keys -
- /// to override key retriever (or to provide keys when key metadata and/or
- /// key retriever are not available)
- explicit ColumnDecryptionProperties(const std::string& column_path,
- const std::string& key);
-};
-
-class PARQUET_EXPORT AADPrefixVerifier {
- public:
- /// Verifies identity (AAD Prefix) of individual file,
- /// or of file collection in a data set.
- /// Throws exception if an AAD prefix is wrong.
- /// In a data set, AAD Prefixes should be collected,
- /// and then checked for missing files.
- virtual void Verify(const std::string& aad_prefix) = 0;
- virtual ~AADPrefixVerifier() {}
-};
-
-class PARQUET_EXPORT FileDecryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- Builder() {
- check_plaintext_footer_integrity_ = kDefaultCheckSignature;
- plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
- }
-
- /// Set an explicit footer key. If applied on a file that contains
- /// footer key metadata the metadata will be ignored, the footer
- /// will be decrypted/verified with this key.
- /// If explicit key is not set, footer key will be fetched from
- /// key retriever.
- /// With explicit keys or AAD prefix, new encryption properties object must be
- /// created for each encrypted file.
- /// Explicit encryption keys (footer and column) are cloned.
- /// Upon completion of file reading, the cloned encryption keys in the properties
- /// will be wiped out (array values set to 0).
- /// Caller is responsible for wiping out the input key array.
- /// param footerKey Key length must be either 16, 24 or 32 bytes.
- Builder* footer_key(const std::string footer_key);
-
- /// Set explicit column keys (decryption properties).
- /// Its also possible to set a key retriever on this property object.
- /// Upon file decryption, availability of explicit keys is checked before
- /// invocation of the retriever callback.
- /// If an explicit key is available for a footer or a column,
- /// its key metadata will be ignored.
- Builder* column_keys(
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
-
- /// Set a key retriever callback. Its also possible to
- /// set explicit footer or column keys on this file property object.
- /// Upon file decryption, availability of explicit keys is checked before
- /// invocation of the retriever callback.
- /// If an explicit key is available for a footer or a column,
- /// its key metadata will be ignored.
- Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
-
- /// Skip integrity verification of plaintext footers.
- /// If not called, integrity of plaintext footers will be checked in runtime,
- /// and an exception will be thrown in the following situations:
- /// - footer signing key is not available
- /// (not passed, or not found by key retriever)
- /// - footer content and signature don't match
- Builder* disable_footer_signature_verification() {
- check_plaintext_footer_integrity_ = false;
- return this;
- }
-
- /// Explicitly supply the file AAD prefix.
- /// A must when a prefix is used for file encryption, but not stored in file.
- /// If AAD prefix is stored in file, it will be compared to the explicitly
- /// supplied value and an exception will be thrown if they differ.
- Builder* aad_prefix(const std::string& aad_prefix);
-
- /// Set callback for verification of AAD Prefixes stored in file.
- Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
-
- /// By default, reading plaintext (unencrypted) files is not
- /// allowed when using a decryptor
- /// - in order to detect files that were not encrypted by mistake.
- /// However, the default behavior can be overridden by calling this method.
- /// The caller should use then a different method to ensure encryption
- /// of files with sensitive data.
- Builder* plaintext_files_allowed() {
- plaintext_files_allowed_ = true;
- return this;
- }
-
- std::shared_ptr<FileDecryptionProperties> build() {
- return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
- footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
- aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
- }
-
- private:
- std::string footer_key_;
- std::string aad_prefix_;
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
- ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
-
- std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
- bool check_plaintext_footer_integrity_;
- bool plaintext_files_allowed_;
- };
-
- std::string column_key(const std::string& column_path) const;
-
- std::string footer_key() const { return footer_key_; }
-
- std::string aad_prefix() const { return aad_prefix_; }
-
- const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
- return key_retriever_;
- }
-
- bool check_plaintext_footer_integrity() const {
- return check_plaintext_footer_integrity_;
- }
-
- bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
-
- const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
- return aad_prefix_verifier_;
- }
-
- /// Upon completion of file reading, the encryption keys in the properties
- /// will be wiped out (array values set to 0).
- void WipeOutDecryptionKeys();
-
- bool is_utilized();
-
- /// FileDecryptionProperties object can be used for reading one file only.
- /// Mark FileDecryptionProperties as utilized once it is used to read a file as the
- /// encryption keys will be wiped out upon completion of file reading.
- void set_utilized() { utilized_ = true; }
-
- /// FileDecryptionProperties object can be used for reading one file only.
- /// (unless this object keeps the keyRetrieval callback only, and no explicit
- /// keys or aadPrefix).
- /// At the end, keys are wiped out in the memory.
- /// This method allows to clone identical properties for another file,
- /// with an option to update the aadPrefix (if newAadPrefix is null,
- /// aadPrefix will be cloned too)
- std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
-
- private:
- std::string footer_key_;
- std::string aad_prefix_;
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
-
- const std::string empty_string_ = "";
- ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
-
- std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
- bool check_plaintext_footer_integrity_;
- bool plaintext_files_allowed_;
- bool utilized_;
-
- FileDecryptionProperties(
- const std::string& footer_key,
- std::shared_ptr<DecryptionKeyRetriever> key_retriever,
- bool check_plaintext_footer_integrity, const std::string& aad_prefix,
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
- const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
- bool plaintext_files_allowed);
-};
-
-class PARQUET_EXPORT FileEncryptionProperties {
- public:
- class PARQUET_EXPORT Builder {
- public:
- explicit Builder(const std::string& footer_key)
- : parquet_cipher_(kDefaultEncryptionAlgorithm),
- encrypted_footer_(kDefaultEncryptedFooter) {
- footer_key_ = footer_key;
- store_aad_prefix_in_file_ = false;
- }
-
- /// Create files with plaintext footer.
- /// If not called, the files will be created with encrypted footer (default).
- Builder* set_plaintext_footer() {
- encrypted_footer_ = false;
- return this;
- }
-
- /// Set encryption algorithm.
- /// If not called, files will be encrypted with AES_GCM_V1 (default).
- Builder* algorithm(ParquetCipher::type parquet_cipher) {
- parquet_cipher_ = parquet_cipher;
- return this;
- }
-
- /// Set a key retrieval metadata (converted from String).
- /// use either footer_key_metadata or footer_key_id, not both.
- Builder* footer_key_id(const std::string& key_id);
-
- /// Set a key retrieval metadata.
- /// use either footer_key_metadata or footer_key_id, not both.
- Builder* footer_key_metadata(const std::string& footer_key_metadata);
-
- /// Set the file AAD Prefix.
- Builder* aad_prefix(const std::string& aad_prefix);
-
- /// Skip storing AAD Prefix in file.
- /// If not called, and if AAD Prefix is set, it will be stored.
- Builder* disable_aad_prefix_storage();
-
- /// Set the list of encrypted columns and their properties (keys etc).
- /// If not called, all columns will be encrypted with the footer key.
- /// If called, the file columns not in the list will be left unencrypted.
- Builder* encrypted_columns(
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
-
- std::shared_ptr<FileEncryptionProperties> build() {
- return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
- parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
- aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
- }
-
- private:
- ParquetCipher::type parquet_cipher_;
- bool encrypted_footer_;
- std::string footer_key_;
- std::string footer_key_metadata_;
-
- std::string aad_prefix_;
- bool store_aad_prefix_in_file_;
- ColumnPathToEncryptionPropertiesMap encrypted_columns_;
- };
- bool encrypted_footer() const { return encrypted_footer_; }
-
- EncryptionAlgorithm algorithm() const { return algorithm_; }
-
- std::string footer_key() const { return footer_key_; }
-
- std::string footer_key_metadata() const { return footer_key_metadata_; }
-
- std::string file_aad() const { return file_aad_; }
-
- std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
- const std::string& column_path);
-
- bool is_utilized() const { return utilized_; }
-
- /// FileEncryptionProperties object can be used for writing one file only.
- /// Mark FileEncryptionProperties as utilized once it is used to write a file as the
- /// encryption keys will be wiped out upon completion of file writing.
- void set_utilized() { utilized_ = true; }
-
- /// Upon completion of file writing, the encryption keys
- /// will be wiped out (array values set to 0).
- void WipeOutEncryptionKeys();
-
- /// FileEncryptionProperties object can be used for writing one file only.
- /// (at the end, keys are wiped out in the memory).
- /// This method allows to clone identical properties for another file,
- /// with an option to update the aadPrefix (if newAadPrefix is null,
- /// aadPrefix will be cloned too)
- std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
-
- ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
- return encrypted_columns_;
- }
-
- private:
- EncryptionAlgorithm algorithm_;
- std::string footer_key_;
- std::string footer_key_metadata_;
- bool encrypted_footer_;
- std::string file_aad_;
- std::string aad_prefix_;
- bool utilized_;
- bool store_aad_prefix_in_file_;
- ColumnPathToEncryptionPropertiesMap encrypted_columns_;
-
- FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
- const std::string& footer_key_metadata, bool encrypted_footer,
- const std::string& aad_prefix, bool store_aad_prefix_in_file,
- const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
+ ParquetCipher::AES_GCM_V1;
+static constexpr int32_t kMaximalAadMetadataLength = 256;
+static constexpr bool kDefaultEncryptedFooter = true;
+static constexpr bool kDefaultCheckSignature = true;
+static constexpr bool kDefaultAllowPlaintextFiles = false;
+static constexpr int32_t kAadFileUniqueLength = 8;
+
+class ColumnDecryptionProperties;
+using ColumnPathToDecryptionPropertiesMap =
+ std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
+
+class ColumnEncryptionProperties;
+using ColumnPathToEncryptionPropertiesMap =
+ std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
+
+class PARQUET_EXPORT DecryptionKeyRetriever {
+ public:
+ virtual std::string GetKey(const std::string& key_metadata) = 0;
+ virtual ~DecryptionKeyRetriever() {}
+};
+
+/// Simple integer key retriever
+class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+ void PutKey(uint32_t key_id, const std::string& key);
+ std::string GetKey(const std::string& key_metadata) override;
+
+ private:
+ std::map<uint32_t, std::string> key_map_;
+};
+
+// Simple string key retriever
+class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+ void PutKey(const std::string& key_id, const std::string& key);
+ std::string GetKey(const std::string& key_metadata) override;
+
+ private:
+ std::map<std::string, std::string> key_map_;
+};
+
+class PARQUET_EXPORT HiddenColumnException : public ParquetException {
+ public:
+ explicit HiddenColumnException(const std::string& columnPath)
+ : ParquetException(columnPath.c_str()) {}
+};
+
+class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
+ public:
+ explicit KeyAccessDeniedException(const std::string& columnPath)
+ : ParquetException(columnPath.c_str()) {}
+};
+
+inline const uint8_t* str2bytes(const std::string& str) {
+ if (str.empty()) return NULLPTR;
+
+ char* cbytes = const_cast<char*>(str.c_str());
+ return reinterpret_cast<const uint8_t*>(cbytes);
+}
+
+class PARQUET_EXPORT ColumnEncryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ /// Convenience builder for encrypted columns.
+ explicit Builder(const std::string& name) : Builder(name, true) {}
+
+ /// Convenience builder for encrypted columns.
+ explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
+ : Builder(path->ToDotString(), true) {}
+
+ /// Set a column-specific key.
+ /// If key is not set on an encrypted column, the column will
+ /// be encrypted with the footer key.
+ /// keyBytes Key length must be either 16, 24 or 32 bytes.
+ /// The key is cloned, and will be wiped out (array values set to 0) upon completion
+ /// of file writing.
+ /// Caller is responsible for wiping out the input key array.
+ Builder* key(std::string column_key);
+
+ /// Set a key retrieval metadata.
+ /// use either key_metadata() or key_id(), not both
+ Builder* key_metadata(const std::string& key_metadata);
+
+ /// A convenience function to set key metadata using a string id.
+ /// Set a key retrieval metadata (converted from String).
+ /// use either key_metadata() or key_id(), not both
+ /// key_id will be converted to metadata (UTF-8 array).
+ Builder* key_id(const std::string& key_id);
+
+ std::shared_ptr<ColumnEncryptionProperties> build() {
+ return std::shared_ptr<ColumnEncryptionProperties>(
+ new ColumnEncryptionProperties(encrypted_, column_path_, key_, key_metadata_));
+ }
+
+ private:
+ const std::string column_path_;
+ bool encrypted_;
+ std::string key_;
+ std::string key_metadata_;
+
+ Builder(const std::string path, bool encrypted)
+ : column_path_(path), encrypted_(encrypted) {}
+ };
+
+ std::string column_path() const { return column_path_; }
+ bool is_encrypted() const { return encrypted_; }
+ bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
+ std::string key() const { return key_; }
+ std::string key_metadata() const { return key_metadata_; }
+
+ /// Upon completion of file writing, the encryption key
+ /// will be wiped out.
+ void WipeOutEncryptionKey() { key_.clear(); }
+
+ bool is_utilized() {
+ if (key_.empty())
+ return false; // can re-use column properties without encryption keys
+ return utilized_;
+ }
+
+ /// ColumnEncryptionProperties object can be used for writing one file only.
+ /// Mark ColumnEncryptionProperties as utilized once it is used in
+ /// FileEncryptionProperties as the encryption key will be wiped out upon
+ /// completion of file writing.
+ void set_utilized() { utilized_ = true; }
+
+ std::shared_ptr<ColumnEncryptionProperties> DeepClone() {
+ std::string key_copy = key_;
+ return std::shared_ptr<ColumnEncryptionProperties>(new ColumnEncryptionProperties(
+ encrypted_, column_path_, key_copy, key_metadata_));
+ }
+
+ ColumnEncryptionProperties() = default;
+ ColumnEncryptionProperties(const ColumnEncryptionProperties& other) = default;
+ ColumnEncryptionProperties(ColumnEncryptionProperties&& other) = default;
+
+ private:
+ const std::string column_path_;
+ bool encrypted_;
+ bool encrypted_with_footer_key_;
+ std::string key_;
+ std::string key_metadata_;
+ bool utilized_;
+ explicit ColumnEncryptionProperties(bool encrypted, const std::string& column_path,
+ const std::string& key,
+ const std::string& key_metadata);
+};
+
+class PARQUET_EXPORT ColumnDecryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ explicit Builder(const std::string& name) : column_path_(name) {}
+
+ explicit Builder(const std::shared_ptr<schema::ColumnPath>& path)
+ : Builder(path->ToDotString()) {}
+
+ /// Set an explicit column key. If applied on a file that contains
+ /// key metadata for this column the metadata will be ignored,
+ /// the column will be decrypted with this key.
+ /// key length must be either 16, 24 or 32 bytes.
+ Builder* key(const std::string& key);
+
+ std::shared_ptr<ColumnDecryptionProperties> build();
+
+ private:
+ const std::string column_path_;
+ std::string key_;
+ };
+
+ ColumnDecryptionProperties() = default;
+ ColumnDecryptionProperties(const ColumnDecryptionProperties& other) = default;
+ ColumnDecryptionProperties(ColumnDecryptionProperties&& other) = default;
+
+ std::string column_path() const { return column_path_; }
+ std::string key() const { return key_; }
+ bool is_utilized() { return utilized_; }
+
+ /// ColumnDecryptionProperties object can be used for reading one file only.
+ /// Mark ColumnDecryptionProperties as utilized once it is used in
+ /// FileDecryptionProperties as the encryption key will be wiped out upon
+ /// completion of file reading.
+ void set_utilized() { utilized_ = true; }
+
+ /// Upon completion of file reading, the encryption key
+ /// will be wiped out.
+ void WipeOutDecryptionKey();
+
+ std::shared_ptr<ColumnDecryptionProperties> DeepClone();
+
+ private:
+ const std::string column_path_;
+ std::string key_;
+ bool utilized_;
+
+ /// This class is only required for setting explicit column decryption keys -
+ /// to override key retriever (or to provide keys when key metadata and/or
+ /// key retriever are not available)
+ explicit ColumnDecryptionProperties(const std::string& column_path,
+ const std::string& key);
+};
+
+class PARQUET_EXPORT AADPrefixVerifier {
+ public:
+ /// Verifies identity (AAD Prefix) of individual file,
+ /// or of file collection in a data set.
+ /// Throws exception if an AAD prefix is wrong.
+ /// In a data set, AAD Prefixes should be collected,
+ /// and then checked for missing files.
+ virtual void Verify(const std::string& aad_prefix) = 0;
+ virtual ~AADPrefixVerifier() {}
+};
+
+class PARQUET_EXPORT FileDecryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ Builder() {
+ check_plaintext_footer_integrity_ = kDefaultCheckSignature;
+ plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
+ }
+
+ /// Set an explicit footer key. If applied on a file that contains
+ /// footer key metadata the metadata will be ignored, the footer
+ /// will be decrypted/verified with this key.
+ /// If explicit key is not set, footer key will be fetched from
+ /// key retriever.
+ /// With explicit keys or AAD prefix, new encryption properties object must be
+ /// created for each encrypted file.
+ /// Explicit encryption keys (footer and column) are cloned.
+ /// Upon completion of file reading, the cloned encryption keys in the properties
+ /// will be wiped out (array values set to 0).
+ /// Caller is responsible for wiping out the input key array.
+ /// param footerKey Key length must be either 16, 24 or 32 bytes.
+ Builder* footer_key(const std::string footer_key);
+
+ /// Set explicit column keys (decryption properties).
+ /// Its also possible to set a key retriever on this property object.
+ /// Upon file decryption, availability of explicit keys is checked before
+ /// invocation of the retriever callback.
+ /// If an explicit key is available for a footer or a column,
+ /// its key metadata will be ignored.
+ Builder* column_keys(
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties);
+
+ /// Set a key retriever callback. Its also possible to
+ /// set explicit footer or column keys on this file property object.
+ /// Upon file decryption, availability of explicit keys is checked before
+ /// invocation of the retriever callback.
+ /// If an explicit key is available for a footer or a column,
+ /// its key metadata will be ignored.
+ Builder* key_retriever(const std::shared_ptr<DecryptionKeyRetriever>& key_retriever);
+
+ /// Skip integrity verification of plaintext footers.
+ /// If not called, integrity of plaintext footers will be checked in runtime,
+ /// and an exception will be thrown in the following situations:
+ /// - footer signing key is not available
+ /// (not passed, or not found by key retriever)
+ /// - footer content and signature don't match
+ Builder* disable_footer_signature_verification() {
+ check_plaintext_footer_integrity_ = false;
+ return this;
+ }
+
+ /// Explicitly supply the file AAD prefix.
+ /// A must when a prefix is used for file encryption, but not stored in file.
+ /// If AAD prefix is stored in file, it will be compared to the explicitly
+ /// supplied value and an exception will be thrown if they differ.
+ Builder* aad_prefix(const std::string& aad_prefix);
+
+ /// Set callback for verification of AAD Prefixes stored in file.
+ Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
+
+ /// By default, reading plaintext (unencrypted) files is not
+ /// allowed when using a decryptor
+ /// - in order to detect files that were not encrypted by mistake.
+ /// However, the default behavior can be overridden by calling this method.
+ /// The caller should use then a different method to ensure encryption
+ /// of files with sensitive data.
+ Builder* plaintext_files_allowed() {
+ plaintext_files_allowed_ = true;
+ return this;
+ }
+
+ std::shared_ptr<FileDecryptionProperties> build() {
+ return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+ footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
+ aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
+ }
+
+ private:
+ std::string footer_key_;
+ std::string aad_prefix_;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+ bool check_plaintext_footer_integrity_;
+ bool plaintext_files_allowed_;
+ };
+
+ std::string column_key(const std::string& column_path) const;
+
+ std::string footer_key() const { return footer_key_; }
+
+ std::string aad_prefix() const { return aad_prefix_; }
+
+ const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
+ return key_retriever_;
+ }
+
+ bool check_plaintext_footer_integrity() const {
+ return check_plaintext_footer_integrity_;
+ }
+
+ bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
+
+ const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
+ return aad_prefix_verifier_;
+ }
+
+ /// Upon completion of file reading, the encryption keys in the properties
+ /// will be wiped out (array values set to 0).
+ void WipeOutDecryptionKeys();
+
+ bool is_utilized();
+
+ /// FileDecryptionProperties object can be used for reading one file only.
+ /// Mark FileDecryptionProperties as utilized once it is used to read a file as the
+ /// encryption keys will be wiped out upon completion of file reading.
+ void set_utilized() { utilized_ = true; }
+
+ /// FileDecryptionProperties object can be used for reading one file only.
+ /// (unless this object keeps the keyRetrieval callback only, and no explicit
+ /// keys or aadPrefix).
+ /// At the end, keys are wiped out in the memory.
+ /// This method allows to clone identical properties for another file,
+ /// with an option to update the aadPrefix (if newAadPrefix is null,
+ /// aadPrefix will be cloned too)
+ std::shared_ptr<FileDecryptionProperties> DeepClone(std::string new_aad_prefix = "");
+
+ private:
+ std::string footer_key_;
+ std::string aad_prefix_;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+
+ const std::string empty_string_ = "";
+ ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+ bool check_plaintext_footer_integrity_;
+ bool plaintext_files_allowed_;
+ bool utilized_;
+
+ FileDecryptionProperties(
+ const std::string& footer_key,
+ std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+ bool check_plaintext_footer_integrity, const std::string& aad_prefix,
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+ const ColumnPathToDecryptionPropertiesMap& column_decryption_properties,
+ bool plaintext_files_allowed);
+};
+
+class PARQUET_EXPORT FileEncryptionProperties {
+ public:
+ class PARQUET_EXPORT Builder {
+ public:
+ explicit Builder(const std::string& footer_key)
+ : parquet_cipher_(kDefaultEncryptionAlgorithm),
+ encrypted_footer_(kDefaultEncryptedFooter) {
+ footer_key_ = footer_key;
+ store_aad_prefix_in_file_ = false;
+ }
+
+ /// Create files with plaintext footer.
+ /// If not called, the files will be created with encrypted footer (default).
+ Builder* set_plaintext_footer() {
+ encrypted_footer_ = false;
+ return this;
+ }
+
+ /// Set encryption algorithm.
+ /// If not called, files will be encrypted with AES_GCM_V1 (default).
+ Builder* algorithm(ParquetCipher::type parquet_cipher) {
+ parquet_cipher_ = parquet_cipher;
+ return this;
+ }
+
+ /// Set a key retrieval metadata (converted from String).
+ /// use either footer_key_metadata or footer_key_id, not both.
+ Builder* footer_key_id(const std::string& key_id);
+
+ /// Set a key retrieval metadata.
+ /// use either footer_key_metadata or footer_key_id, not both.
+ Builder* footer_key_metadata(const std::string& footer_key_metadata);
+
+ /// Set the file AAD Prefix.
+ Builder* aad_prefix(const std::string& aad_prefix);
+
+ /// Skip storing AAD Prefix in file.
+ /// If not called, and if AAD Prefix is set, it will be stored.
+ Builder* disable_aad_prefix_storage();
+
+ /// Set the list of encrypted columns and their properties (keys etc).
+ /// If not called, all columns will be encrypted with the footer key.
+ /// If called, the file columns not in the list will be left unencrypted.
+ Builder* encrypted_columns(
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
+
+ std::shared_ptr<FileEncryptionProperties> build() {
+ return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+ parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
+ aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
+ }
+
+ private:
+ ParquetCipher::type parquet_cipher_;
+ bool encrypted_footer_;
+ std::string footer_key_;
+ std::string footer_key_metadata_;
+
+ std::string aad_prefix_;
+ bool store_aad_prefix_in_file_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+ };
+ bool encrypted_footer() const { return encrypted_footer_; }
+
+ EncryptionAlgorithm algorithm() const { return algorithm_; }
+
+ std::string footer_key() const { return footer_key_; }
+
+ std::string footer_key_metadata() const { return footer_key_metadata_; }
+
+ std::string file_aad() const { return file_aad_; }
+
+ std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+ const std::string& column_path);
+
+ bool is_utilized() const { return utilized_; }
+
+ /// FileEncryptionProperties object can be used for writing one file only.
+ /// Mark FileEncryptionProperties as utilized once it is used to write a file as the
+ /// encryption keys will be wiped out upon completion of file writing.
+ void set_utilized() { utilized_ = true; }
+
+ /// Upon completion of file writing, the encryption keys
+ /// will be wiped out (array values set to 0).
+ void WipeOutEncryptionKeys();
+
+ /// FileEncryptionProperties object can be used for writing one file only.
+ /// (at the end, keys are wiped out in the memory).
+ /// This method allows to clone identical properties for another file,
+ /// with an option to update the aadPrefix (if newAadPrefix is null,
+ /// aadPrefix will be cloned too)
+ std::shared_ptr<FileEncryptionProperties> DeepClone(std::string new_aad_prefix = "");
+
+ ColumnPathToEncryptionPropertiesMap encrypted_columns() const {
+ return encrypted_columns_;
+ }
+
+ private:
+ EncryptionAlgorithm algorithm_;
+ std::string footer_key_;
+ std::string footer_key_metadata_;
+ bool encrypted_footer_;
+ std::string file_aad_;
+ std::string aad_prefix_;
+ bool utilized_;
+ bool store_aad_prefix_in_file_;
+ ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+
+ FileEncryptionProperties(ParquetCipher::type cipher, const std::string& footer_key,
+ const std::string& footer_key_metadata, bool encrypted_footer,
+ const std::string& aad_prefix, bool store_aad_prefix_in_file,
+ const ColumnPathToEncryptionPropertiesMap& encrypted_columns);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
index 48e64574b43..e50fb9d0b8a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal.h
@@ -1,116 +1,116 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/properties.h"
-#include "parquet/types.h"
-
-using parquet::ParquetCipher;
-
-namespace parquet {
-namespace encryption {
-
-constexpr int kGcmTagLength = 16;
-constexpr int kNonceLength = 12;
-
-// Module types
-constexpr int8_t kFooter = 0;
-constexpr int8_t kColumnMetaData = 1;
-constexpr int8_t kDataPage = 2;
-constexpr int8_t kDictionaryPage = 3;
-constexpr int8_t kDataPageHeader = 4;
-constexpr int8_t kDictionaryPageHeader = 5;
-constexpr int8_t kColumnIndex = 6;
-constexpr int8_t kOffsetIndex = 7;
-
-/// Performs AES encryption operations with GCM or CTR ciphers.
-class AesEncryptor {
- public:
- /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
- explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
-
- static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesEncryptor*>* all_encryptors);
-
- ~AesEncryptor();
-
- /// Size difference between plaintext and ciphertext, for this cipher.
- int CiphertextSizeDelta();
-
- /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
- /// If different from value in constructor, exception will be thrown.
- int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext);
-
- /// Encrypts plaintext footer, in order to compute footer signature (tag).
- int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len,
- const uint8_t* nonce, uint8_t* encrypted_footer);
-
- void WipeOut();
-
- private:
- // PIMPL Idiom
- class AesEncryptorImpl;
- std::unique_ptr<AesEncryptorImpl> impl_;
-};
-
-/// Performs AES decryption operations with GCM or CTR ciphers.
-class AesDecryptor {
- public:
- /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
- explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
-
- static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesDecryptor*>* all_decryptors);
-
- ~AesDecryptor();
- void WipeOut();
-
- /// Size difference between plaintext and ciphertext, for this cipher.
- int CiphertextSizeDelta();
-
- /// Decrypts ciphertext with the key and aad. Key length is passed only for
- /// validation. If different from value in constructor, exception will be thrown.
- int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext);
-
- private:
- // PIMPL Idiom
- class AesDecryptorImpl;
- std::unique_ptr<AesDecryptorImpl> impl_;
-};
-
-std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
- int16_t row_group_ordinal, int16_t column_ordinal,
- int16_t page_ordinal);
-
-std::string CreateFooterAad(const std::string& aad_prefix_bytes);
-
-// Update last two bytes of page (or page header) module AAD
-void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal);
-
-// Wraps OpenSSL RAND_bytes function
-void RandBytes(unsigned char* buf, int num);
-
-} // namespace encryption
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/properties.h"
+#include "parquet/types.h"
+
+using parquet::ParquetCipher;
+
+namespace parquet {
+namespace encryption {
+
+constexpr int kGcmTagLength = 16;
+constexpr int kNonceLength = 12;
+
+// Module types
+constexpr int8_t kFooter = 0;
+constexpr int8_t kColumnMetaData = 1;
+constexpr int8_t kDataPage = 2;
+constexpr int8_t kDictionaryPage = 3;
+constexpr int8_t kDataPageHeader = 4;
+constexpr int8_t kDictionaryPageHeader = 5;
+constexpr int8_t kColumnIndex = 6;
+constexpr int8_t kOffsetIndex = 7;
+
+/// Performs AES encryption operations with GCM or CTR ciphers.
+class AesEncryptor {
+ public:
+ /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
+ explicit AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
+
+ static AesEncryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesEncryptor*>* all_encryptors);
+
+ ~AesEncryptor();
+
+ /// Size difference between plaintext and ciphertext, for this cipher.
+ int CiphertextSizeDelta();
+
+ /// Encrypts plaintext with the key and aad. Key length is passed only for validation.
+ /// If different from value in constructor, exception will be thrown.
+ int Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len, uint8_t* ciphertext);
+
+ /// Encrypts plaintext footer, in order to compute footer signature (tag).
+ int SignedFooterEncrypt(const uint8_t* footer, int footer_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ const uint8_t* nonce, uint8_t* encrypted_footer);
+
+ void WipeOut();
+
+ private:
+ // PIMPL Idiom
+ class AesEncryptorImpl;
+ std::unique_ptr<AesEncryptorImpl> impl_;
+};
+
+/// Performs AES decryption operations with GCM or CTR ciphers.
+class AesDecryptor {
+ public:
+ /// Can serve one key length only. Possible values: 16, 24, 32 bytes.
+ explicit AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata);
+
+ static AesDecryptor* Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesDecryptor*>* all_decryptors);
+
+ ~AesDecryptor();
+ void WipeOut();
+
+ /// Size difference between plaintext and ciphertext, for this cipher.
+ int CiphertextSizeDelta();
+
+ /// Decrypts ciphertext with the key and aad. Key length is passed only for
+ /// validation. If different from value in constructor, exception will be thrown.
+ int Decrypt(const uint8_t* ciphertext, int ciphertext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len, uint8_t* plaintext);
+
+ private:
+ // PIMPL Idiom
+ class AesDecryptorImpl;
+ std::unique_ptr<AesDecryptorImpl> impl_;
+};
+
+std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ int16_t page_ordinal);
+
+std::string CreateFooterAad(const std::string& aad_prefix_bytes);
+
+// Update last two bytes of page (or page header) module AAD
+void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal);
+
+// Wraps OpenSSL RAND_bytes function
+void RandBytes(unsigned char* buf, int num);
+
+} // namespace encryption
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
index fd3c1775d25..7f2edfa1d78 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/encryption_internal_nossl.cc
@@ -1,110 +1,110 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/exception.h"
-
-namespace parquet {
-namespace encryption {
-
-void ThrowOpenSSLRequiredException() {
- throw ParquetException(
- "Calling encryption method in Arrow/Parquet built without OpenSSL");
-}
-
-class AesEncryptor::AesEncryptorImpl {};
-
-AesEncryptor::~AesEncryptor() {}
-
-int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len,
- const uint8_t* key, int key_len, const uint8_t* aad,
- int aad_len, const uint8_t* nonce,
- uint8_t* encrypted_footer) {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
-
-int AesEncryptor::CiphertextSizeDelta() {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len,
- uint8_t* ciphertext) {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
- ThrowOpenSSLRequiredException();
-}
-
-class AesDecryptor::AesDecryptorImpl {};
-
-int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
- int key_len, const uint8_t* aad, int aad_len,
- uint8_t* ciphertext) {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
-
-AesDecryptor::~AesDecryptor() {}
-
-AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesEncryptor*>* all_encryptors) {
- return NULLPTR;
-}
-
-AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
- ThrowOpenSSLRequiredException();
-}
-
-AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
- std::vector<AesDecryptor*>* all_decryptors) {
- return NULLPTR;
-}
-
-int AesDecryptor::CiphertextSizeDelta() {
- ThrowOpenSSLRequiredException();
- return -1;
-}
-
-std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
- int16_t row_group_ordinal, int16_t column_ordinal,
- int16_t page_ordinal) {
- ThrowOpenSSLRequiredException();
- return "";
-}
-
-std::string CreateFooterAad(const std::string& aad_prefix_bytes) {
- ThrowOpenSSLRequiredException();
- return "";
-}
-
-void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) {
- ThrowOpenSSLRequiredException();
-}
-
-void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
-
-} // namespace encryption
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/exception.h"
+
+namespace parquet {
+namespace encryption {
+
+void ThrowOpenSSLRequiredException() {
+ throw ParquetException(
+ "Calling encryption method in Arrow/Parquet built without OpenSSL");
+}
+
+class AesEncryptor::AesEncryptorImpl {};
+
+AesEncryptor::~AesEncryptor() {}
+
+int AesEncryptor::SignedFooterEncrypt(const uint8_t* footer, int footer_len,
+ const uint8_t* key, int key_len, const uint8_t* aad,
+ int aad_len, const uint8_t* nonce,
+ uint8_t* encrypted_footer) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+void AesEncryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
+
+int AesEncryptor::CiphertextSizeDelta() {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+int AesEncryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ uint8_t* ciphertext) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+AesEncryptor::AesEncryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
+ ThrowOpenSSLRequiredException();
+}
+
+class AesDecryptor::AesDecryptorImpl {};
+
+int AesDecryptor::Decrypt(const uint8_t* plaintext, int plaintext_len, const uint8_t* key,
+ int key_len, const uint8_t* aad, int aad_len,
+ uint8_t* ciphertext) {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+void AesDecryptor::WipeOut() { ThrowOpenSSLRequiredException(); }
+
+AesDecryptor::~AesDecryptor() {}
+
+AesEncryptor* AesEncryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesEncryptor*>* all_encryptors) {
+ return NULLPTR;
+}
+
+AesDecryptor::AesDecryptor(ParquetCipher::type alg_id, int key_len, bool metadata) {
+ ThrowOpenSSLRequiredException();
+}
+
+AesDecryptor* AesDecryptor::Make(ParquetCipher::type alg_id, int key_len, bool metadata,
+ std::vector<AesDecryptor*>* all_decryptors) {
+ return NULLPTR;
+}
+
+int AesDecryptor::CiphertextSizeDelta() {
+ ThrowOpenSSLRequiredException();
+ return -1;
+}
+
+std::string CreateModuleAad(const std::string& file_aad, int8_t module_type,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ int16_t page_ordinal) {
+ ThrowOpenSSLRequiredException();
+ return "";
+}
+
+std::string CreateFooterAad(const std::string& aad_prefix_bytes) {
+ ThrowOpenSSLRequiredException();
+ return "";
+}
+
+void QuickUpdatePageAad(const std::string& AAD, int16_t new_page_ordinal) {
+ ThrowOpenSSLRequiredException();
+}
+
+void RandBytes(unsigned char* buf, int num) { ThrowOpenSSLRequiredException(); }
+
+} // namespace encryption
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
index 5aa7d010d3f..6381e4f37f7 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.cc
@@ -1,240 +1,240 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/encryption/encryption.h"
-#include "parquet/encryption/encryption_internal.h"
-
-namespace parquet {
-
-// Decryptor
-Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool)
- : aes_decryptor_(aes_decryptor),
- key_(key),
- file_aad_(file_aad),
- aad_(aad),
- pool_(pool) {}
-
-int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); }
-
-int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len,
- uint8_t* plaintext) {
- return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_),
- static_cast<int>(key_.size()), str2bytes(aad_),
- static_cast<int>(aad_.size()), plaintext);
-}
-
-// InternalFileDecryptor
-InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties,
- const std::string& file_aad,
- ParquetCipher::type algorithm,
- const std::string& footer_key_metadata,
- ::arrow::MemoryPool* pool)
- : properties_(properties),
- file_aad_(file_aad),
- algorithm_(algorithm),
- footer_key_metadata_(footer_key_metadata),
- pool_(pool) {
- if (properties_->is_utilized()) {
- throw ParquetException(
- "Re-using decryption properties with explicit keys for another file");
- }
- properties_->set_utilized();
-}
-
-void InternalFileDecryptor::WipeOutDecryptionKeys() {
- properties_->WipeOutDecryptionKeys();
- for (auto const& i : all_decryptors_) {
- i->WipeOut();
- }
-}
-
-std::string InternalFileDecryptor::GetFooterKey() {
- std::string footer_key = properties_->footer_key();
- // ignore footer key metadata if footer key is explicitly set via API
- if (footer_key.empty()) {
- if (footer_key_metadata_.empty())
- throw ParquetException("No footer key or key metadata");
- if (properties_->key_retriever() == nullptr)
- throw ParquetException("No footer key or key retriever");
- try {
- footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
- } catch (KeyAccessDeniedException& e) {
- std::stringstream ss;
- ss << "Footer key: access denied " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- }
- if (footer_key.empty()) {
- throw ParquetException(
- "Footer key unavailable. Could not verify "
- "plaintext footer metadata");
- }
- return footer_key;
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor() {
- std::string aad = encryption::CreateFooterAad(file_aad_);
- return GetFooterDecryptor(aad, true);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnMeta(
- const std::string& aad) {
- return GetFooterDecryptor(aad, true);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnData(
- const std::string& aad) {
- return GetFooterDecryptor(aad, false);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
- const std::string& aad, bool metadata) {
- if (metadata) {
- if (footer_metadata_decryptor_ != nullptr) return footer_metadata_decryptor_;
- } else {
- if (footer_data_decryptor_ != nullptr) return footer_data_decryptor_;
- }
-
- std::string footer_key = properties_->footer_key();
- if (footer_key.empty()) {
- if (footer_key_metadata_.empty())
- throw ParquetException("No footer key or key metadata");
- if (properties_->key_retriever() == nullptr)
- throw ParquetException("No footer key or key retriever");
- try {
- footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
- } catch (KeyAccessDeniedException& e) {
- std::stringstream ss;
- ss << "Footer key: access denied " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- }
- if (footer_key.empty()) {
- throw ParquetException(
- "Invalid footer encryption key. "
- "Could not parse footer metadata");
- }
-
- // Create both data and metadata decryptors to avoid redundant retrieval of key
- // from the key_retriever.
- auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size());
- auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size());
-
- footer_metadata_decryptor_ = std::make_shared<Decryptor>(
- aes_metadata_decryptor, footer_key, file_aad_, aad, pool_);
- footer_data_decryptor_ =
- std::make_shared<Decryptor>(aes_data_decryptor, footer_key, file_aad_, aad, pool_);
-
- if (metadata) return footer_metadata_decryptor_;
- return footer_data_decryptor_;
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnMetaDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad) {
- return GetColumnDecryptor(column_path, column_key_metadata, aad, true);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDataDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad) {
- return GetColumnDecryptor(column_path, column_key_metadata, aad, false);
-}
-
-std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad, bool metadata) {
- std::string column_key;
- // first look if we already got the decryptor from before
- if (metadata) {
- if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
- auto res(column_metadata_map_.at(column_path));
- res->UpdateAad(aad);
- return res;
- }
- } else {
- if (column_data_map_.find(column_path) != column_data_map_.end()) {
- auto res(column_data_map_.at(column_path));
- res->UpdateAad(aad);
- return res;
- }
- }
-
- column_key = properties_->column_key(column_path);
- // No explicit column key given via API. Retrieve via key metadata.
- if (column_key.empty() && !column_key_metadata.empty() &&
- properties_->key_retriever() != nullptr) {
- try {
- column_key = properties_->key_retriever()->GetKey(column_key_metadata);
- } catch (KeyAccessDeniedException& e) {
- std::stringstream ss;
- ss << "HiddenColumnException, path=" + column_path + " " << e.what() << "\n";
- throw HiddenColumnException(ss.str());
- }
- }
- if (column_key.empty()) {
- throw HiddenColumnException("HiddenColumnException, path=" + column_path);
- }
-
- // Create both data and metadata decryptors to avoid redundant retrieval of key
- // using the key_retriever.
- auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size());
- auto aes_data_decryptor = GetDataAesDecryptor(column_key.size());
-
- column_metadata_map_[column_path] = std::make_shared<Decryptor>(
- aes_metadata_decryptor, column_key, file_aad_, aad, pool_);
- column_data_map_[column_path] =
- std::make_shared<Decryptor>(aes_data_decryptor, column_key, file_aad_, aad, pool_);
-
- if (metadata) return column_metadata_map_[column_path];
- return column_data_map_[column_path];
-}
-
-int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) {
- if (key_len == 16)
- return 0;
- else if (key_len == 24)
- return 1;
- else if (key_len == 32)
- return 2;
- throw ParquetException("decryption key must be 16, 24 or 32 bytes in length");
-}
-
-encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToDecryptorArrayIndex(key_len);
- if (meta_decryptor_[index] == nullptr) {
- meta_decryptor_[index].reset(
- encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_));
- }
- return meta_decryptor_[index].get();
-}
-
-encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToDecryptorArrayIndex(key_len);
- if (data_decryptor_[index] == nullptr) {
- data_decryptor_[index].reset(
- encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_));
- }
- return data_decryptor_[index].get();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// Decryptor
+Decryptor::Decryptor(encryption::AesDecryptor* aes_decryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool)
+ : aes_decryptor_(aes_decryptor),
+ key_(key),
+ file_aad_(file_aad),
+ aad_(aad),
+ pool_(pool) {}
+
+int Decryptor::CiphertextSizeDelta() { return aes_decryptor_->CiphertextSizeDelta(); }
+
+int Decryptor::Decrypt(const uint8_t* ciphertext, int ciphertext_len,
+ uint8_t* plaintext) {
+ return aes_decryptor_->Decrypt(ciphertext, ciphertext_len, str2bytes(key_),
+ static_cast<int>(key_.size()), str2bytes(aad_),
+ static_cast<int>(aad_.size()), plaintext);
+}
+
+// InternalFileDecryptor
+InternalFileDecryptor::InternalFileDecryptor(FileDecryptionProperties* properties,
+ const std::string& file_aad,
+ ParquetCipher::type algorithm,
+ const std::string& footer_key_metadata,
+ ::arrow::MemoryPool* pool)
+ : properties_(properties),
+ file_aad_(file_aad),
+ algorithm_(algorithm),
+ footer_key_metadata_(footer_key_metadata),
+ pool_(pool) {
+ if (properties_->is_utilized()) {
+ throw ParquetException(
+ "Re-using decryption properties with explicit keys for another file");
+ }
+ properties_->set_utilized();
+}
+
+void InternalFileDecryptor::WipeOutDecryptionKeys() {
+ properties_->WipeOutDecryptionKeys();
+ for (auto const& i : all_decryptors_) {
+ i->WipeOut();
+ }
+}
+
+std::string InternalFileDecryptor::GetFooterKey() {
+ std::string footer_key = properties_->footer_key();
+ // ignore footer key metadata if footer key is explicitly set via API
+ if (footer_key.empty()) {
+ if (footer_key_metadata_.empty())
+ throw ParquetException("No footer key or key metadata");
+ if (properties_->key_retriever() == nullptr)
+ throw ParquetException("No footer key or key retriever");
+ try {
+ footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "Footer key: access denied " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ if (footer_key.empty()) {
+ throw ParquetException(
+ "Footer key unavailable. Could not verify "
+ "plaintext footer metadata");
+ }
+ return footer_key;
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor() {
+ std::string aad = encryption::CreateFooterAad(file_aad_);
+ return GetFooterDecryptor(aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnMeta(
+ const std::string& aad) {
+ return GetFooterDecryptor(aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptorForColumnData(
+ const std::string& aad) {
+ return GetFooterDecryptor(aad, false);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetFooterDecryptor(
+ const std::string& aad, bool metadata) {
+ if (metadata) {
+ if (footer_metadata_decryptor_ != nullptr) return footer_metadata_decryptor_;
+ } else {
+ if (footer_data_decryptor_ != nullptr) return footer_data_decryptor_;
+ }
+
+ std::string footer_key = properties_->footer_key();
+ if (footer_key.empty()) {
+ if (footer_key_metadata_.empty())
+ throw ParquetException("No footer key or key metadata");
+ if (properties_->key_retriever() == nullptr)
+ throw ParquetException("No footer key or key retriever");
+ try {
+ footer_key = properties_->key_retriever()->GetKey(footer_key_metadata_);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "Footer key: access denied " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+ if (footer_key.empty()) {
+ throw ParquetException(
+ "Invalid footer encryption key. "
+ "Could not parse footer metadata");
+ }
+
+ // Create both data and metadata decryptors to avoid redundant retrieval of key
+ // from the key_retriever.
+ auto aes_metadata_decryptor = GetMetaAesDecryptor(footer_key.size());
+ auto aes_data_decryptor = GetDataAesDecryptor(footer_key.size());
+
+ footer_metadata_decryptor_ = std::make_shared<Decryptor>(
+ aes_metadata_decryptor, footer_key, file_aad_, aad, pool_);
+ footer_data_decryptor_ =
+ std::make_shared<Decryptor>(aes_data_decryptor, footer_key, file_aad_, aad, pool_);
+
+ if (metadata) return footer_metadata_decryptor_;
+ return footer_data_decryptor_;
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnMetaDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad) {
+ return GetColumnDecryptor(column_path, column_key_metadata, aad, true);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDataDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad) {
+ return GetColumnDecryptor(column_path, column_key_metadata, aad, false);
+}
+
+std::shared_ptr<Decryptor> InternalFileDecryptor::GetColumnDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad, bool metadata) {
+ std::string column_key;
+ // first look if we already got the decryptor from before
+ if (metadata) {
+ if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
+ auto res(column_metadata_map_.at(column_path));
+ res->UpdateAad(aad);
+ return res;
+ }
+ } else {
+ if (column_data_map_.find(column_path) != column_data_map_.end()) {
+ auto res(column_data_map_.at(column_path));
+ res->UpdateAad(aad);
+ return res;
+ }
+ }
+
+ column_key = properties_->column_key(column_path);
+ // No explicit column key given via API. Retrieve via key metadata.
+ if (column_key.empty() && !column_key_metadata.empty() &&
+ properties_->key_retriever() != nullptr) {
+ try {
+ column_key = properties_->key_retriever()->GetKey(column_key_metadata);
+ } catch (KeyAccessDeniedException& e) {
+ std::stringstream ss;
+ ss << "HiddenColumnException, path=" + column_path + " " << e.what() << "\n";
+ throw HiddenColumnException(ss.str());
+ }
+ }
+ if (column_key.empty()) {
+ throw HiddenColumnException("HiddenColumnException, path=" + column_path);
+ }
+
+ // Create both data and metadata decryptors to avoid redundant retrieval of key
+ // using the key_retriever.
+ auto aes_metadata_decryptor = GetMetaAesDecryptor(column_key.size());
+ auto aes_data_decryptor = GetDataAesDecryptor(column_key.size());
+
+ column_metadata_map_[column_path] = std::make_shared<Decryptor>(
+ aes_metadata_decryptor, column_key, file_aad_, aad, pool_);
+ column_data_map_[column_path] =
+ std::make_shared<Decryptor>(aes_data_decryptor, column_key, file_aad_, aad, pool_);
+
+ if (metadata) return column_metadata_map_[column_path];
+ return column_data_map_[column_path];
+}
+
+int InternalFileDecryptor::MapKeyLenToDecryptorArrayIndex(int key_len) {
+ if (key_len == 16)
+ return 0;
+ else if (key_len == 24)
+ return 1;
+ else if (key_len == 32)
+ return 2;
+ throw ParquetException("decryption key must be 16, 24 or 32 bytes in length");
+}
+
+encryption::AesDecryptor* InternalFileDecryptor::GetMetaAesDecryptor(size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToDecryptorArrayIndex(key_len);
+ if (meta_decryptor_[index] == nullptr) {
+ meta_decryptor_[index].reset(
+ encryption::AesDecryptor::Make(algorithm_, key_len, true, &all_decryptors_));
+ }
+ return meta_decryptor_[index].get();
+}
+
+encryption::AesDecryptor* InternalFileDecryptor::GetDataAesDecryptor(size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToDecryptorArrayIndex(key_len);
+ if (data_decryptor_[index] == nullptr) {
+ data_decryptor_[index].reset(
+ encryption::AesDecryptor::Make(algorithm_, key_len, false, &all_decryptors_));
+ }
+ return data_decryptor_[index].get();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
index fc2bc433d92..011c4acbeb6 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_decryptor.h
@@ -1,121 +1,121 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/schema.h"
-
-namespace parquet {
-
-namespace encryption {
-class AesDecryptor;
-class AesEncryptor;
-} // namespace encryption
-
-class FileDecryptionProperties;
-
-class PARQUET_EXPORT Decryptor {
- public:
- Decryptor(encryption::AesDecryptor* decryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool);
-
- const std::string& file_aad() const { return file_aad_; }
- void UpdateAad(const std::string& aad) { aad_ = aad; }
- ::arrow::MemoryPool* pool() { return pool_; }
-
- int CiphertextSizeDelta();
- int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext);
-
- private:
- encryption::AesDecryptor* aes_decryptor_;
- std::string key_;
- std::string file_aad_;
- std::string aad_;
- ::arrow::MemoryPool* pool_;
-};
-
-class InternalFileDecryptor {
- public:
- explicit InternalFileDecryptor(FileDecryptionProperties* properties,
- const std::string& file_aad,
- ParquetCipher::type algorithm,
- const std::string& footer_key_metadata,
- ::arrow::MemoryPool* pool);
-
- std::string& file_aad() { return file_aad_; }
-
- std::string GetFooterKey();
-
- ParquetCipher::type algorithm() { return algorithm_; }
-
- std::string& footer_key_metadata() { return footer_key_metadata_; }
-
- FileDecryptionProperties* properties() { return properties_; }
-
- void WipeOutDecryptionKeys();
-
- ::arrow::MemoryPool* pool() { return pool_; }
-
- std::shared_ptr<Decryptor> GetFooterDecryptor();
- std::shared_ptr<Decryptor> GetFooterDecryptorForColumnMeta(const std::string& aad = "");
- std::shared_ptr<Decryptor> GetFooterDecryptorForColumnData(const std::string& aad = "");
- std::shared_ptr<Decryptor> GetColumnMetaDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad = "");
- std::shared_ptr<Decryptor> GetColumnDataDecryptor(
- const std::string& column_path, const std::string& column_key_metadata,
- const std::string& aad = "");
-
- private:
- FileDecryptionProperties* properties_;
- // Concatenation of aad_prefix (if exists) and aad_file_unique
- std::string file_aad_;
- std::map<std::string, std::shared_ptr<Decryptor>> column_data_map_;
- std::map<std::string, std::shared_ptr<Decryptor>> column_metadata_map_;
-
- std::shared_ptr<Decryptor> footer_metadata_decryptor_;
- std::shared_ptr<Decryptor> footer_data_decryptor_;
- ParquetCipher::type algorithm_;
- std::string footer_key_metadata_;
- std::vector<encryption::AesDecryptor*> all_decryptors_;
-
- /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
- // types of meta_decryptors and data_decryptors.
- std::unique_ptr<encryption::AesDecryptor> meta_decryptor_[3];
- std::unique_ptr<encryption::AesDecryptor> data_decryptor_[3];
-
- ::arrow::MemoryPool* pool_;
-
- std::shared_ptr<Decryptor> GetFooterDecryptor(const std::string& aad, bool metadata);
- std::shared_ptr<Decryptor> GetColumnDecryptor(const std::string& column_path,
- const std::string& column_key_metadata,
- const std::string& aad,
- bool metadata = false);
-
- encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size);
- encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size);
-
- int MapKeyLenToDecryptorArrayIndex(int key_len);
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/schema.h"
+
+namespace parquet {
+
+namespace encryption {
+class AesDecryptor;
+class AesEncryptor;
+} // namespace encryption
+
+class FileDecryptionProperties;
+
+class PARQUET_EXPORT Decryptor {
+ public:
+ Decryptor(encryption::AesDecryptor* decryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool);
+
+ const std::string& file_aad() const { return file_aad_; }
+ void UpdateAad(const std::string& aad) { aad_ = aad; }
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ int CiphertextSizeDelta();
+ int Decrypt(const uint8_t* ciphertext, int ciphertext_len, uint8_t* plaintext);
+
+ private:
+ encryption::AesDecryptor* aes_decryptor_;
+ std::string key_;
+ std::string file_aad_;
+ std::string aad_;
+ ::arrow::MemoryPool* pool_;
+};
+
+class InternalFileDecryptor {
+ public:
+ explicit InternalFileDecryptor(FileDecryptionProperties* properties,
+ const std::string& file_aad,
+ ParquetCipher::type algorithm,
+ const std::string& footer_key_metadata,
+ ::arrow::MemoryPool* pool);
+
+ std::string& file_aad() { return file_aad_; }
+
+ std::string GetFooterKey();
+
+ ParquetCipher::type algorithm() { return algorithm_; }
+
+ std::string& footer_key_metadata() { return footer_key_metadata_; }
+
+ FileDecryptionProperties* properties() { return properties_; }
+
+ void WipeOutDecryptionKeys();
+
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ std::shared_ptr<Decryptor> GetFooterDecryptor();
+ std::shared_ptr<Decryptor> GetFooterDecryptorForColumnMeta(const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetFooterDecryptorForColumnData(const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetColumnMetaDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad = "");
+ std::shared_ptr<Decryptor> GetColumnDataDecryptor(
+ const std::string& column_path, const std::string& column_key_metadata,
+ const std::string& aad = "");
+
+ private:
+ FileDecryptionProperties* properties_;
+ // Concatenation of aad_prefix (if exists) and aad_file_unique
+ std::string file_aad_;
+ std::map<std::string, std::shared_ptr<Decryptor>> column_data_map_;
+ std::map<std::string, std::shared_ptr<Decryptor>> column_metadata_map_;
+
+ std::shared_ptr<Decryptor> footer_metadata_decryptor_;
+ std::shared_ptr<Decryptor> footer_data_decryptor_;
+ ParquetCipher::type algorithm_;
+ std::string footer_key_metadata_;
+ std::vector<encryption::AesDecryptor*> all_decryptors_;
+
+ /// Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
+ // types of meta_decryptors and data_decryptors.
+ std::unique_ptr<encryption::AesDecryptor> meta_decryptor_[3];
+ std::unique_ptr<encryption::AesDecryptor> data_decryptor_[3];
+
+ ::arrow::MemoryPool* pool_;
+
+ std::shared_ptr<Decryptor> GetFooterDecryptor(const std::string& aad, bool metadata);
+ std::shared_ptr<Decryptor> GetColumnDecryptor(const std::string& column_path,
+ const std::string& column_key_metadata,
+ const std::string& aad,
+ bool metadata = false);
+
+ encryption::AesDecryptor* GetMetaAesDecryptor(size_t key_size);
+ encryption::AesDecryptor* GetDataAesDecryptor(size_t key_size);
+
+ int MapKeyLenToDecryptorArrayIndex(int key_len);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
index c9f265cf7f1..15bf52b84dd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.cc
@@ -1,170 +1,170 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/encryption/encryption.h"
-#include "parquet/encryption/encryption_internal.h"
-
-namespace parquet {
-
-// Encryptor
-Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool)
- : aes_encryptor_(aes_encryptor),
- key_(key),
- file_aad_(file_aad),
- aad_(aad),
- pool_(pool) {}
-
-int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); }
-
-int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) {
- return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_),
- static_cast<int>(key_.size()), str2bytes(aad_),
- static_cast<int>(aad_.size()), ciphertext);
-}
-
-// InternalFileEncryptor
-InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties,
- ::arrow::MemoryPool* pool)
- : properties_(properties), pool_(pool) {
- if (properties_->is_utilized()) {
- throw ParquetException("Re-using encryption properties for another file");
- }
- properties_->set_utilized();
-}
-
-void InternalFileEncryptor::WipeOutEncryptionKeys() {
- properties_->WipeOutEncryptionKeys();
-
- for (auto const& i : all_encryptors_) {
- i->WipeOut();
- }
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterEncryptor() {
- if (footer_encryptor_ != nullptr) {
- return footer_encryptor_;
- }
-
- ParquetCipher::type algorithm = properties_->algorithm().algorithm;
- std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
- std::string footer_key = properties_->footer_key();
- auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size());
- footer_encryptor_ = std::make_shared<Encryptor>(
- aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_);
- return footer_encryptor_;
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterSigningEncryptor() {
- if (footer_signing_encryptor_ != nullptr) {
- return footer_signing_encryptor_;
- }
-
- ParquetCipher::type algorithm = properties_->algorithm().algorithm;
- std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
- std::string footer_signing_key = properties_->footer_key();
- auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size());
- footer_signing_encryptor_ = std::make_shared<Encryptor>(
- aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_);
- return footer_signing_encryptor_;
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnMetaEncryptor(
- const std::string& column_path) {
- return GetColumnEncryptor(column_path, true);
-}
-
-std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnDataEncryptor(
- const std::string& column_path) {
- return GetColumnEncryptor(column_path, false);
-}
-
-std::shared_ptr<Encryptor>
-InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
- const std::string& column_path, bool metadata) {
- // first look if we already got the encryptor from before
- if (metadata) {
- if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
- return column_metadata_map_.at(column_path);
- }
- } else {
- if (column_data_map_.find(column_path) != column_data_map_.end()) {
- return column_data_map_.at(column_path);
- }
- }
- auto column_prop = properties_->column_encryption_properties(column_path);
- if (column_prop == nullptr) {
- return nullptr;
- }
-
- std::string key;
- if (column_prop->is_encrypted_with_footer_key()) {
- key = properties_->footer_key();
- } else {
- key = column_prop->key();
- }
-
- ParquetCipher::type algorithm = properties_->algorithm().algorithm;
- auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size())
- : GetDataAesEncryptor(algorithm, key.size());
-
- std::string file_aad = properties_->file_aad();
- std::shared_ptr<Encryptor> encryptor =
- std::make_shared<Encryptor>(aes_encryptor, key, file_aad, "", pool_);
- if (metadata)
- column_metadata_map_[column_path] = encryptor;
- else
- column_data_map_[column_path] = encryptor;
-
- return encryptor;
-}
-
-int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) {
- if (key_len == 16)
- return 0;
- else if (key_len == 24)
- return 1;
- else if (key_len == 32)
- return 2;
- throw ParquetException("encryption key must be 16, 24 or 32 bytes in length");
-}
-
-encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
- ParquetCipher::type algorithm, size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToEncryptorArrayIndex(key_len);
- if (meta_encryptor_[index] == nullptr) {
- meta_encryptor_[index].reset(
- encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_));
- }
- return meta_encryptor_[index].get();
-}
-
-encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
- ParquetCipher::type algorithm, size_t key_size) {
- int key_len = static_cast<int>(key_size);
- int index = MapKeyLenToEncryptorArrayIndex(key_len);
- if (data_encryptor_[index] == nullptr) {
- data_encryptor_[index].reset(
- encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_));
- }
- return data_encryptor_[index].get();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/encryption_internal.h"
+
+namespace parquet {
+
+// Encryptor
+Encryptor::Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool)
+ : aes_encryptor_(aes_encryptor),
+ key_(key),
+ file_aad_(file_aad),
+ aad_(aad),
+ pool_(pool) {}
+
+int Encryptor::CiphertextSizeDelta() { return aes_encryptor_->CiphertextSizeDelta(); }
+
+int Encryptor::Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext) {
+ return aes_encryptor_->Encrypt(plaintext, plaintext_len, str2bytes(key_),
+ static_cast<int>(key_.size()), str2bytes(aad_),
+ static_cast<int>(aad_.size()), ciphertext);
+}
+
+// InternalFileEncryptor
+InternalFileEncryptor::InternalFileEncryptor(FileEncryptionProperties* properties,
+ ::arrow::MemoryPool* pool)
+ : properties_(properties), pool_(pool) {
+ if (properties_->is_utilized()) {
+ throw ParquetException("Re-using encryption properties for another file");
+ }
+ properties_->set_utilized();
+}
+
+void InternalFileEncryptor::WipeOutEncryptionKeys() {
+ properties_->WipeOutEncryptionKeys();
+
+ for (auto const& i : all_encryptors_) {
+ i->WipeOut();
+ }
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterEncryptor() {
+ if (footer_encryptor_ != nullptr) {
+ return footer_encryptor_;
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
+ std::string footer_key = properties_->footer_key();
+ auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_key.size());
+ footer_encryptor_ = std::make_shared<Encryptor>(
+ aes_encryptor, footer_key, properties_->file_aad(), footer_aad, pool_);
+ return footer_encryptor_;
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetFooterSigningEncryptor() {
+ if (footer_signing_encryptor_ != nullptr) {
+ return footer_signing_encryptor_;
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ std::string footer_aad = encryption::CreateFooterAad(properties_->file_aad());
+ std::string footer_signing_key = properties_->footer_key();
+ auto aes_encryptor = GetMetaAesEncryptor(algorithm, footer_signing_key.size());
+ footer_signing_encryptor_ = std::make_shared<Encryptor>(
+ aes_encryptor, footer_signing_key, properties_->file_aad(), footer_aad, pool_);
+ return footer_signing_encryptor_;
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnMetaEncryptor(
+ const std::string& column_path) {
+ return GetColumnEncryptor(column_path, true);
+}
+
+std::shared_ptr<Encryptor> InternalFileEncryptor::GetColumnDataEncryptor(
+ const std::string& column_path) {
+ return GetColumnEncryptor(column_path, false);
+}
+
+std::shared_ptr<Encryptor>
+InternalFileEncryptor::InternalFileEncryptor::GetColumnEncryptor(
+ const std::string& column_path, bool metadata) {
+ // first look if we already got the encryptor from before
+ if (metadata) {
+ if (column_metadata_map_.find(column_path) != column_metadata_map_.end()) {
+ return column_metadata_map_.at(column_path);
+ }
+ } else {
+ if (column_data_map_.find(column_path) != column_data_map_.end()) {
+ return column_data_map_.at(column_path);
+ }
+ }
+ auto column_prop = properties_->column_encryption_properties(column_path);
+ if (column_prop == nullptr) {
+ return nullptr;
+ }
+
+ std::string key;
+ if (column_prop->is_encrypted_with_footer_key()) {
+ key = properties_->footer_key();
+ } else {
+ key = column_prop->key();
+ }
+
+ ParquetCipher::type algorithm = properties_->algorithm().algorithm;
+ auto aes_encryptor = metadata ? GetMetaAesEncryptor(algorithm, key.size())
+ : GetDataAesEncryptor(algorithm, key.size());
+
+ std::string file_aad = properties_->file_aad();
+ std::shared_ptr<Encryptor> encryptor =
+ std::make_shared<Encryptor>(aes_encryptor, key, file_aad, "", pool_);
+ if (metadata)
+ column_metadata_map_[column_path] = encryptor;
+ else
+ column_data_map_[column_path] = encryptor;
+
+ return encryptor;
+}
+
+int InternalFileEncryptor::MapKeyLenToEncryptorArrayIndex(int key_len) {
+ if (key_len == 16)
+ return 0;
+ else if (key_len == 24)
+ return 1;
+ else if (key_len == 32)
+ return 2;
+ throw ParquetException("encryption key must be 16, 24 or 32 bytes in length");
+}
+
+encryption::AesEncryptor* InternalFileEncryptor::GetMetaAesEncryptor(
+ ParquetCipher::type algorithm, size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToEncryptorArrayIndex(key_len);
+ if (meta_encryptor_[index] == nullptr) {
+ meta_encryptor_[index].reset(
+ encryption::AesEncryptor::Make(algorithm, key_len, true, &all_encryptors_));
+ }
+ return meta_encryptor_[index].get();
+}
+
+encryption::AesEncryptor* InternalFileEncryptor::GetDataAesEncryptor(
+ ParquetCipher::type algorithm, size_t key_size) {
+ int key_len = static_cast<int>(key_size);
+ int index = MapKeyLenToEncryptorArrayIndex(key_len);
+ if (data_encryptor_[index] == nullptr) {
+ data_encryptor_[index].reset(
+ encryption::AesEncryptor::Make(algorithm, key_len, false, &all_encryptors_));
+ }
+ return data_encryptor_[index].get();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
index 7cf513ca810..3cbe53500c2 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/encryption/internal_file_encryptor.h
@@ -1,109 +1,109 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "parquet/encryption/encryption.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-
-namespace encryption {
-class AesEncryptor;
-} // namespace encryption
-
-class FileEncryptionProperties;
-class ColumnEncryptionProperties;
-
-class PARQUET_EXPORT Encryptor {
- public:
- Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
- const std::string& file_aad, const std::string& aad,
- ::arrow::MemoryPool* pool);
- const std::string& file_aad() { return file_aad_; }
- void UpdateAad(const std::string& aad) { aad_ = aad; }
- ::arrow::MemoryPool* pool() { return pool_; }
-
- int CiphertextSizeDelta();
- int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext);
-
- bool EncryptColumnMetaData(
- bool encrypted_footer,
- const std::shared_ptr<ColumnEncryptionProperties>& column_encryption_properties) {
- // if column is not encrypted then do not encrypt the column metadata
- if (!column_encryption_properties || !column_encryption_properties->is_encrypted())
- return false;
- // if plaintext footer then encrypt the column metadata
- if (!encrypted_footer) return true;
- // if column is not encrypted with footer key then encrypt the column metadata
- return !column_encryption_properties->is_encrypted_with_footer_key();
- }
-
- private:
- encryption::AesEncryptor* aes_encryptor_;
- std::string key_;
- std::string file_aad_;
- std::string aad_;
- ::arrow::MemoryPool* pool_;
-};
-
-class InternalFileEncryptor {
- public:
- explicit InternalFileEncryptor(FileEncryptionProperties* properties,
- ::arrow::MemoryPool* pool);
-
- std::shared_ptr<Encryptor> GetFooterEncryptor();
- std::shared_ptr<Encryptor> GetFooterSigningEncryptor();
- std::shared_ptr<Encryptor> GetColumnMetaEncryptor(const std::string& column_path);
- std::shared_ptr<Encryptor> GetColumnDataEncryptor(const std::string& column_path);
- void WipeOutEncryptionKeys();
-
- private:
- FileEncryptionProperties* properties_;
-
- std::map<std::string, std::shared_ptr<Encryptor>> column_data_map_;
- std::map<std::string, std::shared_ptr<Encryptor>> column_metadata_map_;
-
- std::shared_ptr<Encryptor> footer_signing_encryptor_;
- std::shared_ptr<Encryptor> footer_encryptor_;
-
- std::vector<encryption::AesEncryptor*> all_encryptors_;
-
- // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
- // types of meta_encryptors and data_encryptors.
- std::unique_ptr<encryption::AesEncryptor> meta_encryptor_[3];
- std::unique_ptr<encryption::AesEncryptor> data_encryptor_[3];
-
- ::arrow::MemoryPool* pool_;
-
- std::shared_ptr<Encryptor> GetColumnEncryptor(const std::string& column_path,
- bool metadata);
-
- encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm,
- size_t key_len);
- encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
- size_t key_len);
-
- int MapKeyLenToEncryptorArrayIndex(int key_len);
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "parquet/encryption/encryption.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+namespace encryption {
+class AesEncryptor;
+} // namespace encryption
+
+class FileEncryptionProperties;
+class ColumnEncryptionProperties;
+
+class PARQUET_EXPORT Encryptor {
+ public:
+ Encryptor(encryption::AesEncryptor* aes_encryptor, const std::string& key,
+ const std::string& file_aad, const std::string& aad,
+ ::arrow::MemoryPool* pool);
+ const std::string& file_aad() { return file_aad_; }
+ void UpdateAad(const std::string& aad) { aad_ = aad; }
+ ::arrow::MemoryPool* pool() { return pool_; }
+
+ int CiphertextSizeDelta();
+ int Encrypt(const uint8_t* plaintext, int plaintext_len, uint8_t* ciphertext);
+
+ bool EncryptColumnMetaData(
+ bool encrypted_footer,
+ const std::shared_ptr<ColumnEncryptionProperties>& column_encryption_properties) {
+ // if column is not encrypted then do not encrypt the column metadata
+ if (!column_encryption_properties || !column_encryption_properties->is_encrypted())
+ return false;
+ // if plaintext footer then encrypt the column metadata
+ if (!encrypted_footer) return true;
+ // if column is not encrypted with footer key then encrypt the column metadata
+ return !column_encryption_properties->is_encrypted_with_footer_key();
+ }
+
+ private:
+ encryption::AesEncryptor* aes_encryptor_;
+ std::string key_;
+ std::string file_aad_;
+ std::string aad_;
+ ::arrow::MemoryPool* pool_;
+};
+
+class InternalFileEncryptor {
+ public:
+ explicit InternalFileEncryptor(FileEncryptionProperties* properties,
+ ::arrow::MemoryPool* pool);
+
+ std::shared_ptr<Encryptor> GetFooterEncryptor();
+ std::shared_ptr<Encryptor> GetFooterSigningEncryptor();
+ std::shared_ptr<Encryptor> GetColumnMetaEncryptor(const std::string& column_path);
+ std::shared_ptr<Encryptor> GetColumnDataEncryptor(const std::string& column_path);
+ void WipeOutEncryptionKeys();
+
+ private:
+ FileEncryptionProperties* properties_;
+
+ std::map<std::string, std::shared_ptr<Encryptor>> column_data_map_;
+ std::map<std::string, std::shared_ptr<Encryptor>> column_metadata_map_;
+
+ std::shared_ptr<Encryptor> footer_signing_encryptor_;
+ std::shared_ptr<Encryptor> footer_encryptor_;
+
+ std::vector<encryption::AesEncryptor*> all_encryptors_;
+
+ // Key must be 16, 24 or 32 bytes in length. Thus there could be up to three
+ // types of meta_encryptors and data_encryptors.
+ std::unique_ptr<encryption::AesEncryptor> meta_encryptor_[3];
+ std::unique_ptr<encryption::AesEncryptor> data_encryptor_[3];
+
+ ::arrow::MemoryPool* pool_;
+
+ std::shared_ptr<Encryptor> GetColumnEncryptor(const std::string& column_path,
+ bool metadata);
+
+ encryption::AesEncryptor* GetMetaAesEncryptor(ParquetCipher::type algorithm,
+ size_t key_len);
+ encryption::AesEncryptor* GetDataAesEncryptor(ParquetCipher::type algorithm,
+ size_t key_len);
+
+ int MapKeyLenToEncryptorArrayIndex(int key_len);
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc b/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
index 909a5079c76..c333957dd1d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/exception.cc
@@ -1,27 +1,27 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/exception.h"
-
-namespace parquet {
-
-std::ostream& operator<<(std::ostream& os, const ParquetException& exception) {
- os << exception.what();
- return os;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception) {
+ os << exception.what();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/exception.h b/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
index a76761c63c3..826f5bdc8bf 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/exception.h
@@ -1,158 +1,158 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <exception>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "arrow/type_fwd.h"
-#include "arrow/util/string_builder.h"
-#include "parquet/platform.h"
-
-// PARQUET-1085
-#if !defined(ARROW_UNUSED)
-#define ARROW_UNUSED(x) UNUSED(x)
-#endif
-
-// Parquet exception to Arrow Status
-
-#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
-#define END_PARQUET_CATCH_EXCEPTIONS \
- } \
- catch (const ::parquet::ParquetStatusException& e) { \
- return e.status(); \
- } \
- catch (const ::parquet::ParquetException& e) { \
- return ::arrow::Status::IOError(e.what()); \
- }
-
-// clang-format off
-
-#define PARQUET_CATCH_NOT_OK(s) \
- BEGIN_PARQUET_CATCH_EXCEPTIONS \
- (s); \
- END_PARQUET_CATCH_EXCEPTIONS
-
-// clang-format on
-
-#define PARQUET_CATCH_AND_RETURN(s) \
- BEGIN_PARQUET_CATCH_EXCEPTIONS \
- return (s); \
- END_PARQUET_CATCH_EXCEPTIONS
-
-// Arrow Status to Parquet exception
-
-#define PARQUET_IGNORE_NOT_OK(s) \
- do { \
- ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
- ARROW_UNUSED(_s); \
- } while (0)
-
-#define PARQUET_THROW_NOT_OK(s) \
- do { \
- ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
- if (!_s.ok()) { \
- throw ::parquet::ParquetStatusException(std::move(_s)); \
- } \
- } while (0)
-
-#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
- auto status_name = (rexpr); \
- PARQUET_THROW_NOT_OK(status_name.status()); \
- lhs = std::move(status_name).ValueOrDie();
-
-#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
- PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
- lhs, rexpr);
-
-namespace parquet {
-
-class ParquetException : public std::exception {
- public:
- PARQUET_NORETURN static void EofException(const std::string& msg = "") {
- static std::string prefix = "Unexpected end of stream";
- if (msg.empty()) {
- throw ParquetException(prefix);
- }
- throw ParquetException(prefix, ": ", msg);
- }
-
- PARQUET_NORETURN static void NYI(const std::string& msg = "") {
- throw ParquetException("Not yet implemented: ", msg, ".");
- }
-
- template <typename... Args>
- explicit ParquetException(Args&&... args)
- : msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
-
- explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
-
- explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
-
- ParquetException(const ParquetException&) = default;
- ParquetException& operator=(const ParquetException&) = default;
- ParquetException(ParquetException&&) = default;
- ParquetException& operator=(ParquetException&&) = default;
-
- const char* what() const noexcept override { return msg_.c_str(); }
-
- private:
- std::string msg_;
-};
-
-// Support printing a ParquetException.
-// This is needed for clang-on-MSVC as there operator<< is not defined for
-// std::exception.
-PARQUET_EXPORT
-std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
-
-class ParquetStatusException : public ParquetException {
- public:
- explicit ParquetStatusException(::arrow::Status status)
- : ParquetException(status.ToString()), status_(std::move(status)) {}
-
- const ::arrow::Status& status() const { return status_; }
-
- private:
- ::arrow::Status status_;
-};
-
-// This class exists for the purpose of detecting an invalid or corrupted file.
-class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
- public:
- ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
- default;
-
- template <typename Arg,
- typename std::enable_if<
- !std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
- int>::type = 0,
- typename... Args>
- explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
- : ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
- std::forward<Args>(args)...)) {}
-};
-
-template <typename StatusReturnBlock>
-void ThrowNotOk(StatusReturnBlock&& b) {
- PARQUET_THROW_NOT_OK(b());
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <exception>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/string_builder.h"
+#include "parquet/platform.h"
+
+// PARQUET-1085
+#if !defined(ARROW_UNUSED)
+#define ARROW_UNUSED(x) UNUSED(x)
+#endif
+
+// Parquet exception to Arrow Status
+
+#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
+#define END_PARQUET_CATCH_EXCEPTIONS \
+ } \
+ catch (const ::parquet::ParquetStatusException& e) { \
+ return e.status(); \
+ } \
+ catch (const ::parquet::ParquetException& e) { \
+ return ::arrow::Status::IOError(e.what()); \
+ }
+
+// clang-format off
+
+#define PARQUET_CATCH_NOT_OK(s) \
+ BEGIN_PARQUET_CATCH_EXCEPTIONS \
+ (s); \
+ END_PARQUET_CATCH_EXCEPTIONS
+
+// clang-format on
+
+#define PARQUET_CATCH_AND_RETURN(s) \
+ BEGIN_PARQUET_CATCH_EXCEPTIONS \
+ return (s); \
+ END_PARQUET_CATCH_EXCEPTIONS
+
+// Arrow Status to Parquet exception
+
+#define PARQUET_IGNORE_NOT_OK(s) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ ARROW_UNUSED(_s); \
+ } while (0)
+
+#define PARQUET_THROW_NOT_OK(s) \
+ do { \
+ ::arrow::Status _s = ::arrow::internal::GenericToStatus(s); \
+ if (!_s.ok()) { \
+ throw ::parquet::ParquetStatusException(std::move(_s)); \
+ } \
+ } while (0)
+
+#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+ auto status_name = (rexpr); \
+ PARQUET_THROW_NOT_OK(status_name.status()); \
+ lhs = std::move(status_name).ValueOrDie();
+
+#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr) \
+ PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+ lhs, rexpr);
+
+namespace parquet {
+
+class ParquetException : public std::exception {
+ public:
+ PARQUET_NORETURN static void EofException(const std::string& msg = "") {
+ static std::string prefix = "Unexpected end of stream";
+ if (msg.empty()) {
+ throw ParquetException(prefix);
+ }
+ throw ParquetException(prefix, ": ", msg);
+ }
+
+ PARQUET_NORETURN static void NYI(const std::string& msg = "") {
+ throw ParquetException("Not yet implemented: ", msg, ".");
+ }
+
+ template <typename... Args>
+ explicit ParquetException(Args&&... args)
+ : msg_(::arrow::util::StringBuilder(std::forward<Args>(args)...)) {}
+
+ explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
+
+ explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
+
+ ParquetException(const ParquetException&) = default;
+ ParquetException& operator=(const ParquetException&) = default;
+ ParquetException(ParquetException&&) = default;
+ ParquetException& operator=(ParquetException&&) = default;
+
+ const char* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+ std::string msg_;
+};
+
+// Support printing a ParquetException.
+// This is needed for clang-on-MSVC as there operator<< is not defined for
+// std::exception.
+PARQUET_EXPORT
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
+
+class ParquetStatusException : public ParquetException {
+ public:
+ explicit ParquetStatusException(::arrow::Status status)
+ : ParquetException(status.ToString()), status_(std::move(status)) {}
+
+ const ::arrow::Status& status() const { return status_; }
+
+ private:
+ ::arrow::Status status_;
+};
+
+// This class exists for the purpose of detecting an invalid or corrupted file.
+class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
+ public:
+ ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
+ default;
+
+ template <typename Arg,
+ typename std::enable_if<
+ !std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
+ int>::type = 0,
+ typename... Args>
+ explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
+ : ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
+ std::forward<Args>(args)...)) {}
+};
+
+template <typename StatusReturnBlock>
+void ThrowNotOk(StatusReturnBlock&& b) {
+ PARQUET_THROW_NOT_OK(b());
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
index 3c3c124987e..4e38901aa0d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.cc
@@ -1,868 +1,868 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/file_reader.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <utility>
-
-#include "arrow/io/caching.h"
-#include "arrow/io/file.h"
-#include "arrow/io/memory.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/future.h"
-#include "arrow/util/int_util_internal.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/ubsan.h"
-#include "parquet/column_reader.h"
-#include "parquet/column_scanner.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/exception.h"
-#include "parquet/file_writer.h"
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-using arrow::internal::AddWithOverflow;
-
-namespace parquet {
-
-// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
-static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
-static constexpr uint32_t kFooterSize = 8;
-
-// For PARQUET-816
-static constexpr int64_t kMaxDictHeaderSize = 100;
-
-// ----------------------------------------------------------------------
-// RowGroupReader public API
-
-RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
- : contents_(std::move(contents)) {}
-
-std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
- if (i >= metadata()->num_columns()) {
- std::stringstream ss;
- ss << "Trying to read column index " << i << " but row group metadata has only "
- << metadata()->num_columns() << " columns";
- throw ParquetException(ss.str());
- }
- const ColumnDescriptor* descr = metadata()->schema()->Column(i);
-
- std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
- return ColumnReader::Make(
- descr, std::move(page_reader),
- const_cast<ReaderProperties*>(contents_->properties())->memory_pool());
-}
-
-std::shared_ptr<ColumnReader> RowGroupReader::ColumnWithExposeEncoding(
- int i, ExposedEncoding encoding_to_expose) {
- std::shared_ptr<ColumnReader> reader = Column(i);
-
- if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
- // Check the encoding_stats to see if all data pages are dictionary encoded.
- std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
- const std::vector<PageEncodingStats>& encoding_stats = col->encoding_stats();
- if (encoding_stats.empty()) {
- // Some parquet files may have empty encoding_stats. In this case we are
- // not sure whether all data pages are dictionary encoded. So we do not
- // enable exposing dictionary.
- return reader;
- }
- // The 1st page should be the dictionary page.
- if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
- (encoding_stats[0].encoding != Encoding::PLAIN &&
- encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
- return reader;
- }
- // The following pages should be dictionary encoded data pages.
- for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
- if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
- encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
- (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
- encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
- return reader;
- }
- }
- } else {
- // Exposing other encodings are not supported for now.
- return reader;
- }
-
- // Set exposed encoding.
- reader->SetExposedEncoding(encoding_to_expose);
- return reader;
-}
-
-std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
- if (i >= metadata()->num_columns()) {
- std::stringstream ss;
- ss << "Trying to read column index " << i << " but row group metadata has only "
- << metadata()->num_columns() << " columns";
- throw ParquetException(ss.str());
- }
- return contents_->GetColumnPageReader(i);
-}
-
-// Returns the rowgroup metadata
-const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->metadata(); }
-
-/// Compute the section of the file that should be read for the given
-/// row group and column chunk.
-::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
- int64_t source_size, int row_group_index,
- int column_index) {
- auto row_group_metadata = file_metadata->RowGroup(row_group_index);
- auto column_metadata = row_group_metadata->ColumnChunk(column_index);
-
- int64_t col_start = column_metadata->data_page_offset();
- if (column_metadata->has_dictionary_page() &&
- column_metadata->dictionary_page_offset() > 0 &&
- col_start > column_metadata->dictionary_page_offset()) {
- col_start = column_metadata->dictionary_page_offset();
- }
-
- int64_t col_length = column_metadata->total_compressed_size();
- int64_t col_end;
- if (AddWithOverflow(col_start, col_length, &col_end) || col_end > source_size) {
- throw ParquetException("Invalid column metadata (corrupt file?)");
- }
-
- // PARQUET-816 workaround for old files created by older parquet-mr
- const ApplicationVersion& version = file_metadata->writer_version();
- if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION())) {
- // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
- // dictionary page header size in total_compressed_size and total_uncompressed_size
- // (see IMPALA-694). We add padding to compensate.
- int64_t bytes_remaining = source_size - col_end;
- int64_t padding = std::min<int64_t>(kMaxDictHeaderSize, bytes_remaining);
- col_length += padding;
- }
-
- return {col_start, col_length};
-}
-
-// RowGroupReader::Contents implementation for the Parquet file specification
-class SerializedRowGroup : public RowGroupReader::Contents {
- public:
- SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
- std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source,
- int64_t source_size, FileMetaData* file_metadata,
- int row_group_number, const ReaderProperties& props,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
- : source_(std::move(source)),
- cached_source_(std::move(cached_source)),
- source_size_(source_size),
- file_metadata_(file_metadata),
- properties_(props),
- row_group_ordinal_(row_group_number),
- file_decryptor_(file_decryptor) {
- row_group_metadata_ = file_metadata->RowGroup(row_group_number);
- }
-
- const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); }
-
- const ReaderProperties* properties() const override { return &properties_; }
-
- std::unique_ptr<PageReader> GetColumnPageReader(int i) override {
- // Read column chunk from the file
- auto col = row_group_metadata_->ColumnChunk(i);
-
- ::arrow::io::ReadRange col_range =
- ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i);
- std::shared_ptr<ArrowInputStream> stream;
- if (cached_source_) {
- // PARQUET-1698: if read coalescing is enabled, read from pre-buffered
- // segments.
- PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range));
- stream = std::make_shared<::arrow::io::BufferReader>(buffer);
- } else {
- stream = properties_.GetStream(source_, col_range.offset, col_range.length);
- }
-
- std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col->crypto_metadata();
-
- // Column is encrypted only if crypto_metadata exists.
- if (!crypto_metadata) {
- return PageReader::Open(stream, col->num_values(), col->compression(),
- properties_.memory_pool());
- }
-
- if (file_decryptor_ == nullptr) {
- throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
- }
-
- constexpr auto kEncryptedRowGroupsLimit = 32767;
- if (i > kEncryptedRowGroupsLimit) {
- throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
- }
-
- // The column is encrypted
- std::shared_ptr<Decryptor> meta_decryptor;
- std::shared_ptr<Decryptor> data_decryptor;
- // The column is encrypted with footer key
- if (crypto_metadata->encrypted_with_footer_key()) {
- meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta();
- data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData();
- CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
- static_cast<int16_t>(i), meta_decryptor, data_decryptor);
- return PageReader::Open(stream, col->num_values(), col->compression(),
- properties_.memory_pool(), &ctx);
- }
-
- // The column is encrypted with its own key
- std::string column_key_metadata = crypto_metadata->key_metadata();
- const std::string column_path = crypto_metadata->path_in_schema()->ToDotString();
-
- meta_decryptor =
- file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata);
- data_decryptor =
- file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata);
-
- CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
- static_cast<int16_t>(i), meta_decryptor, data_decryptor);
- return PageReader::Open(stream, col->num_values(), col->compression(),
- properties_.memory_pool(), &ctx);
- }
-
- private:
- std::shared_ptr<ArrowInputFile> source_;
- // Will be nullptr if PreBuffer() is not called.
- std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
- int64_t source_size_;
- FileMetaData* file_metadata_;
- std::unique_ptr<RowGroupMetaData> row_group_metadata_;
- ReaderProperties properties_;
- int row_group_ordinal_;
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-};
-
-// ----------------------------------------------------------------------
-// SerializedFile: An implementation of ParquetFileReader::Contents that deals
-// with the Parquet file structure, Thrift deserialization, and other internal
-// matters
-
-// This class takes ownership of the provided data source
-class SerializedFile : public ParquetFileReader::Contents {
- public:
- SerializedFile(std::shared_ptr<ArrowInputFile> source,
- const ReaderProperties& props = default_reader_properties())
- : source_(std::move(source)), properties_(props) {
- PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize());
- }
-
- ~SerializedFile() override {
- try {
- Close();
- } catch (...) {
- }
- }
-
- void Close() override {
- if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys();
- }
-
- std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
- std::unique_ptr<SerializedRowGroup> contents(
- new SerializedRowGroup(source_, cached_source_, source_size_,
- file_metadata_.get(), i, properties_, file_decryptor_));
- return std::make_shared<RowGroupReader>(std::move(contents));
- }
-
- std::shared_ptr<FileMetaData> metadata() const override { return file_metadata_; }
-
- void set_metadata(std::shared_ptr<FileMetaData> metadata) {
- file_metadata_ = std::move(metadata);
- }
-
- void PreBuffer(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- const ::arrow::io::IOContext& ctx,
- const ::arrow::io::CacheOptions& options) {
- cached_source_ =
- std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options);
- std::vector<::arrow::io::ReadRange> ranges;
- for (int row : row_groups) {
- for (int col : column_indices) {
- ranges.push_back(
- ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
- }
- }
- PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
- }
-
- ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices) const {
- if (!cached_source_) {
- return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
- }
- std::vector<::arrow::io::ReadRange> ranges;
- for (int row : row_groups) {
- for (int col : column_indices) {
- ranges.push_back(
- ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
- }
- }
- return cached_source_->WaitFor(ranges);
- }
-
- // Metadata/footer parsing. Divided up to separate sync/async paths, and to use
- // exceptions for error handling (with the async path converting to Future/Status).
-
- void ParseMetaData() {
- int64_t footer_read_size = GetFooterReadSize();
- PARQUET_ASSIGN_OR_THROW(
- auto footer_buffer,
- source_->ReadAt(source_size_ - footer_read_size, footer_read_size));
- uint32_t metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
- int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
-
- std::shared_ptr<::arrow::Buffer> metadata_buffer;
- if (footer_read_size >= (metadata_len + kFooterSize)) {
- metadata_buffer = SliceBuffer(
- footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len);
- } else {
- PARQUET_ASSIGN_OR_THROW(metadata_buffer,
- source_->ReadAt(metadata_start, metadata_len));
- }
-
- // Parse the footer depending on encryption type
- const bool is_encrypted_footer =
- memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
- if (is_encrypted_footer) {
- // Encrypted file with Encrypted footer.
- const std::pair<int64_t, uint32_t> read_size =
- ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
- // Read the actual footer
- metadata_start = read_size.first;
- metadata_len = read_size.second;
- PARQUET_ASSIGN_OR_THROW(metadata_buffer,
- source_->ReadAt(metadata_start, metadata_len));
- // Fall through
- }
-
- const uint32_t read_metadata_len =
- ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
- auto file_decryption_properties = properties_.file_decryption_properties().get();
- if (is_encrypted_footer) {
- // Nothing else to do here.
- return;
- } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
- if (file_decryption_properties != nullptr) {
- if (!file_decryption_properties->plaintext_files_allowed()) {
- throw ParquetException("Applying decryption properties on plaintext file");
- }
- }
- } else {
- // Encrypted file with plaintext footer mode.
- ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
- }
- }
-
- // Validate the source size and get the initial read size.
- int64_t GetFooterReadSize() {
- if (source_size_ == 0) {
- throw ParquetInvalidOrCorruptedFileException("Parquet file size is 0 bytes");
- } else if (source_size_ < kFooterSize) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet file size is ", source_size_,
- " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
- }
- return std::min(source_size_, kDefaultFooterReadSize);
- }
-
- // Validate the magic bytes and get the length of the full footer.
- uint32_t ParseFooterLength(const std::shared_ptr<::arrow::Buffer>& footer_buffer,
- const int64_t footer_read_size) {
- // Check if all bytes are read. Check if last 4 bytes read have the magic bits
- if (footer_buffer->size() != footer_read_size ||
- (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 &&
- memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet magic bytes not found in footer. Either the file is corrupted or this "
- "is not a parquet file.");
- }
- // Both encrypted/unencrypted footers have the same footer length check.
- uint32_t metadata_len = ::arrow::util::SafeLoadAs<uint32_t>(
- reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
- kFooterSize);
- if (metadata_len > source_size_ - kFooterSize) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet file size is ", source_size_,
- " bytes, smaller than the size reported by footer's (", metadata_len, "bytes)");
- }
- return metadata_len;
- }
-
- // Does not throw.
- ::arrow::Future<> ParseMetaDataAsync() {
- int64_t footer_read_size;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- footer_read_size = GetFooterReadSize();
- END_PARQUET_CATCH_EXCEPTIONS
- // Assumes this is kept alive externally
- return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
- .Then([=](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
- -> ::arrow::Future<> {
- uint32_t metadata_len;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
- END_PARQUET_CATCH_EXCEPTIONS
- int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
-
- std::shared_ptr<::arrow::Buffer> metadata_buffer;
- if (footer_read_size >= (metadata_len + kFooterSize)) {
- metadata_buffer =
- SliceBuffer(footer_buffer, footer_read_size - metadata_len - kFooterSize,
- metadata_len);
- return ParseMaybeEncryptedMetaDataAsync(footer_buffer,
- std::move(metadata_buffer),
- footer_read_size, metadata_len);
- }
- return source_->ReadAsync(metadata_start, metadata_len)
- .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
- return ParseMaybeEncryptedMetaDataAsync(footer_buffer, metadata_buffer,
- footer_read_size, metadata_len);
- });
- });
- }
-
- // Continuation
- ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
- std::shared_ptr<::arrow::Buffer> footer_buffer,
- std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
- uint32_t metadata_len) {
- // Parse the footer depending on encryption type
- const bool is_encrypted_footer =
- memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
- if (is_encrypted_footer) {
- // Encrypted file with Encrypted footer.
- std::pair<int64_t, uint32_t> read_size;
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- read_size =
- ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
- END_PARQUET_CATCH_EXCEPTIONS
- // Read the actual footer
- int64_t metadata_start = read_size.first;
- metadata_len = read_size.second;
- return source_->ReadAsync(metadata_start, metadata_len)
- .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
- // Continue and read the file footer
- return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer);
- });
- }
- return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
- is_encrypted_footer);
- }
-
- // Continuation
- ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
- uint32_t metadata_len,
- const bool is_encrypted_footer) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- const uint32_t read_metadata_len =
- ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
- auto file_decryption_properties = properties_.file_decryption_properties().get();
- if (is_encrypted_footer) {
- // Nothing else to do here.
- return ::arrow::Status::OK();
- } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
- if (file_decryption_properties != nullptr) {
- if (!file_decryption_properties->plaintext_files_allowed()) {
- throw ParquetException("Applying decryption properties on plaintext file");
- }
- }
- } else {
- // Encrypted file with plaintext footer mode.
- ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
- }
- END_PARQUET_CATCH_EXCEPTIONS
- return ::arrow::Status::OK();
- }
-
- private:
- std::shared_ptr<ArrowInputFile> source_;
- std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
- int64_t source_size_;
- std::shared_ptr<FileMetaData> file_metadata_;
- ReaderProperties properties_;
-
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-
- // \return The true length of the metadata in bytes
- uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr<Buffer>& footer_buffer,
- const uint32_t metadata_len);
-
- std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties,
- EncryptionAlgorithm& algo);
-
- void ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- FileDecryptionProperties* file_decryption_properties,
- const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
- uint32_t read_metadata_len);
-
- // \return The position and size of the actual footer
- std::pair<int64_t, uint32_t> ParseMetaDataOfEncryptedFileWithEncryptedFooter(
- const std::shared_ptr<Buffer>& crypto_metadata_buffer, uint32_t footer_len);
-};
-
-uint32_t SerializedFile::ParseUnencryptedFileMetadata(
- const std::shared_ptr<Buffer>& metadata_buffer, const uint32_t metadata_len) {
- if (metadata_buffer->size() != metadata_len) {
- throw ParquetException("Failed reading metadata buffer (requested " +
- std::to_string(metadata_len) + " bytes but got " +
- std::to_string(metadata_buffer->size()) + " bytes)");
- }
- uint32_t read_metadata_len = metadata_len;
- // The encrypted read path falls through to here, so pass in the decryptor
- file_metadata_ =
- FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, file_decryptor_);
- return read_metadata_len;
-}
-
-std::pair<int64_t, uint32_t>
-SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter(
- const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer,
- // both metadata & crypto metadata length
- const uint32_t footer_len) {
- // encryption with encrypted footer
- // Check if the footer_buffer contains the entire metadata
- if (crypto_metadata_buffer->size() != footer_len) {
- throw ParquetException("Failed reading encrypted metadata buffer (requested " +
- std::to_string(footer_len) + " bytes but got " +
- std::to_string(crypto_metadata_buffer->size()) + " bytes)");
- }
- auto file_decryption_properties = properties_.file_decryption_properties().get();
- if (file_decryption_properties == nullptr) {
- throw ParquetException(
- "Could not read encrypted metadata, no decryption found in reader's properties");
- }
- uint32_t crypto_metadata_len = footer_len;
- std::shared_ptr<FileCryptoMetaData> file_crypto_metadata =
- FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len);
- // Handle AAD prefix
- EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm();
- std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
- file_decryptor_ = std::make_shared<InternalFileDecryptor>(
- file_decryption_properties, file_aad, algo.algorithm,
- file_crypto_metadata->key_metadata(), properties_.memory_pool());
-
- int64_t metadata_offset = source_size_ - kFooterSize - footer_len + crypto_metadata_len;
- uint32_t metadata_len = footer_len - crypto_metadata_len;
- return std::make_pair(metadata_offset, metadata_len);
-}
-
-void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
- FileDecryptionProperties* file_decryption_properties,
- const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
- uint32_t read_metadata_len) {
- // Providing decryption properties in plaintext footer mode is not mandatory, for
- // example when reading by legacy reader.
- if (file_decryption_properties != nullptr) {
- EncryptionAlgorithm algo = file_metadata_->encryption_algorithm();
- // Handle AAD prefix
- std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
- file_decryptor_ = std::make_shared<InternalFileDecryptor>(
- file_decryption_properties, file_aad, algo.algorithm,
- file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
- // set the InternalFileDecryptor in the metadata as well, as it's used
- // for signature verification and for ColumnChunkMetaData creation.
- file_metadata_->set_file_decryptor(file_decryptor_);
-
- if (file_decryption_properties->check_plaintext_footer_integrity()) {
- if (metadata_len - read_metadata_len !=
- (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
- throw ParquetInvalidOrCorruptedFileException(
- "Failed reading metadata for encryption signature (requested ",
- parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
- " bytes but have ", metadata_len - read_metadata_len, " bytes)");
- }
-
- if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
- throw ParquetInvalidOrCorruptedFileException(
- "Parquet crypto signature verification failed");
- }
- }
- }
-}
-
-std::string SerializedFile::HandleAadPrefix(
- FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) {
- std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix();
- std::string aad_prefix = aad_prefix_in_properties;
- bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true;
- std::string aad_prefix_in_file = algo.aad.aad_prefix;
-
- if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) {
- throw ParquetException(
- "AAD prefix used for file encryption, "
- "but not stored in file and not supplied "
- "in decryption properties");
- }
-
- if (file_has_aad_prefix) {
- if (!aad_prefix_in_properties.empty()) {
- if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) {
- throw ParquetException(
- "AAD Prefix in file and in properties "
- "is not the same");
- }
- }
- aad_prefix = aad_prefix_in_file;
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
- file_decryption_properties->aad_prefix_verifier();
- if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix);
- } else {
- if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) {
- throw ParquetException(
- "AAD Prefix set in decryption properties, but was not used "
- "for file encryption");
- }
- std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
- file_decryption_properties->aad_prefix_verifier();
- if (aad_prefix_verifier != nullptr) {
- throw ParquetException(
- "AAD Prefix Verifier is set, but AAD Prefix not found in file");
- }
- }
- return aad_prefix + algo.aad.aad_file_unique;
-}
-
-// ----------------------------------------------------------------------
-// ParquetFileReader public API
-
-ParquetFileReader::ParquetFileReader() {}
-
-ParquetFileReader::~ParquetFileReader() {
- try {
- Close();
- } catch (...) {
- }
-}
-
-// Open the file. If no metadata is passed, it is parsed from the footer of
-// the file
-std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
- std::shared_ptr<ArrowInputFile> source, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- std::unique_ptr<ParquetFileReader::Contents> result(
- new SerializedFile(std::move(source), props));
-
- // Access private methods here, but otherwise unavailable
- SerializedFile* file = static_cast<SerializedFile*>(result.get());
-
- if (metadata == nullptr) {
- // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor
- file->ParseMetaData();
- } else {
- file->set_metadata(std::move(metadata));
- }
-
- return result;
-}
-
-::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
-ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
- const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- std::unique_ptr<ParquetFileReader::Contents> result(
- new SerializedFile(std::move(source), props));
- SerializedFile* file = static_cast<SerializedFile*>(result.get());
- if (metadata == nullptr) {
- // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
- struct {
- ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
- return std::move(result);
- }
-
- std::unique_ptr<ParquetFileReader::Contents> result;
- } Continuation;
- Continuation.result = std::move(result);
- return file->ParseMetaDataAsync().Then(std::move(Continuation));
- } else {
- file->set_metadata(std::move(metadata));
- return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
- std::move(result));
- }
- END_PARQUET_CATCH_EXCEPTIONS
-}
-
-std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
- std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- auto contents = SerializedFile::Open(std::move(source), props, std::move(metadata));
- std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
- result->Open(std::move(contents));
- return result;
-}
-
-std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
- const std::string& path, bool memory_map, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- std::shared_ptr<::arrow::io::RandomAccessFile> source;
- if (memory_map) {
- PARQUET_ASSIGN_OR_THROW(
- source, ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ));
- } else {
- PARQUET_ASSIGN_OR_THROW(source,
- ::arrow::io::ReadableFile::Open(path, props.memory_pool()));
- }
-
- return Open(std::move(source), props, std::move(metadata));
-}
-
-::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
- std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
- std::shared_ptr<FileMetaData> metadata) {
- BEGIN_PARQUET_CATCH_EXCEPTIONS
- auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
- // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
- auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
- fut.AddCallback([fut, completed](
- const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
- contents) mutable {
- if (!contents.ok()) {
- completed.MarkFinished(contents.status());
- return;
- }
- std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
- result->Open(fut.MoveResult().MoveValueUnsafe());
- completed.MarkFinished(std::move(result));
- });
- return completed;
- END_PARQUET_CATCH_EXCEPTIONS
-}
-
-void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) {
- contents_ = std::move(contents);
-}
-
-void ParquetFileReader::Close() {
- if (contents_) {
- contents_->Close();
- }
-}
-
-std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
- return contents_->metadata();
-}
-
-std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
- if (i >= metadata()->num_row_groups()) {
- std::stringstream ss;
- ss << "Trying to read row group " << i << " but file only has "
- << metadata()->num_row_groups() << " row groups";
- throw ParquetException(ss.str());
- }
- return contents_->GetRowGroup(i);
-}
-
-void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- const ::arrow::io::IOContext& ctx,
- const ::arrow::io::CacheOptions& options) {
- // Access private methods here
- SerializedFile* file =
- ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
- file->PreBuffer(row_groups, column_indices, ctx, options);
-}
-
-::arrow::Future<> ParquetFileReader::WhenBuffered(
- const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
- // Access private methods here
- SerializedFile* file =
- ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
- return file->WhenBuffered(row_groups, column_indices);
-}
-
-// ----------------------------------------------------------------------
-// File metadata helpers
-
-std::shared_ptr<FileMetaData> ReadMetaData(
- const std::shared_ptr<::arrow::io::RandomAccessFile>& source) {
- return ParquetFileReader::Open(source)->metadata();
-}
-
-// ----------------------------------------------------------------------
-// File scanner for performance testing
-
-int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
- ParquetFileReader* reader) {
- std::vector<int16_t> rep_levels(column_batch_size);
- std::vector<int16_t> def_levels(column_batch_size);
-
- int num_columns = static_cast<int>(columns.size());
-
- // columns are not specified explicitly. Add all columns
- if (columns.size() == 0) {
- num_columns = reader->metadata()->num_columns();
- columns.resize(num_columns);
- for (int i = 0; i < num_columns; i++) {
- columns[i] = i;
- }
- }
-
- std::vector<int64_t> total_rows(num_columns, 0);
-
- for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
- auto group_reader = reader->RowGroup(r);
- int col = 0;
- for (auto i : columns) {
- std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
- size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
- std::vector<uint8_t> values(column_batch_size * value_byte_size);
-
- int64_t values_read = 0;
- while (col_reader->HasNext()) {
- int64_t levels_read =
- ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
- values.data(), &values_read, col_reader.get());
- if (col_reader->descr()->max_repetition_level() > 0) {
- for (int64_t i = 0; i < levels_read; i++) {
- if (rep_levels[i] == 0) {
- total_rows[col]++;
- }
- }
- } else {
- total_rows[col] += levels_read;
- }
- }
- col++;
- }
- }
-
- for (int i = 1; i < num_columns; ++i) {
- if (total_rows[0] != total_rows[i]) {
- throw ParquetException("Parquet error: Total rows among columns do not match");
- }
- }
-
- return total_rows[0];
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/io/file.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/future.h"
+#include "arrow/util/int_util_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/ubsan.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_scanner.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+using arrow::internal::AddWithOverflow;
+
+namespace parquet {
+
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+static constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
+static constexpr uint32_t kFooterSize = 8;
+
+// For PARQUET-816
+static constexpr int64_t kMaxDictHeaderSize = 100;
+
+// ----------------------------------------------------------------------
+// RowGroupReader public API
+
+RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
+ : contents_(std::move(contents)) {}
+
+std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
+ const ColumnDescriptor* descr = metadata()->schema()->Column(i);
+
+ std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
+ return ColumnReader::Make(
+ descr, std::move(page_reader),
+ const_cast<ReaderProperties*>(contents_->properties())->memory_pool());
+}
+
+std::shared_ptr<ColumnReader> RowGroupReader::ColumnWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose) {
+ std::shared_ptr<ColumnReader> reader = Column(i);
+
+ if (encoding_to_expose == ExposedEncoding::DICTIONARY) {
+ // Check the encoding_stats to see if all data pages are dictionary encoded.
+ std::unique_ptr<ColumnChunkMetaData> col = metadata()->ColumnChunk(i);
+ const std::vector<PageEncodingStats>& encoding_stats = col->encoding_stats();
+ if (encoding_stats.empty()) {
+ // Some parquet files may have empty encoding_stats. In this case we are
+ // not sure whether all data pages are dictionary encoded. So we do not
+ // enable exposing dictionary.
+ return reader;
+ }
+ // The 1st page should be the dictionary page.
+ if (encoding_stats[0].page_type != PageType::DICTIONARY_PAGE ||
+ (encoding_stats[0].encoding != Encoding::PLAIN &&
+ encoding_stats[0].encoding != Encoding::PLAIN_DICTIONARY)) {
+ return reader;
+ }
+ // The following pages should be dictionary encoded data pages.
+ for (size_t idx = 1; idx < encoding_stats.size(); ++idx) {
+ if ((encoding_stats[idx].encoding != Encoding::RLE_DICTIONARY &&
+ encoding_stats[idx].encoding != Encoding::PLAIN_DICTIONARY) ||
+ (encoding_stats[idx].page_type != PageType::DATA_PAGE &&
+ encoding_stats[idx].page_type != PageType::DATA_PAGE_V2)) {
+ return reader;
+ }
+ }
+ } else {
+ // Exposing other encodings are not supported for now.
+ return reader;
+ }
+
+ // Set exposed encoding.
+ reader->SetExposedEncoding(encoding_to_expose);
+ return reader;
+}
+
+std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
+ return contents_->GetColumnPageReader(i);
+}
+
+// Returns the rowgroup metadata
+const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->metadata(); }
+
+/// Compute the section of the file that should be read for the given
+/// row group and column chunk.
+::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
+ int64_t source_size, int row_group_index,
+ int column_index) {
+ auto row_group_metadata = file_metadata->RowGroup(row_group_index);
+ auto column_metadata = row_group_metadata->ColumnChunk(column_index);
+
+ int64_t col_start = column_metadata->data_page_offset();
+ if (column_metadata->has_dictionary_page() &&
+ column_metadata->dictionary_page_offset() > 0 &&
+ col_start > column_metadata->dictionary_page_offset()) {
+ col_start = column_metadata->dictionary_page_offset();
+ }
+
+ int64_t col_length = column_metadata->total_compressed_size();
+ int64_t col_end;
+ if (AddWithOverflow(col_start, col_length, &col_end) || col_end > source_size) {
+ throw ParquetException("Invalid column metadata (corrupt file?)");
+ }
+
+ // PARQUET-816 workaround for old files created by older parquet-mr
+ const ApplicationVersion& version = file_metadata->writer_version();
+ if (version.VersionLt(ApplicationVersion::PARQUET_816_FIXED_VERSION())) {
+ // The Parquet MR writer had a bug in 1.2.8 and below where it didn't include the
+ // dictionary page header size in total_compressed_size and total_uncompressed_size
+ // (see IMPALA-694). We add padding to compensate.
+ int64_t bytes_remaining = source_size - col_end;
+ int64_t padding = std::min<int64_t>(kMaxDictHeaderSize, bytes_remaining);
+ col_length += padding;
+ }
+
+ return {col_start, col_length};
+}
+
+// RowGroupReader::Contents implementation for the Parquet file specification
+class SerializedRowGroup : public RowGroupReader::Contents {
+ public:
+ SerializedRowGroup(std::shared_ptr<ArrowInputFile> source,
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source,
+ int64_t source_size, FileMetaData* file_metadata,
+ int row_group_number, const ReaderProperties& props,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
+ : source_(std::move(source)),
+ cached_source_(std::move(cached_source)),
+ source_size_(source_size),
+ file_metadata_(file_metadata),
+ properties_(props),
+ row_group_ordinal_(row_group_number),
+ file_decryptor_(file_decryptor) {
+ row_group_metadata_ = file_metadata->RowGroup(row_group_number);
+ }
+
+ const RowGroupMetaData* metadata() const override { return row_group_metadata_.get(); }
+
+ const ReaderProperties* properties() const override { return &properties_; }
+
+ std::unique_ptr<PageReader> GetColumnPageReader(int i) override {
+ // Read column chunk from the file
+ auto col = row_group_metadata_->ColumnChunk(i);
+
+ ::arrow::io::ReadRange col_range =
+ ComputeColumnChunkRange(file_metadata_, source_size_, row_group_ordinal_, i);
+ std::shared_ptr<ArrowInputStream> stream;
+ if (cached_source_) {
+ // PARQUET-1698: if read coalescing is enabled, read from pre-buffered
+ // segments.
+ PARQUET_ASSIGN_OR_THROW(auto buffer, cached_source_->Read(col_range));
+ stream = std::make_shared<::arrow::io::BufferReader>(buffer);
+ } else {
+ stream = properties_.GetStream(source_, col_range.offset, col_range.length);
+ }
+
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata = col->crypto_metadata();
+
+ // Column is encrypted only if crypto_metadata exists.
+ if (!crypto_metadata) {
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool());
+ }
+
+ if (file_decryptor_ == nullptr) {
+ throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
+ }
+
+ constexpr auto kEncryptedRowGroupsLimit = 32767;
+ if (i > kEncryptedRowGroupsLimit) {
+ throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
+ }
+
+ // The column is encrypted
+ std::shared_ptr<Decryptor> meta_decryptor;
+ std::shared_ptr<Decryptor> data_decryptor;
+ // The column is encrypted with footer key
+ if (crypto_metadata->encrypted_with_footer_key()) {
+ meta_decryptor = file_decryptor_->GetFooterDecryptorForColumnMeta();
+ data_decryptor = file_decryptor_->GetFooterDecryptorForColumnData();
+ CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
+ static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool(), &ctx);
+ }
+
+ // The column is encrypted with its own key
+ std::string column_key_metadata = crypto_metadata->key_metadata();
+ const std::string column_path = crypto_metadata->path_in_schema()->ToDotString();
+
+ meta_decryptor =
+ file_decryptor_->GetColumnMetaDecryptor(column_path, column_key_metadata);
+ data_decryptor =
+ file_decryptor_->GetColumnDataDecryptor(column_path, column_key_metadata);
+
+ CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
+ static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ return PageReader::Open(stream, col->num_values(), col->compression(),
+ properties_.memory_pool(), &ctx);
+ }
+
+ private:
+ std::shared_ptr<ArrowInputFile> source_;
+ // Will be nullptr if PreBuffer() is not called.
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
+ int64_t source_size_;
+ FileMetaData* file_metadata_;
+ std::unique_ptr<RowGroupMetaData> row_group_metadata_;
+ ReaderProperties properties_;
+ int row_group_ordinal_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+};
+
+// ----------------------------------------------------------------------
+// SerializedFile: An implementation of ParquetFileReader::Contents that deals
+// with the Parquet file structure, Thrift deserialization, and other internal
+// matters
+
+// This class takes ownership of the provided data source
+class SerializedFile : public ParquetFileReader::Contents {
+ public:
+ SerializedFile(std::shared_ptr<ArrowInputFile> source,
+ const ReaderProperties& props = default_reader_properties())
+ : source_(std::move(source)), properties_(props) {
+ PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize());
+ }
+
+ ~SerializedFile() override {
+ try {
+ Close();
+ } catch (...) {
+ }
+ }
+
+ void Close() override {
+ if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys();
+ }
+
+ std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
+ std::unique_ptr<SerializedRowGroup> contents(
+ new SerializedRowGroup(source_, cached_source_, source_size_,
+ file_metadata_.get(), i, properties_, file_decryptor_));
+ return std::make_shared<RowGroupReader>(std::move(contents));
+ }
+
+ std::shared_ptr<FileMetaData> metadata() const override { return file_metadata_; }
+
+ void set_metadata(std::shared_ptr<FileMetaData> metadata) {
+ file_metadata_ = std::move(metadata);
+ }
+
+ void PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options) {
+ cached_source_ =
+ std::make_shared<::arrow::io::internal::ReadRangeCache>(source_, ctx, options);
+ std::vector<::arrow::io::ReadRange> ranges;
+ for (int row : row_groups) {
+ for (int col : column_indices) {
+ ranges.push_back(
+ ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
+ }
+ }
+ PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
+ }
+
+ ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) const {
+ if (!cached_source_) {
+ return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
+ }
+ std::vector<::arrow::io::ReadRange> ranges;
+ for (int row : row_groups) {
+ for (int col : column_indices) {
+ ranges.push_back(
+ ComputeColumnChunkRange(file_metadata_.get(), source_size_, row, col));
+ }
+ }
+ return cached_source_->WaitFor(ranges);
+ }
+
+ // Metadata/footer parsing. Divided up to separate sync/async paths, and to use
+ // exceptions for error handling (with the async path converting to Future/Status).
+
+ void ParseMetaData() {
+ int64_t footer_read_size = GetFooterReadSize();
+ PARQUET_ASSIGN_OR_THROW(
+ auto footer_buffer,
+ source_->ReadAt(source_size_ - footer_read_size, footer_read_size));
+ uint32_t metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
+ int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
+
+ std::shared_ptr<::arrow::Buffer> metadata_buffer;
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
+ metadata_buffer = SliceBuffer(
+ footer_buffer, footer_read_size - metadata_len - kFooterSize, metadata_len);
+ } else {
+ PARQUET_ASSIGN_OR_THROW(metadata_buffer,
+ source_->ReadAt(metadata_start, metadata_len));
+ }
+
+ // Parse the footer depending on encryption type
+ const bool is_encrypted_footer =
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
+ if (is_encrypted_footer) {
+ // Encrypted file with Encrypted footer.
+ const std::pair<int64_t, uint32_t> read_size =
+ ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
+ // Read the actual footer
+ metadata_start = read_size.first;
+ metadata_len = read_size.second;
+ PARQUET_ASSIGN_OR_THROW(metadata_buffer,
+ source_->ReadAt(metadata_start, metadata_len));
+ // Fall through
+ }
+
+ const uint32_t read_metadata_len =
+ ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (is_encrypted_footer) {
+ // Nothing else to do here.
+ return;
+ } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
+ if (file_decryption_properties != nullptr) {
+ if (!file_decryption_properties->plaintext_files_allowed()) {
+ throw ParquetException("Applying decryption properties on plaintext file");
+ }
+ }
+ } else {
+ // Encrypted file with plaintext footer mode.
+ ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
+ }
+ }
+
+ // Validate the source size and get the initial read size.
+ int64_t GetFooterReadSize() {
+ if (source_size_ == 0) {
+ throw ParquetInvalidOrCorruptedFileException("Parquet file size is 0 bytes");
+ } else if (source_size_ < kFooterSize) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet file size is ", source_size_,
+ " bytes, smaller than the minimum file footer (", kFooterSize, " bytes)");
+ }
+ return std::min(source_size_, kDefaultFooterReadSize);
+ }
+
+ // Validate the magic bytes and get the length of the full footer.
+ uint32_t ParseFooterLength(const std::shared_ptr<::arrow::Buffer>& footer_buffer,
+ const int64_t footer_read_size) {
+ // Check if all bytes are read. Check if last 4 bytes read have the magic bits
+ if (footer_buffer->size() != footer_read_size ||
+ (memcmp(footer_buffer->data() + footer_read_size - 4, kParquetMagic, 4) != 0 &&
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) != 0)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet magic bytes not found in footer. Either the file is corrupted or this "
+ "is not a parquet file.");
+ }
+ // Both encrypted/unencrypted footers have the same footer length check.
+ uint32_t metadata_len = ::arrow::util::SafeLoadAs<uint32_t>(
+ reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
+ kFooterSize);
+ if (metadata_len > source_size_ - kFooterSize) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet file size is ", source_size_,
+ " bytes, smaller than the size reported by footer's (", metadata_len, "bytes)");
+ }
+ return metadata_len;
+ }
+
+ // Does not throw.
+ ::arrow::Future<> ParseMetaDataAsync() {
+ int64_t footer_read_size;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ footer_read_size = GetFooterReadSize();
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Assumes this is kept alive externally
+ return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
+ -> ::arrow::Future<> {
+ uint32_t metadata_len;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
+ END_PARQUET_CATCH_EXCEPTIONS
+ int64_t metadata_start = source_size_ - kFooterSize - metadata_len;
+
+ std::shared_ptr<::arrow::Buffer> metadata_buffer;
+ if (footer_read_size >= (metadata_len + kFooterSize)) {
+ metadata_buffer =
+ SliceBuffer(footer_buffer, footer_read_size - metadata_len - kFooterSize,
+ metadata_len);
+ return ParseMaybeEncryptedMetaDataAsync(footer_buffer,
+ std::move(metadata_buffer),
+ footer_read_size, metadata_len);
+ }
+ return source_->ReadAsync(metadata_start, metadata_len)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
+ return ParseMaybeEncryptedMetaDataAsync(footer_buffer, metadata_buffer,
+ footer_read_size, metadata_len);
+ });
+ });
+ }
+
+ // Continuation
+ ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
+ std::shared_ptr<::arrow::Buffer> footer_buffer,
+ std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
+ uint32_t metadata_len) {
+ // Parse the footer depending on encryption type
+ const bool is_encrypted_footer =
+ memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0;
+ if (is_encrypted_footer) {
+ // Encrypted file with Encrypted footer.
+ std::pair<int64_t, uint32_t> read_size;
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ read_size =
+ ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len);
+ END_PARQUET_CATCH_EXCEPTIONS
+ // Read the actual footer
+ int64_t metadata_start = read_size.first;
+ metadata_len = read_size.second;
+ return source_->ReadAsync(metadata_start, metadata_len)
+ .Then([=](const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
+ // Continue and read the file footer
+ return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer);
+ });
+ }
+ return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
+ is_encrypted_footer);
+ }
+
+ // Continuation
+ ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
+ uint32_t metadata_len,
+ const bool is_encrypted_footer) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ const uint32_t read_metadata_len =
+ ParseUnencryptedFileMetadata(metadata_buffer, metadata_len);
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (is_encrypted_footer) {
+ // Nothing else to do here.
+ return ::arrow::Status::OK();
+ } else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
+ if (file_decryption_properties != nullptr) {
+ if (!file_decryption_properties->plaintext_files_allowed()) {
+ throw ParquetException("Applying decryption properties on plaintext file");
+ }
+ }
+ } else {
+ // Encrypted file with plaintext footer mode.
+ ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+ return ::arrow::Status::OK();
+ }
+
+ private:
+ std::shared_ptr<ArrowInputFile> source_;
+ std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source_;
+ int64_t source_size_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+ ReaderProperties properties_;
+
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+
+ // \return The true length of the metadata in bytes
+ uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr<Buffer>& footer_buffer,
+ const uint32_t metadata_len);
+
+ std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties,
+ EncryptionAlgorithm& algo);
+
+ void ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ FileDecryptionProperties* file_decryption_properties,
+ const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
+ uint32_t read_metadata_len);
+
+ // \return The position and size of the actual footer
+ std::pair<int64_t, uint32_t> ParseMetaDataOfEncryptedFileWithEncryptedFooter(
+ const std::shared_ptr<Buffer>& crypto_metadata_buffer, uint32_t footer_len);
+};
+
+uint32_t SerializedFile::ParseUnencryptedFileMetadata(
+ const std::shared_ptr<Buffer>& metadata_buffer, const uint32_t metadata_len) {
+ if (metadata_buffer->size() != metadata_len) {
+ throw ParquetException("Failed reading metadata buffer (requested " +
+ std::to_string(metadata_len) + " bytes but got " +
+ std::to_string(metadata_buffer->size()) + " bytes)");
+ }
+ uint32_t read_metadata_len = metadata_len;
+ // The encrypted read path falls through to here, so pass in the decryptor
+ file_metadata_ =
+ FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, file_decryptor_);
+ return read_metadata_len;
+}
+
+std::pair<int64_t, uint32_t>
+SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter(
+ const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer,
+ // both metadata & crypto metadata length
+ const uint32_t footer_len) {
+ // encryption with encrypted footer
+ // Check if the footer_buffer contains the entire metadata
+ if (crypto_metadata_buffer->size() != footer_len) {
+ throw ParquetException("Failed reading encrypted metadata buffer (requested " +
+ std::to_string(footer_len) + " bytes but got " +
+ std::to_string(crypto_metadata_buffer->size()) + " bytes)");
+ }
+ auto file_decryption_properties = properties_.file_decryption_properties().get();
+ if (file_decryption_properties == nullptr) {
+ throw ParquetException(
+ "Could not read encrypted metadata, no decryption found in reader's properties");
+ }
+ uint32_t crypto_metadata_len = footer_len;
+ std::shared_ptr<FileCryptoMetaData> file_crypto_metadata =
+ FileCryptoMetaData::Make(crypto_metadata_buffer->data(), &crypto_metadata_len);
+ // Handle AAD prefix
+ EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm();
+ std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
+ file_decryptor_ = std::make_shared<InternalFileDecryptor>(
+ file_decryption_properties, file_aad, algo.algorithm,
+ file_crypto_metadata->key_metadata(), properties_.memory_pool());
+
+ int64_t metadata_offset = source_size_ - kFooterSize - footer_len + crypto_metadata_len;
+ uint32_t metadata_len = footer_len - crypto_metadata_len;
+ return std::make_pair(metadata_offset, metadata_len);
+}
+
+void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
+ FileDecryptionProperties* file_decryption_properties,
+ const std::shared_ptr<Buffer>& metadata_buffer, uint32_t metadata_len,
+ uint32_t read_metadata_len) {
+ // Providing decryption properties in plaintext footer mode is not mandatory, for
+ // example when reading by legacy reader.
+ if (file_decryption_properties != nullptr) {
+ EncryptionAlgorithm algo = file_metadata_->encryption_algorithm();
+ // Handle AAD prefix
+ std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
+ file_decryptor_ = std::make_shared<InternalFileDecryptor>(
+ file_decryption_properties, file_aad, algo.algorithm,
+ file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
+ // set the InternalFileDecryptor in the metadata as well, as it's used
+ // for signature verification and for ColumnChunkMetaData creation.
+ file_metadata_->set_file_decryptor(file_decryptor_);
+
+ if (file_decryption_properties->check_plaintext_footer_integrity()) {
+ if (metadata_len - read_metadata_len !=
+ (parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Failed reading metadata for encryption signature (requested ",
+ parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
+ " bytes but have ", metadata_len - read_metadata_len, " bytes)");
+ }
+
+ if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
+ throw ParquetInvalidOrCorruptedFileException(
+ "Parquet crypto signature verification failed");
+ }
+ }
+ }
+}
+
+std::string SerializedFile::HandleAadPrefix(
+ FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo) {
+ std::string aad_prefix_in_properties = file_decryption_properties->aad_prefix();
+ std::string aad_prefix = aad_prefix_in_properties;
+ bool file_has_aad_prefix = algo.aad.aad_prefix.empty() ? false : true;
+ std::string aad_prefix_in_file = algo.aad.aad_prefix;
+
+ if (algo.aad.supply_aad_prefix && aad_prefix_in_properties.empty()) {
+ throw ParquetException(
+ "AAD prefix used for file encryption, "
+ "but not stored in file and not supplied "
+ "in decryption properties");
+ }
+
+ if (file_has_aad_prefix) {
+ if (!aad_prefix_in_properties.empty()) {
+ if (aad_prefix_in_properties.compare(aad_prefix_in_file) != 0) {
+ throw ParquetException(
+ "AAD Prefix in file and in properties "
+ "is not the same");
+ }
+ }
+ aad_prefix = aad_prefix_in_file;
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
+ file_decryption_properties->aad_prefix_verifier();
+ if (aad_prefix_verifier != nullptr) aad_prefix_verifier->Verify(aad_prefix);
+ } else {
+ if (!algo.aad.supply_aad_prefix && !aad_prefix_in_properties.empty()) {
+ throw ParquetException(
+ "AAD Prefix set in decryption properties, but was not used "
+ "for file encryption");
+ }
+ std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier =
+ file_decryption_properties->aad_prefix_verifier();
+ if (aad_prefix_verifier != nullptr) {
+ throw ParquetException(
+ "AAD Prefix Verifier is set, but AAD Prefix not found in file");
+ }
+ }
+ return aad_prefix + algo.aad.aad_file_unique;
+}
+
+// ----------------------------------------------------------------------
+// ParquetFileReader public API
+
+ParquetFileReader::ParquetFileReader() {}
+
+ParquetFileReader::~ParquetFileReader() {
+ try {
+ Close();
+ } catch (...) {
+ }
+}
+
+// Open the file. If no metadata is passed, it is parsed from the footer of
+// the file
+std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
+ std::shared_ptr<ArrowInputFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ std::unique_ptr<ParquetFileReader::Contents> result(
+ new SerializedFile(std::move(source), props));
+
+ // Access private methods here, but otherwise unavailable
+ SerializedFile* file = static_cast<SerializedFile*>(result.get());
+
+ if (metadata == nullptr) {
+ // Validates magic bytes, parses metadata, and initializes the SchemaDescriptor
+ file->ParseMetaData();
+ } else {
+ file->set_metadata(std::move(metadata));
+ }
+
+ return result;
+}
+
+::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
+ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
+ const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ std::unique_ptr<ParquetFileReader::Contents> result(
+ new SerializedFile(std::move(source), props));
+ SerializedFile* file = static_cast<SerializedFile*>(result.get());
+ if (metadata == nullptr) {
+ // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
+ struct {
+ ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
+ return std::move(result);
+ }
+
+ std::unique_ptr<ParquetFileReader::Contents> result;
+ } Continuation;
+ Continuation.result = std::move(result);
+ return file->ParseMetaDataAsync().Then(std::move(Continuation));
+ } else {
+ file->set_metadata(std::move(metadata));
+ return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
+ std::move(result));
+ }
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+std::unique_ptr<ParquetFileReader> ParquetFileReader::Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ auto contents = SerializedFile::Open(std::move(source), props, std::move(metadata));
+ std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
+ result->Open(std::move(contents));
+ return result;
+}
+
+std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
+ const std::string& path, bool memory_map, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ std::shared_ptr<::arrow::io::RandomAccessFile> source;
+ if (memory_map) {
+ PARQUET_ASSIGN_OR_THROW(
+ source, ::arrow::io::MemoryMappedFile::Open(path, ::arrow::io::FileMode::READ));
+ } else {
+ PARQUET_ASSIGN_OR_THROW(source,
+ ::arrow::io::ReadableFile::Open(path, props.memory_pool()));
+ }
+
+ return Open(std::move(source), props, std::move(metadata));
+}
+
+::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
+ std::shared_ptr<FileMetaData> metadata) {
+ BEGIN_PARQUET_CATCH_EXCEPTIONS
+ auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
+ // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
+ auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
+ fut.AddCallback([fut, completed](
+ const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
+ contents) mutable {
+ if (!contents.ok()) {
+ completed.MarkFinished(contents.status());
+ return;
+ }
+ std::unique_ptr<ParquetFileReader> result(new ParquetFileReader());
+ result->Open(fut.MoveResult().MoveValueUnsafe());
+ completed.MarkFinished(std::move(result));
+ });
+ return completed;
+ END_PARQUET_CATCH_EXCEPTIONS
+}
+
+void ParquetFileReader::Open(std::unique_ptr<ParquetFileReader::Contents> contents) {
+ contents_ = std::move(contents);
+}
+
+void ParquetFileReader::Close() {
+ if (contents_) {
+ contents_->Close();
+ }
+}
+
+std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
+ return contents_->metadata();
+}
+
+std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
+ if (i >= metadata()->num_row_groups()) {
+ std::stringstream ss;
+ ss << "Trying to read row group " << i << " but file only has "
+ << metadata()->num_row_groups() << " row groups";
+ throw ParquetException(ss.str());
+ }
+ return contents_->GetRowGroup(i);
+}
+
+void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options) {
+ // Access private methods here
+ SerializedFile* file =
+ ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
+ file->PreBuffer(row_groups, column_indices, ctx, options);
+}
+
+::arrow::Future<> ParquetFileReader::WhenBuffered(
+ const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
+ // Access private methods here
+ SerializedFile* file =
+ ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
+ return file->WhenBuffered(row_groups, column_indices);
+}
+
+// ----------------------------------------------------------------------
+// File metadata helpers
+
+std::shared_ptr<FileMetaData> ReadMetaData(
+ const std::shared_ptr<::arrow::io::RandomAccessFile>& source) {
+ return ParquetFileReader::Open(source)->metadata();
+}
+
+// ----------------------------------------------------------------------
+// File scanner for performance testing
+
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader) {
+ std::vector<int16_t> rep_levels(column_batch_size);
+ std::vector<int16_t> def_levels(column_batch_size);
+
+ int num_columns = static_cast<int>(columns.size());
+
+ // columns are not specified explicitly. Add all columns
+ if (columns.size() == 0) {
+ num_columns = reader->metadata()->num_columns();
+ columns.resize(num_columns);
+ for (int i = 0; i < num_columns; i++) {
+ columns[i] = i;
+ }
+ }
+
+ std::vector<int64_t> total_rows(num_columns, 0);
+
+ for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
+ auto group_reader = reader->RowGroup(r);
+ int col = 0;
+ for (auto i : columns) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
+ std::vector<uint8_t> values(column_batch_size * value_byte_size);
+
+ int64_t values_read = 0;
+ while (col_reader->HasNext()) {
+ int64_t levels_read =
+ ScanAllValues(column_batch_size, def_levels.data(), rep_levels.data(),
+ values.data(), &values_read, col_reader.get());
+ if (col_reader->descr()->max_repetition_level() > 0) {
+ for (int64_t i = 0; i < levels_read; i++) {
+ if (rep_levels[i] == 0) {
+ total_rows[col]++;
+ }
+ }
+ } else {
+ total_rows[col] += levels_read;
+ }
+ }
+ col++;
+ }
+ }
+
+ for (int i = 1; i < num_columns; ++i) {
+ if (total_rows[0] != total_rows[i]) {
+ throw ParquetException("Parquet error: Total rows among columns do not match");
+ }
+ }
+
+ return total_rows[0];
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
index a6358684250..0fc84054939 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_reader.h
@@ -1,188 +1,188 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/io/caching.h"
-#include "arrow/util/type_fwd.h"
-#include "parquet/metadata.h" // IWYU pragma: keep
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-
-namespace parquet {
-
-class ColumnReader;
-class FileMetaData;
-class PageReader;
-class RowGroupMetaData;
-
-class PARQUET_EXPORT RowGroupReader {
- public:
- // Forward declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct Contents {
- virtual ~Contents() {}
- virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
- virtual const RowGroupMetaData* metadata() const = 0;
- virtual const ReaderProperties* properties() const = 0;
- };
-
- explicit RowGroupReader(std::unique_ptr<Contents> contents);
-
- // Returns the rowgroup metadata
- const RowGroupMetaData* metadata() const;
-
- // Construct a ColumnReader for the indicated row group-relative
- // column. Ownership is shared with the RowGroupReader.
- std::shared_ptr<ColumnReader> Column(int i);
-
- // Construct a ColumnReader, trying to enable exposed encoding.
- //
- // For dictionary encoding, currently we only support column chunks that are fully
- // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
- // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
- // encoding will not be exposed.
- //
- // The returned column reader provides an API GetExposedEncoding() for the
- // users to check the exposed encoding and determine how to read the batches.
- //
- // \note API EXPERIMENTAL
- std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
- int i, ExposedEncoding encoding_to_expose);
-
- std::unique_ptr<PageReader> GetColumnPageReader(int i);
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
-};
-
-class PARQUET_EXPORT ParquetFileReader {
- public:
- // Declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct PARQUET_EXPORT Contents {
- static std::unique_ptr<Contents> Open(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- virtual ~Contents() = default;
- // Perform any cleanup associated with the file contents
- virtual void Close() = 0;
- virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
- virtual std::shared_ptr<FileMetaData> metadata() const = 0;
- };
-
- ParquetFileReader();
- ~ParquetFileReader();
-
- // Create a file reader instance from an Arrow file object. Thread-safety is
- // the responsibility of the file implementation
- static std::unique_ptr<ParquetFileReader> Open(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- // API Convenience to open a serialized Parquet file on disk, using Arrow IO
- // interfaces.
- static std::unique_ptr<ParquetFileReader> OpenFile(
- const std::string& path, bool memory_map = true,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- // Asynchronously open a file reader from an Arrow file object.
- // Does not throw - all errors are reported through the Future.
- static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
- std::shared_ptr<::arrow::io::RandomAccessFile> source,
- const ReaderProperties& props = default_reader_properties(),
- std::shared_ptr<FileMetaData> metadata = NULLPTR);
-
- void Open(std::unique_ptr<Contents> contents);
- void Close();
-
- // The RowGroupReader is owned by the FileReader
- std::shared_ptr<RowGroupReader> RowGroup(int i);
-
- // Returns the file metadata. Only one instance is ever created
- std::shared_ptr<FileMetaData> metadata() const;
-
- /// Pre-buffer the specified column indices in all row groups.
- ///
- /// Readers can optionally call this to cache the necessary slices
- /// of the file in-memory before deserialization. Arrow readers can
- /// automatically do this via an option. This is intended to
- /// increase performance when reading from high-latency filesystems
- /// (e.g. Amazon S3).
- ///
- /// After calling this, creating readers for row groups/column
- /// indices that were not buffered may fail. Creating multiple
- /// readers for the a subset of the buffered regions is
- /// acceptable. This may be called again to buffer a different set
- /// of row groups/columns.
- ///
- /// If memory usage is a concern, note that data will remain
- /// buffered in memory until either \a PreBuffer() is called again,
- /// or the reader itself is destructed. Reading - and buffering -
- /// only one row group at a time may be useful.
- ///
- /// This method may throw.
- void PreBuffer(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices,
- const ::arrow::io::IOContext& ctx,
- const ::arrow::io::CacheOptions& options);
-
- /// Wait for the specified row groups and column indices to be pre-buffered.
- ///
- /// After the returned Future completes, reading the specified row
- /// groups/columns will not block.
- ///
- /// PreBuffer must be called first. This method does not throw.
- ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
- const std::vector<int>& column_indices) const;
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
-};
-
-// Read only Parquet file metadata
-std::shared_ptr<FileMetaData> PARQUET_EXPORT
-ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
-
-/// \brief Scan all values in file. Useful for performance testing
-/// \param[in] columns the column numbers to scan. If empty scans all
-/// \param[in] column_batch_size number of values to read at a time when scanning column
-/// \param[in] reader a ParquetFileReader instance
-/// \return number of semantic rows in file
-PARQUET_EXPORT
-int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
- ParquetFileReader* reader);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/util/type_fwd.h"
+#include "parquet/metadata.h" // IWYU pragma: keep
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+class ColumnReader;
+class FileMetaData;
+class PageReader;
+class RowGroupMetaData;
+
+class PARQUET_EXPORT RowGroupReader {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ virtual ~Contents() {}
+ virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
+ virtual const RowGroupMetaData* metadata() const = 0;
+ virtual const ReaderProperties* properties() const = 0;
+ };
+
+ explicit RowGroupReader(std::unique_ptr<Contents> contents);
+
+ // Returns the rowgroup metadata
+ const RowGroupMetaData* metadata() const;
+
+ // Construct a ColumnReader for the indicated row group-relative
+ // column. Ownership is shared with the RowGroupReader.
+ std::shared_ptr<ColumnReader> Column(int i);
+
+ // Construct a ColumnReader, trying to enable exposed encoding.
+ //
+ // For dictionary encoding, currently we only support column chunks that are fully
+ // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
+ // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
+ // encoding will not be exposed.
+ //
+ // The returned column reader provides an API GetExposedEncoding() for the
+ // users to check the exposed encoding and determine how to read the batches.
+ //
+ // \note API EXPERIMENTAL
+ std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
+ int i, ExposedEncoding encoding_to_expose);
+
+ std::unique_ptr<PageReader> GetColumnPageReader(int i);
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+class PARQUET_EXPORT ParquetFileReader {
+ public:
+ // Declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct PARQUET_EXPORT Contents {
+ static std::unique_ptr<Contents> Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ virtual ~Contents() = default;
+ // Perform any cleanup associated with the file contents
+ virtual void Close() = 0;
+ virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
+ virtual std::shared_ptr<FileMetaData> metadata() const = 0;
+ };
+
+ ParquetFileReader();
+ ~ParquetFileReader();
+
+ // Create a file reader instance from an Arrow file object. Thread-safety is
+ // the responsibility of the file implementation
+ static std::unique_ptr<ParquetFileReader> Open(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ // API Convenience to open a serialized Parquet file on disk, using Arrow IO
+ // interfaces.
+ static std::unique_ptr<ParquetFileReader> OpenFile(
+ const std::string& path, bool memory_map = true,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ // Asynchronously open a file reader from an Arrow file object.
+ // Does not throw - all errors are reported through the Future.
+ static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
+ std::shared_ptr<::arrow::io::RandomAccessFile> source,
+ const ReaderProperties& props = default_reader_properties(),
+ std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+ void Open(std::unique_ptr<Contents> contents);
+ void Close();
+
+ // The RowGroupReader is owned by the FileReader
+ std::shared_ptr<RowGroupReader> RowGroup(int i);
+
+ // Returns the file metadata. Only one instance is ever created
+ std::shared_ptr<FileMetaData> metadata() const;
+
+ /// Pre-buffer the specified column indices in all row groups.
+ ///
+ /// Readers can optionally call this to cache the necessary slices
+ /// of the file in-memory before deserialization. Arrow readers can
+ /// automatically do this via an option. This is intended to
+ /// increase performance when reading from high-latency filesystems
+ /// (e.g. Amazon S3).
+ ///
+ /// After calling this, creating readers for row groups/column
+ /// indices that were not buffered may fail. Creating multiple
+ /// readers for the a subset of the buffered regions is
+ /// acceptable. This may be called again to buffer a different set
+ /// of row groups/columns.
+ ///
+ /// If memory usage is a concern, note that data will remain
+ /// buffered in memory until either \a PreBuffer() is called again,
+ /// or the reader itself is destructed. Reading - and buffering -
+ /// only one row group at a time may be useful.
+ ///
+ /// This method may throw.
+ void PreBuffer(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices,
+ const ::arrow::io::IOContext& ctx,
+ const ::arrow::io::CacheOptions& options);
+
+ /// Wait for the specified row groups and column indices to be pre-buffered.
+ ///
+ /// After the returned Future completes, reading the specified row
+ /// groups/columns will not block.
+ ///
+ /// PreBuffer must be called first. This method does not throw.
+ ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+ const std::vector<int>& column_indices) const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+// Read only Parquet file metadata
+std::shared_ptr<FileMetaData> PARQUET_EXPORT
+ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
+
+/// \brief Scan all values in file. Useful for performance testing
+/// \param[in] columns the column numbers to scan. If empty scans all
+/// \param[in] column_batch_size number of values to read at a time when scanning column
+/// \param[in] reader a ParquetFileReader instance
+/// \return number of semantic rows in file
+PARQUET_EXPORT
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+ ParquetFileReader* reader);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
index a4c824c423b..deac9586e5a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.cc
@@ -1,547 +1,547 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/file_writer.h"
-
-#include <cstddef>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "parquet/column_writer.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-using arrow::MemoryPool;
-
-using parquet::schema::GroupNode;
-
-namespace parquet {
-
-// ----------------------------------------------------------------------
-// RowGroupWriter public API
-
-RowGroupWriter::RowGroupWriter(std::unique_ptr<Contents> contents)
- : contents_(std::move(contents)) {}
-
-void RowGroupWriter::Close() {
- if (contents_) {
- contents_->Close();
- }
-}
-
-ColumnWriter* RowGroupWriter::NextColumn() { return contents_->NextColumn(); }
-
-ColumnWriter* RowGroupWriter::column(int i) { return contents_->column(i); }
-
-int64_t RowGroupWriter::total_compressed_bytes() const {
- return contents_->total_compressed_bytes();
-}
-
-int64_t RowGroupWriter::total_bytes_written() const {
- return contents_->total_bytes_written();
-}
-
-int RowGroupWriter::current_column() { return contents_->current_column(); }
-
-int RowGroupWriter::num_columns() const { return contents_->num_columns(); }
-
-int64_t RowGroupWriter::num_rows() const { return contents_->num_rows(); }
-
-inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) {
- std::stringstream ss;
- ss << "Column " << col << " had " << curr << " while previous column had " << prev;
- throw ParquetException(ss.str());
-}
-
-// ----------------------------------------------------------------------
-// RowGroupSerializer
-
-// RowGroupWriter::Contents implementation for the Parquet file specification
-class RowGroupSerializer : public RowGroupWriter::Contents {
- public:
- RowGroupSerializer(std::shared_ptr<ArrowOutputStream> sink,
- RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal,
- const WriterProperties* properties, bool buffered_row_group = false,
- InternalFileEncryptor* file_encryptor = nullptr)
- : sink_(std::move(sink)),
- metadata_(metadata),
- properties_(properties),
- total_bytes_written_(0),
- closed_(false),
- row_group_ordinal_(row_group_ordinal),
- next_column_index_(0),
- num_rows_(0),
- buffered_row_group_(buffered_row_group),
- file_encryptor_(file_encryptor) {
- if (buffered_row_group) {
- InitColumns();
- } else {
- column_writers_.push_back(nullptr);
- }
- }
-
- int num_columns() const override { return metadata_->num_columns(); }
-
- int64_t num_rows() const override {
- CheckRowsWritten();
- // CheckRowsWritten ensures num_rows_ is set correctly
- return num_rows_;
- }
-
- ColumnWriter* NextColumn() override {
- if (buffered_row_group_) {
- throw ParquetException(
- "NextColumn() is not supported when a RowGroup is written by size");
- }
-
- if (column_writers_[0]) {
- CheckRowsWritten();
- }
-
- // Throws an error if more columns are being written
- auto col_meta = metadata_->NextColumnChunk();
-
- if (column_writers_[0]) {
- total_bytes_written_ += column_writers_[0]->Close();
- }
-
- ++next_column_index_;
-
- const auto& path = col_meta->descr()->path();
- auto meta_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
- : nullptr;
- auto data_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
- : nullptr;
- std::unique_ptr<PageWriter> pager = PageWriter::Open(
- sink_, properties_->compression(path), properties_->compression_level(path),
- col_meta, row_group_ordinal_, static_cast<int16_t>(next_column_index_ - 1),
- properties_->memory_pool(), false, meta_encryptor, data_encryptor);
- column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_);
- return column_writers_[0].get();
- }
-
- ColumnWriter* column(int i) override {
- if (!buffered_row_group_) {
- throw ParquetException(
- "column() is only supported when a BufferedRowGroup is being written");
- }
-
- if (i >= 0 && i < static_cast<int>(column_writers_.size())) {
- return column_writers_[i].get();
- }
- return nullptr;
- }
-
- int current_column() const override { return metadata_->current_column(); }
-
- int64_t total_compressed_bytes() const override {
- int64_t total_compressed_bytes = 0;
- for (size_t i = 0; i < column_writers_.size(); i++) {
- if (column_writers_[i]) {
- total_compressed_bytes += column_writers_[i]->total_compressed_bytes();
- }
- }
- return total_compressed_bytes;
- }
-
- int64_t total_bytes_written() const override {
- int64_t total_bytes_written = 0;
- for (size_t i = 0; i < column_writers_.size(); i++) {
- if (column_writers_[i]) {
- total_bytes_written += column_writers_[i]->total_bytes_written();
- }
- }
- return total_bytes_written;
- }
-
- void Close() override {
- if (!closed_) {
- closed_ = true;
- CheckRowsWritten();
-
- for (size_t i = 0; i < column_writers_.size(); i++) {
- if (column_writers_[i]) {
- total_bytes_written_ += column_writers_[i]->Close();
- column_writers_[i].reset();
- }
- }
-
- column_writers_.clear();
-
- // Ensures all columns have been written
- metadata_->set_num_rows(num_rows_);
- metadata_->Finish(total_bytes_written_, row_group_ordinal_);
- }
- }
-
- private:
- std::shared_ptr<ArrowOutputStream> sink_;
- mutable RowGroupMetaDataBuilder* metadata_;
- const WriterProperties* properties_;
- int64_t total_bytes_written_;
- bool closed_;
- int16_t row_group_ordinal_;
- int next_column_index_;
- mutable int64_t num_rows_;
- bool buffered_row_group_;
- InternalFileEncryptor* file_encryptor_;
-
- void CheckRowsWritten() const {
- // verify when only one column is written at a time
- if (!buffered_row_group_ && column_writers_.size() > 0 && column_writers_[0]) {
- int64_t current_col_rows = column_writers_[0]->rows_written();
- if (num_rows_ == 0) {
- num_rows_ = current_col_rows;
- } else if (num_rows_ != current_col_rows) {
- ThrowRowsMisMatchError(next_column_index_, current_col_rows, num_rows_);
- }
- } else if (buffered_row_group_ &&
- column_writers_.size() > 0) { // when buffered_row_group = true
- int64_t current_col_rows = column_writers_[0]->rows_written();
- for (int i = 1; i < static_cast<int>(column_writers_.size()); i++) {
- int64_t current_col_rows_i = column_writers_[i]->rows_written();
- if (current_col_rows != current_col_rows_i) {
- ThrowRowsMisMatchError(i, current_col_rows_i, current_col_rows);
- }
- }
- num_rows_ = current_col_rows;
- }
- }
-
- void InitColumns() {
- for (int i = 0; i < num_columns(); i++) {
- auto col_meta = metadata_->NextColumnChunk();
- const auto& path = col_meta->descr()->path();
- auto meta_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
- : nullptr;
- auto data_encryptor =
- file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
- : nullptr;
- std::unique_ptr<PageWriter> pager = PageWriter::Open(
- sink_, properties_->compression(path), properties_->compression_level(path),
- col_meta, static_cast<int16_t>(row_group_ordinal_),
- static_cast<int16_t>(next_column_index_++), properties_->memory_pool(),
- buffered_row_group_, meta_encryptor, data_encryptor);
- column_writers_.push_back(
- ColumnWriter::Make(col_meta, std::move(pager), properties_));
- }
- }
-
- std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
-};
-
-// ----------------------------------------------------------------------
-// FileSerializer
-
-// An implementation of ParquetFileWriter::Contents that deals with the Parquet
-// file structure, Thrift serialization, and other internal matters
-
-class FileSerializer : public ParquetFileWriter::Contents {
- public:
- static std::unique_ptr<ParquetFileWriter::Contents> Open(
- std::shared_ptr<ArrowOutputStream> sink, std::shared_ptr<GroupNode> schema,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
- std::unique_ptr<ParquetFileWriter::Contents> result(
- new FileSerializer(std::move(sink), std::move(schema), std::move(properties),
- std::move(key_value_metadata)));
-
- return result;
- }
-
- void Close() override {
- if (is_open_) {
- // If any functions here raise an exception, we set is_open_ to be false
- // so that this does not get called again (possibly causing segfault)
- is_open_ = false;
- if (row_group_writer_) {
- num_rows_ += row_group_writer_->num_rows();
- row_group_writer_->Close();
- }
- row_group_writer_.reset();
-
- // Write magic bytes and metadata
- auto file_encryption_properties = properties_->file_encryption_properties();
-
- if (file_encryption_properties == nullptr) { // Non encrypted file.
- file_metadata_ = metadata_->Finish();
- WriteFileMetaData(*file_metadata_, sink_.get());
- } else { // Encrypted file
- CloseEncryptedFile(file_encryption_properties);
- }
- }
- }
-
- int num_columns() const override { return schema_.num_columns(); }
-
- int num_row_groups() const override { return num_row_groups_; }
-
- int64_t num_rows() const override { return num_rows_; }
-
- const std::shared_ptr<WriterProperties>& properties() const override {
- return properties_;
- }
-
- RowGroupWriter* AppendRowGroup(bool buffered_row_group) {
- if (row_group_writer_) {
- row_group_writer_->Close();
- }
- num_row_groups_++;
- auto rg_metadata = metadata_->AppendRowGroup();
- std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
- sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), properties_.get(),
- buffered_row_group, file_encryptor_.get()));
- row_group_writer_.reset(new RowGroupWriter(std::move(contents)));
- return row_group_writer_.get();
- }
-
- RowGroupWriter* AppendRowGroup() override { return AppendRowGroup(false); }
-
- RowGroupWriter* AppendBufferedRowGroup() override { return AppendRowGroup(true); }
-
- ~FileSerializer() override {
- try {
- Close();
- } catch (...) {
- }
- }
-
- private:
- FileSerializer(std::shared_ptr<ArrowOutputStream> sink,
- std::shared_ptr<GroupNode> schema,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : ParquetFileWriter::Contents(std::move(schema), std::move(key_value_metadata)),
- sink_(std::move(sink)),
- is_open_(true),
- properties_(std::move(properties)),
- num_row_groups_(0),
- num_rows_(0),
- metadata_(FileMetaDataBuilder::Make(&schema_, properties_, key_value_metadata_)) {
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
- if (position == 0) {
- StartFile();
- } else {
- throw ParquetException("Appending to file not implemented.");
- }
- }
-
- void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) {
- // Encrypted file with encrypted footer
- if (file_encryption_properties->encrypted_footer()) {
- // encrypted footer
- file_metadata_ = metadata_->Finish();
-
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
- uint64_t metadata_start = static_cast<uint64_t>(position);
- auto crypto_metadata = metadata_->GetCryptoMetaData();
- WriteFileCryptoMetaData(*crypto_metadata, sink_.get());
-
- auto footer_encryptor = file_encryptor_->GetFooterEncryptor();
- WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true);
- PARQUET_ASSIGN_OR_THROW(position, sink_->Tell());
- uint32_t footer_and_crypto_len = static_cast<uint32_t>(position - metadata_start);
- PARQUET_THROW_NOT_OK(
- sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len), 4));
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
- } else { // Encrypted file with plaintext footer
- file_metadata_ = metadata_->Finish();
- auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor();
- WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor,
- false);
- }
- if (file_encryptor_) {
- file_encryptor_->WipeOutEncryptionKeys();
- }
- }
-
- std::shared_ptr<ArrowOutputStream> sink_;
- bool is_open_;
- const std::shared_ptr<WriterProperties> properties_;
- int num_row_groups_;
- int64_t num_rows_;
- std::unique_ptr<FileMetaDataBuilder> metadata_;
- // Only one of the row group writers is active at a time
- std::unique_ptr<RowGroupWriter> row_group_writer_;
-
- std::unique_ptr<InternalFileEncryptor> file_encryptor_;
-
- void StartFile() {
- auto file_encryption_properties = properties_->file_encryption_properties();
- if (file_encryption_properties == nullptr) {
- // Unencrypted parquet files always start with PAR1
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
- } else {
- // Check that all columns in columnEncryptionProperties exist in the schema.
- auto encrypted_columns = file_encryption_properties->encrypted_columns();
- // if columnEncryptionProperties is empty, every column in file schema will be
- // encrypted with footer key.
- if (encrypted_columns.size() != 0) {
- std::vector<std::string> column_path_vec;
- // First, save all column paths in schema.
- for (int i = 0; i < num_columns(); i++) {
- column_path_vec.push_back(schema_.Column(i)->path()->ToDotString());
- }
- // Check if column exists in schema.
- for (const auto& elem : encrypted_columns) {
- auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first);
- if (it == column_path_vec.end()) {
- std::stringstream ss;
- ss << "Encrypted column " + elem.first + " not in file schema";
- throw ParquetException(ss.str());
- }
- }
- }
-
- file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties,
- properties_->memory_pool()));
- if (file_encryption_properties->encrypted_footer()) {
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
- } else {
- // Encrypted file with plaintext footer mode.
- PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
- }
- }
- }
-};
-
-// ----------------------------------------------------------------------
-// ParquetFileWriter public API
-
-ParquetFileWriter::ParquetFileWriter() {}
-
-ParquetFileWriter::~ParquetFileWriter() {
- try {
- Close();
- } catch (...) {
- }
-}
-
-std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
- std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr<GroupNode> schema,
- std::shared_ptr<WriterProperties> properties,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
- auto contents =
- FileSerializer::Open(std::move(sink), std::move(schema), std::move(properties),
- std::move(key_value_metadata));
- std::unique_ptr<ParquetFileWriter> result(new ParquetFileWriter());
- result->Open(std::move(contents));
- return result;
-}
-
-void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
- // Write MetaData
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
- uint32_t metadata_len = static_cast<uint32_t>(position);
-
- file_metadata.WriteTo(sink);
- PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
- metadata_len = static_cast<uint32_t>(position) - metadata_len;
-
- // Write Footer
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
- PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
-}
-
-void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
- PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
- return WriteFileMetaData(file_metadata, sink);
-}
-
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
- ArrowOutputStream* sink,
- const std::shared_ptr<Encryptor>& encryptor,
- bool encrypt_footer) {
- if (encrypt_footer) { // Encrypted file with encrypted footer
- // encrypt and write to sink
- file_metadata.WriteTo(sink, encryptor);
- } else { // Encrypted file with plaintext footer mode.
- PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
- uint32_t metadata_len = static_cast<uint32_t>(position);
- file_metadata.WriteTo(sink, encryptor);
- PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
- metadata_len = static_cast<uint32_t>(position) - metadata_len;
-
- PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
- PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
- }
-}
-
-void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
- ArrowOutputStream* sink) {
- crypto_metadata.WriteTo(sink);
-}
-
-const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
-
-const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
- return contents_->schema()->Column(i);
-}
-
-int ParquetFileWriter::num_columns() const { return contents_->num_columns(); }
-
-int64_t ParquetFileWriter::num_rows() const { return contents_->num_rows(); }
-
-int ParquetFileWriter::num_row_groups() const { return contents_->num_row_groups(); }
-
-const std::shared_ptr<const KeyValueMetadata>& ParquetFileWriter::key_value_metadata()
- const {
- return contents_->key_value_metadata();
-}
-
-const std::shared_ptr<FileMetaData> ParquetFileWriter::metadata() const {
- return file_metadata_;
-}
-
-void ParquetFileWriter::Open(std::unique_ptr<ParquetFileWriter::Contents> contents) {
- contents_ = std::move(contents);
-}
-
-void ParquetFileWriter::Close() {
- if (contents_) {
- contents_->Close();
- file_metadata_ = contents_->metadata();
- contents_.reset();
- }
-}
-
-RowGroupWriter* ParquetFileWriter::AppendRowGroup() {
- return contents_->AppendRowGroup();
-}
-
-RowGroupWriter* ParquetFileWriter::AppendBufferedRowGroup() {
- return contents_->AppendBufferedRowGroup();
-}
-
-RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) {
- return AppendRowGroup();
-}
-
-const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const {
- return contents_->properties();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/file_writer.h"
+
+#include <cstddef>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_writer.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+using arrow::MemoryPool;
+
+using parquet::schema::GroupNode;
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// RowGroupWriter public API
+
+RowGroupWriter::RowGroupWriter(std::unique_ptr<Contents> contents)
+ : contents_(std::move(contents)) {}
+
+void RowGroupWriter::Close() {
+ if (contents_) {
+ contents_->Close();
+ }
+}
+
+ColumnWriter* RowGroupWriter::NextColumn() { return contents_->NextColumn(); }
+
+ColumnWriter* RowGroupWriter::column(int i) { return contents_->column(i); }
+
+int64_t RowGroupWriter::total_compressed_bytes() const {
+ return contents_->total_compressed_bytes();
+}
+
+int64_t RowGroupWriter::total_bytes_written() const {
+ return contents_->total_bytes_written();
+}
+
+int RowGroupWriter::current_column() { return contents_->current_column(); }
+
+int RowGroupWriter::num_columns() const { return contents_->num_columns(); }
+
+int64_t RowGroupWriter::num_rows() const { return contents_->num_rows(); }
+
+inline void ThrowRowsMisMatchError(int col, int64_t prev, int64_t curr) {
+ std::stringstream ss;
+ ss << "Column " << col << " had " << curr << " while previous column had " << prev;
+ throw ParquetException(ss.str());
+}
+
+// ----------------------------------------------------------------------
+// RowGroupSerializer
+
+// RowGroupWriter::Contents implementation for the Parquet file specification
+class RowGroupSerializer : public RowGroupWriter::Contents {
+ public:
+ RowGroupSerializer(std::shared_ptr<ArrowOutputStream> sink,
+ RowGroupMetaDataBuilder* metadata, int16_t row_group_ordinal,
+ const WriterProperties* properties, bool buffered_row_group = false,
+ InternalFileEncryptor* file_encryptor = nullptr)
+ : sink_(std::move(sink)),
+ metadata_(metadata),
+ properties_(properties),
+ total_bytes_written_(0),
+ closed_(false),
+ row_group_ordinal_(row_group_ordinal),
+ next_column_index_(0),
+ num_rows_(0),
+ buffered_row_group_(buffered_row_group),
+ file_encryptor_(file_encryptor) {
+ if (buffered_row_group) {
+ InitColumns();
+ } else {
+ column_writers_.push_back(nullptr);
+ }
+ }
+
+ int num_columns() const override { return metadata_->num_columns(); }
+
+ int64_t num_rows() const override {
+ CheckRowsWritten();
+ // CheckRowsWritten ensures num_rows_ is set correctly
+ return num_rows_;
+ }
+
+ ColumnWriter* NextColumn() override {
+ if (buffered_row_group_) {
+ throw ParquetException(
+ "NextColumn() is not supported when a RowGroup is written by size");
+ }
+
+ if (column_writers_[0]) {
+ CheckRowsWritten();
+ }
+
+ // Throws an error if more columns are being written
+ auto col_meta = metadata_->NextColumnChunk();
+
+ if (column_writers_[0]) {
+ total_bytes_written_ += column_writers_[0]->Close();
+ }
+
+ ++next_column_index_;
+
+ const auto& path = col_meta->descr()->path();
+ auto meta_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, properties_->compression(path), properties_->compression_level(path),
+ col_meta, row_group_ordinal_, static_cast<int16_t>(next_column_index_ - 1),
+ properties_->memory_pool(), false, meta_encryptor, data_encryptor);
+ column_writers_[0] = ColumnWriter::Make(col_meta, std::move(pager), properties_);
+ return column_writers_[0].get();
+ }
+
+ ColumnWriter* column(int i) override {
+ if (!buffered_row_group_) {
+ throw ParquetException(
+ "column() is only supported when a BufferedRowGroup is being written");
+ }
+
+ if (i >= 0 && i < static_cast<int>(column_writers_.size())) {
+ return column_writers_[i].get();
+ }
+ return nullptr;
+ }
+
+ int current_column() const override { return metadata_->current_column(); }
+
+ int64_t total_compressed_bytes() const override {
+ int64_t total_compressed_bytes = 0;
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_compressed_bytes += column_writers_[i]->total_compressed_bytes();
+ }
+ }
+ return total_compressed_bytes;
+ }
+
+ int64_t total_bytes_written() const override {
+ int64_t total_bytes_written = 0;
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_bytes_written += column_writers_[i]->total_bytes_written();
+ }
+ }
+ return total_bytes_written;
+ }
+
+ void Close() override {
+ if (!closed_) {
+ closed_ = true;
+ CheckRowsWritten();
+
+ for (size_t i = 0; i < column_writers_.size(); i++) {
+ if (column_writers_[i]) {
+ total_bytes_written_ += column_writers_[i]->Close();
+ column_writers_[i].reset();
+ }
+ }
+
+ column_writers_.clear();
+
+ // Ensures all columns have been written
+ metadata_->set_num_rows(num_rows_);
+ metadata_->Finish(total_bytes_written_, row_group_ordinal_);
+ }
+ }
+
+ private:
+ std::shared_ptr<ArrowOutputStream> sink_;
+ mutable RowGroupMetaDataBuilder* metadata_;
+ const WriterProperties* properties_;
+ int64_t total_bytes_written_;
+ bool closed_;
+ int16_t row_group_ordinal_;
+ int next_column_index_;
+ mutable int64_t num_rows_;
+ bool buffered_row_group_;
+ InternalFileEncryptor* file_encryptor_;
+
+ void CheckRowsWritten() const {
+ // verify when only one column is written at a time
+ if (!buffered_row_group_ && column_writers_.size() > 0 && column_writers_[0]) {
+ int64_t current_col_rows = column_writers_[0]->rows_written();
+ if (num_rows_ == 0) {
+ num_rows_ = current_col_rows;
+ } else if (num_rows_ != current_col_rows) {
+ ThrowRowsMisMatchError(next_column_index_, current_col_rows, num_rows_);
+ }
+ } else if (buffered_row_group_ &&
+ column_writers_.size() > 0) { // when buffered_row_group = true
+ int64_t current_col_rows = column_writers_[0]->rows_written();
+ for (int i = 1; i < static_cast<int>(column_writers_.size()); i++) {
+ int64_t current_col_rows_i = column_writers_[i]->rows_written();
+ if (current_col_rows != current_col_rows_i) {
+ ThrowRowsMisMatchError(i, current_col_rows_i, current_col_rows);
+ }
+ }
+ num_rows_ = current_col_rows;
+ }
+ }
+
+ void InitColumns() {
+ for (int i = 0; i < num_columns(); i++) {
+ auto col_meta = metadata_->NextColumnChunk();
+ const auto& path = col_meta->descr()->path();
+ auto meta_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnMetaEncryptor(path->ToDotString())
+ : nullptr;
+ auto data_encryptor =
+ file_encryptor_ ? file_encryptor_->GetColumnDataEncryptor(path->ToDotString())
+ : nullptr;
+ std::unique_ptr<PageWriter> pager = PageWriter::Open(
+ sink_, properties_->compression(path), properties_->compression_level(path),
+ col_meta, static_cast<int16_t>(row_group_ordinal_),
+ static_cast<int16_t>(next_column_index_++), properties_->memory_pool(),
+ buffered_row_group_, meta_encryptor, data_encryptor);
+ column_writers_.push_back(
+ ColumnWriter::Make(col_meta, std::move(pager), properties_));
+ }
+ }
+
+ std::vector<std::shared_ptr<ColumnWriter>> column_writers_;
+};
+
+// ----------------------------------------------------------------------
+// FileSerializer
+
+// An implementation of ParquetFileWriter::Contents that deals with the Parquet
+// file structure, Thrift serialization, and other internal matters
+
+class FileSerializer : public ParquetFileWriter::Contents {
+ public:
+ static std::unique_ptr<ParquetFileWriter::Contents> Open(
+ std::shared_ptr<ArrowOutputStream> sink, std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ std::unique_ptr<ParquetFileWriter::Contents> result(
+ new FileSerializer(std::move(sink), std::move(schema), std::move(properties),
+ std::move(key_value_metadata)));
+
+ return result;
+ }
+
+ void Close() override {
+ if (is_open_) {
+ // If any functions here raise an exception, we set is_open_ to be false
+ // so that this does not get called again (possibly causing segfault)
+ is_open_ = false;
+ if (row_group_writer_) {
+ num_rows_ += row_group_writer_->num_rows();
+ row_group_writer_->Close();
+ }
+ row_group_writer_.reset();
+
+ // Write magic bytes and metadata
+ auto file_encryption_properties = properties_->file_encryption_properties();
+
+ if (file_encryption_properties == nullptr) { // Non encrypted file.
+ file_metadata_ = metadata_->Finish();
+ WriteFileMetaData(*file_metadata_, sink_.get());
+ } else { // Encrypted file
+ CloseEncryptedFile(file_encryption_properties);
+ }
+ }
+ }
+
+ int num_columns() const override { return schema_.num_columns(); }
+
+ int num_row_groups() const override { return num_row_groups_; }
+
+ int64_t num_rows() const override { return num_rows_; }
+
+ const std::shared_ptr<WriterProperties>& properties() const override {
+ return properties_;
+ }
+
+ RowGroupWriter* AppendRowGroup(bool buffered_row_group) {
+ if (row_group_writer_) {
+ row_group_writer_->Close();
+ }
+ num_row_groups_++;
+ auto rg_metadata = metadata_->AppendRowGroup();
+ std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
+ sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), properties_.get(),
+ buffered_row_group, file_encryptor_.get()));
+ row_group_writer_.reset(new RowGroupWriter(std::move(contents)));
+ return row_group_writer_.get();
+ }
+
+ RowGroupWriter* AppendRowGroup() override { return AppendRowGroup(false); }
+
+ RowGroupWriter* AppendBufferedRowGroup() override { return AppendRowGroup(true); }
+
+ ~FileSerializer() override {
+ try {
+ Close();
+ } catch (...) {
+ }
+ }
+
+ private:
+ FileSerializer(std::shared_ptr<ArrowOutputStream> sink,
+ std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : ParquetFileWriter::Contents(std::move(schema), std::move(key_value_metadata)),
+ sink_(std::move(sink)),
+ is_open_(true),
+ properties_(std::move(properties)),
+ num_row_groups_(0),
+ num_rows_(0),
+ metadata_(FileMetaDataBuilder::Make(&schema_, properties_, key_value_metadata_)) {
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
+ if (position == 0) {
+ StartFile();
+ } else {
+ throw ParquetException("Appending to file not implemented.");
+ }
+ }
+
+ void CloseEncryptedFile(FileEncryptionProperties* file_encryption_properties) {
+ // Encrypted file with encrypted footer
+ if (file_encryption_properties->encrypted_footer()) {
+ // encrypted footer
+ file_metadata_ = metadata_->Finish();
+
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink_->Tell());
+ uint64_t metadata_start = static_cast<uint64_t>(position);
+ auto crypto_metadata = metadata_->GetCryptoMetaData();
+ WriteFileCryptoMetaData(*crypto_metadata, sink_.get());
+
+ auto footer_encryptor = file_encryptor_->GetFooterEncryptor();
+ WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_encryptor, true);
+ PARQUET_ASSIGN_OR_THROW(position, sink_->Tell());
+ uint32_t footer_and_crypto_len = static_cast<uint32_t>(position - metadata_start);
+ PARQUET_THROW_NOT_OK(
+ sink_->Write(reinterpret_cast<uint8_t*>(&footer_and_crypto_len), 4));
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
+ } else { // Encrypted file with plaintext footer
+ file_metadata_ = metadata_->Finish();
+ auto footer_signing_encryptor = file_encryptor_->GetFooterSigningEncryptor();
+ WriteEncryptedFileMetadata(*file_metadata_, sink_.get(), footer_signing_encryptor,
+ false);
+ }
+ if (file_encryptor_) {
+ file_encryptor_->WipeOutEncryptionKeys();
+ }
+ }
+
+ std::shared_ptr<ArrowOutputStream> sink_;
+ bool is_open_;
+ const std::shared_ptr<WriterProperties> properties_;
+ int num_row_groups_;
+ int64_t num_rows_;
+ std::unique_ptr<FileMetaDataBuilder> metadata_;
+ // Only one of the row group writers is active at a time
+ std::unique_ptr<RowGroupWriter> row_group_writer_;
+
+ std::unique_ptr<InternalFileEncryptor> file_encryptor_;
+
+ void StartFile() {
+ auto file_encryption_properties = properties_->file_encryption_properties();
+ if (file_encryption_properties == nullptr) {
+ // Unencrypted parquet files always start with PAR1
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
+ } else {
+ // Check that all columns in columnEncryptionProperties exist in the schema.
+ auto encrypted_columns = file_encryption_properties->encrypted_columns();
+ // if columnEncryptionProperties is empty, every column in file schema will be
+ // encrypted with footer key.
+ if (encrypted_columns.size() != 0) {
+ std::vector<std::string> column_path_vec;
+ // First, save all column paths in schema.
+ for (int i = 0; i < num_columns(); i++) {
+ column_path_vec.push_back(schema_.Column(i)->path()->ToDotString());
+ }
+ // Check if column exists in schema.
+ for (const auto& elem : encrypted_columns) {
+ auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first);
+ if (it == column_path_vec.end()) {
+ std::stringstream ss;
+ ss << "Encrypted column " + elem.first + " not in file schema";
+ throw ParquetException(ss.str());
+ }
+ }
+ }
+
+ file_encryptor_.reset(new InternalFileEncryptor(file_encryption_properties,
+ properties_->memory_pool()));
+ if (file_encryption_properties->encrypted_footer()) {
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetEMagic, 4));
+ } else {
+ // Encrypted file with plaintext footer mode.
+ PARQUET_THROW_NOT_OK(sink_->Write(kParquetMagic, 4));
+ }
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
+// ParquetFileWriter public API
+
+ParquetFileWriter::ParquetFileWriter() {}
+
+ParquetFileWriter::~ParquetFileWriter() {
+ try {
+ Close();
+ } catch (...) {
+ }
+}
+
+std::unique_ptr<ParquetFileWriter> ParquetFileWriter::Open(
+ std::shared_ptr<::arrow::io::OutputStream> sink, std::shared_ptr<GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ auto contents =
+ FileSerializer::Open(std::move(sink), std::move(schema), std::move(properties),
+ std::move(key_value_metadata));
+ std::unique_ptr<ParquetFileWriter> result(new ParquetFileWriter());
+ result->Open(std::move(contents));
+ return result;
+}
+
+void WriteFileMetaData(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
+ // Write MetaData
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
+ uint32_t metadata_len = static_cast<uint32_t>(position);
+
+ file_metadata.WriteTo(sink);
+ PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
+ metadata_len = static_cast<uint32_t>(position) - metadata_len;
+
+ // Write Footer
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+}
+
+void WriteMetaDataFile(const FileMetaData& file_metadata, ArrowOutputStream* sink) {
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+ return WriteFileMetaData(file_metadata, sink);
+}
+
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ArrowOutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor,
+ bool encrypt_footer) {
+ if (encrypt_footer) { // Encrypted file with encrypted footer
+ // encrypt and write to sink
+ file_metadata.WriteTo(sink, encryptor);
+ } else { // Encrypted file with plaintext footer mode.
+ PARQUET_ASSIGN_OR_THROW(int64_t position, sink->Tell());
+ uint32_t metadata_len = static_cast<uint32_t>(position);
+ file_metadata.WriteTo(sink, encryptor);
+ PARQUET_ASSIGN_OR_THROW(position, sink->Tell());
+ metadata_len = static_cast<uint32_t>(position) - metadata_len;
+
+ PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<uint8_t*>(&metadata_len), 4));
+ PARQUET_THROW_NOT_OK(sink->Write(kParquetMagic, 4));
+ }
+}
+
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+ ArrowOutputStream* sink) {
+ crypto_metadata.WriteTo(sink);
+}
+
+const SchemaDescriptor* ParquetFileWriter::schema() const { return contents_->schema(); }
+
+const ColumnDescriptor* ParquetFileWriter::descr(int i) const {
+ return contents_->schema()->Column(i);
+}
+
+int ParquetFileWriter::num_columns() const { return contents_->num_columns(); }
+
+int64_t ParquetFileWriter::num_rows() const { return contents_->num_rows(); }
+
+int ParquetFileWriter::num_row_groups() const { return contents_->num_row_groups(); }
+
+const std::shared_ptr<const KeyValueMetadata>& ParquetFileWriter::key_value_metadata()
+ const {
+ return contents_->key_value_metadata();
+}
+
+const std::shared_ptr<FileMetaData> ParquetFileWriter::metadata() const {
+ return file_metadata_;
+}
+
+void ParquetFileWriter::Open(std::unique_ptr<ParquetFileWriter::Contents> contents) {
+ contents_ = std::move(contents);
+}
+
+void ParquetFileWriter::Close() {
+ if (contents_) {
+ contents_->Close();
+ file_metadata_ = contents_->metadata();
+ contents_.reset();
+ }
+}
+
+RowGroupWriter* ParquetFileWriter::AppendRowGroup() {
+ return contents_->AppendRowGroup();
+}
+
+RowGroupWriter* ParquetFileWriter::AppendBufferedRowGroup() {
+ return contents_->AppendBufferedRowGroup();
+}
+
+RowGroupWriter* ParquetFileWriter::AppendRowGroup(int64_t num_rows) {
+ return AppendRowGroup();
+}
+
+const std::shared_ptr<WriterProperties>& ParquetFileWriter::properties() const {
+ return contents_->properties();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
index dafb2573b2c..4cfc24719a3 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/file_writer.h
@@ -1,234 +1,234 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "parquet/metadata.h"
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-
-class ColumnWriter;
-
-// FIXME: copied from reader-internal.cc
-static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
-static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
-
-class PARQUET_EXPORT RowGroupWriter {
- public:
- // Forward declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct Contents {
- virtual ~Contents() = default;
- virtual int num_columns() const = 0;
- virtual int64_t num_rows() const = 0;
-
- // to be used only with ParquetFileWriter::AppendRowGroup
- virtual ColumnWriter* NextColumn() = 0;
- // to be used only with ParquetFileWriter::AppendBufferedRowGroup
- virtual ColumnWriter* column(int i) = 0;
-
- virtual int current_column() const = 0;
- virtual void Close() = 0;
-
- // total bytes written by the page writer
- virtual int64_t total_bytes_written() const = 0;
- // total bytes still compressed but not written
- virtual int64_t total_compressed_bytes() const = 0;
- };
-
- explicit RowGroupWriter(std::unique_ptr<Contents> contents);
-
- /// Construct a ColumnWriter for the indicated row group-relative column.
- ///
- /// To be used only with ParquetFileWriter::AppendRowGroup
- /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
- /// valid until the next call to NextColumn or Close. As the contents are
- /// directly written to the sink, once a new column is started, the contents
- /// of the previous one cannot be modified anymore.
- ColumnWriter* NextColumn();
- /// Index of currently written column. Equal to -1 if NextColumn()
- /// has not been called yet.
- int current_column();
- void Close();
-
- int num_columns() const;
-
- /// Construct a ColumnWriter for the indicated row group column.
- ///
- /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
- /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
- /// valid until Close. The contents are buffered in memory and written to sink
- /// on Close
- ColumnWriter* column(int i);
-
- /**
- * Number of rows that shall be written as part of this RowGroup.
- */
- int64_t num_rows() const;
-
- int64_t total_bytes_written() const;
- int64_t total_compressed_bytes() const;
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
-};
-
-PARQUET_EXPORT
-void WriteFileMetaData(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-PARQUET_EXPORT
-void WriteMetaDataFile(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink);
-
-PARQUET_EXPORT
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
- ArrowOutputStream* sink,
- const std::shared_ptr<Encryptor>& encryptor,
- bool encrypt_footer);
-
-PARQUET_EXPORT
-void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
- ::arrow::io::OutputStream* sink,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
- bool encrypt_footer = false);
-PARQUET_EXPORT
-void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
- ::arrow::io::OutputStream* sink);
-
-class PARQUET_EXPORT ParquetFileWriter {
- public:
- // Forward declare a virtual class 'Contents' to aid dependency injection and more
- // easily create test fixtures
- // An implementation of the Contents class is defined in the .cc file
- struct Contents {
- Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
- schema_.Init(std::move(schema));
- }
- virtual ~Contents() {}
- // Perform any cleanup associated with the file contents
- virtual void Close() = 0;
-
- /// \note Deprecated since 1.3.0
- RowGroupWriter* AppendRowGroup(int64_t num_rows);
-
- virtual RowGroupWriter* AppendRowGroup() = 0;
- virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
-
- virtual int64_t num_rows() const = 0;
- virtual int num_columns() const = 0;
- virtual int num_row_groups() const = 0;
-
- virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
-
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
- return key_value_metadata_;
- }
-
- // Return const-pointer to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const { return &schema_; }
-
- SchemaDescriptor schema_;
-
- /// This should be the only place this is stored. Everything else is a const reference
- std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
-
- const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
- std::shared_ptr<FileMetaData> file_metadata_;
- };
-
- ParquetFileWriter();
- ~ParquetFileWriter();
-
- static std::unique_ptr<ParquetFileWriter> Open(
- std::shared_ptr<::arrow::io::OutputStream> sink,
- std::shared_ptr<schema::GroupNode> schema,
- std::shared_ptr<WriterProperties> properties = default_writer_properties(),
- std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
-
- void Open(std::unique_ptr<Contents> contents);
- void Close();
-
- // Construct a RowGroupWriter for the indicated number of rows.
- //
- // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
- // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
- // @param num_rows The number of rows that are stored in the new RowGroup
- //
- // \deprecated Since 1.3.0
- RowGroupWriter* AppendRowGroup(int64_t num_rows);
-
- /// Construct a RowGroupWriter with an arbitrary number of rows.
- ///
- /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
- /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
- RowGroupWriter* AppendRowGroup();
-
- /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
- /// Use this if you want to write a RowGroup based on a certain size
- ///
- /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
- /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
- RowGroupWriter* AppendBufferedRowGroup();
-
- /// Number of columns.
- ///
- /// This number is fixed during the lifetime of the writer as it is determined via
- /// the schema.
- int num_columns() const;
-
- /// Number of rows in the yet started RowGroups.
- ///
- /// Changes on the addition of a new RowGroup.
- int64_t num_rows() const;
-
- /// Number of started RowGroups.
- int num_row_groups() const;
-
- /// Configuration passed to the writer, e.g. the used Parquet format version.
- const std::shared_ptr<WriterProperties>& properties() const;
-
- /// Returns the file schema descriptor
- const SchemaDescriptor* schema() const;
-
- /// Returns a column descriptor in schema
- const ColumnDescriptor* descr(int i) const;
-
- /// Returns the file custom metadata
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
-
- /// Returns the file metadata, only available after calling Close().
- const std::shared_ptr<FileMetaData> metadata() const;
-
- private:
- // Holds a pointer to an instance of Contents implementation
- std::unique_ptr<Contents> contents_;
- std::shared_ptr<FileMetaData> file_metadata_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ColumnWriter;
+
+// FIXME: copied from reader-internal.cc
+static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
+static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
+
+class PARQUET_EXPORT RowGroupWriter {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ virtual ~Contents() = default;
+ virtual int num_columns() const = 0;
+ virtual int64_t num_rows() const = 0;
+
+ // to be used only with ParquetFileWriter::AppendRowGroup
+ virtual ColumnWriter* NextColumn() = 0;
+ // to be used only with ParquetFileWriter::AppendBufferedRowGroup
+ virtual ColumnWriter* column(int i) = 0;
+
+ virtual int current_column() const = 0;
+ virtual void Close() = 0;
+
+ // total bytes written by the page writer
+ virtual int64_t total_bytes_written() const = 0;
+ // total bytes still compressed but not written
+ virtual int64_t total_compressed_bytes() const = 0;
+ };
+
+ explicit RowGroupWriter(std::unique_ptr<Contents> contents);
+
+ /// Construct a ColumnWriter for the indicated row group-relative column.
+ ///
+ /// To be used only with ParquetFileWriter::AppendRowGroup
+ /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
+ /// valid until the next call to NextColumn or Close. As the contents are
+ /// directly written to the sink, once a new column is started, the contents
+ /// of the previous one cannot be modified anymore.
+ ColumnWriter* NextColumn();
+ /// Index of currently written column. Equal to -1 if NextColumn()
+ /// has not been called yet.
+ int current_column();
+ void Close();
+
+ int num_columns() const;
+
+ /// Construct a ColumnWriter for the indicated row group column.
+ ///
+ /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
+ /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
+ /// valid until Close. The contents are buffered in memory and written to sink
+ /// on Close
+ ColumnWriter* column(int i);
+
+ /**
+ * Number of rows that shall be written as part of this RowGroup.
+ */
+ int64_t num_rows() const;
+
+ int64_t total_bytes_written() const;
+ int64_t total_compressed_bytes() const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+};
+
+PARQUET_EXPORT
+void WriteFileMetaData(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteMetaDataFile(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ArrowOutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor,
+ bool encrypt_footer);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+ ::arrow::io::OutputStream* sink,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
+ bool encrypt_footer = false);
+PARQUET_EXPORT
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+ ::arrow::io::OutputStream* sink);
+
+class PARQUET_EXPORT ParquetFileWriter {
+ public:
+ // Forward declare a virtual class 'Contents' to aid dependency injection and more
+ // easily create test fixtures
+ // An implementation of the Contents class is defined in the .cc file
+ struct Contents {
+ Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
+ schema_.Init(std::move(schema));
+ }
+ virtual ~Contents() {}
+ // Perform any cleanup associated with the file contents
+ virtual void Close() = 0;
+
+ /// \note Deprecated since 1.3.0
+ RowGroupWriter* AppendRowGroup(int64_t num_rows);
+
+ virtual RowGroupWriter* AppendRowGroup() = 0;
+ virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
+
+ virtual int64_t num_rows() const = 0;
+ virtual int num_columns() const = 0;
+ virtual int num_row_groups() const = 0;
+
+ virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const { return &schema_; }
+
+ SchemaDescriptor schema_;
+
+ /// This should be the only place this is stored. Everything else is a const reference
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+
+ const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
+ std::shared_ptr<FileMetaData> file_metadata_;
+ };
+
+ ParquetFileWriter();
+ ~ParquetFileWriter();
+
+ static std::unique_ptr<ParquetFileWriter> Open(
+ std::shared_ptr<::arrow::io::OutputStream> sink,
+ std::shared_ptr<schema::GroupNode> schema,
+ std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+ void Open(std::unique_ptr<Contents> contents);
+ void Close();
+
+ // Construct a RowGroupWriter for the indicated number of rows.
+ //
+ // Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ // until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ // @param num_rows The number of rows that are stored in the new RowGroup
+ //
+ // \deprecated Since 1.3.0
+ RowGroupWriter* AppendRowGroup(int64_t num_rows);
+
+ /// Construct a RowGroupWriter with an arbitrary number of rows.
+ ///
+ /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ RowGroupWriter* AppendRowGroup();
+
+ /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
+ /// Use this if you want to write a RowGroup based on a certain size
+ ///
+ /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+ /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+ RowGroupWriter* AppendBufferedRowGroup();
+
+ /// Number of columns.
+ ///
+ /// This number is fixed during the lifetime of the writer as it is determined via
+ /// the schema.
+ int num_columns() const;
+
+ /// Number of rows in the yet started RowGroups.
+ ///
+ /// Changes on the addition of a new RowGroup.
+ int64_t num_rows() const;
+
+ /// Number of started RowGroups.
+ int num_row_groups() const;
+
+ /// Configuration passed to the writer, e.g. the used Parquet format version.
+ const std::shared_ptr<WriterProperties>& properties() const;
+
+ /// Returns the file schema descriptor
+ const SchemaDescriptor* schema() const;
+
+ /// Returns a column descriptor in schema
+ const ColumnDescriptor* descr(int i) const;
+
+ /// Returns the file custom metadata
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ /// Returns the file metadata, only available after calling Close().
+ const std::shared_ptr<FileMetaData> metadata() const;
+
+ private:
+ // Holds a pointer to an instance of Contents implementation
+ std::unique_ptr<Contents> contents_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h b/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
index 7452e39190f..d699356a6c4 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/hasher.h
@@ -1,72 +1,72 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include "parquet/types.h"
-
-namespace parquet {
-// Abstract class for hash
-class Hasher {
- public:
- /// Compute hash for 32 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int32_t value) const = 0;
-
- /// Compute hash for 64 bits value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(int64_t value) const = 0;
-
- /// Compute hash for float value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(float value) const = 0;
-
- /// Compute hash for double value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(double value) const = 0;
-
- /// Compute hash for Int96 value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const Int96* value) const = 0;
-
- /// Compute hash for ByteArray value by using its plain encoding result.
- ///
- /// @param value the value to hash.
- /// @return hash result.
- virtual uint64_t Hash(const ByteArray* value) const = 0;
-
- /// Compute hash for fixed byte array value by using its plain encoding result.
- ///
- /// @param value the value address.
- /// @param len the value length.
- virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
-
- virtual ~Hasher() = default;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "parquet/types.h"
+
+namespace parquet {
+// Abstract class for hash
+class Hasher {
+ public:
+ /// Compute hash for 32 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int32_t value) const = 0;
+
+ /// Compute hash for 64 bits value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(int64_t value) const = 0;
+
+ /// Compute hash for float value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(float value) const = 0;
+
+ /// Compute hash for double value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(double value) const = 0;
+
+ /// Compute hash for Int96 value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const Int96* value) const = 0;
+
+ /// Compute hash for ByteArray value by using its plain encoding result.
+ ///
+ /// @param value the value to hash.
+ /// @return hash result.
+ virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+ /// Compute hash for fixed byte array value by using its plain encoding result.
+ ///
+ /// @param value the value address.
+ /// @param len the value length.
+ virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+ virtual ~Hasher() = default;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
index b0851f5cf1f..30614ae61fb 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.cc
@@ -1,82 +1,82 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/level_comparison.h"
-
-#define PARQUET_IMPL_NAMESPACE standard
-#include "parquet/level_comparison_inc.h"
-#undef PARQUET_IMPL_NAMESPACE
-
-#include <vector>
-
-#include "arrow/util/dispatch.h"
-
-namespace parquet {
-namespace internal {
-
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
-MinMax FindMinMaxAvx2(const int16_t* levels, int64_t num_levels);
-uint64_t GreaterThanBitmapAvx2(const int16_t* levels, int64_t num_levels, int16_t rhs);
-#endif
-
-namespace {
-
-using ::arrow::internal::DispatchLevel;
-using ::arrow::internal::DynamicDispatch;
-
-// defined in level_comparison_avx2.cc
-
-struct GreaterThanDynamicFunction {
- using FunctionType = decltype(&GreaterThanBitmap);
-
- static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
- return {
- { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
- , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
-#endif
- };
- }
-};
-
-struct MinMaxDynamicFunction {
- using FunctionType = decltype(&FindMinMax);
-
- static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
- return {
- { DispatchLevel::NONE, standard::FindMinMaxImpl }
-#if defined(ARROW_HAVE_RUNTIME_AVX2)
- , { DispatchLevel::AVX2, FindMinMaxAvx2 }
-#endif
- };
- }
-};
-
-} // namespace
-
-uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs) {
- static DynamicDispatch<GreaterThanDynamicFunction> dispatch;
- return dispatch.func(levels, num_levels, rhs);
-}
-
-MinMax FindMinMax(const int16_t* levels, int64_t num_levels) {
- static DynamicDispatch<MinMaxDynamicFunction> dispatch;
- return dispatch.func(levels, num_levels);
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/level_comparison.h"
+
+#define PARQUET_IMPL_NAMESPACE standard
+#include "parquet/level_comparison_inc.h"
+#undef PARQUET_IMPL_NAMESPACE
+
+#include <vector>
+
+#include "arrow/util/dispatch.h"
+
+namespace parquet {
+namespace internal {
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+MinMax FindMinMaxAvx2(const int16_t* levels, int64_t num_levels);
+uint64_t GreaterThanBitmapAvx2(const int16_t* levels, int64_t num_levels, int16_t rhs);
+#endif
+
+namespace {
+
+using ::arrow::internal::DispatchLevel;
+using ::arrow::internal::DynamicDispatch;
+
+// defined in level_comparison_avx2.cc
+
+struct GreaterThanDynamicFunction {
+ using FunctionType = decltype(&GreaterThanBitmap);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, standard::GreaterThanBitmapImpl }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, GreaterThanBitmapAvx2 }
+#endif
+ };
+ }
+};
+
+struct MinMaxDynamicFunction {
+ using FunctionType = decltype(&FindMinMax);
+
+ static std::vector<std::pair<DispatchLevel, FunctionType>> implementations() {
+ return {
+ { DispatchLevel::NONE, standard::FindMinMaxImpl }
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+ , { DispatchLevel::AVX2, FindMinMaxAvx2 }
+#endif
+ };
+ }
+};
+
+} // namespace
+
+uint64_t GreaterThanBitmap(const int16_t* levels, int64_t num_levels, int16_t rhs) {
+ static DynamicDispatch<GreaterThanDynamicFunction> dispatch;
+ return dispatch.func(levels, num_levels, rhs);
+}
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels) {
+ static DynamicDispatch<MinMaxDynamicFunction> dispatch;
+ return dispatch.func(levels, num_levels);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
index 2097e4db8a0..38e7ef8e2ec 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison.h
@@ -1,40 +1,40 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-
-#include "parquet/platform.h"
-
-namespace parquet {
-namespace internal {
-
-/// Builds a bitmap where each set bit indicates the corresponding level is greater
-/// than rhs.
-uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
- int16_t rhs);
-
-struct MinMax {
- int16_t min;
- int16_t max;
-};
-
-MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+namespace internal {
+
+/// Builds a bitmap where each set bit indicates the corresponding level is greater
+/// than rhs.
+uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
+ int16_t rhs);
+
+struct MinMax {
+ int16_t min;
+ int16_t max;
+};
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
index cc6bf382a50..e21c3e5824d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_comparison_inc.h
@@ -1,65 +1,65 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include "arrow/util/bit_util.h"
-#include "arrow/util/endian.h"
-#include "parquet/level_comparison.h"
-
-// Used to make sure ODR rule isn't violated.
-#ifndef PARQUET_IMPL_NAMESPACE
-#error "PARQUET_IMPL_NAMESPACE must be defined"
-#endif
-namespace parquet {
-namespace internal {
-namespace PARQUET_IMPL_NAMESPACE {
-/// Builds a bitmap by applying predicate to the level vector provided.
-///
-/// \param[in] levels Rep or def level array.
-/// \param[in] num_levels The number of levels to process (must be [0, 64])
-/// \param[in] predicate The predicate to apply (must have the signature `bool
-/// predicate(int16_t)`.
-/// \returns The bitmap using least significant "bit" ordering.
-///
-template <typename Predicate>
-inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
- Predicate predicate) {
- // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
- uint64_t mask = 0;
- for (int x = 0; x < num_levels; x++) {
- mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
- }
- return ::arrow::BitUtil::ToLittleEndian(mask);
-}
-
-inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
- MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
- for (int x = 0; x < num_levels; x++) {
- out.min = std::min(levels[x], out.min);
- out.max = std::max(levels[x], out.max);
- }
- return out;
-}
-
-inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
- int16_t rhs) {
- return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
-}
-
-} // namespace PARQUET_IMPL_NAMESPACE
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "parquet/level_comparison.h"
+
+// Used to make sure ODR rule isn't violated.
+#ifndef PARQUET_IMPL_NAMESPACE
+#error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace parquet {
+namespace internal {
+namespace PARQUET_IMPL_NAMESPACE {
+/// Builds a bitmap by applying predicate to the level vector provided.
+///
+/// \param[in] levels Rep or def level array.
+/// \param[in] num_levels The number of levels to process (must be [0, 64])
+/// \param[in] predicate The predicate to apply (must have the signature `bool
+/// predicate(int16_t)`.
+/// \returns The bitmap using least significant "bit" ordering.
+///
+template <typename Predicate>
+inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
+ Predicate predicate) {
+ // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
+ uint64_t mask = 0;
+ for (int x = 0; x < num_levels; x++) {
+ mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
+ }
+ return ::arrow::BitUtil::ToLittleEndian(mask);
+}
+
+inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
+ MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
+ for (int x = 0; x < num_levels; x++) {
+ out.min = std::min(levels[x], out.min);
+ out.max = std::max(levels[x], out.max);
+ }
+ return out;
+}
+
+inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
+ int16_t rhs) {
+ return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
+}
+
+} // namespace PARQUET_IMPL_NAMESPACE
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
index 998fd982fd7..ffdca476ddd 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.cc
@@ -1,183 +1,183 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "parquet/level_conversion.h"
-
-#include <algorithm>
-#include <limits>
-
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/cpu_info.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-#include "parquet/exception.h"
-
-#include "parquet/level_comparison.h"
-#define PARQUET_IMPL_NAMESPACE standard
-#include "parquet/level_conversion_inc.h"
-#undef PARQUET_IMPL_NAMESPACE
-
-namespace parquet {
-namespace internal {
-namespace {
-
-using ::arrow::internal::CpuInfo;
-using ::arrow::util::optional;
-
-template <typename OffsetType>
-void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output, OffsetType* offsets) {
- OffsetType* orig_pos = offsets;
- optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
- if (output->valid_bits) {
- valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
- output->values_read_upper_bound);
- }
- for (int x = 0; x < num_def_levels; x++) {
- // Skip items that belong to empty or null ancestor lists and further nested lists.
- if (def_levels[x] < level_info.repeated_ancestor_def_level ||
- rep_levels[x] > level_info.rep_level) {
- continue;
- }
-
- if (rep_levels[x] == level_info.rep_level) {
- // A continuation of an existing list.
- // offsets can be null for structs with repeated children (we don't need to know
- // offsets until we get to the children).
- if (offsets != nullptr) {
- if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
- throw ParquetException("List index overflow.");
- }
- *offsets += 1;
- }
- } else {
- if (ARROW_PREDICT_FALSE(
- (valid_bits_writer.has_value() &&
- valid_bits_writer->position() >= output->values_read_upper_bound) ||
- (offsets - orig_pos) >= output->values_read_upper_bound)) {
- std::stringstream ss;
- ss << "Definition levels exceeded upper bound: "
- << output->values_read_upper_bound;
- throw ParquetException(ss.str());
- }
-
- // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
- // filtered out above).
- // offsets can be null for structs with repeated children (we don't need to know
- // offsets until we get to the children).
- if (offsets != nullptr) {
- ++offsets;
- // Use cumulative offsets because variable size lists are more common then
- // fixed size lists so it should be cheaper to make these cumulative and
- // subtract when validating fixed size lists.
- *offsets = *(offsets - 1);
- if (def_levels[x] >= level_info.def_level) {
- if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
- throw ParquetException("List index overflow.");
- }
- *offsets += 1;
- }
- }
-
- if (valid_bits_writer.has_value()) {
- // the level_info def level for lists reflects element present level.
- // the prior level distinguishes between empty lists.
- if (def_levels[x] >= level_info.def_level - 1) {
- valid_bits_writer->Set();
- } else {
- output->null_count++;
- valid_bits_writer->Clear();
- }
- valid_bits_writer->Next();
- }
- }
- }
- if (valid_bits_writer.has_value()) {
- valid_bits_writer->Finish();
- }
- if (offsets != nullptr) {
- output->values_read = offsets - orig_pos;
- } else if (valid_bits_writer.has_value()) {
- output->values_read = valid_bits_writer->position();
- }
- if (output->null_count > 0 && level_info.null_slot_usage > 1) {
- throw ParquetException(
- "Null values with null_slot_usage > 1 not supported."
- "(i.e. FixedSizeLists with null values are not supported)");
- }
-}
-
-} // namespace
-
-#if defined(ARROW_HAVE_RUNTIME_BMI2)
-// defined in level_conversion_bmi2.cc for dynamic dispatch.
-void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output);
-#endif
-
-void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
- LevelInfo level_info, ValidityBitmapInputOutput* output) {
- // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
- // is deleted in a follow-up release.
- if (level_info.rep_level > 0) {
-#if defined(ARROW_HAVE_RUNTIME_BMI2)
- if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
- return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
- level_info, output);
- }
-#endif
- standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
- def_levels, num_def_levels, level_info, output);
- } else {
- standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
- def_levels, num_def_levels, level_info, output);
- }
-}
-
-uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
- return standard::ExtractBitsSoftware(bitmap, select_bitmap);
-}
-
-void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output, int32_t* offsets) {
- DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
- output, offsets);
-}
-
-void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output, int64_t* offsets) {
- DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
- output, offsets);
-}
-
-void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output) {
- // DefReplevelsToListInfo assumes it for the actual list method and this
- // method is for parent structs, so we need to bump def and ref level.
- level_info.rep_level += 1;
- level_info.def_level += 1;
- DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
- output, /*offsets=*/nullptr);
-}
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "parquet/exception.h"
+
+#include "parquet/level_comparison.h"
+#define PARQUET_IMPL_NAMESPACE standard
+#include "parquet/level_conversion_inc.h"
+#undef PARQUET_IMPL_NAMESPACE
+
+namespace parquet {
+namespace internal {
+namespace {
+
+using ::arrow::internal::CpuInfo;
+using ::arrow::util::optional;
+
+template <typename OffsetType>
+void DefRepLevelsToListInfo(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, OffsetType* offsets) {
+ OffsetType* orig_pos = offsets;
+ optional<::arrow::internal::FirstTimeBitmapWriter> valid_bits_writer;
+ if (output->valid_bits) {
+ valid_bits_writer.emplace(output->valid_bits, output->valid_bits_offset,
+ output->values_read_upper_bound);
+ }
+ for (int x = 0; x < num_def_levels; x++) {
+ // Skip items that belong to empty or null ancestor lists and further nested lists.
+ if (def_levels[x] < level_info.repeated_ancestor_def_level ||
+ rep_levels[x] > level_info.rep_level) {
+ continue;
+ }
+
+ if (rep_levels[x] == level_info.rep_level) {
+ // A continuation of an existing list.
+ // offsets can be null for structs with repeated children (we don't need to know
+ // offsets until we get to the children).
+ if (offsets != nullptr) {
+ if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
+ throw ParquetException("List index overflow.");
+ }
+ *offsets += 1;
+ }
+ } else {
+ if (ARROW_PREDICT_FALSE(
+ (valid_bits_writer.has_value() &&
+ valid_bits_writer->position() >= output->values_read_upper_bound) ||
+ (offsets - orig_pos) >= output->values_read_upper_bound)) {
+ std::stringstream ss;
+ ss << "Definition levels exceeded upper bound: "
+ << output->values_read_upper_bound;
+ throw ParquetException(ss.str());
+ }
+
+ // current_rep < list rep_level i.e. start of a list (ancestor empty lists are
+ // filtered out above).
+ // offsets can be null for structs with repeated children (we don't need to know
+ // offsets until we get to the children).
+ if (offsets != nullptr) {
+ ++offsets;
+ // Use cumulative offsets because variable size lists are more common then
+ // fixed size lists so it should be cheaper to make these cumulative and
+ // subtract when validating fixed size lists.
+ *offsets = *(offsets - 1);
+ if (def_levels[x] >= level_info.def_level) {
+ if (ARROW_PREDICT_FALSE(*offsets == std::numeric_limits<OffsetType>::max())) {
+ throw ParquetException("List index overflow.");
+ }
+ *offsets += 1;
+ }
+ }
+
+ if (valid_bits_writer.has_value()) {
+ // the level_info def level for lists reflects element present level.
+ // the prior level distinguishes between empty lists.
+ if (def_levels[x] >= level_info.def_level - 1) {
+ valid_bits_writer->Set();
+ } else {
+ output->null_count++;
+ valid_bits_writer->Clear();
+ }
+ valid_bits_writer->Next();
+ }
+ }
+ }
+ if (valid_bits_writer.has_value()) {
+ valid_bits_writer->Finish();
+ }
+ if (offsets != nullptr) {
+ output->values_read = offsets - orig_pos;
+ } else if (valid_bits_writer.has_value()) {
+ output->values_read = valid_bits_writer->position();
+ }
+ if (output->null_count > 0 && level_info.null_slot_usage > 1) {
+ throw ParquetException(
+ "Null values with null_slot_usage > 1 not supported."
+ "(i.e. FixedSizeLists with null values are not supported)");
+ }
+}
+
+} // namespace
+
+#if defined(ARROW_HAVE_RUNTIME_BMI2)
+// defined in level_conversion_bmi2.cc for dynamic dispatch.
+void DefLevelsToBitmapBmi2WithRepeatedParent(const int16_t* def_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+#endif
+
+void DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info, ValidityBitmapInputOutput* output) {
+ // It is simpler to rely on rep_level here until PARQUET-1899 is done and the code
+ // is deleted in a follow-up release.
+ if (level_info.rep_level > 0) {
+#if defined(ARROW_HAVE_RUNTIME_BMI2)
+ if (CpuInfo::GetInstance()->HasEfficientBmi2()) {
+ return DefLevelsToBitmapBmi2WithRepeatedParent(def_levels, num_def_levels,
+ level_info, output);
+ }
+#endif
+ standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/true>(
+ def_levels, num_def_levels, level_info, output);
+ } else {
+ standard::DefLevelsToBitmapSimd</*has_repeated_parent=*/false>(
+ def_levels, num_def_levels, level_info, output);
+ }
+}
+
+uint64_t TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+ return standard::ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, int32_t* offsets) {
+ DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, offsets);
+}
+
+void DefRepLevelsToList(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output, int64_t* offsets) {
+ DefRepLevelsToListInfo<int64_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, offsets);
+}
+
+void DefRepLevelsToBitmap(const int16_t* def_levels, const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output) {
+ // DefReplevelsToListInfo assumes it for the actual list method and this
+ // method is for parent structs, so we need to bump def and ref level.
+ level_info.rep_level += 1;
+ level_info.def_level += 1;
+ DefRepLevelsToListInfo<int32_t>(def_levels, rep_levels, num_def_levels, level_info,
+ output, /*offsets=*/nullptr);
+}
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
index fa6a23c1d9b..e45a288e8c0 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion.h
@@ -1,199 +1,199 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-
-#include "arrow/util/endian.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-namespace parquet {
-namespace internal {
-
-struct PARQUET_EXPORT LevelInfo {
- LevelInfo()
- : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
- LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
- int32_t repeated_ancestor_definition_level)
- : null_slot_usage(null_slots),
- def_level(definition_level),
- rep_level(repetition_level),
- repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
-
- bool operator==(const LevelInfo& b) const {
- return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
- rep_level == b.rep_level &&
- repeated_ancestor_def_level == b.repeated_ancestor_def_level;
- }
-
- bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
-
- // How many slots an undefined but present (i.e. null) element in
- // parquet consumes when decoding to Arrow.
- // "Slot" is used in the same context as the Arrow specification
- // (i.e. a value holder).
- // This is only ever >1 for descendents of FixedSizeList.
- int32_t null_slot_usage = 1;
-
- // The definition level at which the value for the field
- // is considered not null (definition levels greater than
- // or equal to this value indicate a not-null
- // value for the field). For list fields definition levels
- // greater than or equal to this field indicate a present,
- // possibly null, child value.
- int16_t def_level = 0;
-
- // The repetition level corresponding to this element
- // or the closest repeated ancestor. Any repetition
- // level less than this indicates either a new list OR
- // an empty list (which is determined in conjunction
- // with definition levels).
- int16_t rep_level = 0;
-
- // The definition level indicating the level at which the closest
- // repeated ancestor is not empty. This is used to discriminate
- // between a value less than |def_level| being null or excluded entirely.
- // For instance if we have an arrow schema like:
- // list(struct(f0: int)). Then then there are the following
- // definition levels:
- // 0 = null list
- // 1 = present but empty list.
- // 2 = a null value in the list
- // 3 = a non null struct but null integer.
- // 4 = a present integer.
- // When reconstructing, the struct and integer arrays'
- // repeated_ancestor_def_level would be 2. Any
- // def_level < 2 indicates that there isn't a corresponding
- // child value in the list.
- // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
- // has the def levels [0, 1, 2, 3, 4]. The actual
- // struct array is only of length 3: [not-set, set, set] and
- // the int array is also of length 3: [N/A, null, 1].
- //
- int16_t repeated_ancestor_def_level = 0;
-
- /// Increments levels according to the cardinality of node.
- void Increment(const schema::Node& node) {
- if (node.is_repeated()) {
- IncrementRepeated();
- return;
- }
- if (node.is_optional()) {
- IncrementOptional();
- return;
- }
- }
-
- /// Incremetns level for a optional node.
- void IncrementOptional() { def_level++; }
-
- /// Increments levels for the repeated node. Returns
- /// the previous ancestor_list_def_level.
- int16_t IncrementRepeated() {
- int16_t last_repeated_ancestor = repeated_ancestor_def_level;
-
- // Repeated fields add both a repetition and definition level. This is used
- // to distinguish between an empty list and a list with an item in it.
- ++rep_level;
- ++def_level;
- // For levels >= repeated_ancenstor_def_level it indicates the list was
- // non-null and had at least one element. This is important
- // for later decoding because we need to add a slot for these
- // values. for levels < current_def_level no slots are added
- // to arrays.
- repeated_ancestor_def_level = def_level;
- return last_repeated_ancestor;
- }
-
- friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
- // This print method is to silence valgrind issues. What's printed
- // is not important because all asserts happen directly on
- // members.
- os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
- << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
- if (levels.null_slot_usage > 1) {
- os << ", null_slot_usage=" << levels.null_slot_usage;
- }
- os << "}";
- return os;
- }
-};
-
-// Input/Output structure for reconstructed validity bitmaps.
-struct PARQUET_EXPORT ValidityBitmapInputOutput {
- // Input only.
- // The maximum number of values_read expected (actual
- // values read must be less than or equal to this value).
- // If this number is exceeded methods will throw a
- // ParquetException. Exceeding this limit indicates
- // either a corrupt or incorrectly written file.
- int64_t values_read_upper_bound = 0;
- // Output only. The number of values added to the encountered
- // (this is logically the count of the number of elements
- // for an Arrow array).
- int64_t values_read = 0;
- // Input/Output. The number of nulls encountered.
- int64_t null_count = 0;
- // Output only. The validity bitmap to populate. May be be null only
- // for DefRepLevelsToListInfo (if all that is needed is list offsets).
- uint8_t* valid_bits = NULLPTR;
- // Input only, offset into valid_bits to start at.
- int64_t valid_bits_offset = 0;
-};
-
-// Converts def_levels to validity bitmaps for non-list arrays and structs that have
-// at least one member that is not a list and has no list descendents.
-// For lists use DefRepLevelsToList and structs where all descendants contain
-// a list use DefRepLevelsToBitmap.
-void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
- LevelInfo level_info,
- ValidityBitmapInputOutput* output);
-
-// Reconstructs a validity bitmap and list offsets for a list arrays based on
-// def/rep levels. The first element of offsets will not be modified if rep_levels
-// starts with a new list. The first element of offsets will be used when calculating
-// the next offset. See documentation onf DefLevelsToBitmap for when to use this
-// method vs the other ones in this file for reconstruction.
-//
-// Offsets must be sized to 1 + values_read_upper_bound.
-void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
- const int16_t* rep_levels, int64_t num_def_levels,
- LevelInfo level_info,
- ValidityBitmapInputOutput* output,
- int32_t* offsets);
-void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
- const int16_t* rep_levels, int64_t num_def_levels,
- LevelInfo level_info,
- ValidityBitmapInputOutput* output,
- int64_t* offsets);
-
-// Reconstructs a validity bitmap for a struct every member is a list or has
-// a list descendant. See documentation on DefLevelsToBitmap for when more
-// details on this method compared to the other ones defined above.
-void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
- const int16_t* rep_levels,
- int64_t num_def_levels, LevelInfo level_info,
- ValidityBitmapInputOutput* output);
-
-// This is exposed to ensure we can properly test a software simulated pext function
-// (i.e. it isn't hidden by runtime dispatch).
-uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
-
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/endian.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+namespace internal {
+
+struct PARQUET_EXPORT LevelInfo {
+ LevelInfo()
+ : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
+ LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
+ int32_t repeated_ancestor_definition_level)
+ : null_slot_usage(null_slots),
+ def_level(definition_level),
+ rep_level(repetition_level),
+ repeated_ancestor_def_level(repeated_ancestor_definition_level) {}
+
+ bool operator==(const LevelInfo& b) const {
+ return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+ rep_level == b.rep_level &&
+ repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+ }
+
+ bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
+
+ // How many slots an undefined but present (i.e. null) element in
+ // parquet consumes when decoding to Arrow.
+ // "Slot" is used in the same context as the Arrow specification
+ // (i.e. a value holder).
+ // This is only ever >1 for descendents of FixedSizeList.
+ int32_t null_slot_usage = 1;
+
+ // The definition level at which the value for the field
+ // is considered not null (definition levels greater than
+ // or equal to this value indicate a not-null
+ // value for the field). For list fields definition levels
+ // greater than or equal to this field indicate a present,
+ // possibly null, child value.
+ int16_t def_level = 0;
+
+ // The repetition level corresponding to this element
+ // or the closest repeated ancestor. Any repetition
+ // level less than this indicates either a new list OR
+ // an empty list (which is determined in conjunction
+ // with definition levels).
+ int16_t rep_level = 0;
+
+ // The definition level indicating the level at which the closest
+ // repeated ancestor is not empty. This is used to discriminate
+ // between a value less than |def_level| being null or excluded entirely.
+ // For instance if we have an arrow schema like:
+ // list(struct(f0: int)). Then then there are the following
+ // definition levels:
+ // 0 = null list
+ // 1 = present but empty list.
+ // 2 = a null value in the list
+ // 3 = a non null struct but null integer.
+ // 4 = a present integer.
+ // When reconstructing, the struct and integer arrays'
+ // repeated_ancestor_def_level would be 2. Any
+ // def_level < 2 indicates that there isn't a corresponding
+ // child value in the list.
+ // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
+ // has the def levels [0, 1, 2, 3, 4]. The actual
+ // struct array is only of length 3: [not-set, set, set] and
+ // the int array is also of length 3: [N/A, null, 1].
+ //
+ int16_t repeated_ancestor_def_level = 0;
+
+ /// Increments levels according to the cardinality of node.
+ void Increment(const schema::Node& node) {
+ if (node.is_repeated()) {
+ IncrementRepeated();
+ return;
+ }
+ if (node.is_optional()) {
+ IncrementOptional();
+ return;
+ }
+ }
+
+ /// Incremetns level for a optional node.
+ void IncrementOptional() { def_level++; }
+
+ /// Increments levels for the repeated node. Returns
+ /// the previous ancestor_list_def_level.
+ int16_t IncrementRepeated() {
+ int16_t last_repeated_ancestor = repeated_ancestor_def_level;
+
+ // Repeated fields add both a repetition and definition level. This is used
+ // to distinguish between an empty list and a list with an item in it.
+ ++rep_level;
+ ++def_level;
+ // For levels >= repeated_ancenstor_def_level it indicates the list was
+ // non-null and had at least one element. This is important
+ // for later decoding because we need to add a slot for these
+ // values. for levels < current_def_level no slots are added
+ // to arrays.
+ repeated_ancestor_def_level = def_level;
+ return last_repeated_ancestor;
+ }
+
+ friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
+ // This print method is to silence valgrind issues. What's printed
+ // is not important because all asserts happen directly on
+ // members.
+ os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+ << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
+ if (levels.null_slot_usage > 1) {
+ os << ", null_slot_usage=" << levels.null_slot_usage;
+ }
+ os << "}";
+ return os;
+ }
+};
+
+// Input/Output structure for reconstructed validity bitmaps.
+struct PARQUET_EXPORT ValidityBitmapInputOutput {
+ // Input only.
+ // The maximum number of values_read expected (actual
+ // values read must be less than or equal to this value).
+ // If this number is exceeded methods will throw a
+ // ParquetException. Exceeding this limit indicates
+ // either a corrupt or incorrectly written file.
+ int64_t values_read_upper_bound = 0;
+ // Output only. The number of values added to the encountered
+ // (this is logically the count of the number of elements
+ // for an Arrow array).
+ int64_t values_read = 0;
+ // Input/Output. The number of nulls encountered.
+ int64_t null_count = 0;
+ // Output only. The validity bitmap to populate. May be be null only
+ // for DefRepLevelsToListInfo (if all that is needed is list offsets).
+ uint8_t* valid_bits = NULLPTR;
+ // Input only, offset into valid_bits to start at.
+ int64_t valid_bits_offset = 0;
+};
+
+// Converts def_levels to validity bitmaps for non-list arrays and structs that have
+// at least one member that is not a list and has no list descendents.
+// For lists use DefRepLevelsToList and structs where all descendants contain
+// a list use DefRepLevelsToBitmap.
+void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+
+// Reconstructs a validity bitmap and list offsets for a list arrays based on
+// def/rep levels. The first element of offsets will not be modified if rep_levels
+// starts with a new list. The first element of offsets will be used when calculating
+// the next offset. See documentation onf DefLevelsToBitmap for when to use this
+// method vs the other ones in this file for reconstruction.
+//
+// Offsets must be sized to 1 + values_read_upper_bound.
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+ const int16_t* rep_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output,
+ int32_t* offsets);
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+ const int16_t* rep_levels, int64_t num_def_levels,
+ LevelInfo level_info,
+ ValidityBitmapInputOutput* output,
+ int64_t* offsets);
+
+// Reconstructs a validity bitmap for a struct every member is a list or has
+// a list descendant. See documentation on DefLevelsToBitmap for when more
+// details on this method compared to the other ones defined above.
+void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
+ const int16_t* rep_levels,
+ int64_t num_def_levels, LevelInfo level_info,
+ ValidityBitmapInputOutput* output);
+
+// This is exposed to ensure we can properly test a software simulated pext function
+// (i.e. it isn't hidden by runtime dispatch).
+uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
+
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
index fd06b7334dd..75c7716c483 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/level_conversion_inc.h
@@ -1,357 +1,357 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include "parquet/level_conversion.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/bitmap_writer.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/simd.h"
-#include "parquet/exception.h"
-#include "parquet/level_comparison.h"
-
-namespace parquet {
-namespace internal {
-#ifndef PARQUET_IMPL_NAMESPACE
-#error "PARQUET_IMPL_NAMESPACE must be defined"
-#endif
-namespace PARQUET_IMPL_NAMESPACE {
-
-// clang-format off
-/* Python code to generate lookup table:
-
-kLookupBits = 5
-count = 0
-print('constexpr int kLookupBits = {};'.format(kLookupBits))
-print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
-print(' ', end = '')
-for mask in range(1 << kLookupBits):
- for data in range(1 << kLookupBits):
- bit_value = 0
- bit_len = 0
- for i in range(kLookupBits):
- if mask & (1 << i):
- bit_value |= (((data >> i) & 1) << bit_len)
- bit_len += 1
- out = '0x{:02X},'.format(bit_value)
- count += 1
- if count % (1 << kLookupBits) == 1:
- print(' {')
- if count % 8 == 1:
- print(' ', end = '')
- if count % 8 == 0:
- print(out, end = '\n')
- else:
- print(out, end = ' ')
- if count % (1 << kLookupBits) == 0:
- print(' },', end = '')
-print('\n};')
-
-*/
-// clang-format on
-
-constexpr int kLookupBits = 5;
-constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
- 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
- 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
- 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
- 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
- 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
- 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
- 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
- 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
- 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
- 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
- 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
- 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
- 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
- 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
- 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
- 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
- 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
- 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
- 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
- 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
- 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
- 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
- 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
- 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
- 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
- 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
- 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
- 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
- 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
- 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
- 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
- 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
- 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
- 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
- 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
- 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
- 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
- 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
- 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
- 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
- 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
- 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
- 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
- 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
- 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
- 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
- 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
- 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
- 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
- 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
- },
- {
- 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
- 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
- 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
- },
- {
- 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
- 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
- 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
- },
- {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
- 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
- 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
- },
-};
-
-inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
- // A software emulation of _pext_u64
-
- // These checks should be inline and are likely to be common cases.
- if (select_bitmap == ~uint64_t{0}) {
- return bitmap;
- } else if (select_bitmap == 0) {
- return 0;
- }
-
- // Fallback to lookup table method
- uint64_t bit_value = 0;
- int bit_len = 0;
- constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
- while (select_bitmap != 0) {
- const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
- const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
- bit_value |= (value << bit_len);
- bit_len += mask_len;
- bitmap >>= kLookupBits;
- select_bitmap >>= kLookupBits;
- }
- return bit_value;
-}
-
-#ifdef ARROW_HAVE_BMI2
-
-// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
-#if UINTPTR_MAX == 0xFFFFFFFF
-
-using extract_bitmap_t = uint32_t;
-inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
- extract_bitmap_t select_bitmap) {
- return _pext_u32(bitmap, select_bitmap);
-}
-
-#else
-
-using extract_bitmap_t = uint64_t;
-inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
- extract_bitmap_t select_bitmap) {
- return _pext_u64(bitmap, select_bitmap);
-}
-
-#endif
-
-#else // !defined(ARROW_HAVE_BMI2)
-
-// Use 64-bit pext emulation when BMI2 isn't available.
-using extract_bitmap_t = uint64_t;
-inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
- extract_bitmap_t select_bitmap) {
- return ExtractBitsSoftware(bitmap, select_bitmap);
-}
-
-#endif
-
-static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
-
-template <bool has_repeated_parent>
-int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
- int64_t upper_bound_remaining, LevelInfo level_info,
- ::arrow::internal::FirstTimeBitmapWriter* writer) {
- DCHECK_LE(batch_size, kExtractBitsSize);
-
- // Greater than level_info.def_level - 1 implies >= the def_level
- auto defined_bitmap = static_cast<extract_bitmap_t>(
- internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
-
- if (has_repeated_parent) {
- // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
- // repeated_ancestor_def_level
- auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
- def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
- auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
- int64_t selected_count = ::arrow::BitUtil::PopCount(present_bitmap);
- if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
- throw ParquetException("Values read exceeded upper bound");
- }
- writer->AppendWord(selected_bits, selected_count);
- return ::arrow::BitUtil::PopCount(selected_bits);
- } else {
- if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
- std::stringstream ss;
- ss << "Values read exceeded upper bound";
- throw ParquetException(ss.str());
- }
-
- writer->AppendWord(defined_bitmap, batch_size);
- return ::arrow::BitUtil::PopCount(defined_bitmap);
- }
-}
-
-template <bool has_repeated_parent>
-void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
- LevelInfo level_info, ValidityBitmapInputOutput* output) {
- ::arrow::internal::FirstTimeBitmapWriter writer(
- output->valid_bits,
- /*start_offset=*/output->valid_bits_offset,
- /*length=*/num_def_levels);
- int64_t set_count = 0;
- output->values_read = 0;
- int64_t values_read_remaining = output->values_read_upper_bound;
- while (num_def_levels > kExtractBitsSize) {
- set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
- def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
- def_levels += kExtractBitsSize;
- num_def_levels -= kExtractBitsSize;
- values_read_remaining = output->values_read_upper_bound - writer.position();
- }
- set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
- def_levels, num_def_levels, values_read_remaining, level_info, &writer);
-
- output->values_read = writer.position();
- output->null_count += output->values_read - set_count;
- writer.Finish();
-}
-
-} // namespace PARQUET_IMPL_NAMESPACE
-} // namespace internal
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/simd.h"
+#include "parquet/exception.h"
+#include "parquet/level_comparison.h"
+
+namespace parquet {
+namespace internal {
+#ifndef PARQUET_IMPL_NAMESPACE
+#error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace PARQUET_IMPL_NAMESPACE {
+
+// clang-format off
+/* Python code to generate lookup table:
+
+kLookupBits = 5
+count = 0
+print('constexpr int kLookupBits = {};'.format(kLookupBits))
+print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
+print(' ', end = '')
+for mask in range(1 << kLookupBits):
+ for data in range(1 << kLookupBits):
+ bit_value = 0
+ bit_len = 0
+ for i in range(kLookupBits):
+ if mask & (1 << i):
+ bit_value |= (((data >> i) & 1) << bit_len)
+ bit_len += 1
+ out = '0x{:02X},'.format(bit_value)
+ count += 1
+ if count % (1 << kLookupBits) == 1:
+ print(' {')
+ if count % 8 == 1:
+ print(' ', end = '')
+ if count % 8 == 0:
+ print(out, end = '\n')
+ else:
+ print(out, end = ' ')
+ if count % (1 << kLookupBits) == 0:
+ print(' },', end = '')
+print('\n};')
+
+*/
+// clang-format on
+
+constexpr int kLookupBits = 5;
+constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+ 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+ 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+ 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+ 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+ 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+ 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+ 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+ 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+ 0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+ 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+ 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+ 0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+ 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+ 0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+ 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+ 0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+ 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+ 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+ 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
+ 0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+ 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
+ 0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
+ 0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+ 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
+ 0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+ 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
+ 0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+ 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+ 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+ 0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
+ 0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+ 0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
+ 0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+ 0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+ 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
+ 0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
+ },
+ {
+ 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+ 0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
+ },
+ {
+ 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+ 0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
+ 0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
+ },
+ {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+ 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+ 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+ },
+};
+
+inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+ // A software emulation of _pext_u64
+
+ // These checks should be inline and are likely to be common cases.
+ if (select_bitmap == ~uint64_t{0}) {
+ return bitmap;
+ } else if (select_bitmap == 0) {
+ return 0;
+ }
+
+ // Fallback to lookup table method
+ uint64_t bit_value = 0;
+ int bit_len = 0;
+ constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
+ while (select_bitmap != 0) {
+ const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
+ const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
+ bit_value |= (value << bit_len);
+ bit_len += mask_len;
+ bitmap >>= kLookupBits;
+ select_bitmap >>= kLookupBits;
+ }
+ return bit_value;
+}
+
+#ifdef ARROW_HAVE_BMI2
+
+// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
+#if UINTPTR_MAX == 0xFFFFFFFF
+
+using extract_bitmap_t = uint32_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return _pext_u32(bitmap, select_bitmap);
+}
+
+#else
+
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return _pext_u64(bitmap, select_bitmap);
+}
+
+#endif
+
+#else // !defined(ARROW_HAVE_BMI2)
+
+// Use 64-bit pext emulation when BMI2 isn't available.
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+ extract_bitmap_t select_bitmap) {
+ return ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+#endif
+
+static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
+
+template <bool has_repeated_parent>
+int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
+ int64_t upper_bound_remaining, LevelInfo level_info,
+ ::arrow::internal::FirstTimeBitmapWriter* writer) {
+ DCHECK_LE(batch_size, kExtractBitsSize);
+
+ // Greater than level_info.def_level - 1 implies >= the def_level
+ auto defined_bitmap = static_cast<extract_bitmap_t>(
+ internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1));
+
+ if (has_repeated_parent) {
+ // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
+ // repeated_ancestor_def_level
+ auto present_bitmap = static_cast<extract_bitmap_t>(internal::GreaterThanBitmap(
+ def_levels, batch_size, level_info.repeated_ancestor_def_level - 1));
+ auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
+ int64_t selected_count = ::arrow::BitUtil::PopCount(present_bitmap);
+ if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
+ throw ParquetException("Values read exceeded upper bound");
+ }
+ writer->AppendWord(selected_bits, selected_count);
+ return ::arrow::BitUtil::PopCount(selected_bits);
+ } else {
+ if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
+ std::stringstream ss;
+ ss << "Values read exceeded upper bound";
+ throw ParquetException(ss.str());
+ }
+
+ writer->AppendWord(defined_bitmap, batch_size);
+ return ::arrow::BitUtil::PopCount(defined_bitmap);
+ }
+}
+
+template <bool has_repeated_parent>
+void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
+ LevelInfo level_info, ValidityBitmapInputOutput* output) {
+ ::arrow::internal::FirstTimeBitmapWriter writer(
+ output->valid_bits,
+ /*start_offset=*/output->valid_bits_offset,
+ /*length=*/num_def_levels);
+ int64_t set_count = 0;
+ output->values_read = 0;
+ int64_t values_read_remaining = output->values_read_upper_bound;
+ while (num_def_levels > kExtractBitsSize) {
+ set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+ def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
+ def_levels += kExtractBitsSize;
+ num_def_levels -= kExtractBitsSize;
+ values_read_remaining = output->values_read_upper_bound - writer.position();
+ }
+ set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+ def_levels, num_def_levels, values_read_remaining, level_info, &writer);
+
+ output->values_read = writer.position();
+ output->null_count += output->values_read - set_count;
+ writer.Finish();
+}
+
+} // namespace PARQUET_IMPL_NAMESPACE
+} // namespace internal
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
index 1524333702f..bd9bf77c42d 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.cc
@@ -1,1783 +1,1783 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/metadata.h"
-
-#include <algorithm>
-#include <cinttypes>
-#include <ostream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "arrow/io/memory.h"
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/string_view.h"
-#include "parquet/encryption/encryption_internal.h"
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/exception.h"
-#include "parquet/schema.h"
-#include "parquet/schema_internal.h"
-#include "parquet/statistics.h"
-#include "parquet/thrift_internal.h"
-
-namespace parquet {
-
-const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() {
- static ApplicationVersion version("parquet-mr", 1, 8, 0);
- return version;
-}
-
-const ApplicationVersion& ApplicationVersion::PARQUET_816_FIXED_VERSION() {
- static ApplicationVersion version("parquet-mr", 1, 2, 9);
- return version;
-}
-
-const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() {
- static ApplicationVersion version("parquet-cpp", 1, 3, 0);
- return version;
-}
-
-const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() {
- static ApplicationVersion version("parquet-mr", 1, 10, 0);
- return version;
-}
-
-std::string ParquetVersionToString(ParquetVersion::type ver) {
- switch (ver) {
- case ParquetVersion::PARQUET_1_0:
- return "1.0";
- case ParquetVersion::PARQUET_2_0:
- return "2.0";
- }
-
- // This should be unreachable
- return "UNKNOWN";
-}
-
-template <typename DType>
-static std::shared_ptr<Statistics> MakeTypedColumnStats(
- const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
- // If ColumnOrder is defined, return max_value and min_value
- if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
- return MakeStatistics<DType>(
- descr, metadata.statistics.min_value, metadata.statistics.max_value,
- metadata.num_values - metadata.statistics.null_count,
- metadata.statistics.null_count, metadata.statistics.distinct_count,
- metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value,
- metadata.statistics.__isset.null_count,
- metadata.statistics.__isset.distinct_count);
- }
- // Default behavior
- return MakeStatistics<DType>(
- descr, metadata.statistics.min, metadata.statistics.max,
- metadata.num_values - metadata.statistics.null_count,
- metadata.statistics.null_count, metadata.statistics.distinct_count,
- metadata.statistics.__isset.max || metadata.statistics.__isset.min,
- metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
-}
-
-std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_data,
- const ColumnDescriptor* descr) {
- switch (static_cast<Type::type>(meta_data.type)) {
- case Type::BOOLEAN:
- return MakeTypedColumnStats<BooleanType>(meta_data, descr);
- case Type::INT32:
- return MakeTypedColumnStats<Int32Type>(meta_data, descr);
- case Type::INT64:
- return MakeTypedColumnStats<Int64Type>(meta_data, descr);
- case Type::INT96:
- return MakeTypedColumnStats<Int96Type>(meta_data, descr);
- case Type::DOUBLE:
- return MakeTypedColumnStats<DoubleType>(meta_data, descr);
- case Type::FLOAT:
- return MakeTypedColumnStats<FloatType>(meta_data, descr);
- case Type::BYTE_ARRAY:
- return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return MakeTypedColumnStats<FLBAType>(meta_data, descr);
- case Type::UNDEFINED:
- break;
- }
- throw ParquetException("Can't decode page statistics for selected column type");
-}
-
-// MetaData Accessor
-
-// ColumnCryptoMetaData
-class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl {
- public:
- explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata)
- : crypto_metadata_(crypto_metadata) {}
-
- bool encrypted_with_footer_key() const {
- return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY;
- }
- bool encrypted_with_column_key() const {
- return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY;
- }
- std::shared_ptr<schema::ColumnPath> path_in_schema() const {
- return std::make_shared<schema::ColumnPath>(
- crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
- }
- const std::string& key_metadata() const {
- return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
- }
-
- private:
- const format::ColumnCryptoMetaData* crypto_metadata_;
-};
-
-std::unique_ptr<ColumnCryptoMetaData> ColumnCryptoMetaData::Make(
- const uint8_t* metadata) {
- return std::unique_ptr<ColumnCryptoMetaData>(new ColumnCryptoMetaData(metadata));
-}
-
-ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata)
- : impl_(new ColumnCryptoMetaDataImpl(
- reinterpret_cast<const format::ColumnCryptoMetaData*>(metadata))) {}
-
-ColumnCryptoMetaData::~ColumnCryptoMetaData() = default;
-
-std::shared_ptr<schema::ColumnPath> ColumnCryptoMetaData::path_in_schema() const {
- return impl_->path_in_schema();
-}
-bool ColumnCryptoMetaData::encrypted_with_footer_key() const {
- return impl_->encrypted_with_footer_key();
-}
-const std::string& ColumnCryptoMetaData::key_metadata() const {
- return impl_->key_metadata();
-}
-
-// ColumnChunk metadata
-class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
- public:
- explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
- const ColumnDescriptor* descr,
- int16_t row_group_ordinal, int16_t column_ordinal,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : column_(column), descr_(descr), writer_version_(writer_version) {
- column_metadata_ = &column->meta_data;
- if (column->__isset.crypto_metadata) { // column metadata is encrypted
- format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
-
- if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
- if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) {
- // should decrypt metadata
- std::shared_ptr<schema::ColumnPath> path = std::make_shared<schema::ColumnPath>(
- ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
- std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
-
- std::string aad_column_metadata = encryption::CreateModuleAad(
- file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal,
- column_ordinal, static_cast<int16_t>(-1));
- auto decryptor = file_decryptor->GetColumnMetaDecryptor(
- path->ToDotString(), key_metadata, aad_column_metadata);
- auto len = static_cast<uint32_t>(column->encrypted_column_metadata.size());
- DeserializeThriftMsg(
- reinterpret_cast<const uint8_t*>(column->encrypted_column_metadata.c_str()),
- &len, &decrypted_metadata_, decryptor);
- column_metadata_ = &decrypted_metadata_;
- } else {
- throw ParquetException(
- "Cannot decrypt ColumnMetadata."
- " FileDecryption is not setup correctly");
- }
- }
- }
- for (const auto& encoding : column_metadata_->encodings) {
- encodings_.push_back(LoadEnumSafe(&encoding));
- }
- for (const auto& encoding_stats : column_metadata_->encoding_stats) {
- encoding_stats_.push_back({LoadEnumSafe(&encoding_stats.page_type),
- LoadEnumSafe(&encoding_stats.encoding),
- encoding_stats.count});
- }
- possible_stats_ = nullptr;
- }
-
- bool Equals(const ColumnChunkMetaDataImpl& other) const {
- return *column_metadata_ == *other.column_metadata_;
- }
-
- // column chunk
- inline int64_t file_offset() const { return column_->file_offset; }
- inline const std::string& file_path() const { return column_->file_path; }
-
- inline Type::type type() const { return LoadEnumSafe(&column_metadata_->type); }
-
- inline int64_t num_values() const { return column_metadata_->num_values; }
-
- std::shared_ptr<schema::ColumnPath> path_in_schema() {
- return std::make_shared<schema::ColumnPath>(column_metadata_->path_in_schema);
- }
-
- // Check if statistics are set and are valid
- // 1) Must be set in the metadata
- // 2) Statistics must not be corrupted
- inline bool is_stats_set() const {
- DCHECK(writer_version_ != nullptr);
- // If the column statistics don't exist or column sort order is unknown
- // we cannot use the column stats
- if (!column_metadata_->__isset.statistics ||
- descr_->sort_order() == SortOrder::UNKNOWN) {
- return false;
- }
- if (possible_stats_ == nullptr) {
- possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
- }
- EncodedStatistics encodedStatistics = possible_stats_->Encode();
- return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
- descr_->sort_order());
- }
-
- inline std::shared_ptr<Statistics> statistics() const {
- return is_stats_set() ? possible_stats_ : nullptr;
- }
-
- inline Compression::type compression() const {
- return LoadEnumSafe(&column_metadata_->codec);
- }
-
- const std::vector<Encoding::type>& encodings() const { return encodings_; }
-
- const std::vector<PageEncodingStats>& encoding_stats() const { return encoding_stats_; }
-
- inline bool has_dictionary_page() const {
- return column_metadata_->__isset.dictionary_page_offset;
- }
-
- inline int64_t dictionary_page_offset() const {
- return column_metadata_->dictionary_page_offset;
- }
-
- inline int64_t data_page_offset() const { return column_metadata_->data_page_offset; }
-
- inline bool has_index_page() const {
- return column_metadata_->__isset.index_page_offset;
- }
-
- inline int64_t index_page_offset() const { return column_metadata_->index_page_offset; }
-
- inline int64_t total_compressed_size() const {
- return column_metadata_->total_compressed_size;
- }
-
- inline int64_t total_uncompressed_size() const {
- return column_metadata_->total_uncompressed_size;
- }
-
- inline std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const {
- if (column_->__isset.crypto_metadata) {
- return ColumnCryptoMetaData::Make(
- reinterpret_cast<const uint8_t*>(&column_->crypto_metadata));
- } else {
- return nullptr;
- }
- }
-
- private:
- mutable std::shared_ptr<Statistics> possible_stats_;
- std::vector<Encoding::type> encodings_;
- std::vector<PageEncodingStats> encoding_stats_;
- const format::ColumnChunk* column_;
- const format::ColumnMetaData* column_metadata_;
- format::ColumnMetaData decrypted_metadata_;
- const ColumnDescriptor* descr_;
- const ApplicationVersion* writer_version_;
-};
-
-std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
- const void* metadata, const ColumnDescriptor* descr,
- const ApplicationVersion* writer_version, int16_t row_group_ordinal,
- int16_t column_ordinal, std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- return std::unique_ptr<ColumnChunkMetaData>(
- new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal,
- writer_version, std::move(file_decryptor)));
-}
-
-ColumnChunkMetaData::ColumnChunkMetaData(
- const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
- int16_t column_ordinal, const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : impl_{new ColumnChunkMetaDataImpl(
- reinterpret_cast<const format::ColumnChunk*>(metadata), descr,
- row_group_ordinal, column_ordinal, writer_version, std::move(file_decryptor))} {
-}
-
-ColumnChunkMetaData::~ColumnChunkMetaData() = default;
-
-// column chunk
-int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); }
-
-const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); }
-
-Type::type ColumnChunkMetaData::type() const { return impl_->type(); }
-
-int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); }
-
-std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const {
- return impl_->path_in_schema();
-}
-
-std::shared_ptr<Statistics> ColumnChunkMetaData::statistics() const {
- return impl_->statistics();
-}
-
-bool ColumnChunkMetaData::is_stats_set() const { return impl_->is_stats_set(); }
-
-bool ColumnChunkMetaData::has_dictionary_page() const {
- return impl_->has_dictionary_page();
-}
-
-int64_t ColumnChunkMetaData::dictionary_page_offset() const {
- return impl_->dictionary_page_offset();
-}
-
-int64_t ColumnChunkMetaData::data_page_offset() const {
- return impl_->data_page_offset();
-}
-
-bool ColumnChunkMetaData::has_index_page() const { return impl_->has_index_page(); }
-
-int64_t ColumnChunkMetaData::index_page_offset() const {
- return impl_->index_page_offset();
-}
-
-Compression::type ColumnChunkMetaData::compression() const {
- return impl_->compression();
-}
-
-bool ColumnChunkMetaData::can_decompress() const {
- return ::arrow::util::Codec::IsAvailable(compression());
-}
-
-const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
- return impl_->encodings();
-}
-
-const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats() const {
- return impl_->encoding_stats();
-}
-
-int64_t ColumnChunkMetaData::total_uncompressed_size() const {
- return impl_->total_uncompressed_size();
-}
-
-int64_t ColumnChunkMetaData::total_compressed_size() const {
- return impl_->total_compressed_size();
-}
-
-std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() const {
- return impl_->crypto_metadata();
-}
-
-bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
- return impl_->Equals(*other.impl_);
-}
-
-// row-group metadata
-class RowGroupMetaData::RowGroupMetaDataImpl {
- public:
- explicit RowGroupMetaDataImpl(const format::RowGroup* row_group,
- const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : row_group_(row_group),
- schema_(schema),
- writer_version_(writer_version),
- file_decryptor_(std::move(file_decryptor)) {}
-
- bool Equals(const RowGroupMetaDataImpl& other) const {
- return *row_group_ == *other.row_group_;
- }
-
- inline int num_columns() const { return static_cast<int>(row_group_->columns.size()); }
-
- inline int64_t num_rows() const { return row_group_->num_rows; }
-
- inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
-
- inline int64_t total_compressed_size() const {
- return row_group_->total_compressed_size;
- }
-
- inline int64_t file_offset() const { return row_group_->file_offset; }
-
- inline const SchemaDescriptor* schema() const { return schema_; }
-
- std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
- if (i < num_columns()) {
- return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
- writer_version_, row_group_->ordinal,
- static_cast<int16_t>(i), file_decryptor_);
- }
- throw ParquetException("The file only has ", num_columns(),
- " columns, requested metadata for column: ", i);
- }
-
- private:
- const format::RowGroup* row_group_;
- const SchemaDescriptor* schema_;
- const ApplicationVersion* writer_version_;
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-};
-
-std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
- const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- return std::unique_ptr<RowGroupMetaData>(
- new RowGroupMetaData(metadata, schema, writer_version, std::move(file_decryptor)));
-}
-
-RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : impl_{new RowGroupMetaDataImpl(reinterpret_cast<const format::RowGroup*>(metadata),
- schema, writer_version, std::move(file_decryptor))} {
-}
-
-RowGroupMetaData::~RowGroupMetaData() = default;
-
-bool RowGroupMetaData::Equals(const RowGroupMetaData& other) const {
- return impl_->Equals(*other.impl_);
-}
-
-int RowGroupMetaData::num_columns() const { return impl_->num_columns(); }
-
-int64_t RowGroupMetaData::num_rows() const { return impl_->num_rows(); }
-
-int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); }
-
-int64_t RowGroupMetaData::total_compressed_size() const {
- return impl_->total_compressed_size();
-}
-
-int64_t RowGroupMetaData::file_offset() const { return impl_->file_offset(); }
-
-const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); }
-
-std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const {
- return impl_->ColumnChunk(i);
-}
-
-bool RowGroupMetaData::can_decompress() const {
- int n_columns = num_columns();
- for (int i = 0; i < n_columns; i++) {
- if (!ColumnChunk(i)->can_decompress()) {
- return false;
- }
- }
- return true;
-}
-
-// file metadata
-class FileMetaData::FileMetaDataImpl {
- public:
- FileMetaDataImpl() = default;
-
- explicit FileMetaDataImpl(
- const void* metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
- : file_decryptor_(file_decryptor) {
- metadata_.reset(new format::FileMetaData);
-
- auto footer_decryptor =
- file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr;
-
- DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(metadata), metadata_len,
- metadata_.get(), footer_decryptor);
- metadata_len_ = *metadata_len;
-
- if (metadata_->__isset.created_by) {
- writer_version_ = ApplicationVersion(metadata_->created_by);
- } else {
- writer_version_ = ApplicationVersion("unknown 0.0.0");
- }
-
- InitSchema();
- InitColumnOrders();
- InitKeyValueMetadata();
- }
-
- bool VerifySignature(const void* signature) {
- // verify decryption properties are set
- if (file_decryptor_ == nullptr) {
- throw ParquetException("Decryption not set properly. cannot verify signature");
- }
- // serialize the footer
- uint8_t* serialized_data;
- uint32_t serialized_len = metadata_len_;
- ThriftSerializer serializer;
- serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
-
- // encrypt with nonce
- auto nonce = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature));
- auto tag = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature)) +
- encryption::kNonceLength;
-
- std::string key = file_decryptor_->GetFooterKey();
- std::string aad = encryption::CreateFooterAad(file_decryptor_->file_aad());
-
- auto aes_encryptor = encryption::AesEncryptor::Make(
- file_decryptor_->algorithm(), static_cast<int>(key.size()), true, nullptr);
-
- std::shared_ptr<Buffer> encrypted_buffer = std::static_pointer_cast<ResizableBuffer>(
- AllocateBuffer(file_decryptor_->pool(),
- aes_encryptor->CiphertextSizeDelta() + serialized_len));
- uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
- serialized_data, serialized_len, str2bytes(key), static_cast<int>(key.size()),
- str2bytes(aad), static_cast<int>(aad.size()), nonce,
- encrypted_buffer->mutable_data());
- // Delete AES encryptor object. It was created only to verify the footer signature.
- aes_encryptor->WipeOut();
- delete aes_encryptor;
- return 0 ==
- memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
- tag, encryption::kGcmTagLength);
- }
-
- inline uint32_t size() const { return metadata_len_; }
- inline int num_columns() const { return schema_.num_columns(); }
- inline int64_t num_rows() const { return metadata_->num_rows; }
- inline int num_row_groups() const {
- return static_cast<int>(metadata_->row_groups.size());
- }
- inline int32_t version() const { return metadata_->version; }
- inline const std::string& created_by() const { return metadata_->created_by; }
- inline int num_schema_elements() const {
- return static_cast<int>(metadata_->schema.size());
- }
-
- inline bool is_encryption_algorithm_set() const {
- return metadata_->__isset.encryption_algorithm;
- }
- inline EncryptionAlgorithm encryption_algorithm() {
- return FromThrift(metadata_->encryption_algorithm);
- }
- inline const std::string& footer_signing_key_metadata() {
- return metadata_->footer_signing_key_metadata;
- }
-
- const ApplicationVersion& writer_version() const { return writer_version_; }
-
- void WriteTo(::arrow::io::OutputStream* dst,
- const std::shared_ptr<Encryptor>& encryptor) const {
- ThriftSerializer serializer;
- // Only in encrypted files with plaintext footers the
- // encryption_algorithm is set in footer
- if (is_encryption_algorithm_set()) {
- uint8_t* serialized_data;
- uint32_t serialized_len;
- serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
-
- // encrypt the footer key
- std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
- serialized_len);
- unsigned encrypted_len =
- encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
-
- // write unencrypted footer
- PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
- // Write signature (nonce and tag)
- PARQUET_THROW_NOT_OK(
- dst->Write(encrypted_data.data() + 4, encryption::kNonceLength));
- PARQUET_THROW_NOT_OK(
- dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength,
- encryption::kGcmTagLength));
- } else { // either plaintext file (when encryptor is null)
- // or encrypted file with encrypted footer
- serializer.Serialize(metadata_.get(), dst, encryptor);
- }
- }
-
- std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
- if (!(i < num_row_groups())) {
- std::stringstream ss;
- ss << "The file only has " << num_row_groups()
- << " row groups, requested metadata for row group: " << i;
- throw ParquetException(ss.str());
- }
- return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_,
- file_decryptor_);
- }
-
- bool Equals(const FileMetaDataImpl& other) const {
- return *metadata_ == *other.metadata_;
- }
-
- const SchemaDescriptor* schema() const { return &schema_; }
-
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
- return key_value_metadata_;
- }
-
- void set_file_path(const std::string& path) {
- for (format::RowGroup& row_group : metadata_->row_groups) {
- for (format::ColumnChunk& chunk : row_group.columns) {
- chunk.__set_file_path(path);
- }
- }
- }
-
- format::RowGroup& row_group(int i) {
- DCHECK_LT(i, num_row_groups());
- return metadata_->row_groups[i];
- }
-
- void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
- if (!schema()->Equals(*other->schema())) {
- throw ParquetException("AppendRowGroups requires equal schemas.");
- }
-
- format::RowGroup other_rg;
- for (int i = 0; i < other->num_row_groups(); i++) {
- other_rg = other->row_group(i);
- metadata_->row_groups.push_back(other_rg);
- metadata_->num_rows += other_rg.num_rows;
- }
- }
-
- std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) {
- for (int i : row_groups) {
- if (i < num_row_groups()) continue;
-
- throw ParquetException(
- "The file only has ", num_row_groups(),
- " row groups, but requested a subset including row group: ", i);
- }
-
- std::shared_ptr<FileMetaData> out(new FileMetaData());
- out->impl_.reset(new FileMetaDataImpl());
- out->impl_->metadata_.reset(new format::FileMetaData());
-
- auto metadata = out->impl_->metadata_.get();
- metadata->version = metadata_->version;
- metadata->schema = metadata_->schema;
-
- metadata->row_groups.resize(row_groups.size());
- int i = 0;
- for (int selected_index : row_groups) {
- metadata->num_rows += row_group(selected_index).num_rows;
- metadata->row_groups[i++] = row_group(selected_index);
- }
-
- metadata->key_value_metadata = metadata_->key_value_metadata;
- metadata->created_by = metadata_->created_by;
- metadata->column_orders = metadata_->column_orders;
- metadata->encryption_algorithm = metadata_->encryption_algorithm;
- metadata->footer_signing_key_metadata = metadata_->footer_signing_key_metadata;
- metadata->__isset = metadata_->__isset;
-
- out->impl_->schema_ = schema_;
- out->impl_->writer_version_ = writer_version_;
- out->impl_->key_value_metadata_ = key_value_metadata_;
- out->impl_->file_decryptor_ = file_decryptor_;
-
- return out;
- }
-
- void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- file_decryptor_ = file_decryptor;
- }
-
- private:
- friend FileMetaDataBuilder;
- uint32_t metadata_len_ = 0;
- std::unique_ptr<format::FileMetaData> metadata_;
- SchemaDescriptor schema_;
- ApplicationVersion writer_version_;
- std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
- std::shared_ptr<InternalFileDecryptor> file_decryptor_;
-
- void InitSchema() {
- if (metadata_->schema.empty()) {
- throw ParquetException("Empty file schema (no root)");
- }
- schema_.Init(schema::Unflatten(&metadata_->schema[0],
- static_cast<int>(metadata_->schema.size())));
- }
-
- void InitColumnOrders() {
- // update ColumnOrder
- std::vector<parquet::ColumnOrder> column_orders;
- if (metadata_->__isset.column_orders) {
- for (auto column_order : metadata_->column_orders) {
- if (column_order.__isset.TYPE_ORDER) {
- column_orders.push_back(ColumnOrder::type_defined_);
- } else {
- column_orders.push_back(ColumnOrder::undefined_);
- }
- }
- } else {
- column_orders.resize(schema_.num_columns(), ColumnOrder::undefined_);
- }
-
- schema_.updateColumnOrders(column_orders);
- }
-
- void InitKeyValueMetadata() {
- std::shared_ptr<KeyValueMetadata> metadata = nullptr;
- if (metadata_->__isset.key_value_metadata) {
- metadata = std::make_shared<KeyValueMetadata>();
- for (const auto& it : metadata_->key_value_metadata) {
- metadata->Append(it.key, it.value);
- }
- }
- key_value_metadata_ = std::move(metadata);
- }
-};
-
-std::shared_ptr<FileMetaData> FileMetaData::Make(
- const void* metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- // This FileMetaData ctor is private, not compatible with std::make_shared
- return std::shared_ptr<FileMetaData>(
- new FileMetaData(metadata, metadata_len, file_decryptor));
-}
-
-FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor)
- : impl_{std::unique_ptr<FileMetaDataImpl>(
- new FileMetaDataImpl(metadata, metadata_len, file_decryptor))} {}
-
-FileMetaData::FileMetaData()
- : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {}
-
-FileMetaData::~FileMetaData() = default;
-
-bool FileMetaData::Equals(const FileMetaData& other) const {
- return impl_->Equals(*other.impl_);
-}
-
-std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const {
- return impl_->RowGroup(i);
-}
-
-bool FileMetaData::VerifySignature(const void* signature) {
- return impl_->VerifySignature(signature);
-}
-
-uint32_t FileMetaData::size() const { return impl_->size(); }
-
-int FileMetaData::num_columns() const { return impl_->num_columns(); }
-
-int64_t FileMetaData::num_rows() const { return impl_->num_rows(); }
-
-int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); }
-
-bool FileMetaData::can_decompress() const {
- int n_row_groups = num_row_groups();
- for (int i = 0; i < n_row_groups; i++) {
- if (!RowGroup(i)->can_decompress()) {
- return false;
- }
- }
- return true;
-}
-
-bool FileMetaData::is_encryption_algorithm_set() const {
- return impl_->is_encryption_algorithm_set();
-}
-
-EncryptionAlgorithm FileMetaData::encryption_algorithm() const {
- return impl_->encryption_algorithm();
-}
-
-const std::string& FileMetaData::footer_signing_key_metadata() const {
- return impl_->footer_signing_key_metadata();
-}
-
-void FileMetaData::set_file_decryptor(
- std::shared_ptr<InternalFileDecryptor> file_decryptor) {
- impl_->set_file_decryptor(file_decryptor);
-}
-
-ParquetVersion::type FileMetaData::version() const {
- switch (impl_->version()) {
- case 1:
- return ParquetVersion::PARQUET_1_0;
- case 2:
- return ParquetVersion::PARQUET_2_0;
- default:
- // Improperly set version, assuming Parquet 1.0
- break;
- }
- return ParquetVersion::PARQUET_1_0;
-}
-
-const ApplicationVersion& FileMetaData::writer_version() const {
- return impl_->writer_version();
-}
-
-const std::string& FileMetaData::created_by() const { return impl_->created_by(); }
-
-int FileMetaData::num_schema_elements() const { return impl_->num_schema_elements(); }
-
-const SchemaDescriptor* FileMetaData::schema() const { return impl_->schema(); }
-
-const std::shared_ptr<const KeyValueMetadata>& FileMetaData::key_value_metadata() const {
- return impl_->key_value_metadata();
-}
-
-void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
-
-void FileMetaData::AppendRowGroups(const FileMetaData& other) {
- impl_->AppendRowGroups(other.impl_);
-}
-
-std::shared_ptr<FileMetaData> FileMetaData::Subset(
- const std::vector<int>& row_groups) const {
- return impl_->Subset(row_groups);
-}
-
-void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
- const std::shared_ptr<Encryptor>& encryptor) const {
- return impl_->WriteTo(dst, encryptor);
-}
-
-class FileCryptoMetaData::FileCryptoMetaDataImpl {
- public:
- FileCryptoMetaDataImpl() = default;
-
- explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) {
- metadata_.reset(new format::FileCryptoMetaData);
- DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
- metadata_len_ = *metadata_len;
- }
-
- EncryptionAlgorithm encryption_algorithm() {
- return FromThrift(metadata_->encryption_algorithm);
- }
- const std::string& key_metadata() { return metadata_->key_metadata; }
- void WriteTo(::arrow::io::OutputStream* dst) const {
- ThriftSerializer serializer;
- serializer.Serialize(metadata_.get(), dst);
- }
-
- private:
- friend FileMetaDataBuilder;
- std::unique_ptr<format::FileCryptoMetaData> metadata_;
- uint32_t metadata_len_;
-};
-
-EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const {
- return impl_->encryption_algorithm();
-}
-
-const std::string& FileCryptoMetaData::key_metadata() const {
- return impl_->key_metadata();
-}
-
-std::shared_ptr<FileCryptoMetaData> FileCryptoMetaData::Make(
- const uint8_t* serialized_metadata, uint32_t* metadata_len) {
- return std::shared_ptr<FileCryptoMetaData>(
- new FileCryptoMetaData(serialized_metadata, metadata_len));
-}
-
-FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata,
- uint32_t* metadata_len)
- : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {}
-
-FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {}
-
-FileCryptoMetaData::~FileCryptoMetaData() = default;
-
-void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const {
- impl_->WriteTo(dst);
-}
-
-std::string FileMetaData::SerializeToString() const {
- // We need to pass in an initial size. Since it will automatically
- // increase the buffer size to hold the metadata, we just leave it 0.
- PARQUET_ASSIGN_OR_THROW(auto serializer, ::arrow::io::BufferOutputStream::Create(0));
- WriteTo(serializer.get());
- PARQUET_ASSIGN_OR_THROW(auto metadata_buffer, serializer->Finish());
- return metadata_buffer->ToString();
-}
-
-ApplicationVersion::ApplicationVersion(std::string application, int major, int minor,
- int patch)
- : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
-
-namespace {
-// Parse the application version format and set parsed values to
-// ApplicationVersion.
-//
-// The application version format must be compatible parquet-mr's
-// one. See also:
-// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
-// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
-//
-// The application version format:
-// "${APPLICATION_NAME}"
-// "${APPLICATION_NAME} version ${VERSION}"
-// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
-//
-// Eg:
-// parquet-cpp
-// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
-// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
-//
-// The VERSION format:
-// "${MAJOR}"
-// "${MAJOR}.${MINOR}"
-// "${MAJOR}.${MINOR}.${PATCH}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
-// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
-// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
-// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
-// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
-//
-// Eg:
-// 1
-// 1.5
-// 1.5.0
-// 1.5.0ab
-// 1.5.0ab-cdh5.5.0
-// 1.5.0ab-cdh5.5.0+cd
-// 1.5.0ab+cd
-// 1.5.0-cdh5.5.0
-// 1.5.0-cdh5.5.0+cd
-// 1.5.0+cd
-class ApplicationVersionParser {
- public:
- ApplicationVersionParser(const std::string& created_by,
- ApplicationVersion& application_version)
- : created_by_(created_by),
- application_version_(application_version),
- spaces_(" \t\v\r\n\f"),
- digits_("0123456789") {}
-
- void Parse() {
- application_version_.application_ = "unknown";
- application_version_.version = {0, 0, 0, "", "", ""};
-
- if (!ParseApplicationName()) {
- return;
- }
- if (!ParseVersion()) {
- return;
- }
- if (!ParseBuildName()) {
- return;
- }
- }
-
- private:
- bool IsSpace(const std::string& string, const size_t& offset) {
- auto target = ::arrow::util::string_view(string).substr(offset, 1);
- return target.find_first_of(spaces_) != ::arrow::util::string_view::npos;
- }
-
- void RemovePrecedingSpaces(const std::string& string, size_t& start,
- const size_t& end) {
- while (start < end && IsSpace(string, start)) {
- ++start;
- }
- }
-
- void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
- while (start < (end - 1) && (end - 1) < string.size() && IsSpace(string, end - 1)) {
- --end;
- }
- }
-
- bool ParseApplicationName() {
- std::string version_mark(" version ");
- auto version_mark_position = created_by_.find(version_mark);
- size_t application_name_end;
- // No VERSION and BUILD_NAME.
- if (version_mark_position == std::string::npos) {
- version_start_ = std::string::npos;
- application_name_end = created_by_.size();
- } else {
- version_start_ = version_mark_position + version_mark.size();
- application_name_end = version_mark_position;
- }
-
- size_t application_name_start = 0;
- RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
- RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
- application_version_.application_ = created_by_.substr(
- application_name_start, application_name_end - application_name_start);
-
- return true;
- }
-
- bool ParseVersion() {
- // No VERSION.
- if (version_start_ == std::string::npos) {
- return false;
- }
-
- RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
- version_end_ = created_by_.find(" (", version_start_);
- // No BUILD_NAME.
- if (version_end_ == std::string::npos) {
- version_end_ = created_by_.size();
- }
- RemoveTrailingSpaces(created_by_, version_start_, version_end_);
- // No VERSION.
- if (version_start_ == version_end_) {
- return false;
- }
- version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
-
- if (!ParseVersionMajor()) {
- return false;
- }
- if (!ParseVersionMinor()) {
- return false;
- }
- if (!ParseVersionPatch()) {
- return false;
- }
- if (!ParseVersionUnknown()) {
- return false;
- }
- if (!ParseVersionPreRelease()) {
- return false;
- }
- if (!ParseVersionBuildInfo()) {
- return false;
- }
-
- return true;
- }
-
- bool ParseVersionMajor() {
- size_t version_major_start = 0;
- auto version_major_end = version_string_.find_first_not_of(digits_);
- // MAJOR only.
- if (version_major_end == std::string::npos) {
- version_major_end = version_string_.size();
- version_parsing_position_ = version_major_end;
- } else {
- // No ".".
- if (version_string_[version_major_end] != '.') {
- return false;
- }
- // No MAJOR.
- if (version_major_end == version_major_start) {
- return false;
- }
- version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
- }
- auto version_major_string = version_string_.substr(
- version_major_start, version_major_end - version_major_start);
- application_version_.version.major = atoi(version_major_string.c_str());
- return true;
- }
-
- bool ParseVersionMinor() {
- auto version_minor_start = version_parsing_position_;
- auto version_minor_end =
- version_string_.find_first_not_of(digits_, version_minor_start);
- // MAJOR.MINOR only.
- if (version_minor_end == std::string::npos) {
- version_minor_end = version_string_.size();
- version_parsing_position_ = version_minor_end;
- } else {
- // No ".".
- if (version_string_[version_minor_end] != '.') {
- return false;
- }
- // No MINOR.
- if (version_minor_end == version_minor_start) {
- return false;
- }
- version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
- }
- auto version_minor_string = version_string_.substr(
- version_minor_start, version_minor_end - version_minor_start);
- application_version_.version.minor = atoi(version_minor_string.c_str());
- return true;
- }
-
- bool ParseVersionPatch() {
- auto version_patch_start = version_parsing_position_;
- auto version_patch_end =
- version_string_.find_first_not_of(digits_, version_patch_start);
- // No UNKNOWN, PRE_RELEASE and BUILD_INFO.
- if (version_patch_end == std::string::npos) {
- version_patch_end = version_string_.size();
- }
- // No PATCH.
- if (version_patch_end == version_patch_start) {
- return false;
- }
- auto version_patch_string = version_string_.substr(
- version_patch_start, version_patch_end - version_patch_start);
- application_version_.version.patch = atoi(version_patch_string.c_str());
- version_parsing_position_ = version_patch_end;
- return true;
- }
-
- bool ParseVersionUnknown() {
- // No UNKNOWN.
- if (version_parsing_position_ == version_string_.size()) {
- return true;
- }
- auto version_unknown_start = version_parsing_position_;
- auto version_unknown_end = version_string_.find_first_of("-+", version_unknown_start);
- // No PRE_RELEASE and BUILD_INFO
- if (version_unknown_end == std::string::npos) {
- version_unknown_end = version_string_.size();
- }
- application_version_.version.unknown = version_string_.substr(
- version_unknown_start, version_unknown_end - version_unknown_start);
- version_parsing_position_ = version_unknown_end;
- return true;
- }
-
- bool ParseVersionPreRelease() {
- // No PRE_RELEASE.
- if (version_parsing_position_ == version_string_.size() ||
- version_string_[version_parsing_position_] != '-') {
- return true;
- }
-
- auto version_pre_release_start = version_parsing_position_ + 1; // +1 is for '-'.
- auto version_pre_release_end =
- version_string_.find_first_of("+", version_pre_release_start);
- // No BUILD_INFO
- if (version_pre_release_end == std::string::npos) {
- version_pre_release_end = version_string_.size();
- }
- application_version_.version.pre_release = version_string_.substr(
- version_pre_release_start, version_pre_release_end - version_pre_release_start);
- version_parsing_position_ = version_pre_release_end;
- return true;
- }
-
- bool ParseVersionBuildInfo() {
- // No BUILD_INFO.
- if (version_parsing_position_ == version_string_.size() ||
- version_string_[version_parsing_position_] != '+') {
- return true;
- }
-
- auto version_build_info_start = version_parsing_position_ + 1; // +1 is for '+'.
- application_version_.version.build_info =
- version_string_.substr(version_build_info_start);
- return true;
- }
-
- bool ParseBuildName() {
- std::string build_mark(" (build ");
- auto build_mark_position = created_by_.find(build_mark, version_end_);
- // No BUILD_NAME.
- if (build_mark_position == std::string::npos) {
- return false;
- }
- auto build_name_start = build_mark_position + build_mark.size();
- RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
- auto build_name_end = created_by_.find_first_of(")", build_name_start);
- // No end ")".
- if (build_name_end == std::string::npos) {
- return false;
- }
- RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
- application_version_.build_ =
- created_by_.substr(build_name_start, build_name_end - build_name_start);
-
- return true;
- }
-
- const std::string& created_by_;
- ApplicationVersion& application_version_;
-
- // For parsing.
- std::string spaces_;
- std::string digits_;
- size_t version_parsing_position_;
- size_t version_start_;
- size_t version_end_;
- std::string version_string_;
-};
-} // namespace
-
-ApplicationVersion::ApplicationVersion(const std::string& created_by) {
- ApplicationVersionParser parser(created_by, *this);
- parser.Parse();
-}
-
-bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
- if (application_ != other_version.application_) return false;
-
- if (version.major < other_version.version.major) return true;
- if (version.major > other_version.version.major) return false;
- DCHECK_EQ(version.major, other_version.version.major);
- if (version.minor < other_version.version.minor) return true;
- if (version.minor > other_version.version.minor) return false;
- DCHECK_EQ(version.minor, other_version.version.minor);
- return version.patch < other_version.version.patch;
-}
-
-bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
- return application_ == other_version.application_ &&
- version.major == other_version.version.major &&
- version.minor == other_version.version.minor &&
- version.patch == other_version.version.patch;
-}
-
-// Reference:
-// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
-// PARQUET-686 has more discussion on statistics
-bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
- EncodedStatistics& statistics,
- SortOrder::type sort_order) const {
- // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
- // correctly for all types
- if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
- (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
- // Only SIGNED are valid unless max and min are the same
- // (in which case the sort order does not matter)
- bool max_equals_min = statistics.has_min && statistics.has_max
- ? statistics.min() == statistics.max()
- : false;
- if (SortOrder::SIGNED != sort_order && !max_equals_min) {
- return false;
- }
-
- // Statistics of other types are OK
- if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
- return true;
- }
- }
- // created_by is not populated, which could have been caused by
- // parquet-mr during the same time as PARQUET-251, see PARQUET-297
- if (application_ == "unknown") {
- return true;
- }
-
- // Unknown sort order has incorrect stats
- if (SortOrder::UNKNOWN == sort_order) {
- return false;
- }
-
- // PARQUET-251
- if (VersionLt(PARQUET_251_FIXED_VERSION())) {
- return false;
- }
-
- return true;
-}
-
-// MetaData Builders
-// row-group metadata
-class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
- public:
- explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column)
- : owned_column_chunk_(new format::ColumnChunk),
- properties_(std::move(props)),
- column_(column) {
- Init(owned_column_chunk_.get());
- }
-
- explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column,
- format::ColumnChunk* column_chunk)
- : properties_(std::move(props)), column_(column) {
- Init(column_chunk);
- }
-
- const void* contents() const { return column_chunk_; }
-
- // column chunk
- void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
-
- // column metadata
- void SetStatistics(const EncodedStatistics& val) {
- column_chunk_->meta_data.__set_statistics(ToThrift(val));
- }
-
- void Finish(int64_t num_values, int64_t dictionary_page_offset,
- int64_t index_page_offset, int64_t data_page_offset,
- int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
- bool dictionary_fallback,
- const std::map<Encoding::type, int32_t>& dict_encoding_stats,
- const std::map<Encoding::type, int32_t>& data_encoding_stats,
- const std::shared_ptr<Encryptor>& encryptor) {
- if (dictionary_page_offset > 0) {
- column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
- column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
- } else {
- column_chunk_->__set_file_offset(data_page_offset + compressed_size);
- }
- column_chunk_->__isset.meta_data = true;
- column_chunk_->meta_data.__set_num_values(num_values);
- if (index_page_offset >= 0) {
- column_chunk_->meta_data.__set_index_page_offset(index_page_offset);
- }
- column_chunk_->meta_data.__set_data_page_offset(data_page_offset);
- column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size);
- column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
-
- std::vector<format::Encoding::type> thrift_encodings;
- if (has_dictionary) {
- thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
- if (properties_->version() == ParquetVersion::PARQUET_1_0) {
- thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
- } else {
- thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
- }
- } else { // Dictionary not enabled
- thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
- }
- thrift_encodings.push_back(ToThrift(Encoding::RLE));
- // Only PLAIN encoding is supported for fallback in V1
- // TODO(majetideepak): Use user specified encoding for V2
- if (dictionary_fallback) {
- thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
- }
- column_chunk_->meta_data.__set_encodings(thrift_encodings);
- std::vector<format::PageEncodingStats> thrift_encoding_stats;
- // Add dictionary page encoding stats
- for (const auto& entry : dict_encoding_stats) {
- format::PageEncodingStats dict_enc_stat;
- dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
- dict_enc_stat.__set_encoding(ToThrift(entry.first));
- dict_enc_stat.__set_count(entry.second);
- thrift_encoding_stats.push_back(dict_enc_stat);
- }
- // Add data page encoding stats
- for (const auto& entry : data_encoding_stats) {
- format::PageEncodingStats data_enc_stat;
- data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
- data_enc_stat.__set_encoding(ToThrift(entry.first));
- data_enc_stat.__set_count(entry.second);
- thrift_encoding_stats.push_back(data_enc_stat);
- }
- column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
-
- const auto& encrypt_md =
- properties_->column_encryption_properties(column_->path()->ToDotString());
- // column is encrypted
- if (encrypt_md != nullptr && encrypt_md->is_encrypted()) {
- column_chunk_->__isset.crypto_metadata = true;
- format::ColumnCryptoMetaData ccmd;
- if (encrypt_md->is_encrypted_with_footer_key()) {
- // encrypted with footer key
- ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
- ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey());
- } else { // encrypted with column key
- format::EncryptionWithColumnKey eck;
- eck.__set_key_metadata(encrypt_md->key_metadata());
- eck.__set_path_in_schema(column_->path()->ToDotVector());
- ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
- ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
- }
- column_chunk_->__set_crypto_metadata(ccmd);
-
- bool encrypted_footer =
- properties_->file_encryption_properties()->encrypted_footer();
- bool encrypt_metadata =
- !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key();
- if (encrypt_metadata) {
- ThriftSerializer serializer;
- // Serialize and encrypt ColumnMetadata separately
- // Thrift-serialize the ColumnMetaData structure,
- // encrypt it with the column key, and write to encrypted_column_metadata
- uint8_t* serialized_data;
- uint32_t serialized_len;
-
- serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len,
- &serialized_data);
-
- std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
- serialized_len);
- unsigned encrypted_len =
- encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
-
- const char* temp =
- const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
- std::string encrypted_column_metadata(temp, encrypted_len);
- column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata);
-
- if (encrypted_footer) {
- column_chunk_->__isset.meta_data = false;
- } else {
- // Keep redacted metadata version for old readers
- column_chunk_->__isset.meta_data = true;
- column_chunk_->meta_data.__isset.statistics = false;
- column_chunk_->meta_data.__isset.encoding_stats = false;
- }
- }
- }
- }
-
- void WriteTo(::arrow::io::OutputStream* sink) {
- ThriftSerializer serializer;
- serializer.Serialize(column_chunk_, sink);
- }
-
- const ColumnDescriptor* descr() const { return column_; }
- int64_t total_compressed_size() const {
- return column_chunk_->meta_data.total_compressed_size;
- }
-
- private:
- void Init(format::ColumnChunk* column_chunk) {
- column_chunk_ = column_chunk;
-
- column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type()));
- column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector());
- column_chunk_->meta_data.__set_codec(
- ToThrift(properties_->compression(column_->path())));
- }
-
- format::ColumnChunk* column_chunk_;
- std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
- const std::shared_ptr<WriterProperties> properties_;
- const ColumnDescriptor* column_;
-};
-
-std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
- void* contents) {
- return std::unique_ptr<ColumnChunkMetaDataBuilder>(
- new ColumnChunkMetaDataBuilder(std::move(props), column, contents));
-}
-
-std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column) {
- return std::unique_ptr<ColumnChunkMetaDataBuilder>(
- new ColumnChunkMetaDataBuilder(std::move(props), column));
-}
-
-ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
- : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
- new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
-
-ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
- void* contents)
- : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
- new ColumnChunkMetaDataBuilderImpl(
- std::move(props), column,
- reinterpret_cast<format::ColumnChunk*>(contents)))} {}
-
-ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;
-
-const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
-
-void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
- impl_->set_file_path(path);
-}
-
-void ColumnChunkMetaDataBuilder::Finish(
- int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset,
- int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size,
- bool has_dictionary, bool dictionary_fallback,
- const std::map<Encoding::type, int32_t>& dict_encoding_stats,
- const std::map<Encoding::type, int32_t>& data_encoding_stats,
- const std::shared_ptr<Encryptor>& encryptor) {
- impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
- compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
- dict_encoding_stats, data_encoding_stats, encryptor);
-}
-
-void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) {
- impl_->WriteTo(sink);
-}
-
-const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const {
- return impl_->descr();
-}
-
-void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) {
- impl_->SetStatistics(result);
-}
-
-int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
- return impl_->total_compressed_size();
-}
-
-class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
- public:
- explicit RowGroupMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
- const SchemaDescriptor* schema, void* contents)
- : properties_(std::move(props)), schema_(schema), next_column_(0) {
- row_group_ = reinterpret_cast<format::RowGroup*>(contents);
- InitializeColumns(schema->num_columns());
- }
-
- ColumnChunkMetaDataBuilder* NextColumnChunk() {
- if (!(next_column_ < num_columns())) {
- std::stringstream ss;
- ss << "The schema only has " << num_columns()
- << " columns, requested metadata for column: " << next_column_;
- throw ParquetException(ss.str());
- }
- auto column = schema_->Column(next_column_);
- auto column_builder = ColumnChunkMetaDataBuilder::Make(
- properties_, column, &row_group_->columns[next_column_++]);
- auto column_builder_ptr = column_builder.get();
- column_builders_.push_back(std::move(column_builder));
- return column_builder_ptr;
- }
-
- int current_column() { return next_column_ - 1; }
-
- void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) {
- if (!(next_column_ == schema_->num_columns())) {
- std::stringstream ss;
- ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns()
- << " columns are initialized";
- throw ParquetException(ss.str());
- }
-
- int64_t file_offset = 0;
- int64_t total_compressed_size = 0;
- for (int i = 0; i < schema_->num_columns(); i++) {
- if (!(row_group_->columns[i].file_offset >= 0)) {
- std::stringstream ss;
- ss << "Column " << i << " is not complete.";
- throw ParquetException(ss.str());
- }
- if (i == 0) {
- file_offset = row_group_->columns[0].file_offset;
- }
- // sometimes column metadata is encrypted and not available to read,
- // so we must get total_compressed_size from column builder
- total_compressed_size += column_builders_[i]->total_compressed_size();
- }
-
- row_group_->__set_file_offset(file_offset);
- row_group_->__set_total_compressed_size(total_compressed_size);
- row_group_->__set_total_byte_size(total_bytes_written);
- row_group_->__set_ordinal(row_group_ordinal);
- }
-
- void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }
-
- int num_columns() { return static_cast<int>(row_group_->columns.size()); }
-
- int64_t num_rows() { return row_group_->num_rows; }
-
- private:
- void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); }
-
- format::RowGroup* row_group_;
- const std::shared_ptr<WriterProperties> properties_;
- const SchemaDescriptor* schema_;
- std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_;
- int next_column_;
-};
-
-std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
- std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
- void* contents) {
- return std::unique_ptr<RowGroupMetaDataBuilder>(
- new RowGroupMetaDataBuilder(std::move(props), schema_, contents));
-}
-
-RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const SchemaDescriptor* schema_,
- void* contents)
- : impl_{new RowGroupMetaDataBuilderImpl(std::move(props), schema_, contents)} {}
-
-RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() = default;
-
-ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() {
- return impl_->NextColumnChunk();
-}
-
-int RowGroupMetaDataBuilder::current_column() const { return impl_->current_column(); }
-
-int RowGroupMetaDataBuilder::num_columns() { return impl_->num_columns(); }
-
-int64_t RowGroupMetaDataBuilder::num_rows() { return impl_->num_rows(); }
-
-void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) {
- impl_->set_num_rows(num_rows);
-}
-
-void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written,
- int16_t row_group_ordinal) {
- impl_->Finish(total_bytes_written, row_group_ordinal);
-}
-
-// file metadata
-// TODO(PARQUET-595) Support key_value_metadata
-class FileMetaDataBuilder::FileMetaDataBuilderImpl {
- public:
- explicit FileMetaDataBuilderImpl(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : metadata_(new format::FileMetaData()),
- properties_(std::move(props)),
- schema_(schema),
- key_value_metadata_(std::move(key_value_metadata)) {
- if (properties_->file_encryption_properties() != nullptr &&
- properties_->file_encryption_properties()->encrypted_footer()) {
- crypto_metadata_.reset(new format::FileCryptoMetaData());
- }
- }
-
- RowGroupMetaDataBuilder* AppendRowGroup() {
- row_groups_.emplace_back();
- current_row_group_builder_ =
- RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back());
- return current_row_group_builder_.get();
- }
-
- std::unique_ptr<FileMetaData> Finish() {
- int64_t total_rows = 0;
- for (auto row_group : row_groups_) {
- total_rows += row_group.num_rows;
- }
- metadata_->__set_num_rows(total_rows);
- metadata_->__set_row_groups(row_groups_);
-
- if (key_value_metadata_) {
- metadata_->key_value_metadata.clear();
- metadata_->key_value_metadata.reserve(key_value_metadata_->size());
- for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
- format::KeyValue kv_pair;
- kv_pair.__set_key(key_value_metadata_->key(i));
- kv_pair.__set_value(key_value_metadata_->value(i));
- metadata_->key_value_metadata.push_back(kv_pair);
- }
- metadata_->__isset.key_value_metadata = true;
- }
-
- int32_t file_version = 0;
- switch (properties_->version()) {
- case ParquetVersion::PARQUET_1_0:
- file_version = 1;
- break;
- case ParquetVersion::PARQUET_2_0:
- file_version = 2;
- break;
- default:
- break;
- }
- metadata_->__set_version(file_version);
- metadata_->__set_created_by(properties_->created_by());
-
- // Users cannot set the `ColumnOrder` since we donot not have user defined sort order
- // in the spec yet.
- // We always default to `TYPE_DEFINED_ORDER`. We can expose it in
- // the API once we have user defined sort orders in the Parquet format.
- // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
- format::TypeDefinedOrder type_defined_order;
- format::ColumnOrder column_order;
- column_order.__set_TYPE_ORDER(type_defined_order);
- column_order.__isset.TYPE_ORDER = true;
- metadata_->column_orders.resize(schema_->num_columns(), column_order);
- metadata_->__isset.column_orders = true;
-
- // if plaintext footer, set footer signing algorithm
- auto file_encryption_properties = properties_->file_encryption_properties();
- if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) {
- EncryptionAlgorithm signing_algorithm;
- EncryptionAlgorithm algo = file_encryption_properties->algorithm();
- signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique;
- signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix;
- if (!algo.aad.supply_aad_prefix) {
- signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix;
- }
- signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
-
- metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm));
- const std::string& footer_signing_key_metadata =
- file_encryption_properties->footer_key_metadata();
- if (footer_signing_key_metadata.size() > 0) {
- metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata);
- }
- }
-
- ToParquet(static_cast<parquet::schema::GroupNode*>(schema_->schema_root().get()),
- &metadata_->schema);
- auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData());
- file_meta_data->impl_->metadata_ = std::move(metadata_);
- file_meta_data->impl_->InitSchema();
- file_meta_data->impl_->InitKeyValueMetadata();
- return file_meta_data;
- }
-
- std::unique_ptr<FileCryptoMetaData> BuildFileCryptoMetaData() {
- if (crypto_metadata_ == nullptr) {
- return nullptr;
- }
-
- auto file_encryption_properties = properties_->file_encryption_properties();
-
- crypto_metadata_->__set_encryption_algorithm(
- ToThrift(file_encryption_properties->algorithm()));
- std::string key_metadata = file_encryption_properties->footer_key_metadata();
-
- if (!key_metadata.empty()) {
- crypto_metadata_->__set_key_metadata(key_metadata);
- }
-
- std::unique_ptr<FileCryptoMetaData> file_crypto_metadata =
- std::unique_ptr<FileCryptoMetaData>(new FileCryptoMetaData());
- file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_);
-
- return file_crypto_metadata;
- }
-
- protected:
- std::unique_ptr<format::FileMetaData> metadata_;
- std::unique_ptr<format::FileCryptoMetaData> crypto_metadata_;
-
- private:
- const std::shared_ptr<WriterProperties> properties_;
- std::vector<format::RowGroup> row_groups_;
-
- std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
- const SchemaDescriptor* schema_;
- std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
-};
-
-std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
- return std::unique_ptr<FileMetaDataBuilder>(
- new FileMetaDataBuilder(schema, std::move(props), std::move(key_value_metadata)));
-}
-
-FileMetaDataBuilder::FileMetaDataBuilder(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata)
- : impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
- schema, std::move(props), std::move(key_value_metadata)))} {}
-
-FileMetaDataBuilder::~FileMetaDataBuilder() = default;
-
-RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() {
- return impl_->AppendRowGroup();
-}
-
-std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() { return impl_->Finish(); }
-
-std::unique_ptr<FileCryptoMetaData> FileMetaDataBuilder::GetCryptoMetaData() {
- return impl_->BuildFileCryptoMetaData();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/metadata.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/memory.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+#include "parquet/encryption/encryption_internal.h"
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/schema_internal.h"
+#include "parquet/statistics.h"
+#include "parquet/thrift_internal.h"
+
+namespace parquet {
+
+const ApplicationVersion& ApplicationVersion::PARQUET_251_FIXED_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 8, 0);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_816_FIXED_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 2, 9);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_CPP_FIXED_STATS_VERSION() {
+ static ApplicationVersion version("parquet-cpp", 1, 3, 0);
+ return version;
+}
+
+const ApplicationVersion& ApplicationVersion::PARQUET_MR_FIXED_STATS_VERSION() {
+ static ApplicationVersion version("parquet-mr", 1, 10, 0);
+ return version;
+}
+
+std::string ParquetVersionToString(ParquetVersion::type ver) {
+ switch (ver) {
+ case ParquetVersion::PARQUET_1_0:
+ return "1.0";
+ case ParquetVersion::PARQUET_2_0:
+ return "2.0";
+ }
+
+ // This should be unreachable
+ return "UNKNOWN";
+}
+
+template <typename DType>
+static std::shared_ptr<Statistics> MakeTypedColumnStats(
+ const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
+ // If ColumnOrder is defined, return max_value and min_value
+ if (descr->column_order().get_order() == ColumnOrder::TYPE_DEFINED_ORDER) {
+ return MakeStatistics<DType>(
+ descr, metadata.statistics.min_value, metadata.statistics.max_value,
+ metadata.num_values - metadata.statistics.null_count,
+ metadata.statistics.null_count, metadata.statistics.distinct_count,
+ metadata.statistics.__isset.max_value || metadata.statistics.__isset.min_value,
+ metadata.statistics.__isset.null_count,
+ metadata.statistics.__isset.distinct_count);
+ }
+ // Default behavior
+ return MakeStatistics<DType>(
+ descr, metadata.statistics.min, metadata.statistics.max,
+ metadata.num_values - metadata.statistics.null_count,
+ metadata.statistics.null_count, metadata.statistics.distinct_count,
+ metadata.statistics.__isset.max || metadata.statistics.__isset.min,
+ metadata.statistics.__isset.null_count, metadata.statistics.__isset.distinct_count);
+}
+
+std::shared_ptr<Statistics> MakeColumnStats(const format::ColumnMetaData& meta_data,
+ const ColumnDescriptor* descr) {
+ switch (static_cast<Type::type>(meta_data.type)) {
+ case Type::BOOLEAN:
+ return MakeTypedColumnStats<BooleanType>(meta_data, descr);
+ case Type::INT32:
+ return MakeTypedColumnStats<Int32Type>(meta_data, descr);
+ case Type::INT64:
+ return MakeTypedColumnStats<Int64Type>(meta_data, descr);
+ case Type::INT96:
+ return MakeTypedColumnStats<Int96Type>(meta_data, descr);
+ case Type::DOUBLE:
+ return MakeTypedColumnStats<DoubleType>(meta_data, descr);
+ case Type::FLOAT:
+ return MakeTypedColumnStats<FloatType>(meta_data, descr);
+ case Type::BYTE_ARRAY:
+ return MakeTypedColumnStats<ByteArrayType>(meta_data, descr);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return MakeTypedColumnStats<FLBAType>(meta_data, descr);
+ case Type::UNDEFINED:
+ break;
+ }
+ throw ParquetException("Can't decode page statistics for selected column type");
+}
+
+// MetaData Accessor
+
+// ColumnCryptoMetaData
+class ColumnCryptoMetaData::ColumnCryptoMetaDataImpl {
+ public:
+ explicit ColumnCryptoMetaDataImpl(const format::ColumnCryptoMetaData* crypto_metadata)
+ : crypto_metadata_(crypto_metadata) {}
+
+ bool encrypted_with_footer_key() const {
+ return crypto_metadata_->__isset.ENCRYPTION_WITH_FOOTER_KEY;
+ }
+ bool encrypted_with_column_key() const {
+ return crypto_metadata_->__isset.ENCRYPTION_WITH_COLUMN_KEY;
+ }
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const {
+ return std::make_shared<schema::ColumnPath>(
+ crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
+ }
+ const std::string& key_metadata() const {
+ return crypto_metadata_->ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+ }
+
+ private:
+ const format::ColumnCryptoMetaData* crypto_metadata_;
+};
+
+std::unique_ptr<ColumnCryptoMetaData> ColumnCryptoMetaData::Make(
+ const uint8_t* metadata) {
+ return std::unique_ptr<ColumnCryptoMetaData>(new ColumnCryptoMetaData(metadata));
+}
+
+ColumnCryptoMetaData::ColumnCryptoMetaData(const uint8_t* metadata)
+ : impl_(new ColumnCryptoMetaDataImpl(
+ reinterpret_cast<const format::ColumnCryptoMetaData*>(metadata))) {}
+
+ColumnCryptoMetaData::~ColumnCryptoMetaData() = default;
+
+std::shared_ptr<schema::ColumnPath> ColumnCryptoMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+bool ColumnCryptoMetaData::encrypted_with_footer_key() const {
+ return impl_->encrypted_with_footer_key();
+}
+const std::string& ColumnCryptoMetaData::key_metadata() const {
+ return impl_->key_metadata();
+}
+
+// ColumnChunk metadata
+class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
+ public:
+ explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
+ const ColumnDescriptor* descr,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : column_(column), descr_(descr), writer_version_(writer_version) {
+ column_metadata_ = &column->meta_data;
+ if (column->__isset.crypto_metadata) { // column metadata is encrypted
+ format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
+
+ if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
+ if (file_decryptor != nullptr && file_decryptor->properties() != nullptr) {
+ // should decrypt metadata
+ std::shared_ptr<schema::ColumnPath> path = std::make_shared<schema::ColumnPath>(
+ ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
+ std::string key_metadata = ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+
+ std::string aad_column_metadata = encryption::CreateModuleAad(
+ file_decryptor->file_aad(), encryption::kColumnMetaData, row_group_ordinal,
+ column_ordinal, static_cast<int16_t>(-1));
+ auto decryptor = file_decryptor->GetColumnMetaDecryptor(
+ path->ToDotString(), key_metadata, aad_column_metadata);
+ auto len = static_cast<uint32_t>(column->encrypted_column_metadata.size());
+ DeserializeThriftMsg(
+ reinterpret_cast<const uint8_t*>(column->encrypted_column_metadata.c_str()),
+ &len, &decrypted_metadata_, decryptor);
+ column_metadata_ = &decrypted_metadata_;
+ } else {
+ throw ParquetException(
+ "Cannot decrypt ColumnMetadata."
+ " FileDecryption is not setup correctly");
+ }
+ }
+ }
+ for (const auto& encoding : column_metadata_->encodings) {
+ encodings_.push_back(LoadEnumSafe(&encoding));
+ }
+ for (const auto& encoding_stats : column_metadata_->encoding_stats) {
+ encoding_stats_.push_back({LoadEnumSafe(&encoding_stats.page_type),
+ LoadEnumSafe(&encoding_stats.encoding),
+ encoding_stats.count});
+ }
+ possible_stats_ = nullptr;
+ }
+
+ bool Equals(const ColumnChunkMetaDataImpl& other) const {
+ return *column_metadata_ == *other.column_metadata_;
+ }
+
+ // column chunk
+ inline int64_t file_offset() const { return column_->file_offset; }
+ inline const std::string& file_path() const { return column_->file_path; }
+
+ inline Type::type type() const { return LoadEnumSafe(&column_metadata_->type); }
+
+ inline int64_t num_values() const { return column_metadata_->num_values; }
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() {
+ return std::make_shared<schema::ColumnPath>(column_metadata_->path_in_schema);
+ }
+
+ // Check if statistics are set and are valid
+ // 1) Must be set in the metadata
+ // 2) Statistics must not be corrupted
+ inline bool is_stats_set() const {
+ DCHECK(writer_version_ != nullptr);
+ // If the column statistics don't exist or column sort order is unknown
+ // we cannot use the column stats
+ if (!column_metadata_->__isset.statistics ||
+ descr_->sort_order() == SortOrder::UNKNOWN) {
+ return false;
+ }
+ if (possible_stats_ == nullptr) {
+ possible_stats_ = MakeColumnStats(*column_metadata_, descr_);
+ }
+ EncodedStatistics encodedStatistics = possible_stats_->Encode();
+ return writer_version_->HasCorrectStatistics(type(), encodedStatistics,
+ descr_->sort_order());
+ }
+
+ inline std::shared_ptr<Statistics> statistics() const {
+ return is_stats_set() ? possible_stats_ : nullptr;
+ }
+
+ inline Compression::type compression() const {
+ return LoadEnumSafe(&column_metadata_->codec);
+ }
+
+ const std::vector<Encoding::type>& encodings() const { return encodings_; }
+
+ const std::vector<PageEncodingStats>& encoding_stats() const { return encoding_stats_; }
+
+ inline bool has_dictionary_page() const {
+ return column_metadata_->__isset.dictionary_page_offset;
+ }
+
+ inline int64_t dictionary_page_offset() const {
+ return column_metadata_->dictionary_page_offset;
+ }
+
+ inline int64_t data_page_offset() const { return column_metadata_->data_page_offset; }
+
+ inline bool has_index_page() const {
+ return column_metadata_->__isset.index_page_offset;
+ }
+
+ inline int64_t index_page_offset() const { return column_metadata_->index_page_offset; }
+
+ inline int64_t total_compressed_size() const {
+ return column_metadata_->total_compressed_size;
+ }
+
+ inline int64_t total_uncompressed_size() const {
+ return column_metadata_->total_uncompressed_size;
+ }
+
+ inline std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const {
+ if (column_->__isset.crypto_metadata) {
+ return ColumnCryptoMetaData::Make(
+ reinterpret_cast<const uint8_t*>(&column_->crypto_metadata));
+ } else {
+ return nullptr;
+ }
+ }
+
+ private:
+ mutable std::shared_ptr<Statistics> possible_stats_;
+ std::vector<Encoding::type> encodings_;
+ std::vector<PageEncodingStats> encoding_stats_;
+ const format::ColumnChunk* column_;
+ const format::ColumnMetaData* column_metadata_;
+ format::ColumnMetaData decrypted_metadata_;
+ const ColumnDescriptor* descr_;
+ const ApplicationVersion* writer_version_;
+};
+
+std::unique_ptr<ColumnChunkMetaData> ColumnChunkMetaData::Make(
+ const void* metadata, const ColumnDescriptor* descr,
+ const ApplicationVersion* writer_version, int16_t row_group_ordinal,
+ int16_t column_ordinal, std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ return std::unique_ptr<ColumnChunkMetaData>(
+ new ColumnChunkMetaData(metadata, descr, row_group_ordinal, column_ordinal,
+ writer_version, std::move(file_decryptor)));
+}
+
+ColumnChunkMetaData::ColumnChunkMetaData(
+ const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+ int16_t column_ordinal, const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{new ColumnChunkMetaDataImpl(
+ reinterpret_cast<const format::ColumnChunk*>(metadata), descr,
+ row_group_ordinal, column_ordinal, writer_version, std::move(file_decryptor))} {
+}
+
+ColumnChunkMetaData::~ColumnChunkMetaData() = default;
+
+// column chunk
+int64_t ColumnChunkMetaData::file_offset() const { return impl_->file_offset(); }
+
+const std::string& ColumnChunkMetaData::file_path() const { return impl_->file_path(); }
+
+Type::type ColumnChunkMetaData::type() const { return impl_->type(); }
+
+int64_t ColumnChunkMetaData::num_values() const { return impl_->num_values(); }
+
+std::shared_ptr<schema::ColumnPath> ColumnChunkMetaData::path_in_schema() const {
+ return impl_->path_in_schema();
+}
+
+std::shared_ptr<Statistics> ColumnChunkMetaData::statistics() const {
+ return impl_->statistics();
+}
+
+bool ColumnChunkMetaData::is_stats_set() const { return impl_->is_stats_set(); }
+
+bool ColumnChunkMetaData::has_dictionary_page() const {
+ return impl_->has_dictionary_page();
+}
+
+int64_t ColumnChunkMetaData::dictionary_page_offset() const {
+ return impl_->dictionary_page_offset();
+}
+
+int64_t ColumnChunkMetaData::data_page_offset() const {
+ return impl_->data_page_offset();
+}
+
+bool ColumnChunkMetaData::has_index_page() const { return impl_->has_index_page(); }
+
+int64_t ColumnChunkMetaData::index_page_offset() const {
+ return impl_->index_page_offset();
+}
+
+Compression::type ColumnChunkMetaData::compression() const {
+ return impl_->compression();
+}
+
+bool ColumnChunkMetaData::can_decompress() const {
+ return ::arrow::util::Codec::IsAvailable(compression());
+}
+
+const std::vector<Encoding::type>& ColumnChunkMetaData::encodings() const {
+ return impl_->encodings();
+}
+
+const std::vector<PageEncodingStats>& ColumnChunkMetaData::encoding_stats() const {
+ return impl_->encoding_stats();
+}
+
+int64_t ColumnChunkMetaData::total_uncompressed_size() const {
+ return impl_->total_uncompressed_size();
+}
+
+int64_t ColumnChunkMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+std::unique_ptr<ColumnCryptoMetaData> ColumnChunkMetaData::crypto_metadata() const {
+ return impl_->crypto_metadata();
+}
+
+bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+// row-group metadata
+class RowGroupMetaData::RowGroupMetaDataImpl {
+ public:
+ explicit RowGroupMetaDataImpl(const format::RowGroup* row_group,
+ const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : row_group_(row_group),
+ schema_(schema),
+ writer_version_(writer_version),
+ file_decryptor_(std::move(file_decryptor)) {}
+
+ bool Equals(const RowGroupMetaDataImpl& other) const {
+ return *row_group_ == *other.row_group_;
+ }
+
+ inline int num_columns() const { return static_cast<int>(row_group_->columns.size()); }
+
+ inline int64_t num_rows() const { return row_group_->num_rows; }
+
+ inline int64_t total_byte_size() const { return row_group_->total_byte_size; }
+
+ inline int64_t total_compressed_size() const {
+ return row_group_->total_compressed_size;
+ }
+
+ inline int64_t file_offset() const { return row_group_->file_offset; }
+
+ inline const SchemaDescriptor* schema() const { return schema_; }
+
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
+ if (i < num_columns()) {
+ return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i),
+ writer_version_, row_group_->ordinal,
+ static_cast<int16_t>(i), file_decryptor_);
+ }
+ throw ParquetException("The file only has ", num_columns(),
+ " columns, requested metadata for column: ", i);
+ }
+
+ private:
+ const format::RowGroup* row_group_;
+ const SchemaDescriptor* schema_;
+ const ApplicationVersion* writer_version_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+};
+
+std::unique_ptr<RowGroupMetaData> RowGroupMetaData::Make(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ return std::unique_ptr<RowGroupMetaData>(
+ new RowGroupMetaData(metadata, schema, writer_version, std::move(file_decryptor)));
+}
+
+RowGroupMetaData::RowGroupMetaData(const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{new RowGroupMetaDataImpl(reinterpret_cast<const format::RowGroup*>(metadata),
+ schema, writer_version, std::move(file_decryptor))} {
+}
+
+RowGroupMetaData::~RowGroupMetaData() = default;
+
+bool RowGroupMetaData::Equals(const RowGroupMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+int RowGroupMetaData::num_columns() const { return impl_->num_columns(); }
+
+int64_t RowGroupMetaData::num_rows() const { return impl_->num_rows(); }
+
+int64_t RowGroupMetaData::total_byte_size() const { return impl_->total_byte_size(); }
+
+int64_t RowGroupMetaData::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+int64_t RowGroupMetaData::file_offset() const { return impl_->file_offset(); }
+
+const SchemaDescriptor* RowGroupMetaData::schema() const { return impl_->schema(); }
+
+std::unique_ptr<ColumnChunkMetaData> RowGroupMetaData::ColumnChunk(int i) const {
+ return impl_->ColumnChunk(i);
+}
+
+bool RowGroupMetaData::can_decompress() const {
+ int n_columns = num_columns();
+ for (int i = 0; i < n_columns; i++) {
+ if (!ColumnChunk(i)->can_decompress()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// file metadata
+class FileMetaData::FileMetaDataImpl {
+ public:
+ FileMetaDataImpl() = default;
+
+ explicit FileMetaDataImpl(
+ const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = nullptr)
+ : file_decryptor_(file_decryptor) {
+ metadata_.reset(new format::FileMetaData);
+
+ auto footer_decryptor =
+ file_decryptor_ != nullptr ? file_decryptor->GetFooterDecryptor() : nullptr;
+
+ DeserializeThriftMsg(reinterpret_cast<const uint8_t*>(metadata), metadata_len,
+ metadata_.get(), footer_decryptor);
+ metadata_len_ = *metadata_len;
+
+ if (metadata_->__isset.created_by) {
+ writer_version_ = ApplicationVersion(metadata_->created_by);
+ } else {
+ writer_version_ = ApplicationVersion("unknown 0.0.0");
+ }
+
+ InitSchema();
+ InitColumnOrders();
+ InitKeyValueMetadata();
+ }
+
+ bool VerifySignature(const void* signature) {
+ // verify decryption properties are set
+ if (file_decryptor_ == nullptr) {
+ throw ParquetException("Decryption not set properly. cannot verify signature");
+ }
+ // serialize the footer
+ uint8_t* serialized_data;
+ uint32_t serialized_len = metadata_len_;
+ ThriftSerializer serializer;
+ serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
+
+ // encrypt with nonce
+ auto nonce = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature));
+ auto tag = const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(signature)) +
+ encryption::kNonceLength;
+
+ std::string key = file_decryptor_->GetFooterKey();
+ std::string aad = encryption::CreateFooterAad(file_decryptor_->file_aad());
+
+ auto aes_encryptor = encryption::AesEncryptor::Make(
+ file_decryptor_->algorithm(), static_cast<int>(key.size()), true, nullptr);
+
+ std::shared_ptr<Buffer> encrypted_buffer = std::static_pointer_cast<ResizableBuffer>(
+ AllocateBuffer(file_decryptor_->pool(),
+ aes_encryptor->CiphertextSizeDelta() + serialized_len));
+ uint32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
+ serialized_data, serialized_len, str2bytes(key), static_cast<int>(key.size()),
+ str2bytes(aad), static_cast<int>(aad.size()), nonce,
+ encrypted_buffer->mutable_data());
+ // Delete AES encryptor object. It was created only to verify the footer signature.
+ aes_encryptor->WipeOut();
+ delete aes_encryptor;
+ return 0 ==
+ memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
+ tag, encryption::kGcmTagLength);
+ }
+
+ inline uint32_t size() const { return metadata_len_; }
+ inline int num_columns() const { return schema_.num_columns(); }
+ inline int64_t num_rows() const { return metadata_->num_rows; }
+ inline int num_row_groups() const {
+ return static_cast<int>(metadata_->row_groups.size());
+ }
+ inline int32_t version() const { return metadata_->version; }
+ inline const std::string& created_by() const { return metadata_->created_by; }
+ inline int num_schema_elements() const {
+ return static_cast<int>(metadata_->schema.size());
+ }
+
+ inline bool is_encryption_algorithm_set() const {
+ return metadata_->__isset.encryption_algorithm;
+ }
+ inline EncryptionAlgorithm encryption_algorithm() {
+ return FromThrift(metadata_->encryption_algorithm);
+ }
+ inline const std::string& footer_signing_key_metadata() {
+ return metadata_->footer_signing_key_metadata;
+ }
+
+ const ApplicationVersion& writer_version() const { return writer_version_; }
+
+ void WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor) const {
+ ThriftSerializer serializer;
+ // Only in encrypted files with plaintext footers the
+ // encryption_algorithm is set in footer
+ if (is_encryption_algorithm_set()) {
+ uint8_t* serialized_data;
+ uint32_t serialized_len;
+ serializer.SerializeToBuffer(metadata_.get(), &serialized_len, &serialized_data);
+
+ // encrypt the footer key
+ std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
+ serialized_len);
+ unsigned encrypted_len =
+ encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
+
+ // write unencrypted footer
+ PARQUET_THROW_NOT_OK(dst->Write(serialized_data, serialized_len));
+ // Write signature (nonce and tag)
+ PARQUET_THROW_NOT_OK(
+ dst->Write(encrypted_data.data() + 4, encryption::kNonceLength));
+ PARQUET_THROW_NOT_OK(
+ dst->Write(encrypted_data.data() + encrypted_len - encryption::kGcmTagLength,
+ encryption::kGcmTagLength));
+ } else { // either plaintext file (when encryptor is null)
+ // or encrypted file with encrypted footer
+ serializer.Serialize(metadata_.get(), dst, encryptor);
+ }
+ }
+
+ std::unique_ptr<RowGroupMetaData> RowGroup(int i) {
+ if (!(i < num_row_groups())) {
+ std::stringstream ss;
+ ss << "The file only has " << num_row_groups()
+ << " row groups, requested metadata for row group: " << i;
+ throw ParquetException(ss.str());
+ }
+ return RowGroupMetaData::Make(&metadata_->row_groups[i], &schema_, &writer_version_,
+ file_decryptor_);
+ }
+
+ bool Equals(const FileMetaDataImpl& other) const {
+ return *metadata_ == *other.metadata_;
+ }
+
+ const SchemaDescriptor* schema() const { return &schema_; }
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+ return key_value_metadata_;
+ }
+
+ void set_file_path(const std::string& path) {
+ for (format::RowGroup& row_group : metadata_->row_groups) {
+ for (format::ColumnChunk& chunk : row_group.columns) {
+ chunk.__set_file_path(path);
+ }
+ }
+ }
+
+ format::RowGroup& row_group(int i) {
+ DCHECK_LT(i, num_row_groups());
+ return metadata_->row_groups[i];
+ }
+
+ void AppendRowGroups(const std::unique_ptr<FileMetaDataImpl>& other) {
+ if (!schema()->Equals(*other->schema())) {
+ throw ParquetException("AppendRowGroups requires equal schemas.");
+ }
+
+ format::RowGroup other_rg;
+ for (int i = 0; i < other->num_row_groups(); i++) {
+ other_rg = other->row_group(i);
+ metadata_->row_groups.push_back(other_rg);
+ metadata_->num_rows += other_rg.num_rows;
+ }
+ }
+
+ std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) {
+ for (int i : row_groups) {
+ if (i < num_row_groups()) continue;
+
+ throw ParquetException(
+ "The file only has ", num_row_groups(),
+ " row groups, but requested a subset including row group: ", i);
+ }
+
+ std::shared_ptr<FileMetaData> out(new FileMetaData());
+ out->impl_.reset(new FileMetaDataImpl());
+ out->impl_->metadata_.reset(new format::FileMetaData());
+
+ auto metadata = out->impl_->metadata_.get();
+ metadata->version = metadata_->version;
+ metadata->schema = metadata_->schema;
+
+ metadata->row_groups.resize(row_groups.size());
+ int i = 0;
+ for (int selected_index : row_groups) {
+ metadata->num_rows += row_group(selected_index).num_rows;
+ metadata->row_groups[i++] = row_group(selected_index);
+ }
+
+ metadata->key_value_metadata = metadata_->key_value_metadata;
+ metadata->created_by = metadata_->created_by;
+ metadata->column_orders = metadata_->column_orders;
+ metadata->encryption_algorithm = metadata_->encryption_algorithm;
+ metadata->footer_signing_key_metadata = metadata_->footer_signing_key_metadata;
+ metadata->__isset = metadata_->__isset;
+
+ out->impl_->schema_ = schema_;
+ out->impl_->writer_version_ = writer_version_;
+ out->impl_->key_value_metadata_ = key_value_metadata_;
+ out->impl_->file_decryptor_ = file_decryptor_;
+
+ return out;
+ }
+
+ void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ file_decryptor_ = file_decryptor;
+ }
+
+ private:
+ friend FileMetaDataBuilder;
+ uint32_t metadata_len_ = 0;
+ std::unique_ptr<format::FileMetaData> metadata_;
+ SchemaDescriptor schema_;
+ ApplicationVersion writer_version_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+ std::shared_ptr<InternalFileDecryptor> file_decryptor_;
+
+ void InitSchema() {
+ if (metadata_->schema.empty()) {
+ throw ParquetException("Empty file schema (no root)");
+ }
+ schema_.Init(schema::Unflatten(&metadata_->schema[0],
+ static_cast<int>(metadata_->schema.size())));
+ }
+
+ void InitColumnOrders() {
+ // update ColumnOrder
+ std::vector<parquet::ColumnOrder> column_orders;
+ if (metadata_->__isset.column_orders) {
+ for (auto column_order : metadata_->column_orders) {
+ if (column_order.__isset.TYPE_ORDER) {
+ column_orders.push_back(ColumnOrder::type_defined_);
+ } else {
+ column_orders.push_back(ColumnOrder::undefined_);
+ }
+ }
+ } else {
+ column_orders.resize(schema_.num_columns(), ColumnOrder::undefined_);
+ }
+
+ schema_.updateColumnOrders(column_orders);
+ }
+
+ void InitKeyValueMetadata() {
+ std::shared_ptr<KeyValueMetadata> metadata = nullptr;
+ if (metadata_->__isset.key_value_metadata) {
+ metadata = std::make_shared<KeyValueMetadata>();
+ for (const auto& it : metadata_->key_value_metadata) {
+ metadata->Append(it.key, it.value);
+ }
+ }
+ key_value_metadata_ = std::move(metadata);
+ }
+};
+
+std::shared_ptr<FileMetaData> FileMetaData::Make(
+ const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ // This FileMetaData ctor is private, not compatible with std::make_shared
+ return std::shared_ptr<FileMetaData>(
+ new FileMetaData(metadata, metadata_len, file_decryptor));
+}
+
+FileMetaData::FileMetaData(const void* metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor)
+ : impl_{std::unique_ptr<FileMetaDataImpl>(
+ new FileMetaDataImpl(metadata, metadata_len, file_decryptor))} {}
+
+FileMetaData::FileMetaData()
+ : impl_{std::unique_ptr<FileMetaDataImpl>(new FileMetaDataImpl())} {}
+
+FileMetaData::~FileMetaData() = default;
+
+bool FileMetaData::Equals(const FileMetaData& other) const {
+ return impl_->Equals(*other.impl_);
+}
+
+std::unique_ptr<RowGroupMetaData> FileMetaData::RowGroup(int i) const {
+ return impl_->RowGroup(i);
+}
+
+bool FileMetaData::VerifySignature(const void* signature) {
+ return impl_->VerifySignature(signature);
+}
+
+uint32_t FileMetaData::size() const { return impl_->size(); }
+
+int FileMetaData::num_columns() const { return impl_->num_columns(); }
+
+int64_t FileMetaData::num_rows() const { return impl_->num_rows(); }
+
+int FileMetaData::num_row_groups() const { return impl_->num_row_groups(); }
+
+bool FileMetaData::can_decompress() const {
+ int n_row_groups = num_row_groups();
+ for (int i = 0; i < n_row_groups; i++) {
+ if (!RowGroup(i)->can_decompress()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FileMetaData::is_encryption_algorithm_set() const {
+ return impl_->is_encryption_algorithm_set();
+}
+
+EncryptionAlgorithm FileMetaData::encryption_algorithm() const {
+ return impl_->encryption_algorithm();
+}
+
+const std::string& FileMetaData::footer_signing_key_metadata() const {
+ return impl_->footer_signing_key_metadata();
+}
+
+void FileMetaData::set_file_decryptor(
+ std::shared_ptr<InternalFileDecryptor> file_decryptor) {
+ impl_->set_file_decryptor(file_decryptor);
+}
+
+ParquetVersion::type FileMetaData::version() const {
+ switch (impl_->version()) {
+ case 1:
+ return ParquetVersion::PARQUET_1_0;
+ case 2:
+ return ParquetVersion::PARQUET_2_0;
+ default:
+ // Improperly set version, assuming Parquet 1.0
+ break;
+ }
+ return ParquetVersion::PARQUET_1_0;
+}
+
+const ApplicationVersion& FileMetaData::writer_version() const {
+ return impl_->writer_version();
+}
+
+const std::string& FileMetaData::created_by() const { return impl_->created_by(); }
+
+int FileMetaData::num_schema_elements() const { return impl_->num_schema_elements(); }
+
+const SchemaDescriptor* FileMetaData::schema() const { return impl_->schema(); }
+
+const std::shared_ptr<const KeyValueMetadata>& FileMetaData::key_value_metadata() const {
+ return impl_->key_value_metadata();
+}
+
+void FileMetaData::set_file_path(const std::string& path) { impl_->set_file_path(path); }
+
+void FileMetaData::AppendRowGroups(const FileMetaData& other) {
+ impl_->AppendRowGroups(other.impl_);
+}
+
+std::shared_ptr<FileMetaData> FileMetaData::Subset(
+ const std::vector<int>& row_groups) const {
+ return impl_->Subset(row_groups);
+}
+
+void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor) const {
+ return impl_->WriteTo(dst, encryptor);
+}
+
+class FileCryptoMetaData::FileCryptoMetaDataImpl {
+ public:
+ FileCryptoMetaDataImpl() = default;
+
+ explicit FileCryptoMetaDataImpl(const uint8_t* metadata, uint32_t* metadata_len) {
+ metadata_.reset(new format::FileCryptoMetaData);
+ DeserializeThriftMsg(metadata, metadata_len, metadata_.get());
+ metadata_len_ = *metadata_len;
+ }
+
+ EncryptionAlgorithm encryption_algorithm() {
+ return FromThrift(metadata_->encryption_algorithm);
+ }
+ const std::string& key_metadata() { return metadata_->key_metadata; }
+ void WriteTo(::arrow::io::OutputStream* dst) const {
+ ThriftSerializer serializer;
+ serializer.Serialize(metadata_.get(), dst);
+ }
+
+ private:
+ friend FileMetaDataBuilder;
+ std::unique_ptr<format::FileCryptoMetaData> metadata_;
+ uint32_t metadata_len_;
+};
+
+EncryptionAlgorithm FileCryptoMetaData::encryption_algorithm() const {
+ return impl_->encryption_algorithm();
+}
+
+const std::string& FileCryptoMetaData::key_metadata() const {
+ return impl_->key_metadata();
+}
+
+std::shared_ptr<FileCryptoMetaData> FileCryptoMetaData::Make(
+ const uint8_t* serialized_metadata, uint32_t* metadata_len) {
+ return std::shared_ptr<FileCryptoMetaData>(
+ new FileCryptoMetaData(serialized_metadata, metadata_len));
+}
+
+FileCryptoMetaData::FileCryptoMetaData(const uint8_t* serialized_metadata,
+ uint32_t* metadata_len)
+ : impl_(new FileCryptoMetaDataImpl(serialized_metadata, metadata_len)) {}
+
+FileCryptoMetaData::FileCryptoMetaData() : impl_(new FileCryptoMetaDataImpl()) {}
+
+FileCryptoMetaData::~FileCryptoMetaData() = default;
+
+void FileCryptoMetaData::WriteTo(::arrow::io::OutputStream* dst) const {
+ impl_->WriteTo(dst);
+}
+
+std::string FileMetaData::SerializeToString() const {
+ // We need to pass in an initial size. Since it will automatically
+ // increase the buffer size to hold the metadata, we just leave it 0.
+ PARQUET_ASSIGN_OR_THROW(auto serializer, ::arrow::io::BufferOutputStream::Create(0));
+ WriteTo(serializer.get());
+ PARQUET_ASSIGN_OR_THROW(auto metadata_buffer, serializer->Finish());
+ return metadata_buffer->ToString();
+}
+
+ApplicationVersion::ApplicationVersion(std::string application, int major, int minor,
+ int patch)
+ : application_(std::move(application)), version{major, minor, patch, "", "", ""} {}
+
+namespace {
+// Parse the application version format and set parsed values to
+// ApplicationVersion.
+//
+// The application version format must be compatible parquet-mr's
+// one. See also:
+// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/VersionParser.java
+// * https://github.com/apache/parquet-mr/blob/master/parquet-common/src/main/java/org/apache/parquet/SemanticVersion.java
+//
+// The application version format:
+// "${APPLICATION_NAME}"
+// "${APPLICATION_NAME} version ${VERSION}"
+// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
+//
+// Eg:
+// parquet-cpp
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
+// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
+//
+// The VERSION format:
+// "${MAJOR}"
+// "${MAJOR}.${MINOR}"
+// "${MAJOR}.${MINOR}.${PATCH}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
+// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
+// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
+//
+// Eg:
+// 1
+// 1.5
+// 1.5.0
+// 1.5.0ab
+// 1.5.0ab-cdh5.5.0
+// 1.5.0ab-cdh5.5.0+cd
+// 1.5.0ab+cd
+// 1.5.0-cdh5.5.0
+// 1.5.0-cdh5.5.0+cd
+// 1.5.0+cd
+class ApplicationVersionParser {
+ public:
+ ApplicationVersionParser(const std::string& created_by,
+ ApplicationVersion& application_version)
+ : created_by_(created_by),
+ application_version_(application_version),
+ spaces_(" \t\v\r\n\f"),
+ digits_("0123456789") {}
+
+ void Parse() {
+ application_version_.application_ = "unknown";
+ application_version_.version = {0, 0, 0, "", "", ""};
+
+ if (!ParseApplicationName()) {
+ return;
+ }
+ if (!ParseVersion()) {
+ return;
+ }
+ if (!ParseBuildName()) {
+ return;
+ }
+ }
+
+ private:
+ bool IsSpace(const std::string& string, const size_t& offset) {
+ auto target = ::arrow::util::string_view(string).substr(offset, 1);
+ return target.find_first_of(spaces_) != ::arrow::util::string_view::npos;
+ }
+
+ void RemovePrecedingSpaces(const std::string& string, size_t& start,
+ const size_t& end) {
+ while (start < end && IsSpace(string, start)) {
+ ++start;
+ }
+ }
+
+ void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
+ while (start < (end - 1) && (end - 1) < string.size() && IsSpace(string, end - 1)) {
+ --end;
+ }
+ }
+
+ bool ParseApplicationName() {
+ std::string version_mark(" version ");
+ auto version_mark_position = created_by_.find(version_mark);
+ size_t application_name_end;
+ // No VERSION and BUILD_NAME.
+ if (version_mark_position == std::string::npos) {
+ version_start_ = std::string::npos;
+ application_name_end = created_by_.size();
+ } else {
+ version_start_ = version_mark_position + version_mark.size();
+ application_name_end = version_mark_position;
+ }
+
+ size_t application_name_start = 0;
+ RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
+ RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
+ application_version_.application_ = created_by_.substr(
+ application_name_start, application_name_end - application_name_start);
+
+ return true;
+ }
+
+ bool ParseVersion() {
+ // No VERSION.
+ if (version_start_ == std::string::npos) {
+ return false;
+ }
+
+ RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
+ version_end_ = created_by_.find(" (", version_start_);
+ // No BUILD_NAME.
+ if (version_end_ == std::string::npos) {
+ version_end_ = created_by_.size();
+ }
+ RemoveTrailingSpaces(created_by_, version_start_, version_end_);
+ // No VERSION.
+ if (version_start_ == version_end_) {
+ return false;
+ }
+ version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);
+
+ if (!ParseVersionMajor()) {
+ return false;
+ }
+ if (!ParseVersionMinor()) {
+ return false;
+ }
+ if (!ParseVersionPatch()) {
+ return false;
+ }
+ if (!ParseVersionUnknown()) {
+ return false;
+ }
+ if (!ParseVersionPreRelease()) {
+ return false;
+ }
+ if (!ParseVersionBuildInfo()) {
+ return false;
+ }
+
+ return true;
+ }
+
+ bool ParseVersionMajor() {
+ size_t version_major_start = 0;
+ auto version_major_end = version_string_.find_first_not_of(digits_);
+ // MAJOR only.
+ if (version_major_end == std::string::npos) {
+ version_major_end = version_string_.size();
+ version_parsing_position_ = version_major_end;
+ } else {
+ // No ".".
+ if (version_string_[version_major_end] != '.') {
+ return false;
+ }
+ // No MAJOR.
+ if (version_major_end == version_major_start) {
+ return false;
+ }
+ version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
+ }
+ auto version_major_string = version_string_.substr(
+ version_major_start, version_major_end - version_major_start);
+ application_version_.version.major = atoi(version_major_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionMinor() {
+ auto version_minor_start = version_parsing_position_;
+ auto version_minor_end =
+ version_string_.find_first_not_of(digits_, version_minor_start);
+ // MAJOR.MINOR only.
+ if (version_minor_end == std::string::npos) {
+ version_minor_end = version_string_.size();
+ version_parsing_position_ = version_minor_end;
+ } else {
+ // No ".".
+ if (version_string_[version_minor_end] != '.') {
+ return false;
+ }
+ // No MINOR.
+ if (version_minor_end == version_minor_start) {
+ return false;
+ }
+ version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
+ }
+ auto version_minor_string = version_string_.substr(
+ version_minor_start, version_minor_end - version_minor_start);
+ application_version_.version.minor = atoi(version_minor_string.c_str());
+ return true;
+ }
+
+ bool ParseVersionPatch() {
+ auto version_patch_start = version_parsing_position_;
+ auto version_patch_end =
+ version_string_.find_first_not_of(digits_, version_patch_start);
+ // No UNKNOWN, PRE_RELEASE and BUILD_INFO.
+ if (version_patch_end == std::string::npos) {
+ version_patch_end = version_string_.size();
+ }
+ // No PATCH.
+ if (version_patch_end == version_patch_start) {
+ return false;
+ }
+ auto version_patch_string = version_string_.substr(
+ version_patch_start, version_patch_end - version_patch_start);
+ application_version_.version.patch = atoi(version_patch_string.c_str());
+ version_parsing_position_ = version_patch_end;
+ return true;
+ }
+
+ bool ParseVersionUnknown() {
+ // No UNKNOWN.
+ if (version_parsing_position_ == version_string_.size()) {
+ return true;
+ }
+ auto version_unknown_start = version_parsing_position_;
+ auto version_unknown_end = version_string_.find_first_of("-+", version_unknown_start);
+ // No PRE_RELEASE and BUILD_INFO
+ if (version_unknown_end == std::string::npos) {
+ version_unknown_end = version_string_.size();
+ }
+ application_version_.version.unknown = version_string_.substr(
+ version_unknown_start, version_unknown_end - version_unknown_start);
+ version_parsing_position_ = version_unknown_end;
+ return true;
+ }
+
+ bool ParseVersionPreRelease() {
+ // No PRE_RELEASE.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '-') {
+ return true;
+ }
+
+ auto version_pre_release_start = version_parsing_position_ + 1; // +1 is for '-'.
+ auto version_pre_release_end =
+ version_string_.find_first_of("+", version_pre_release_start);
+ // No BUILD_INFO
+ if (version_pre_release_end == std::string::npos) {
+ version_pre_release_end = version_string_.size();
+ }
+ application_version_.version.pre_release = version_string_.substr(
+ version_pre_release_start, version_pre_release_end - version_pre_release_start);
+ version_parsing_position_ = version_pre_release_end;
+ return true;
+ }
+
+ bool ParseVersionBuildInfo() {
+ // No BUILD_INFO.
+ if (version_parsing_position_ == version_string_.size() ||
+ version_string_[version_parsing_position_] != '+') {
+ return true;
+ }
+
+ auto version_build_info_start = version_parsing_position_ + 1; // +1 is for '+'.
+ application_version_.version.build_info =
+ version_string_.substr(version_build_info_start);
+ return true;
+ }
+
+ bool ParseBuildName() {
+ std::string build_mark(" (build ");
+ auto build_mark_position = created_by_.find(build_mark, version_end_);
+ // No BUILD_NAME.
+ if (build_mark_position == std::string::npos) {
+ return false;
+ }
+ auto build_name_start = build_mark_position + build_mark.size();
+ RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
+ auto build_name_end = created_by_.find_first_of(")", build_name_start);
+ // No end ")".
+ if (build_name_end == std::string::npos) {
+ return false;
+ }
+ RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
+ application_version_.build_ =
+ created_by_.substr(build_name_start, build_name_end - build_name_start);
+
+ return true;
+ }
+
+ const std::string& created_by_;
+ ApplicationVersion& application_version_;
+
+ // For parsing.
+ std::string spaces_;
+ std::string digits_;
+ size_t version_parsing_position_;
+ size_t version_start_;
+ size_t version_end_;
+ std::string version_string_;
+};
+} // namespace
+
+ApplicationVersion::ApplicationVersion(const std::string& created_by) {
+ ApplicationVersionParser parser(created_by, *this);
+ parser.Parse();
+}
+
+bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
+ if (application_ != other_version.application_) return false;
+
+ if (version.major < other_version.version.major) return true;
+ if (version.major > other_version.version.major) return false;
+ DCHECK_EQ(version.major, other_version.version.major);
+ if (version.minor < other_version.version.minor) return true;
+ if (version.minor > other_version.version.minor) return false;
+ DCHECK_EQ(version.minor, other_version.version.minor);
+ return version.patch < other_version.version.patch;
+}
+
+bool ApplicationVersion::VersionEq(const ApplicationVersion& other_version) const {
+ return application_ == other_version.application_ &&
+ version.major == other_version.version.major &&
+ version.minor == other_version.version.minor &&
+ version.patch == other_version.version.patch;
+}
+
+// Reference:
+// parquet-mr/parquet-column/src/main/java/org/apache/parquet/CorruptStatistics.java
+// PARQUET-686 has more discussion on statistics
+bool ApplicationVersion::HasCorrectStatistics(Type::type col_type,
+ EncodedStatistics& statistics,
+ SortOrder::type sort_order) const {
+ // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed
+ // correctly for all types
+ if ((application_ == "parquet-cpp" && VersionLt(PARQUET_CPP_FIXED_STATS_VERSION())) ||
+ (application_ == "parquet-mr" && VersionLt(PARQUET_MR_FIXED_STATS_VERSION()))) {
+ // Only SIGNED are valid unless max and min are the same
+ // (in which case the sort order does not matter)
+ bool max_equals_min = statistics.has_min && statistics.has_max
+ ? statistics.min() == statistics.max()
+ : false;
+ if (SortOrder::SIGNED != sort_order && !max_equals_min) {
+ return false;
+ }
+
+ // Statistics of other types are OK
+ if (col_type != Type::FIXED_LEN_BYTE_ARRAY && col_type != Type::BYTE_ARRAY) {
+ return true;
+ }
+ }
+ // created_by is not populated, which could have been caused by
+ // parquet-mr during the same time as PARQUET-251, see PARQUET-297
+ if (application_ == "unknown") {
+ return true;
+ }
+
+ // Unknown sort order has incorrect stats
+ if (SortOrder::UNKNOWN == sort_order) {
+ return false;
+ }
+
+ // PARQUET-251
+ if (VersionLt(PARQUET_251_FIXED_VERSION())) {
+ return false;
+ }
+
+ return true;
+}
+
+// MetaData Builders
+// row-group metadata
+class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl {
+ public:
+ explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column)
+ : owned_column_chunk_(new format::ColumnChunk),
+ properties_(std::move(props)),
+ column_(column) {
+ Init(owned_column_chunk_.get());
+ }
+
+ explicit ColumnChunkMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column,
+ format::ColumnChunk* column_chunk)
+ : properties_(std::move(props)), column_(column) {
+ Init(column_chunk);
+ }
+
+ const void* contents() const { return column_chunk_; }
+
+ // column chunk
+ void set_file_path(const std::string& val) { column_chunk_->__set_file_path(val); }
+
+ // column metadata
+ void SetStatistics(const EncodedStatistics& val) {
+ column_chunk_->meta_data.__set_statistics(ToThrift(val));
+ }
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+ bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ if (dictionary_page_offset > 0) {
+ column_chunk_->meta_data.__set_dictionary_page_offset(dictionary_page_offset);
+ column_chunk_->__set_file_offset(dictionary_page_offset + compressed_size);
+ } else {
+ column_chunk_->__set_file_offset(data_page_offset + compressed_size);
+ }
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__set_num_values(num_values);
+ if (index_page_offset >= 0) {
+ column_chunk_->meta_data.__set_index_page_offset(index_page_offset);
+ }
+ column_chunk_->meta_data.__set_data_page_offset(data_page_offset);
+ column_chunk_->meta_data.__set_total_uncompressed_size(uncompressed_size);
+ column_chunk_->meta_data.__set_total_compressed_size(compressed_size);
+
+ std::vector<format::Encoding::type> thrift_encodings;
+ if (has_dictionary) {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_index_encoding()));
+ if (properties_->version() == ParquetVersion::PARQUET_1_0) {
+ thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
+ } else {
+ thrift_encodings.push_back(ToThrift(properties_->dictionary_page_encoding()));
+ }
+ } else { // Dictionary not enabled
+ thrift_encodings.push_back(ToThrift(properties_->encoding(column_->path())));
+ }
+ thrift_encodings.push_back(ToThrift(Encoding::RLE));
+ // Only PLAIN encoding is supported for fallback in V1
+ // TODO(majetideepak): Use user specified encoding for V2
+ if (dictionary_fallback) {
+ thrift_encodings.push_back(ToThrift(Encoding::PLAIN));
+ }
+ column_chunk_->meta_data.__set_encodings(thrift_encodings);
+ std::vector<format::PageEncodingStats> thrift_encoding_stats;
+ // Add dictionary page encoding stats
+ for (const auto& entry : dict_encoding_stats) {
+ format::PageEncodingStats dict_enc_stat;
+ dict_enc_stat.__set_page_type(format::PageType::DICTIONARY_PAGE);
+ dict_enc_stat.__set_encoding(ToThrift(entry.first));
+ dict_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(dict_enc_stat);
+ }
+ // Add data page encoding stats
+ for (const auto& entry : data_encoding_stats) {
+ format::PageEncodingStats data_enc_stat;
+ data_enc_stat.__set_page_type(format::PageType::DATA_PAGE);
+ data_enc_stat.__set_encoding(ToThrift(entry.first));
+ data_enc_stat.__set_count(entry.second);
+ thrift_encoding_stats.push_back(data_enc_stat);
+ }
+ column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats);
+
+ const auto& encrypt_md =
+ properties_->column_encryption_properties(column_->path()->ToDotString());
+ // column is encrypted
+ if (encrypt_md != nullptr && encrypt_md->is_encrypted()) {
+ column_chunk_->__isset.crypto_metadata = true;
+ format::ColumnCryptoMetaData ccmd;
+ if (encrypt_md->is_encrypted_with_footer_key()) {
+ // encrypted with footer key
+ ccmd.__isset.ENCRYPTION_WITH_FOOTER_KEY = true;
+ ccmd.__set_ENCRYPTION_WITH_FOOTER_KEY(format::EncryptionWithFooterKey());
+ } else { // encrypted with column key
+ format::EncryptionWithColumnKey eck;
+ eck.__set_key_metadata(encrypt_md->key_metadata());
+ eck.__set_path_in_schema(column_->path()->ToDotVector());
+ ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true;
+ ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck);
+ }
+ column_chunk_->__set_crypto_metadata(ccmd);
+
+ bool encrypted_footer =
+ properties_->file_encryption_properties()->encrypted_footer();
+ bool encrypt_metadata =
+ !encrypted_footer || !encrypt_md->is_encrypted_with_footer_key();
+ if (encrypt_metadata) {
+ ThriftSerializer serializer;
+ // Serialize and encrypt ColumnMetadata separately
+ // Thrift-serialize the ColumnMetaData structure,
+ // encrypt it with the column key, and write to encrypted_column_metadata
+ uint8_t* serialized_data;
+ uint32_t serialized_len;
+
+ serializer.SerializeToBuffer(&column_chunk_->meta_data, &serialized_len,
+ &serialized_data);
+
+ std::vector<uint8_t> encrypted_data(encryptor->CiphertextSizeDelta() +
+ serialized_len);
+ unsigned encrypted_len =
+ encryptor->Encrypt(serialized_data, serialized_len, encrypted_data.data());
+
+ const char* temp =
+ const_cast<const char*>(reinterpret_cast<char*>(encrypted_data.data()));
+ std::string encrypted_column_metadata(temp, encrypted_len);
+ column_chunk_->__set_encrypted_column_metadata(encrypted_column_metadata);
+
+ if (encrypted_footer) {
+ column_chunk_->__isset.meta_data = false;
+ } else {
+ // Keep redacted metadata version for old readers
+ column_chunk_->__isset.meta_data = true;
+ column_chunk_->meta_data.__isset.statistics = false;
+ column_chunk_->meta_data.__isset.encoding_stats = false;
+ }
+ }
+ }
+ }
+
+ void WriteTo(::arrow::io::OutputStream* sink) {
+ ThriftSerializer serializer;
+ serializer.Serialize(column_chunk_, sink);
+ }
+
+ const ColumnDescriptor* descr() const { return column_; }
+ int64_t total_compressed_size() const {
+ return column_chunk_->meta_data.total_compressed_size;
+ }
+
+ private:
+ void Init(format::ColumnChunk* column_chunk) {
+ column_chunk_ = column_chunk;
+
+ column_chunk_->meta_data.__set_type(ToThrift(column_->physical_type()));
+ column_chunk_->meta_data.__set_path_in_schema(column_->path()->ToDotVector());
+ column_chunk_->meta_data.__set_codec(
+ ToThrift(properties_->compression(column_->path())));
+ }
+
+ format::ColumnChunk* column_chunk_;
+ std::unique_ptr<format::ColumnChunk> owned_column_chunk_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const ColumnDescriptor* column_;
+};
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(std::move(props), column, contents));
+}
+
+std::unique_ptr<ColumnChunkMetaDataBuilder> ColumnChunkMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column) {
+ return std::unique_ptr<ColumnChunkMetaDataBuilder>(
+ new ColumnChunkMetaDataBuilder(std::move(props), column));
+}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {}
+
+ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents)
+ : impl_{std::unique_ptr<ColumnChunkMetaDataBuilderImpl>(
+ new ColumnChunkMetaDataBuilderImpl(
+ std::move(props), column,
+ reinterpret_cast<format::ColumnChunk*>(contents)))} {}
+
+ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default;
+
+const void* ColumnChunkMetaDataBuilder::contents() const { return impl_->contents(); }
+
+void ColumnChunkMetaDataBuilder::set_file_path(const std::string& path) {
+ impl_->set_file_path(path);
+}
+
+void ColumnChunkMetaDataBuilder::Finish(
+ int64_t num_values, int64_t dictionary_page_offset, int64_t index_page_offset,
+ int64_t data_page_offset, int64_t compressed_size, int64_t uncompressed_size,
+ bool has_dictionary, bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ impl_->Finish(num_values, dictionary_page_offset, index_page_offset, data_page_offset,
+ compressed_size, uncompressed_size, has_dictionary, dictionary_fallback,
+ dict_encoding_stats, data_encoding_stats, encryptor);
+}
+
+void ColumnChunkMetaDataBuilder::WriteTo(::arrow::io::OutputStream* sink) {
+ impl_->WriteTo(sink);
+}
+
+const ColumnDescriptor* ColumnChunkMetaDataBuilder::descr() const {
+ return impl_->descr();
+}
+
+void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) {
+ impl_->SetStatistics(result);
+}
+
+int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const {
+ return impl_->total_compressed_size();
+}
+
+class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
+ public:
+ explicit RowGroupMetaDataBuilderImpl(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema, void* contents)
+ : properties_(std::move(props)), schema_(schema), next_column_(0) {
+ row_group_ = reinterpret_cast<format::RowGroup*>(contents);
+ InitializeColumns(schema->num_columns());
+ }
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk() {
+ if (!(next_column_ < num_columns())) {
+ std::stringstream ss;
+ ss << "The schema only has " << num_columns()
+ << " columns, requested metadata for column: " << next_column_;
+ throw ParquetException(ss.str());
+ }
+ auto column = schema_->Column(next_column_);
+ auto column_builder = ColumnChunkMetaDataBuilder::Make(
+ properties_, column, &row_group_->columns[next_column_++]);
+ auto column_builder_ptr = column_builder.get();
+ column_builders_.push_back(std::move(column_builder));
+ return column_builder_ptr;
+ }
+
+ int current_column() { return next_column_ - 1; }
+
+ void Finish(int64_t total_bytes_written, int16_t row_group_ordinal) {
+ if (!(next_column_ == schema_->num_columns())) {
+ std::stringstream ss;
+ ss << "Only " << next_column_ - 1 << " out of " << schema_->num_columns()
+ << " columns are initialized";
+ throw ParquetException(ss.str());
+ }
+
+ int64_t file_offset = 0;
+ int64_t total_compressed_size = 0;
+ for (int i = 0; i < schema_->num_columns(); i++) {
+ if (!(row_group_->columns[i].file_offset >= 0)) {
+ std::stringstream ss;
+ ss << "Column " << i << " is not complete.";
+ throw ParquetException(ss.str());
+ }
+ if (i == 0) {
+ file_offset = row_group_->columns[0].file_offset;
+ }
+ // sometimes column metadata is encrypted and not available to read,
+ // so we must get total_compressed_size from column builder
+ total_compressed_size += column_builders_[i]->total_compressed_size();
+ }
+
+ row_group_->__set_file_offset(file_offset);
+ row_group_->__set_total_compressed_size(total_compressed_size);
+ row_group_->__set_total_byte_size(total_bytes_written);
+ row_group_->__set_ordinal(row_group_ordinal);
+ }
+
+ void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }
+
+ int num_columns() { return static_cast<int>(row_group_->columns.size()); }
+
+ int64_t num_rows() { return row_group_->num_rows; }
+
+ private:
+ void InitializeColumns(int ncols) { row_group_->columns.resize(ncols); }
+
+ format::RowGroup* row_group_;
+ const std::shared_ptr<WriterProperties> properties_;
+ const SchemaDescriptor* schema_;
+ std::vector<std::unique_ptr<ColumnChunkMetaDataBuilder>> column_builders_;
+ int next_column_;
+};
+
+std::unique_ptr<RowGroupMetaDataBuilder> RowGroupMetaDataBuilder::Make(
+ std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+ void* contents) {
+ return std::unique_ptr<RowGroupMetaDataBuilder>(
+ new RowGroupMetaDataBuilder(std::move(props), schema_, contents));
+}
+
+RowGroupMetaDataBuilder::RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema_,
+ void* contents)
+ : impl_{new RowGroupMetaDataBuilderImpl(std::move(props), schema_, contents)} {}
+
+RowGroupMetaDataBuilder::~RowGroupMetaDataBuilder() = default;
+
+ColumnChunkMetaDataBuilder* RowGroupMetaDataBuilder::NextColumnChunk() {
+ return impl_->NextColumnChunk();
+}
+
+int RowGroupMetaDataBuilder::current_column() const { return impl_->current_column(); }
+
+int RowGroupMetaDataBuilder::num_columns() { return impl_->num_columns(); }
+
+int64_t RowGroupMetaDataBuilder::num_rows() { return impl_->num_rows(); }
+
+void RowGroupMetaDataBuilder::set_num_rows(int64_t num_rows) {
+ impl_->set_num_rows(num_rows);
+}
+
+void RowGroupMetaDataBuilder::Finish(int64_t total_bytes_written,
+ int16_t row_group_ordinal) {
+ impl_->Finish(total_bytes_written, row_group_ordinal);
+}
+
+// file metadata
+// TODO(PARQUET-595) Support key_value_metadata
+class FileMetaDataBuilder::FileMetaDataBuilderImpl {
+ public:
+ explicit FileMetaDataBuilderImpl(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : metadata_(new format::FileMetaData()),
+ properties_(std::move(props)),
+ schema_(schema),
+ key_value_metadata_(std::move(key_value_metadata)) {
+ if (properties_->file_encryption_properties() != nullptr &&
+ properties_->file_encryption_properties()->encrypted_footer()) {
+ crypto_metadata_.reset(new format::FileCryptoMetaData());
+ }
+ }
+
+ RowGroupMetaDataBuilder* AppendRowGroup() {
+ row_groups_.emplace_back();
+ current_row_group_builder_ =
+ RowGroupMetaDataBuilder::Make(properties_, schema_, &row_groups_.back());
+ return current_row_group_builder_.get();
+ }
+
+ std::unique_ptr<FileMetaData> Finish() {
+ int64_t total_rows = 0;
+ for (auto row_group : row_groups_) {
+ total_rows += row_group.num_rows;
+ }
+ metadata_->__set_num_rows(total_rows);
+ metadata_->__set_row_groups(row_groups_);
+
+ if (key_value_metadata_) {
+ metadata_->key_value_metadata.clear();
+ metadata_->key_value_metadata.reserve(key_value_metadata_->size());
+ for (int64_t i = 0; i < key_value_metadata_->size(); ++i) {
+ format::KeyValue kv_pair;
+ kv_pair.__set_key(key_value_metadata_->key(i));
+ kv_pair.__set_value(key_value_metadata_->value(i));
+ metadata_->key_value_metadata.push_back(kv_pair);
+ }
+ metadata_->__isset.key_value_metadata = true;
+ }
+
+ int32_t file_version = 0;
+ switch (properties_->version()) {
+ case ParquetVersion::PARQUET_1_0:
+ file_version = 1;
+ break;
+ case ParquetVersion::PARQUET_2_0:
+ file_version = 2;
+ break;
+ default:
+ break;
+ }
+ metadata_->__set_version(file_version);
+ metadata_->__set_created_by(properties_->created_by());
+
+ // Users cannot set the `ColumnOrder` since we donot not have user defined sort order
+ // in the spec yet.
+ // We always default to `TYPE_DEFINED_ORDER`. We can expose it in
+ // the API once we have user defined sort orders in the Parquet format.
+ // TypeDefinedOrder implies choose SortOrder based on ConvertedType/PhysicalType
+ format::TypeDefinedOrder type_defined_order;
+ format::ColumnOrder column_order;
+ column_order.__set_TYPE_ORDER(type_defined_order);
+ column_order.__isset.TYPE_ORDER = true;
+ metadata_->column_orders.resize(schema_->num_columns(), column_order);
+ metadata_->__isset.column_orders = true;
+
+ // if plaintext footer, set footer signing algorithm
+ auto file_encryption_properties = properties_->file_encryption_properties();
+ if (file_encryption_properties && !file_encryption_properties->encrypted_footer()) {
+ EncryptionAlgorithm signing_algorithm;
+ EncryptionAlgorithm algo = file_encryption_properties->algorithm();
+ signing_algorithm.aad.aad_file_unique = algo.aad.aad_file_unique;
+ signing_algorithm.aad.supply_aad_prefix = algo.aad.supply_aad_prefix;
+ if (!algo.aad.supply_aad_prefix) {
+ signing_algorithm.aad.aad_prefix = algo.aad.aad_prefix;
+ }
+ signing_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
+
+ metadata_->__set_encryption_algorithm(ToThrift(signing_algorithm));
+ const std::string& footer_signing_key_metadata =
+ file_encryption_properties->footer_key_metadata();
+ if (footer_signing_key_metadata.size() > 0) {
+ metadata_->__set_footer_signing_key_metadata(footer_signing_key_metadata);
+ }
+ }
+
+ ToParquet(static_cast<parquet::schema::GroupNode*>(schema_->schema_root().get()),
+ &metadata_->schema);
+ auto file_meta_data = std::unique_ptr<FileMetaData>(new FileMetaData());
+ file_meta_data->impl_->metadata_ = std::move(metadata_);
+ file_meta_data->impl_->InitSchema();
+ file_meta_data->impl_->InitKeyValueMetadata();
+ return file_meta_data;
+ }
+
+ std::unique_ptr<FileCryptoMetaData> BuildFileCryptoMetaData() {
+ if (crypto_metadata_ == nullptr) {
+ return nullptr;
+ }
+
+ auto file_encryption_properties = properties_->file_encryption_properties();
+
+ crypto_metadata_->__set_encryption_algorithm(
+ ToThrift(file_encryption_properties->algorithm()));
+ std::string key_metadata = file_encryption_properties->footer_key_metadata();
+
+ if (!key_metadata.empty()) {
+ crypto_metadata_->__set_key_metadata(key_metadata);
+ }
+
+ std::unique_ptr<FileCryptoMetaData> file_crypto_metadata =
+ std::unique_ptr<FileCryptoMetaData>(new FileCryptoMetaData());
+ file_crypto_metadata->impl_->metadata_ = std::move(crypto_metadata_);
+
+ return file_crypto_metadata;
+ }
+
+ protected:
+ std::unique_ptr<format::FileMetaData> metadata_;
+ std::unique_ptr<format::FileCryptoMetaData> crypto_metadata_;
+
+ private:
+ const std::shared_ptr<WriterProperties> properties_;
+ std::vector<format::RowGroup> row_groups_;
+
+ std::unique_ptr<RowGroupMetaDataBuilder> current_row_group_builder_;
+ const SchemaDescriptor* schema_;
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+};
+
+std::unique_ptr<FileMetaDataBuilder> FileMetaDataBuilder::Make(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata) {
+ return std::unique_ptr<FileMetaDataBuilder>(
+ new FileMetaDataBuilder(schema, std::move(props), std::move(key_value_metadata)));
+}
+
+FileMetaDataBuilder::FileMetaDataBuilder(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+ : impl_{std::unique_ptr<FileMetaDataBuilderImpl>(new FileMetaDataBuilderImpl(
+ schema, std::move(props), std::move(key_value_metadata)))} {}
+
+FileMetaDataBuilder::~FileMetaDataBuilder() = default;
+
+RowGroupMetaDataBuilder* FileMetaDataBuilder::AppendRowGroup() {
+ return impl_->AppendRowGroup();
+}
+
+std::unique_ptr<FileMetaData> FileMetaDataBuilder::Finish() { return impl_->Finish(); }
+
+std::unique_ptr<FileCryptoMetaData> FileMetaDataBuilder::GetCryptoMetaData() {
+ return impl_->BuildFileCryptoMetaData();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
index b432c20cf64..1865115e423 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/metadata.h
@@ -1,484 +1,484 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "parquet/platform.h"
-#include "parquet/properties.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-class ColumnDescriptor;
-class EncodedStatistics;
-class Statistics;
-class SchemaDescriptor;
-
-class FileCryptoMetaData;
-class InternalFileDecryptor;
-class Decryptor;
-class Encryptor;
-class FooterSigningEncryptor;
-
-namespace schema {
-
-class ColumnPath;
-
-} // namespace schema
-
-using KeyValueMetadata = ::arrow::KeyValueMetadata;
-
-class PARQUET_EXPORT ApplicationVersion {
- public:
- // Known Versions with Issues
- static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
- static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
- static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
- static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
-
- // Application that wrote the file. e.g. "IMPALA"
- std::string application_;
- // Build name
- std::string build_;
-
- // Version of the application that wrote the file, expressed as
- // (<major>.<minor>.<patch>). Unmatched parts default to 0.
- // "1.2.3" => {1, 2, 3}
- // "1.2" => {1, 2, 0}
- // "1.2-cdh5" => {1, 2, 0}
- struct {
- int major;
- int minor;
- int patch;
- std::string unknown;
- std::string pre_release;
- std::string build_info;
- } version;
-
- ApplicationVersion() = default;
- explicit ApplicationVersion(const std::string& created_by);
- ApplicationVersion(std::string application, int major, int minor, int patch);
-
- // Returns true if version is strictly less than other_version
- bool VersionLt(const ApplicationVersion& other_version) const;
-
- // Returns true if version is strictly less than other_version
- bool VersionEq(const ApplicationVersion& other_version) const;
-
- // Checks if the Version has the correct statistics for a given column
- bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
- SortOrder::type sort_order = SortOrder::SIGNED) const;
-};
-
-class PARQUET_EXPORT ColumnCryptoMetaData {
- public:
- static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
- ~ColumnCryptoMetaData();
-
- bool Equals(const ColumnCryptoMetaData& other) const;
-
- std::shared_ptr<schema::ColumnPath> path_in_schema() const;
- bool encrypted_with_footer_key() const;
- const std::string& key_metadata() const;
-
- private:
- explicit ColumnCryptoMetaData(const uint8_t* metadata);
-
- class ColumnCryptoMetaDataImpl;
- std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
-};
-
-/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
-struct PageEncodingStats {
- PageType::type page_type;
- Encoding::type encoding;
- int32_t count;
-};
-
-/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
-class PARQUET_EXPORT ColumnChunkMetaData {
- public:
- // API convenience to get a MetaData accessor
- static std::unique_ptr<ColumnChunkMetaData> Make(
- const void* metadata, const ColumnDescriptor* descr,
- const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
- int16_t column_ordinal = -1,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- ~ColumnChunkMetaData();
-
- bool Equals(const ColumnChunkMetaData& other) const;
-
- // column chunk
- int64_t file_offset() const;
-
- // parameter is only used when a dataset is spread across multiple files
- const std::string& file_path() const;
-
- // column metadata
- bool is_metadata_set() const;
- Type::type type() const;
- int64_t num_values() const;
- std::shared_ptr<schema::ColumnPath> path_in_schema() const;
- bool is_stats_set() const;
- std::shared_ptr<Statistics> statistics() const;
-
- Compression::type compression() const;
- // Indicate if the ColumnChunk compression is supported by the current
- // compiled parquet library.
- bool can_decompress() const;
-
- const std::vector<Encoding::type>& encodings() const;
- const std::vector<PageEncodingStats>& encoding_stats() const;
- bool has_dictionary_page() const;
- int64_t dictionary_page_offset() const;
- int64_t data_page_offset() const;
- bool has_index_page() const;
- int64_t index_page_offset() const;
- int64_t total_compressed_size() const;
- int64_t total_uncompressed_size() const;
- std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
-
- private:
- explicit ColumnChunkMetaData(
- const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
- int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
- // PIMPL Idiom
- class ColumnChunkMetaDataImpl;
- std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
-};
-
-/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
-class PARQUET_EXPORT RowGroupMetaData {
- public:
- /// \brief Create a RowGroupMetaData from a serialized thrift message.
- static std::unique_ptr<RowGroupMetaData> Make(
- const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version = NULLPTR,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- ~RowGroupMetaData();
-
- bool Equals(const RowGroupMetaData& other) const;
-
- /// \brief The number of columns in this row group. The order must match the
- /// parent's column ordering.
- int num_columns() const;
-
- /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
- ///
- /// WARNING, the returned object references memory location in it's parent
- /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
- /// object.
- ///
- /// \param[in] index of the ColumnChunkMetaData to retrieve.
- ///
- /// \throws ParquetException if the index is out of bound.
- std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
-
- /// \brief Number of rows in this row group.
- int64_t num_rows() const;
-
- /// \brief Total byte size of all the uncompressed column data in this row group.
- int64_t total_byte_size() const;
-
- /// \brief Total byte size of all the compressed (and potentially encrypted)
- /// column data in this row group.
- ///
- /// This information is optional and may be 0 if omitted.
- int64_t total_compressed_size() const;
-
- /// \brief Byte offset from beginning of file to first page (data or
- /// dictionary) in this row group
- ///
- /// The file_offset field that this method exposes is optional. This method
- /// will return 0 if that field is not set to a meaningful value.
- int64_t file_offset() const;
- // Return const-pointer to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const;
- // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
- bool can_decompress() const;
-
- private:
- explicit RowGroupMetaData(
- const void* metadata, const SchemaDescriptor* schema,
- const ApplicationVersion* writer_version = NULLPTR,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
- // PIMPL Idiom
- class RowGroupMetaDataImpl;
- std::unique_ptr<RowGroupMetaDataImpl> impl_;
-};
-
-class FileMetaDataBuilder;
-
-/// \brief FileMetaData is a proxy around format::FileMetaData.
-class PARQUET_EXPORT FileMetaData {
- public:
- /// \brief Create a FileMetaData from a serialized thrift message.
- static std::shared_ptr<FileMetaData> Make(
- const void* serialized_metadata, uint32_t* inout_metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- ~FileMetaData();
-
- bool Equals(const FileMetaData& other) const;
-
- /// \brief The number of top-level columns in the schema.
- ///
- /// Parquet thrift definition requires that nested schema elements are
- /// flattened. This method returns the number of columns in the un-flattened
- /// version.
- int num_columns() const;
-
- /// \brief The number of flattened schema elements.
- ///
- /// Parquet thrift definition requires that nested schema elements are
- /// flattened. This method returns the total number of elements in the
- /// flattened list.
- int num_schema_elements() const;
-
- /// \brief The total number of rows.
- int64_t num_rows() const;
-
- /// \brief The number of row groups in the file.
- int num_row_groups() const;
-
- /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
- ///
- /// WARNING, the returned object references memory location in it's parent
- /// (FileMetaData) object. Hence, the parent must outlive the returned object.
- ///
- /// \param[in] index of the RowGroup to retrieve.
- ///
- /// \throws ParquetException if the index is out of bound.
- std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
-
- /// \brief Return the version of the file.
- ParquetVersion::type version() const;
-
- /// \brief Return the application's user-agent string of the writer.
- const std::string& created_by() const;
-
- /// \brief Return the application's version of the writer.
- const ApplicationVersion& writer_version() const;
-
- /// \brief Size of the original thrift encoded metadata footer.
- uint32_t size() const;
-
- /// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
- ///
- /// This will return false if any of the RowGroup's page is compressed with a
- /// compression format which is not compiled in the current parquet library.
- bool can_decompress() const;
-
- bool is_encryption_algorithm_set() const;
- EncryptionAlgorithm encryption_algorithm() const;
- const std::string& footer_signing_key_metadata() const;
-
- /// \brief Verify signature of FileMetaData when file is encrypted but footer
- /// is not encrypted (plaintext footer).
- bool VerifySignature(const void* signature);
-
- void WriteTo(::arrow::io::OutputStream* dst,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
-
- /// \brief Return Thrift-serialized representation of the metadata as a
- /// string
- std::string SerializeToString() const;
-
- // Return const-pointer to make it clear that this object is not to be copied
- const SchemaDescriptor* schema() const;
-
- const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
-
- /// \brief Set a path to all ColumnChunk for all RowGroups.
- ///
- /// Commonly used by systems (Dask, Spark) who generates an metadata-only
- /// parquet file. The path is usually relative to said index file.
- ///
- /// \param[in] path to set.
- void set_file_path(const std::string& path);
-
- /// \brief Merge row groups from another metadata file into this one.
- ///
- /// The schema of the input FileMetaData must be equal to the
- /// schema of this object.
- ///
- /// This is used by systems who creates an aggregate metadata-only file by
- /// concatenating the row groups of multiple files. This newly created
- /// metadata file acts as an index of all available row groups.
- ///
- /// \param[in] other FileMetaData to merge the row groups from.
- ///
- /// \throws ParquetException if schemas are not equal.
- void AppendRowGroups(const FileMetaData& other);
-
- /// \brief Return a FileMetaData containing a subset of the row groups in this
- /// FileMetaData.
- std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
-
- private:
- friend FileMetaDataBuilder;
- friend class SerializedFile;
-
- explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
- std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
-
- void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
-
- // PIMPL Idiom
- FileMetaData();
- class FileMetaDataImpl;
- std::unique_ptr<FileMetaDataImpl> impl_;
-};
-
-class PARQUET_EXPORT FileCryptoMetaData {
- public:
- // API convenience to get a MetaData accessor
- static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
- uint32_t* metadata_len);
- ~FileCryptoMetaData();
-
- EncryptionAlgorithm encryption_algorithm() const;
- const std::string& key_metadata() const;
-
- void WriteTo(::arrow::io::OutputStream* dst) const;
-
- private:
- friend FileMetaDataBuilder;
- FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
-
- // PIMPL Idiom
- FileCryptoMetaData();
- class FileCryptoMetaDataImpl;
- std::unique_ptr<FileCryptoMetaDataImpl> impl_;
-};
-
-// Builder API
-class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
- public:
- // API convenience to get a MetaData reader
- static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
-
- static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
- std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
- void* contents);
-
- ~ColumnChunkMetaDataBuilder();
-
- // column chunk
- // Used when a dataset is spread across multiple files
- void set_file_path(const std::string& path);
- // column metadata
- void SetStatistics(const EncodedStatistics& stats);
- // get the column descriptor
- const ColumnDescriptor* descr() const;
-
- int64_t total_compressed_size() const;
- // commit the metadata
-
- void Finish(int64_t num_values, int64_t dictionary_page_offset,
- int64_t index_page_offset, int64_t data_page_offset,
- int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
- bool dictionary_fallback,
- const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
- const std::map<Encoding::type, int32_t>& data_encoding_stats_,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
-
- // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
- const void* contents() const;
-
- // For writing metadata at end of column chunk
- void WriteTo(::arrow::io::OutputStream* sink);
-
- private:
- explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column);
- explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const ColumnDescriptor* column, void* contents);
- // PIMPL Idiom
- class ColumnChunkMetaDataBuilderImpl;
- std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
-};
-
-class PARQUET_EXPORT RowGroupMetaDataBuilder {
- public:
- // API convenience to get a MetaData reader
- static std::unique_ptr<RowGroupMetaDataBuilder> Make(
- std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
- void* contents);
-
- ~RowGroupMetaDataBuilder();
-
- ColumnChunkMetaDataBuilder* NextColumnChunk();
- int num_columns();
- int64_t num_rows();
- int current_column() const;
-
- void set_num_rows(int64_t num_rows);
-
- // commit the metadata
- void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
-
- private:
- explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
- const SchemaDescriptor* schema_, void* contents);
- // PIMPL Idiom
- class RowGroupMetaDataBuilderImpl;
- std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
-};
-
-class PARQUET_EXPORT FileMetaDataBuilder {
- public:
- // API convenience to get a MetaData reader
- static std::unique_ptr<FileMetaDataBuilder> Make(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
-
- ~FileMetaDataBuilder();
-
- // The prior RowGroupMetaDataBuilder (if any) is destroyed
- RowGroupMetaDataBuilder* AppendRowGroup();
-
- // Complete the Thrift structure
- std::unique_ptr<FileMetaData> Finish();
-
- // crypto metadata
- std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
-
- private:
- explicit FileMetaDataBuilder(
- const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
- std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
- // PIMPL Idiom
- class FileMetaDataBuilderImpl;
- std::unique_ptr<FileMetaDataBuilderImpl> impl_;
-};
-
-PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class ColumnDescriptor;
+class EncodedStatistics;
+class Statistics;
+class SchemaDescriptor;
+
+class FileCryptoMetaData;
+class InternalFileDecryptor;
+class Decryptor;
+class Encryptor;
+class FooterSigningEncryptor;
+
+namespace schema {
+
+class ColumnPath;
+
+} // namespace schema
+
+using KeyValueMetadata = ::arrow::KeyValueMetadata;
+
+class PARQUET_EXPORT ApplicationVersion {
+ public:
+ // Known Versions with Issues
+ static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
+ static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
+ static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
+ static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
+
+ // Application that wrote the file. e.g. "IMPALA"
+ std::string application_;
+ // Build name
+ std::string build_;
+
+ // Version of the application that wrote the file, expressed as
+ // (<major>.<minor>.<patch>). Unmatched parts default to 0.
+ // "1.2.3" => {1, 2, 3}
+ // "1.2" => {1, 2, 0}
+ // "1.2-cdh5" => {1, 2, 0}
+ struct {
+ int major;
+ int minor;
+ int patch;
+ std::string unknown;
+ std::string pre_release;
+ std::string build_info;
+ } version;
+
+ ApplicationVersion() = default;
+ explicit ApplicationVersion(const std::string& created_by);
+ ApplicationVersion(std::string application, int major, int minor, int patch);
+
+ // Returns true if version is strictly less than other_version
+ bool VersionLt(const ApplicationVersion& other_version) const;
+
+ // Returns true if version is strictly less than other_version
+ bool VersionEq(const ApplicationVersion& other_version) const;
+
+ // Checks if the Version has the correct statistics for a given column
+ bool HasCorrectStatistics(Type::type primitive, EncodedStatistics& statistics,
+ SortOrder::type sort_order = SortOrder::SIGNED) const;
+};
+
+class PARQUET_EXPORT ColumnCryptoMetaData {
+ public:
+ static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
+ ~ColumnCryptoMetaData();
+
+ bool Equals(const ColumnCryptoMetaData& other) const;
+
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool encrypted_with_footer_key() const;
+ const std::string& key_metadata() const;
+
+ private:
+ explicit ColumnCryptoMetaData(const uint8_t* metadata);
+
+ class ColumnCryptoMetaDataImpl;
+ std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
+};
+
+/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
+struct PageEncodingStats {
+ PageType::type page_type;
+ Encoding::type encoding;
+ int32_t count;
+};
+
+/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
+class PARQUET_EXPORT ColumnChunkMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::unique_ptr<ColumnChunkMetaData> Make(
+ const void* metadata, const ColumnDescriptor* descr,
+ const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
+ int16_t column_ordinal = -1,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~ColumnChunkMetaData();
+
+ bool Equals(const ColumnChunkMetaData& other) const;
+
+ // column chunk
+ int64_t file_offset() const;
+
+ // parameter is only used when a dataset is spread across multiple files
+ const std::string& file_path() const;
+
+ // column metadata
+ bool is_metadata_set() const;
+ Type::type type() const;
+ int64_t num_values() const;
+ std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+ bool is_stats_set() const;
+ std::shared_ptr<Statistics> statistics() const;
+
+ Compression::type compression() const;
+ // Indicate if the ColumnChunk compression is supported by the current
+ // compiled parquet library.
+ bool can_decompress() const;
+
+ const std::vector<Encoding::type>& encodings() const;
+ const std::vector<PageEncodingStats>& encoding_stats() const;
+ bool has_dictionary_page() const;
+ int64_t dictionary_page_offset() const;
+ int64_t data_page_offset() const;
+ bool has_index_page() const;
+ int64_t index_page_offset() const;
+ int64_t total_compressed_size() const;
+ int64_t total_uncompressed_size() const;
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
+
+ private:
+ explicit ColumnChunkMetaData(
+ const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+ int16_t column_ordinal, const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataImpl;
+ std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
+};
+
+/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
+class PARQUET_EXPORT RowGroupMetaData {
+ public:
+ /// \brief Create a RowGroupMetaData from a serialized thrift message.
+ static std::unique_ptr<RowGroupMetaData> Make(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~RowGroupMetaData();
+
+ bool Equals(const RowGroupMetaData& other) const;
+
+ /// \brief The number of columns in this row group. The order must match the
+ /// parent's column ordering.
+ int num_columns() const;
+
+ /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
+ ///
+ /// WARNING, the returned object references memory location in it's parent
+ /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
+ /// object.
+ ///
+ /// \param[in] index of the ColumnChunkMetaData to retrieve.
+ ///
+ /// \throws ParquetException if the index is out of bound.
+ std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
+
+ /// \brief Number of rows in this row group.
+ int64_t num_rows() const;
+
+ /// \brief Total byte size of all the uncompressed column data in this row group.
+ int64_t total_byte_size() const;
+
+ /// \brief Total byte size of all the compressed (and potentially encrypted)
+ /// column data in this row group.
+ ///
+ /// This information is optional and may be 0 if omitted.
+ int64_t total_compressed_size() const;
+
+ /// \brief Byte offset from beginning of file to first page (data or
+ /// dictionary) in this row group
+ ///
+ /// The file_offset field that this method exposes is optional. This method
+ /// will return 0 if that field is not set to a meaningful value.
+ int64_t file_offset() const;
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const;
+ // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
+ bool can_decompress() const;
+
+ private:
+ explicit RowGroupMetaData(
+ const void* metadata, const SchemaDescriptor* schema,
+ const ApplicationVersion* writer_version = NULLPTR,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+ // PIMPL Idiom
+ class RowGroupMetaDataImpl;
+ std::unique_ptr<RowGroupMetaDataImpl> impl_;
+};
+
+class FileMetaDataBuilder;
+
+/// \brief FileMetaData is a proxy around format::FileMetaData.
+class PARQUET_EXPORT FileMetaData {
+ public:
+ /// \brief Create a FileMetaData from a serialized thrift message.
+ static std::shared_ptr<FileMetaData> Make(
+ const void* serialized_metadata, uint32_t* inout_metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ ~FileMetaData();
+
+ bool Equals(const FileMetaData& other) const;
+
+ /// \brief The number of top-level columns in the schema.
+ ///
+ /// Parquet thrift definition requires that nested schema elements are
+ /// flattened. This method returns the number of columns in the un-flattened
+ /// version.
+ int num_columns() const;
+
+ /// \brief The number of flattened schema elements.
+ ///
+ /// Parquet thrift definition requires that nested schema elements are
+ /// flattened. This method returns the total number of elements in the
+ /// flattened list.
+ int num_schema_elements() const;
+
+ /// \brief The total number of rows.
+ int64_t num_rows() const;
+
+ /// \brief The number of row groups in the file.
+ int num_row_groups() const;
+
+ /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
+ ///
+ /// WARNING, the returned object references memory location in it's parent
+ /// (FileMetaData) object. Hence, the parent must outlive the returned object.
+ ///
+ /// \param[in] index of the RowGroup to retrieve.
+ ///
+ /// \throws ParquetException if the index is out of bound.
+ std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
+
+ /// \brief Return the version of the file.
+ ParquetVersion::type version() const;
+
+ /// \brief Return the application's user-agent string of the writer.
+ const std::string& created_by() const;
+
+ /// \brief Return the application's version of the writer.
+ const ApplicationVersion& writer_version() const;
+
+ /// \brief Size of the original thrift encoded metadata footer.
+ uint32_t size() const;
+
+ /// \brief Indicate if all of the FileMetadata's RowGroups can be decompressed.
+ ///
+ /// This will return false if any of the RowGroup's page is compressed with a
+ /// compression format which is not compiled in the current parquet library.
+ bool can_decompress() const;
+
+ bool is_encryption_algorithm_set() const;
+ EncryptionAlgorithm encryption_algorithm() const;
+ const std::string& footer_signing_key_metadata() const;
+
+ /// \brief Verify signature of FileMetaData when file is encrypted but footer
+ /// is not encrypted (plaintext footer).
+ bool VerifySignature(const void* signature);
+
+ void WriteTo(::arrow::io::OutputStream* dst,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
+
+ /// \brief Return Thrift-serialized representation of the metadata as a
+ /// string
+ std::string SerializeToString() const;
+
+ // Return const-pointer to make it clear that this object is not to be copied
+ const SchemaDescriptor* schema() const;
+
+ const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ /// \brief Set a path to all ColumnChunk for all RowGroups.
+ ///
+ /// Commonly used by systems (Dask, Spark) who generates an metadata-only
+ /// parquet file. The path is usually relative to said index file.
+ ///
+ /// \param[in] path to set.
+ void set_file_path(const std::string& path);
+
+ /// \brief Merge row groups from another metadata file into this one.
+ ///
+ /// The schema of the input FileMetaData must be equal to the
+ /// schema of this object.
+ ///
+ /// This is used by systems who creates an aggregate metadata-only file by
+ /// concatenating the row groups of multiple files. This newly created
+ /// metadata file acts as an index of all available row groups.
+ ///
+ /// \param[in] other FileMetaData to merge the row groups from.
+ ///
+ /// \throws ParquetException if schemas are not equal.
+ void AppendRowGroups(const FileMetaData& other);
+
+ /// \brief Return a FileMetaData containing a subset of the row groups in this
+ /// FileMetaData.
+ std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
+
+ private:
+ friend FileMetaDataBuilder;
+ friend class SerializedFile;
+
+ explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
+ std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+ void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
+
+ // PIMPL Idiom
+ FileMetaData();
+ class FileMetaDataImpl;
+ std::unique_ptr<FileMetaDataImpl> impl_;
+};
+
+class PARQUET_EXPORT FileCryptoMetaData {
+ public:
+ // API convenience to get a MetaData accessor
+ static std::shared_ptr<FileCryptoMetaData> Make(const uint8_t* serialized_metadata,
+ uint32_t* metadata_len);
+ ~FileCryptoMetaData();
+
+ EncryptionAlgorithm encryption_algorithm() const;
+ const std::string& key_metadata() const;
+
+ void WriteTo(::arrow::io::OutputStream* dst) const;
+
+ private:
+ friend FileMetaDataBuilder;
+ FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len);
+
+ // PIMPL Idiom
+ FileCryptoMetaData();
+ class FileCryptoMetaDataImpl;
+ std::unique_ptr<FileCryptoMetaDataImpl> impl_;
+};
+
+// Builder API
+class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
+
+ static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+ void* contents);
+
+ ~ColumnChunkMetaDataBuilder();
+
+ // column chunk
+ // Used when a dataset is spread across multiple files
+ void set_file_path(const std::string& path);
+ // column metadata
+ void SetStatistics(const EncodedStatistics& stats);
+ // get the column descriptor
+ const ColumnDescriptor* descr() const;
+
+ int64_t total_compressed_size() const;
+ // commit the metadata
+
+ void Finish(int64_t num_values, int64_t dictionary_page_offset,
+ int64_t index_page_offset, int64_t data_page_offset,
+ int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+ bool dictionary_fallback,
+ const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
+ const std::map<Encoding::type, int32_t>& data_encoding_stats_,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
+
+ // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
+ const void* contents() const;
+
+ // For writing metadata at end of column chunk
+ void WriteTo(::arrow::io::OutputStream* sink);
+
+ private:
+ explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column);
+ explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const ColumnDescriptor* column, void* contents);
+ // PIMPL Idiom
+ class ColumnChunkMetaDataBuilderImpl;
+ std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT RowGroupMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<RowGroupMetaDataBuilder> Make(
+ std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+ void* contents);
+
+ ~RowGroupMetaDataBuilder();
+
+ ColumnChunkMetaDataBuilder* NextColumnChunk();
+ int num_columns();
+ int64_t num_rows();
+ int current_column() const;
+
+ void set_num_rows(int64_t num_rows);
+
+ // commit the metadata
+ void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
+
+ private:
+ explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+ const SchemaDescriptor* schema_, void* contents);
+ // PIMPL Idiom
+ class RowGroupMetaDataBuilderImpl;
+ std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT FileMetaDataBuilder {
+ public:
+ // API convenience to get a MetaData reader
+ static std::unique_ptr<FileMetaDataBuilder> Make(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+ ~FileMetaDataBuilder();
+
+ // The prior RowGroupMetaDataBuilder (if any) is destroyed
+ RowGroupMetaDataBuilder* AppendRowGroup();
+
+ // Complete the Thrift structure
+ std::unique_ptr<FileMetaData> Finish();
+
+ // crypto metadata
+ std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
+
+ private:
+ explicit FileMetaDataBuilder(
+ const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props,
+ std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+ // PIMPL Idiom
+ class FileMetaDataBuilderImpl;
+ std::unique_ptr<FileMetaDataBuilderImpl> impl_;
+};
+
+PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
index 69b38478172..07a936e0412 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.cc
@@ -1,222 +1,222 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-// Note - The x86 and x64 versions do _not_ produce the same results, as the
-// algorithms are optimized for their respective platforms. You can still
-// compile and run any of them on any platform, but your performance with the
-// non-native version will be less than optimal.
-
-#include "parquet/murmur3.h"
-
-namespace parquet {
-
-#if defined(_MSC_VER)
-
-#define FORCE_INLINE __forceinline
-#define ROTL64(x, y) _rotl64(x, y)
-
-#else // defined(_MSC_VER)
-
-#define FORCE_INLINE inline __attribute__((always_inline))
-inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); }
-#define ROTL64(x, y) rotl64(x, y)
-
-#endif // !defined(_MSC_VER)
-
-#define BIG_CONSTANT(x) (x##LLU)
-
-//-----------------------------------------------------------------------------
-// Block read - if your platform needs to do endian-swapping or can only
-// handle aligned reads, do the conversion here
-
-FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) { return p[i]; }
-
-FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) { return p[i]; }
-
-//-----------------------------------------------------------------------------
-// Finalization mix - force all bits of a hash block to avalanche
-
-FORCE_INLINE uint32_t fmix32(uint32_t h) {
- h ^= h >> 16;
- h *= 0x85ebca6b;
- h ^= h >> 13;
- h *= 0xc2b2ae35;
- h ^= h >> 16;
-
- return h;
-}
-
-//----------
-
-FORCE_INLINE uint64_t fmix64(uint64_t k) {
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xff51afd7ed558ccd);
- k ^= k >> 33;
- k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
- k ^= k >> 33;
-
- return k;
-}
-
-//-----------------------------------------------------------------------------
-
-void Hash_x64_128(const void* key, const int len, const uint32_t seed, uint64_t out[2]) {
- const uint8_t* data = (const uint8_t*)key;
- const int nblocks = len / 16;
-
- uint64_t h1 = seed;
- uint64_t h2 = seed;
-
- const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
- const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
- //----------
- // body
-
- const uint64_t* blocks = (const uint64_t*)(data);
-
- for (int i = 0; i < nblocks; i++) {
- uint64_t k1 = getblock64(blocks, i * 2 + 0);
- uint64_t k2 = getblock64(blocks, i * 2 + 1);
-
- k1 *= c1;
- k1 = ROTL64(k1, 31);
- k1 *= c2;
- h1 ^= k1;
-
- h1 = ROTL64(h1, 27);
- h1 += h2;
- h1 = h1 * 5 + 0x52dce729;
-
- k2 *= c2;
- k2 = ROTL64(k2, 33);
- k2 *= c1;
- h2 ^= k2;
-
- h2 = ROTL64(h2, 31);
- h2 += h1;
- h2 = h2 * 5 + 0x38495ab5;
- }
-
- //----------
- // tail
-
- const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
-
- uint64_t k1 = 0;
- uint64_t k2 = 0;
-
- switch (len & 15) {
- case 15:
- k2 ^= ((uint64_t)tail[14]) << 48; // fall through
- case 14:
- k2 ^= ((uint64_t)tail[13]) << 40; // fall through
- case 13:
- k2 ^= ((uint64_t)tail[12]) << 32; // fall through
- case 12:
- k2 ^= ((uint64_t)tail[11]) << 24; // fall through
- case 11:
- k2 ^= ((uint64_t)tail[10]) << 16; // fall through
- case 10:
- k2 ^= ((uint64_t)tail[9]) << 8; // fall through
- case 9:
- k2 ^= ((uint64_t)tail[8]) << 0;
- k2 *= c2;
- k2 = ROTL64(k2, 33);
- k2 *= c1;
- h2 ^= k2; // fall through
-
- case 8:
- k1 ^= ((uint64_t)tail[7]) << 56; // fall through
- case 7:
- k1 ^= ((uint64_t)tail[6]) << 48; // fall through
- case 6:
- k1 ^= ((uint64_t)tail[5]) << 40; // fall through
- case 5:
- k1 ^= ((uint64_t)tail[4]) << 32; // fall through
- case 4:
- k1 ^= ((uint64_t)tail[3]) << 24; // fall through
- case 3:
- k1 ^= ((uint64_t)tail[2]) << 16; // fall through
- case 2:
- k1 ^= ((uint64_t)tail[1]) << 8; // fall through
- case 1:
- k1 ^= ((uint64_t)tail[0]) << 0;
- k1 *= c1;
- k1 = ROTL64(k1, 31);
- k1 *= c2;
- h1 ^= k1;
- }
-
- //----------
- // finalization
-
- h1 ^= len;
- h2 ^= len;
-
- h1 += h2;
- h2 += h1;
-
- h1 = fmix64(h1);
- h2 = fmix64(h2);
-
- h1 += h2;
- h2 += h1;
-
- reinterpret_cast<uint64_t*>(out)[0] = h1;
- reinterpret_cast<uint64_t*>(out)[1] = h2;
-}
-
-template <typename T>
-uint64_t HashHelper(T value, uint32_t seed) {
- uint64_t output[2];
- Hash_x64_128(reinterpret_cast<void*>(&value), sizeof(T), seed, output);
- return output[0];
-}
-
-uint64_t MurmurHash3::Hash(int32_t value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(int64_t value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(float value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(double value) const { return HashHelper(value, seed_); }
-
-uint64_t MurmurHash3::Hash(const FLBA* value, uint32_t len) const {
- uint64_t out[2];
- Hash_x64_128(reinterpret_cast<const void*>(value->ptr), len, seed_, out);
- return out[0];
-}
-
-uint64_t MurmurHash3::Hash(const Int96* value) const {
- uint64_t out[2];
- Hash_x64_128(reinterpret_cast<const void*>(value->value), sizeof(value->value), seed_,
- out);
- return out[0];
-}
-
-uint64_t MurmurHash3::Hash(const ByteArray* value) const {
- uint64_t out[2];
- Hash_x64_128(reinterpret_cast<const void*>(value->ptr), value->len, seed_, out);
- return out[0];
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+#include "parquet/murmur3.h"
+
+namespace parquet {
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+#define ROTL64(x, y) _rotl64(x, y)
+
+#else // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+inline uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); }
+#define ROTL64(x, y) rotl64(x, y)
+
+#endif // !defined(_MSC_VER)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+//-----------------------------------------------------------------------------
+// Block read - if your platform needs to do endian-swapping or can only
+// handle aligned reads, do the conversion here
+
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) { return p[i]; }
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) { return p[i]; }
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE uint32_t fmix32(uint32_t h) {
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+//----------
+
+FORCE_INLINE uint64_t fmix64(uint64_t k) {
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+ k ^= k >> 33;
+ k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+ k ^= k >> 33;
+
+ return k;
+}
+
+//-----------------------------------------------------------------------------
+
+void Hash_x64_128(const void* key, const int len, const uint32_t seed, uint64_t out[2]) {
+ const uint8_t* data = (const uint8_t*)key;
+ const int nblocks = len / 16;
+
+ uint64_t h1 = seed;
+ uint64_t h2 = seed;
+
+ const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
+ const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
+
+ //----------
+ // body
+
+ const uint64_t* blocks = (const uint64_t*)(data);
+
+ for (int i = 0; i < nblocks; i++) {
+ uint64_t k1 = getblock64(blocks, i * 2 + 0);
+ uint64_t k2 = getblock64(blocks, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = ROTL64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = ROTL64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ const uint8_t* tail = (const uint8_t*)(data + nblocks * 16);
+
+ uint64_t k1 = 0;
+ uint64_t k2 = 0;
+
+ switch (len & 15) {
+ case 15:
+ k2 ^= ((uint64_t)tail[14]) << 48; // fall through
+ case 14:
+ k2 ^= ((uint64_t)tail[13]) << 40; // fall through
+ case 13:
+ k2 ^= ((uint64_t)tail[12]) << 32; // fall through
+ case 12:
+ k2 ^= ((uint64_t)tail[11]) << 24; // fall through
+ case 11:
+ k2 ^= ((uint64_t)tail[10]) << 16; // fall through
+ case 10:
+ k2 ^= ((uint64_t)tail[9]) << 8; // fall through
+ case 9:
+ k2 ^= ((uint64_t)tail[8]) << 0;
+ k2 *= c2;
+ k2 = ROTL64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2; // fall through
+
+ case 8:
+ k1 ^= ((uint64_t)tail[7]) << 56; // fall through
+ case 7:
+ k1 ^= ((uint64_t)tail[6]) << 48; // fall through
+ case 6:
+ k1 ^= ((uint64_t)tail[5]) << 40; // fall through
+ case 5:
+ k1 ^= ((uint64_t)tail[4]) << 32; // fall through
+ case 4:
+ k1 ^= ((uint64_t)tail[3]) << 24; // fall through
+ case 3:
+ k1 ^= ((uint64_t)tail[2]) << 16; // fall through
+ case 2:
+ k1 ^= ((uint64_t)tail[1]) << 8; // fall through
+ case 1:
+ k1 ^= ((uint64_t)tail[0]) << 0;
+ k1 *= c1;
+ k1 = ROTL64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix64(h1);
+ h2 = fmix64(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ reinterpret_cast<uint64_t*>(out)[0] = h1;
+ reinterpret_cast<uint64_t*>(out)[1] = h2;
+}
+
+template <typename T>
+uint64_t HashHelper(T value, uint32_t seed) {
+ uint64_t output[2];
+ Hash_x64_128(reinterpret_cast<void*>(&value), sizeof(T), seed, output);
+ return output[0];
+}
+
+uint64_t MurmurHash3::Hash(int32_t value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(int64_t value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(float value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(double value) const { return HashHelper(value, seed_); }
+
+uint64_t MurmurHash3::Hash(const FLBA* value, uint32_t len) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->ptr), len, seed_, out);
+ return out[0];
+}
+
+uint64_t MurmurHash3::Hash(const Int96* value) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->value), sizeof(value->value), seed_,
+ out);
+ return out[0];
+}
+
+uint64_t MurmurHash3::Hash(const ByteArray* value) const {
+ uint64_t out[2];
+ Hash_x64_128(reinterpret_cast<const void*>(value->ptr), value->len, seed_, out);
+ return out[0];
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
index 2dcb8b5bffa..acf7088e44b 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/murmur3.h
@@ -1,54 +1,54 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//-----------------------------------------------------------------------------
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-
-#pragma once
-
-#include <cstdint>
-
-#include "parquet/hasher.h"
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-/// Source:
-/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
-/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
-class PARQUET_EXPORT MurmurHash3 : public Hasher {
- public:
- MurmurHash3() : seed_(DEFAULT_SEED) {}
- uint64_t Hash(int32_t value) const override;
- uint64_t Hash(int64_t value) const override;
- uint64_t Hash(float value) const override;
- uint64_t Hash(double value) const override;
- uint64_t Hash(const Int96* value) const override;
- uint64_t Hash(const ByteArray* value) const override;
- uint64_t Hash(const FLBA* val, uint32_t len) const override;
-
- private:
- // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
- // by System.nanoTime() of java.
- static constexpr int DEFAULT_SEED = 1361930890;
-
- uint32_t seed_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+#pragma once
+
+#include <cstdint>
+
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Source:
+/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+/// (Modified to adapt to coding conventions and to inherit the Hasher abstract class)
+class PARQUET_EXPORT MurmurHash3 : public Hasher {
+ public:
+ MurmurHash3() : seed_(DEFAULT_SEED) {}
+ uint64_t Hash(int32_t value) const override;
+ uint64_t Hash(int64_t value) const override;
+ uint64_t Hash(float value) const override;
+ uint64_t Hash(double value) const override;
+ uint64_t Hash(const Int96* value) const override;
+ uint64_t Hash(const ByteArray* value) const override;
+ uint64_t Hash(const FLBA* val, uint32_t len) const override;
+
+ private:
+ // Default seed for hash which comes from Bloom filter in parquet-mr, it is generated
+ // by System.nanoTime() of java.
+ static constexpr int DEFAULT_SEED = 1361930890;
+
+ uint32_t seed_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc b/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
index 70ed6f73df3..5c355c28be1 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/platform.cc
@@ -1,41 +1,41 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/platform.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "arrow/io/memory.h"
-
-#include "parquet/exception.h"
-
-namespace parquet {
-
-std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(MemoryPool* pool) {
- PARQUET_ASSIGN_OR_THROW(auto stream, ::arrow::io::BufferOutputStream::Create(
- kDefaultOutputStreamSize, pool));
- return stream;
-}
-
-std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t size) {
- PARQUET_ASSIGN_OR_THROW(auto result, ::arrow::AllocateResizableBuffer(size, pool));
- return std::move(result);
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/platform.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/memory.h"
+
+#include "parquet/exception.h"
+
+namespace parquet {
+
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(MemoryPool* pool) {
+ PARQUET_ASSIGN_OR_THROW(auto stream, ::arrow::io::BufferOutputStream::Create(
+ kDefaultOutputStreamSize, pool));
+ return stream;
+}
+
+std::shared_ptr<ResizableBuffer> AllocateBuffer(MemoryPool* pool, int64_t size) {
+ PARQUET_ASSIGN_OR_THROW(auto result, ::arrow::AllocateResizableBuffer(size, pool));
+ return std::move(result);
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/platform.h b/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
index cd41aa7f5c6..00a193f144a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/platform.h
@@ -1,111 +1,111 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "arrow/buffer.h" // IWYU pragma: export
-#include "arrow/io/interfaces.h" // IWYU pragma: export
-#include "arrow/status.h" // IWYU pragma: export
-#include "arrow/type_fwd.h" // IWYU pragma: export
-#include "arrow/util/macros.h" // IWYU pragma: export
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-
-#if defined(_MSC_VER)
-#pragma warning(push)
-// Disable warning for STL types usage in DLL interface
-// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
-#pragma warning(disable : 4275 4251)
-// Disable diamond inheritance warnings
-#pragma warning(disable : 4250)
-// Disable macro redefinition warnings
-#pragma warning(disable : 4005)
-// Disable extern before exported template warnings
-#pragma warning(disable : 4910)
-#else
-#pragma GCC diagnostic ignored "-Wattributes"
-#endif
-
-#ifdef PARQUET_STATIC
-#define PARQUET_EXPORT
-#elif defined(PARQUET_EXPORTING)
-#define PARQUET_EXPORT __declspec(dllexport)
-#else
-#define PARQUET_EXPORT __declspec(dllimport)
-#endif
-
-#define PARQUET_NO_EXPORT
-
-#else // Not Windows
-#ifndef PARQUET_EXPORT
-#define PARQUET_EXPORT __attribute__((visibility("default")))
-#endif
-#ifndef PARQUET_NO_EXPORT
-#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif // Non-Windows
-
-// This is a complicated topic, some reading on it:
-// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
-#if defined(_MSC_VER) || defined(__clang__)
-#define PARQUET_TEMPLATE_CLASS_EXPORT
-#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
-#else
-#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
-#define PARQUET_TEMPLATE_EXPORT
-#endif
-
-#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
-
-#define PARQUET_NORETURN ARROW_NORETURN
-#define PARQUET_DEPRECATED ARROW_DEPRECATED
-
-// If ARROW_VALGRIND set when compiling unit tests, also define
-// PARQUET_VALGRIND
-#ifdef ARROW_VALGRIND
-#define PARQUET_VALGRIND
-#endif
-
-namespace parquet {
-
-using Buffer = ::arrow::Buffer;
-using Codec = ::arrow::util::Codec;
-using Compression = ::arrow::Compression;
-using MemoryPool = ::arrow::MemoryPool;
-using MutableBuffer = ::arrow::MutableBuffer;
-using ResizableBuffer = ::arrow::ResizableBuffer;
-using ResizableBuffer = ::arrow::ResizableBuffer;
-using ArrowInputFile = ::arrow::io::RandomAccessFile;
-using ArrowInputStream = ::arrow::io::InputStream;
-using ArrowOutputStream = ::arrow::io::OutputStream;
-
-constexpr int64_t kDefaultOutputStreamSize = 1024;
-
-constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
-
-PARQUET_EXPORT
-std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
-PARQUET_EXPORT
-std::shared_ptr<ResizableBuffer> AllocateBuffer(
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h" // IWYU pragma: export
+#include "arrow/io/interfaces.h" // IWYU pragma: export
+#include "arrow/status.h" // IWYU pragma: export
+#include "arrow/type_fwd.h" // IWYU pragma: export
+#include "arrow/util/macros.h" // IWYU pragma: export
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+// Disable warning for STL types usage in DLL interface
+// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
+#pragma warning(disable : 4275 4251)
+// Disable diamond inheritance warnings
+#pragma warning(disable : 4250)
+// Disable macro redefinition warnings
+#pragma warning(disable : 4005)
+// Disable extern before exported template warnings
+#pragma warning(disable : 4910)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
+#endif
+
+#ifdef PARQUET_STATIC
+#define PARQUET_EXPORT
+#elif defined(PARQUET_EXPORTING)
+#define PARQUET_EXPORT __declspec(dllexport)
+#else
+#define PARQUET_EXPORT __declspec(dllimport)
+#endif
+
+#define PARQUET_NO_EXPORT
+
+#else // Not Windows
+#ifndef PARQUET_EXPORT
+#define PARQUET_EXPORT __attribute__((visibility("default")))
+#endif
+#ifndef PARQUET_NO_EXPORT
+#define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
+#endif
+#endif // Non-Windows
+
+// This is a complicated topic, some reading on it:
+// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
+#if defined(_MSC_VER) || defined(__clang__)
+#define PARQUET_TEMPLATE_CLASS_EXPORT
+#define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
+#else
+#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
+#define PARQUET_TEMPLATE_EXPORT
+#endif
+
+#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
+
+#define PARQUET_NORETURN ARROW_NORETURN
+#define PARQUET_DEPRECATED ARROW_DEPRECATED
+
+// If ARROW_VALGRIND set when compiling unit tests, also define
+// PARQUET_VALGRIND
+#ifdef ARROW_VALGRIND
+#define PARQUET_VALGRIND
+#endif
+
+namespace parquet {
+
+using Buffer = ::arrow::Buffer;
+using Codec = ::arrow::util::Codec;
+using Compression = ::arrow::Compression;
+using MemoryPool = ::arrow::MemoryPool;
+using MutableBuffer = ::arrow::MutableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ArrowInputFile = ::arrow::io::RandomAccessFile;
+using ArrowInputStream = ::arrow::io::InputStream;
+using ArrowOutputStream = ::arrow::io::OutputStream;
+
+constexpr int64_t kDefaultOutputStreamSize = 1024;
+
+constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
+
+PARQUET_EXPORT
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+PARQUET_EXPORT
+std::shared_ptr<ResizableBuffer> AllocateBuffer(
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
index df2b4c50b5d..dfd4bd802ee 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.cc
@@ -1,297 +1,297 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/printer.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "arrow/util/key_value_metadata.h"
-#include "arrow/util/string.h"
-
-#include "parquet/column_scanner.h"
-#include "parquet/exception.h"
-#include "parquet/file_reader.h"
-#include "parquet/metadata.h"
-#include "parquet/schema.h"
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-class ColumnReader;
-
-// ----------------------------------------------------------------------
-// ParquetFilePrinter::DebugPrint
-
-// the fixed initial size is just for an example
-#define COL_WIDTH 30
-
-void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values, bool format_dump,
- bool print_key_value_metadata, const char* filename) {
- const FileMetaData* file_metadata = fileReader->metadata().get();
-
- stream << "File Name: " << filename << "\n";
- stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
- stream << "Created By: " << file_metadata->created_by() << "\n";
- stream << "Total rows: " << file_metadata->num_rows() << "\n";
-
- if (print_key_value_metadata && file_metadata->key_value_metadata()) {
- auto key_value_metadata = file_metadata->key_value_metadata();
- int64_t size_of_key_value_metadata = key_value_metadata->size();
- stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
- for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
- stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
- << key_value_metadata->value(i) << "\n";
- }
- }
-
- stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
- stream << "Number of Real Columns: "
- << file_metadata->schema()->group_node()->field_count() << "\n";
-
- if (selected_columns.size() == 0) {
- for (int i = 0; i < file_metadata->num_columns(); i++) {
- selected_columns.push_back(i);
- }
- } else {
- for (auto i : selected_columns) {
- if (i < 0 || i >= file_metadata->num_columns()) {
- throw ParquetException("Selected column is out of range");
- }
- }
- }
-
- stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
- stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
- for (auto i : selected_columns) {
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
- << TypeToString(descr->physical_type());
- const auto& logical_type = descr->logical_type();
- if (!logical_type->is_none()) {
- stream << " / " << logical_type->ToString();
- }
- if (descr->converted_type() != ConvertedType::NONE) {
- stream << " / " << ConvertedTypeToString(descr->converted_type());
- if (descr->converted_type() == ConvertedType::DECIMAL) {
- stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
- }
- }
- stream << ")" << std::endl;
- }
-
- for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
- stream << "--- Row Group: " << r << " ---\n";
-
- auto group_reader = fileReader->RowGroup(r);
- std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
-
- stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
- stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
- << " ---\n";
- stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
-
- // Print column metadata
- for (auto i : selected_columns) {
- auto column_chunk = group_metadata->ColumnChunk(i);
- std::shared_ptr<Statistics> stats = column_chunk->statistics();
-
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
- if (column_chunk->is_stats_set()) {
- std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << ", Null Values: " << stats->null_count()
- << ", Distinct Values: " << stats->distinct_count() << std::endl
- << " Max: " << FormatStatValue(descr->physical_type(), max)
- << ", Min: " << FormatStatValue(descr->physical_type(), min);
- } else {
- stream << " Statistics Not Set";
- }
- stream << std::endl
- << " Compression: "
- << ::arrow::internal::AsciiToUpper(
- Codec::GetCodecAsString(column_chunk->compression()))
- << ", Encodings:";
- for (auto encoding : column_chunk->encodings()) {
- stream << " " << EncodingToString(encoding);
- }
- stream << std::endl
- << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
- << ", Compressed Size: " << column_chunk->total_compressed_size()
- << std::endl;
- }
-
- if (!print_values) {
- continue;
- }
- stream << "--- Values ---\n";
-
- static constexpr int bufsize = COL_WIDTH + 1;
- char buffer[bufsize];
-
- // Create readers for selected columns and print contents
- std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
- int j = 0;
- for (auto i : selected_columns) {
- std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
- // This is OK in this method as long as the RowGroupReader does not get
- // deleted
- auto& scanner = scanners[j++] = Scanner::Make(col_reader);
-
- if (format_dump) {
- stream << "Column " << i << std::endl;
- while (scanner->HasNext()) {
- scanner->PrintNext(stream, 0, true);
- stream << "\n";
- }
- continue;
- }
-
- snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
- file_metadata->schema()->Column(i)->name().c_str());
- stream << buffer << '|';
- }
- if (format_dump) {
- continue;
- }
- stream << "\n";
-
- bool hasRow;
- do {
- hasRow = false;
- for (auto scanner : scanners) {
- if (scanner->HasNext()) {
- hasRow = true;
- scanner->PrintNext(stream, COL_WIDTH);
- stream << '|';
- }
- }
- stream << "\n";
- } while (hasRow);
- }
-}
-
-void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
- const char* filename) {
- const FileMetaData* file_metadata = fileReader->metadata().get();
- stream << "{\n";
- stream << " \"FileName\": \"" << filename << "\",\n";
- stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
- << "\",\n";
- stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
- stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
- stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
- stream << " \"NumberOfRealColumns\": \""
- << file_metadata->schema()->group_node()->field_count() << "\",\n";
- stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
-
- if (selected_columns.size() == 0) {
- for (int i = 0; i < file_metadata->num_columns(); i++) {
- selected_columns.push_back(i);
- }
- } else {
- for (auto i : selected_columns) {
- if (i < 0 || i >= file_metadata->num_columns()) {
- throw ParquetException("Selected column is out of range");
- }
- }
- }
-
- stream << " \"Columns\": [\n";
- int c = 0;
- for (auto i : selected_columns) {
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << " { \"Id\": \"" << i << "\","
- << " \"Name\": \"" << descr->path()->ToDotString() << "\","
- << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
- << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
- << "\","
- << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
- c++;
- if (c != static_cast<int>(selected_columns.size())) {
- stream << ",\n";
- }
- }
-
- stream << "\n ],\n \"RowGroups\": [\n";
- for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
- stream << " {\n \"Id\": \"" << r << "\", ";
-
- auto group_reader = fileReader->RowGroup(r);
- std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
-
- stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
- stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
- << "\", ";
- stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
-
- // Print column metadata
- stream << " \"ColumnChunks\": [\n";
- int c1 = 0;
- for (auto i : selected_columns) {
- auto column_chunk = group_metadata->ColumnChunk(i);
- std::shared_ptr<Statistics> stats = column_chunk->statistics();
-
- const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
- stream << " {\"Id\": \"" << i << "\", \"Values\": \""
- << column_chunk->num_values() << "\", "
- << "\"StatsSet\": ";
- if (column_chunk->is_stats_set()) {
- stream << "\"True\", \"Stats\": {";
- std::string min = stats->EncodeMin(), max = stats->EncodeMax();
- stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
- << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
- << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
- << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
- << "\" },";
- } else {
- stream << "\"False\",";
- }
- stream << "\n \"Compression\": \""
- << ::arrow::internal::AsciiToUpper(
- Codec::GetCodecAsString(column_chunk->compression()))
- << "\", \"Encodings\": \"";
- for (auto encoding : column_chunk->encodings()) {
- stream << EncodingToString(encoding) << " ";
- }
- stream << "\", "
- << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
- << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
-
- // end of a ColumnChunk
- stream << "\" }";
- c1++;
- if (c1 != static_cast<int>(selected_columns.size())) {
- stream << ",\n";
- }
- }
-
- stream << "\n ]\n }";
- if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
- stream << ",\n";
- }
- }
- stream << "\n ]\n}\n";
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/printer.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/string.h"
+
+#include "parquet/column_scanner.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/schema.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class ColumnReader;
+
+// ----------------------------------------------------------------------
+// ParquetFilePrinter::DebugPrint
+
+// the fixed initial size is just for an example
+#define COL_WIDTH 30
+
+void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values, bool format_dump,
+ bool print_key_value_metadata, const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+
+ stream << "File Name: " << filename << "\n";
+ stream << "Version: " << ParquetVersionToString(file_metadata->version()) << "\n";
+ stream << "Created By: " << file_metadata->created_by() << "\n";
+ stream << "Total rows: " << file_metadata->num_rows() << "\n";
+
+ if (print_key_value_metadata && file_metadata->key_value_metadata()) {
+ auto key_value_metadata = file_metadata->key_value_metadata();
+ int64_t size_of_key_value_metadata = key_value_metadata->size();
+ stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n";
+ for (int64_t i = 0; i < size_of_key_value_metadata; i++) {
+ stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": "
+ << key_value_metadata->value(i) << "\n";
+ }
+ }
+
+ stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n";
+ stream << "Number of Real Columns: "
+ << file_metadata->schema()->group_node()->field_count() << "\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << "Number of Columns: " << file_metadata->num_columns() << "\n";
+ stream << "Number of Selected Columns: " << selected_columns.size() << "\n";
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
+ << TypeToString(descr->physical_type());
+ const auto& logical_type = descr->logical_type();
+ if (!logical_type->is_none()) {
+ stream << " / " << logical_type->ToString();
+ }
+ if (descr->converted_type() != ConvertedType::NONE) {
+ stream << " / " << ConvertedTypeToString(descr->converted_type());
+ if (descr->converted_type() == ConvertedType::DECIMAL) {
+ stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
+ }
+ }
+ stream << ")" << std::endl;
+ }
+
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << "--- Row Group: " << r << " ---\n";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << "--- Total Bytes: " << group_metadata->total_byte_size() << " ---\n";
+ stream << "--- Total Compressed Bytes: " << group_metadata->total_compressed_size()
+ << " ---\n";
+ stream << "--- Rows: " << group_metadata->num_rows() << " ---\n";
+
+ // Print column metadata
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<Statistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values();
+ if (column_chunk->is_stats_set()) {
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << ", Null Values: " << stats->null_count()
+ << ", Distinct Values: " << stats->distinct_count() << std::endl
+ << " Max: " << FormatStatValue(descr->physical_type(), max)
+ << ", Min: " << FormatStatValue(descr->physical_type(), min);
+ } else {
+ stream << " Statistics Not Set";
+ }
+ stream << std::endl
+ << " Compression: "
+ << ::arrow::internal::AsciiToUpper(
+ Codec::GetCodecAsString(column_chunk->compression()))
+ << ", Encodings:";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << " " << EncodingToString(encoding);
+ }
+ stream << std::endl
+ << " Uncompressed Size: " << column_chunk->total_uncompressed_size()
+ << ", Compressed Size: " << column_chunk->total_compressed_size()
+ << std::endl;
+ }
+
+ if (!print_values) {
+ continue;
+ }
+ stream << "--- Values ---\n";
+
+ static constexpr int bufsize = COL_WIDTH + 1;
+ char buffer[bufsize];
+
+ // Create readers for selected columns and print contents
+ std::vector<std::shared_ptr<Scanner>> scanners(selected_columns.size(), nullptr);
+ int j = 0;
+ for (auto i : selected_columns) {
+ std::shared_ptr<ColumnReader> col_reader = group_reader->Column(i);
+ // This is OK in this method as long as the RowGroupReader does not get
+ // deleted
+ auto& scanner = scanners[j++] = Scanner::Make(col_reader);
+
+ if (format_dump) {
+ stream << "Column " << i << std::endl;
+ while (scanner->HasNext()) {
+ scanner->PrintNext(stream, 0, true);
+ stream << "\n";
+ }
+ continue;
+ }
+
+ snprintf(buffer, bufsize, "%-*s", COL_WIDTH,
+ file_metadata->schema()->Column(i)->name().c_str());
+ stream << buffer << '|';
+ }
+ if (format_dump) {
+ continue;
+ }
+ stream << "\n";
+
+ bool hasRow;
+ do {
+ hasRow = false;
+ for (auto scanner : scanners) {
+ if (scanner->HasNext()) {
+ hasRow = true;
+ scanner->PrintNext(stream, COL_WIDTH);
+ stream << '|';
+ }
+ }
+ stream << "\n";
+ } while (hasRow);
+ }
+}
+
+void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+ const char* filename) {
+ const FileMetaData* file_metadata = fileReader->metadata().get();
+ stream << "{\n";
+ stream << " \"FileName\": \"" << filename << "\",\n";
+ stream << " \"Version\": \"" << ParquetVersionToString(file_metadata->version())
+ << "\",\n";
+ stream << " \"CreatedBy\": \"" << file_metadata->created_by() << "\",\n";
+ stream << " \"TotalRows\": \"" << file_metadata->num_rows() << "\",\n";
+ stream << " \"NumberOfRowGroups\": \"" << file_metadata->num_row_groups() << "\",\n";
+ stream << " \"NumberOfRealColumns\": \""
+ << file_metadata->schema()->group_node()->field_count() << "\",\n";
+ stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";
+
+ if (selected_columns.size() == 0) {
+ for (int i = 0; i < file_metadata->num_columns(); i++) {
+ selected_columns.push_back(i);
+ }
+ } else {
+ for (auto i : selected_columns) {
+ if (i < 0 || i >= file_metadata->num_columns()) {
+ throw ParquetException("Selected column is out of range");
+ }
+ }
+ }
+
+ stream << " \"Columns\": [\n";
+ int c = 0;
+ for (auto i : selected_columns) {
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " { \"Id\": \"" << i << "\","
+ << " \"Name\": \"" << descr->path()->ToDotString() << "\","
+ << " \"PhysicalType\": \"" << TypeToString(descr->physical_type()) << "\","
+ << " \"ConvertedType\": \"" << ConvertedTypeToString(descr->converted_type())
+ << "\","
+ << " \"LogicalType\": " << (descr->logical_type())->ToJSON() << " }";
+ c++;
+ if (c != static_cast<int>(selected_columns.size())) {
+ stream << ",\n";
+ }
+ }
+
+ stream << "\n ],\n \"RowGroups\": [\n";
+ for (int r = 0; r < file_metadata->num_row_groups(); ++r) {
+ stream << " {\n \"Id\": \"" << r << "\", ";
+
+ auto group_reader = fileReader->RowGroup(r);
+ std::unique_ptr<RowGroupMetaData> group_metadata = file_metadata->RowGroup(r);
+
+ stream << " \"TotalBytes\": \"" << group_metadata->total_byte_size() << "\", ";
+ stream << " \"TotalCompressedBytes\": \"" << group_metadata->total_compressed_size()
+ << "\", ";
+ stream << " \"Rows\": \"" << group_metadata->num_rows() << "\",\n";
+
+ // Print column metadata
+ stream << " \"ColumnChunks\": [\n";
+ int c1 = 0;
+ for (auto i : selected_columns) {
+ auto column_chunk = group_metadata->ColumnChunk(i);
+ std::shared_ptr<Statistics> stats = column_chunk->statistics();
+
+ const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
+ stream << " {\"Id\": \"" << i << "\", \"Values\": \""
+ << column_chunk->num_values() << "\", "
+ << "\"StatsSet\": ";
+ if (column_chunk->is_stats_set()) {
+ stream << "\"True\", \"Stats\": {";
+ std::string min = stats->EncodeMin(), max = stats->EncodeMax();
+ stream << "\"NumNulls\": \"" << stats->null_count() << "\", "
+ << "\"DistinctValues\": \"" << stats->distinct_count() << "\", "
+ << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) << "\", "
+ << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min)
+ << "\" },";
+ } else {
+ stream << "\"False\",";
+ }
+ stream << "\n \"Compression\": \""
+ << ::arrow::internal::AsciiToUpper(
+ Codec::GetCodecAsString(column_chunk->compression()))
+ << "\", \"Encodings\": \"";
+ for (auto encoding : column_chunk->encodings()) {
+ stream << EncodingToString(encoding) << " ";
+ }
+ stream << "\", "
+ << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
+ << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
+
+ // end of a ColumnChunk
+ stream << "\" }";
+ c1++;
+ if (c1 != static_cast<int>(selected_columns.size())) {
+ stream << ",\n";
+ }
+ }
+
+ stream << "\n ]\n }";
+ if ((r + 1) != static_cast<int>(file_metadata->num_row_groups())) {
+ stream << ",\n";
+ }
+ }
+ stream << "\n ]\n}\n";
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/printer.h b/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
index b29b1bd6d7a..6bdf5b456fa 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/printer.h
@@ -1,46 +1,46 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <iosfwd>
-#include <list>
-
-#include "parquet/platform.h"
-
-namespace parquet {
-
-class ParquetFileReader;
-
-class PARQUET_EXPORT ParquetFilePrinter {
- private:
- ParquetFileReader* fileReader;
-
- public:
- explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
- ~ParquetFilePrinter() {}
-
- void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
- bool print_values = false, bool format_dump = false,
- bool print_key_value_metadata = false,
- const char* filename = "No Name");
-
- void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
- const char* filename = "No Name");
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <list>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+
+class ParquetFileReader;
+
+class PARQUET_EXPORT ParquetFilePrinter {
+ private:
+ ParquetFileReader* fileReader;
+
+ public:
+ explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
+ ~ParquetFilePrinter() {}
+
+ void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+ bool print_values = false, bool format_dump = false,
+ bool print_key_value_metadata = false,
+ const char* filename = "No Name");
+
+ void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+ const char* filename = "No Name");
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc b/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
index 1a28fb81e40..93638dbe28a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/properties.cc
@@ -1,64 +1,64 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <sstream>
-#include <utility>
-
-#include "parquet/properties.h"
-
-#include "arrow/io/buffered.h"
-#include "arrow/io/memory.h"
-#include "arrow/util/logging.h"
-
-namespace parquet {
-
-std::shared_ptr<ArrowInputStream> ReaderProperties::GetStream(
- std::shared_ptr<ArrowInputFile> source, int64_t start, int64_t num_bytes) {
- if (buffered_stream_enabled_) {
- // ARROW-6180 / PARQUET-1636 Create isolated reader that references segment
- // of source
- std::shared_ptr<::arrow::io::InputStream> safe_stream =
- ::arrow::io::RandomAccessFile::GetStream(source, start, num_bytes);
- PARQUET_ASSIGN_OR_THROW(
- auto stream, ::arrow::io::BufferedInputStream::Create(buffer_size_, pool_,
- safe_stream, num_bytes));
- return std::move(stream);
- } else {
- PARQUET_ASSIGN_OR_THROW(auto data, source->ReadAt(start, num_bytes));
-
- if (data->size() != num_bytes) {
- std::stringstream ss;
- ss << "Tried reading " << num_bytes << " bytes starting at position " << start
- << " from file but only got " << data->size();
- throw ParquetException(ss.str());
- }
- return std::make_shared<::arrow::io::BufferReader>(data);
- }
-}
-
-ArrowReaderProperties default_arrow_reader_properties() {
- static ArrowReaderProperties default_reader_props;
- return default_reader_props;
-}
-
-std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties() {
- static std::shared_ptr<ArrowWriterProperties> default_writer_properties =
- ArrowWriterProperties::Builder().build();
- return default_writer_properties;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <sstream>
+#include <utility>
+
+#include "parquet/properties.h"
+
+#include "arrow/io/buffered.h"
+#include "arrow/io/memory.h"
+#include "arrow/util/logging.h"
+
+namespace parquet {
+
+std::shared_ptr<ArrowInputStream> ReaderProperties::GetStream(
+ std::shared_ptr<ArrowInputFile> source, int64_t start, int64_t num_bytes) {
+ if (buffered_stream_enabled_) {
+ // ARROW-6180 / PARQUET-1636 Create isolated reader that references segment
+ // of source
+ std::shared_ptr<::arrow::io::InputStream> safe_stream =
+ ::arrow::io::RandomAccessFile::GetStream(source, start, num_bytes);
+ PARQUET_ASSIGN_OR_THROW(
+ auto stream, ::arrow::io::BufferedInputStream::Create(buffer_size_, pool_,
+ safe_stream, num_bytes));
+ return std::move(stream);
+ } else {
+ PARQUET_ASSIGN_OR_THROW(auto data, source->ReadAt(start, num_bytes));
+
+ if (data->size() != num_bytes) {
+ std::stringstream ss;
+ ss << "Tried reading " << num_bytes << " bytes starting at position " << start
+ << " from file but only got " << data->size();
+ throw ParquetException(ss.str());
+ }
+ return std::make_shared<::arrow::io::BufferReader>(data);
+ }
+}
+
+ArrowReaderProperties default_arrow_reader_properties() {
+ static ArrowReaderProperties default_reader_props;
+ return default_reader_props;
+}
+
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties() {
+ static std::shared_ptr<ArrowWriterProperties> default_writer_properties =
+ ArrowWriterProperties::Builder().build();
+ return default_writer_properties;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/properties.h b/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
index bc86f98ef7f..d217b8efa52 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/properties.h
@@ -1,813 +1,813 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include "arrow/io/caching.h"
-#include "arrow/type.h"
-#include "arrow/util/compression.h"
-#include "parquet/encryption/encryption.h"
-#include "parquet/exception.h"
-#include "parquet/parquet_version.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/type_fwd.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-/// Determines use of Parquet Format version >= 2.0.0 logical types. For
-/// example, when writing from Arrow data structures, PARQUET_2_0 will enable
-/// use of INT_* and UINT_* converted types as well as nanosecond timestamps
-/// stored physically as INT64. Since some Parquet implementations do not
-/// support the logical types added in the 2.0.0 format version, if you want to
-/// maximize compatibility of your files you may want to use PARQUET_1_0.
-///
-/// Note that the 2.x format version series also introduced new serialized
-/// data page metadata and on disk data page layout. To enable this, use
-/// ParquetDataPageVersion.
-struct ParquetVersion;
-
-/// Controls serialization format of data pages. parquet-format v2.0.0
-/// introduced a new data page metadata type DataPageV2 and serialized page
-/// structure (for example, encoded levels are no longer compressed). Prior to
-/// the completion of PARQUET-457 in 2020, this library did not implement
-/// DataPageV2 correctly, so if you use the V2 data page format, you may have
-/// forward compatibility issues (older versions of the library will be unable
-/// to read the files). Note that some Parquet implementations do not implement
-/// DataPageV2 at all.
-enum class ParquetDataPageVersion { V1, V2 };
-
-/// Align the default buffer size to a small multiple of a page size.
-constexpr int64_t kDefaultBufferSize = 4096 * 4;
-
-class PARQUET_EXPORT ReaderProperties {
- public:
- explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
- : pool_(pool) {}
-
- MemoryPool* memory_pool() const { return pool_; }
-
- std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
- int64_t start, int64_t num_bytes);
-
- /// Buffered stream reading allows the user to control the memory usage of
- /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
- /// wrapped in a buffered reader that uses a fix sized buffer (of size
- /// `buffer_size()`) instead of the full size of the ReadAt.
- ///
- /// The primary reason for this control knobs is for resource control and not
- /// performance.
- bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
- void enable_buffered_stream() { buffered_stream_enabled_ = true; }
- void disable_buffered_stream() { buffered_stream_enabled_ = false; }
-
- int64_t buffer_size() const { return buffer_size_; }
- void set_buffer_size(int64_t size) { buffer_size_ = size; }
-
- void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
- file_decryption_properties_ = std::move(decryption);
- }
-
- const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
- return file_decryption_properties_;
- }
-
- private:
- MemoryPool* pool_;
- int64_t buffer_size_ = kDefaultBufferSize;
- bool buffered_stream_enabled_ = false;
- std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
-};
-
-ReaderProperties PARQUET_EXPORT default_reader_properties();
-
-static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
-static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
-static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
-static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
-static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
-static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
-static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
-static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
-static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
-static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
-
-class PARQUET_EXPORT ColumnProperties {
- public:
- ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
- Compression::type codec = DEFAULT_COMPRESSION_TYPE,
- bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
- bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
- size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
- : encoding_(encoding),
- codec_(codec),
- dictionary_enabled_(dictionary_enabled),
- statistics_enabled_(statistics_enabled),
- max_stats_size_(max_stats_size),
- compression_level_(Codec::UseDefaultCompressionLevel()) {}
-
- void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
-
- void set_compression(Compression::type codec) { codec_ = codec; }
-
- void set_dictionary_enabled(bool dictionary_enabled) {
- dictionary_enabled_ = dictionary_enabled;
- }
-
- void set_statistics_enabled(bool statistics_enabled) {
- statistics_enabled_ = statistics_enabled;
- }
-
- void set_max_statistics_size(size_t max_stats_size) {
- max_stats_size_ = max_stats_size;
- }
-
- void set_compression_level(int compression_level) {
- compression_level_ = compression_level;
- }
-
- Encoding::type encoding() const { return encoding_; }
-
- Compression::type compression() const { return codec_; }
-
- bool dictionary_enabled() const { return dictionary_enabled_; }
-
- bool statistics_enabled() const { return statistics_enabled_; }
-
- size_t max_statistics_size() const { return max_stats_size_; }
-
- int compression_level() const { return compression_level_; }
-
- private:
- Encoding::type encoding_;
- Compression::type codec_;
- bool dictionary_enabled_;
- bool statistics_enabled_;
- size_t max_stats_size_;
- int compression_level_;
-};
-
-class PARQUET_EXPORT WriterProperties {
- public:
- class Builder {
- public:
- Builder()
- : pool_(::arrow::default_memory_pool()),
- dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
- write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
- max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
- pagesize_(kDefaultDataPageSize),
- version_(ParquetVersion::PARQUET_1_0),
- data_page_version_(ParquetDataPageVersion::V1),
- created_by_(DEFAULT_CREATED_BY) {}
- virtual ~Builder() {}
-
- Builder* memory_pool(MemoryPool* pool) {
- pool_ = pool;
- return this;
- }
-
- Builder* enable_dictionary() {
- default_column_properties_.set_dictionary_enabled(true);
- return this;
- }
-
- Builder* disable_dictionary() {
- default_column_properties_.set_dictionary_enabled(false);
- return this;
- }
-
- Builder* enable_dictionary(const std::string& path) {
- dictionary_enabled_[path] = true;
- return this;
- }
-
- Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->enable_dictionary(path->ToDotString());
- }
-
- Builder* disable_dictionary(const std::string& path) {
- dictionary_enabled_[path] = false;
- return this;
- }
-
- Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->disable_dictionary(path->ToDotString());
- }
-
- Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
- dictionary_pagesize_limit_ = dictionary_psize_limit;
- return this;
- }
-
- Builder* write_batch_size(int64_t write_batch_size) {
- write_batch_size_ = write_batch_size;
- return this;
- }
-
- Builder* max_row_group_length(int64_t max_row_group_length) {
- max_row_group_length_ = max_row_group_length;
- return this;
- }
-
- Builder* data_pagesize(int64_t pg_size) {
- pagesize_ = pg_size;
- return this;
- }
-
- Builder* data_page_version(ParquetDataPageVersion data_page_version) {
- data_page_version_ = data_page_version;
- return this;
- }
-
- Builder* version(ParquetVersion::type version) {
- version_ = version;
- return this;
- }
-
- Builder* created_by(const std::string& created_by) {
- created_by_ = created_by;
- return this;
- }
-
- /**
- * Define the encoding that is used when we don't utilise dictionary encoding.
- *
- * This either apply if dictionary encoding is disabled or if we fallback
- * as the dictionary grew too large.
- */
- Builder* encoding(Encoding::type encoding_type) {
- if (encoding_type == Encoding::PLAIN_DICTIONARY ||
- encoding_type == Encoding::RLE_DICTIONARY) {
- throw ParquetException("Can't use dictionary encoding as fallback encoding");
- }
-
- default_column_properties_.set_encoding(encoding_type);
- return this;
- }
-
- /**
- * Define the encoding that is used when we don't utilise dictionary encoding.
- *
- * This either apply if dictionary encoding is disabled or if we fallback
- * as the dictionary grew too large.
- */
- Builder* encoding(const std::string& path, Encoding::type encoding_type) {
- if (encoding_type == Encoding::PLAIN_DICTIONARY ||
- encoding_type == Encoding::RLE_DICTIONARY) {
- throw ParquetException("Can't use dictionary encoding as fallback encoding");
- }
-
- encodings_[path] = encoding_type;
- return this;
- }
-
- /**
- * Define the encoding that is used when we don't utilise dictionary encoding.
- *
- * This either apply if dictionary encoding is disabled or if we fallback
- * as the dictionary grew too large.
- */
- Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
- Encoding::type encoding_type) {
- return this->encoding(path->ToDotString(), encoding_type);
- }
-
- Builder* compression(Compression::type codec) {
- default_column_properties_.set_compression(codec);
- return this;
- }
-
- Builder* max_statistics_size(size_t max_stats_sz) {
- default_column_properties_.set_max_statistics_size(max_stats_sz);
- return this;
- }
-
- Builder* compression(const std::string& path, Compression::type codec) {
- codecs_[path] = codec;
- return this;
- }
-
- Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
- Compression::type codec) {
- return this->compression(path->ToDotString(), codec);
- }
-
- /// \brief Specify the default compression level for the compressor in
- /// every column. In case a column does not have an explicitly specified
- /// compression level, the default one would be used.
- ///
- /// The provided compression level is compressor specific. The user would
- /// have to familiarize oneself with the available levels for the selected
- /// compressor. If the compressor does not allow for selecting different
- /// compression levels, calling this function would not have any effect.
- /// Parquet and Arrow do not validate the passed compression level. If no
- /// level is selected by the user or if the special
- /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
- /// compression level.
- Builder* compression_level(int compression_level) {
- default_column_properties_.set_compression_level(compression_level);
- return this;
- }
-
- /// \brief Specify a compression level for the compressor for the column
- /// described by path.
- ///
- /// The provided compression level is compressor specific. The user would
- /// have to familiarize oneself with the available levels for the selected
- /// compressor. If the compressor does not allow for selecting different
- /// compression levels, calling this function would not have any effect.
- /// Parquet and Arrow do not validate the passed compression level. If no
- /// level is selected by the user or if the special
- /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
- /// compression level.
- Builder* compression_level(const std::string& path, int compression_level) {
- codecs_compression_level_[path] = compression_level;
- return this;
- }
-
- /// \brief Specify a compression level for the compressor for the column
- /// described by path.
- ///
- /// The provided compression level is compressor specific. The user would
- /// have to familiarize oneself with the available levels for the selected
- /// compressor. If the compressor does not allow for selecting different
- /// compression levels, calling this function would not have any effect.
- /// Parquet and Arrow do not validate the passed compression level. If no
- /// level is selected by the user or if the special
- /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
- /// compression level.
- Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
- int compression_level) {
- return this->compression_level(path->ToDotString(), compression_level);
- }
-
- Builder* encryption(
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
- file_encryption_properties_ = std::move(file_encryption_properties);
- return this;
- }
-
- Builder* enable_statistics() {
- default_column_properties_.set_statistics_enabled(true);
- return this;
- }
-
- Builder* disable_statistics() {
- default_column_properties_.set_statistics_enabled(false);
- return this;
- }
-
- Builder* enable_statistics(const std::string& path) {
- statistics_enabled_[path] = true;
- return this;
- }
-
- Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->enable_statistics(path->ToDotString());
- }
-
- Builder* disable_statistics(const std::string& path) {
- statistics_enabled_[path] = false;
- return this;
- }
-
- Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
- return this->disable_statistics(path->ToDotString());
- }
-
- std::shared_ptr<WriterProperties> build() {
- std::unordered_map<std::string, ColumnProperties> column_properties;
- auto get = [&](const std::string& key) -> ColumnProperties& {
- auto it = column_properties.find(key);
- if (it == column_properties.end())
- return column_properties[key] = default_column_properties_;
- else
- return it->second;
- };
-
- for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
- for (const auto& item : codecs_) get(item.first).set_compression(item.second);
- for (const auto& item : codecs_compression_level_)
- get(item.first).set_compression_level(item.second);
- for (const auto& item : dictionary_enabled_)
- get(item.first).set_dictionary_enabled(item.second);
- for (const auto& item : statistics_enabled_)
- get(item.first).set_statistics_enabled(item.second);
-
- return std::shared_ptr<WriterProperties>(new WriterProperties(
- pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
- pagesize_, version_, created_by_, std::move(file_encryption_properties_),
- default_column_properties_, column_properties, data_page_version_));
- }
-
- private:
- MemoryPool* pool_;
- int64_t dictionary_pagesize_limit_;
- int64_t write_batch_size_;
- int64_t max_row_group_length_;
- int64_t pagesize_;
- ParquetVersion::type version_;
- ParquetDataPageVersion data_page_version_;
- std::string created_by_;
-
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
-
- // Settings used for each column unless overridden in any of the maps below
- ColumnProperties default_column_properties_;
- std::unordered_map<std::string, Encoding::type> encodings_;
- std::unordered_map<std::string, Compression::type> codecs_;
- std::unordered_map<std::string, int32_t> codecs_compression_level_;
- std::unordered_map<std::string, bool> dictionary_enabled_;
- std::unordered_map<std::string, bool> statistics_enabled_;
- };
-
- inline MemoryPool* memory_pool() const { return pool_; }
-
- inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
-
- inline int64_t write_batch_size() const { return write_batch_size_; }
-
- inline int64_t max_row_group_length() const { return max_row_group_length_; }
-
- inline int64_t data_pagesize() const { return pagesize_; }
-
- inline ParquetDataPageVersion data_page_version() const {
- return parquet_data_page_version_;
- }
-
- inline ParquetVersion::type version() const { return parquet_version_; }
-
- inline std::string created_by() const { return parquet_created_by_; }
-
- inline Encoding::type dictionary_index_encoding() const {
- if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
- return Encoding::PLAIN_DICTIONARY;
- } else {
- return Encoding::RLE_DICTIONARY;
- }
- }
-
- inline Encoding::type dictionary_page_encoding() const {
- if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
- return Encoding::PLAIN_DICTIONARY;
- } else {
- return Encoding::PLAIN;
- }
- }
-
- const ColumnProperties& column_properties(
- const std::shared_ptr<schema::ColumnPath>& path) const {
- auto it = column_properties_.find(path->ToDotString());
- if (it != column_properties_.end()) return it->second;
- return default_column_properties_;
- }
-
- Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).encoding();
- }
-
- Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).compression();
- }
-
- int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).compression_level();
- }
-
- bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).dictionary_enabled();
- }
-
- bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).statistics_enabled();
- }
-
- size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
- return column_properties(path).max_statistics_size();
- }
-
- inline FileEncryptionProperties* file_encryption_properties() const {
- return file_encryption_properties_.get();
- }
-
- std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
- const std::string& path) const {
- if (file_encryption_properties_) {
- return file_encryption_properties_->column_encryption_properties(path);
- } else {
- return NULLPTR;
- }
- }
-
- private:
- explicit WriterProperties(
- MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
- int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
- const std::string& created_by,
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
- const ColumnProperties& default_column_properties,
- const std::unordered_map<std::string, ColumnProperties>& column_properties,
- ParquetDataPageVersion data_page_version)
- : pool_(pool),
- dictionary_pagesize_limit_(dictionary_pagesize_limit),
- write_batch_size_(write_batch_size),
- max_row_group_length_(max_row_group_length),
- pagesize_(pagesize),
- parquet_data_page_version_(data_page_version),
- parquet_version_(version),
- parquet_created_by_(created_by),
- file_encryption_properties_(file_encryption_properties),
- default_column_properties_(default_column_properties),
- column_properties_(column_properties) {}
-
- MemoryPool* pool_;
- int64_t dictionary_pagesize_limit_;
- int64_t write_batch_size_;
- int64_t max_row_group_length_;
- int64_t pagesize_;
- ParquetDataPageVersion parquet_data_page_version_;
- ParquetVersion::type parquet_version_;
- std::string parquet_created_by_;
-
- std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
-
- ColumnProperties default_column_properties_;
- std::unordered_map<std::string, ColumnProperties> column_properties_;
-};
-
-PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
-
-// ----------------------------------------------------------------------
-// Properties specific to Apache Arrow columnar read and write
-
-static constexpr bool kArrowDefaultUseThreads = false;
-
-// Default number of rows to read when using ::arrow::RecordBatchReader
-static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
-
-/// EXPERIMENTAL: Properties for configuring FileReader behavior.
-class PARQUET_EXPORT ArrowReaderProperties {
- public:
- explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
- : use_threads_(use_threads),
- read_dict_indices_(),
- batch_size_(kArrowDefaultBatchSize),
- pre_buffer_(false),
- cache_options_(::arrow::io::CacheOptions::Defaults()),
- coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
-
- void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
-
- bool use_threads() const { return use_threads_; }
-
- void set_read_dictionary(int column_index, bool read_dict) {
- if (read_dict) {
- read_dict_indices_.insert(column_index);
- } else {
- read_dict_indices_.erase(column_index);
- }
- }
- bool read_dictionary(int column_index) const {
- if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
- return true;
- } else {
- return false;
- }
- }
-
- void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
-
- int64_t batch_size() const { return batch_size_; }
-
- /// Enable read coalescing.
- ///
- /// When enabled, the Arrow reader will pre-buffer necessary regions
- /// of the file in-memory. This is intended to improve performance on
- /// high-latency filesystems (e.g. Amazon S3).
- void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
-
- bool pre_buffer() const { return pre_buffer_; }
-
- /// Set options for read coalescing. This can be used to tune the
- /// implementation for characteristics of different filesystems.
- void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
-
- const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
-
- /// Set execution context for read coalescing.
- void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
-
- const ::arrow::io::IOContext& io_context() const { return io_context_; }
-
- /// Set timestamp unit to use for deprecated INT96-encoded timestamps
- /// (default is NANO).
- void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
- coerce_int96_timestamp_unit_ = unit;
- }
-
- ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
- return coerce_int96_timestamp_unit_;
- }
-
- private:
- bool use_threads_;
- std::unordered_set<int> read_dict_indices_;
- int64_t batch_size_;
- bool pre_buffer_;
- ::arrow::io::IOContext io_context_;
- ::arrow::io::CacheOptions cache_options_;
- ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
-};
-
-/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
-PARQUET_EXPORT
-ArrowReaderProperties default_arrow_reader_properties();
-
-class PARQUET_EXPORT ArrowWriterProperties {
- public:
- enum EngineVersion {
- V1, // Supports only nested lists.
- V2 // Full support for all nesting combinations
- };
- class Builder {
- public:
- Builder()
- : write_timestamps_as_int96_(false),
- coerce_timestamps_enabled_(false),
- coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
- truncated_timestamps_allowed_(false),
- store_schema_(false),
- // TODO: At some point we should flip this.
- compliant_nested_types_(false),
- engine_version_(V2) {}
- virtual ~Builder() = default;
-
- Builder* disable_deprecated_int96_timestamps() {
- write_timestamps_as_int96_ = false;
- return this;
- }
-
- Builder* enable_deprecated_int96_timestamps() {
- write_timestamps_as_int96_ = true;
- return this;
- }
-
- Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
- coerce_timestamps_enabled_ = true;
- coerce_timestamps_unit_ = unit;
- return this;
- }
-
- Builder* allow_truncated_timestamps() {
- truncated_timestamps_allowed_ = true;
- return this;
- }
-
- Builder* disallow_truncated_timestamps() {
- truncated_timestamps_allowed_ = false;
- return this;
- }
-
- /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
- /// to enable certain read options (like "read_dictionary") to be set
- /// automatically
- Builder* store_schema() {
- store_schema_ = true;
- return this;
- }
-
- Builder* enable_compliant_nested_types() {
- compliant_nested_types_ = true;
- return this;
- }
-
- Builder* disable_compliant_nested_types() {
- compliant_nested_types_ = false;
- return this;
- }
-
- Builder* set_engine_version(EngineVersion version) {
- engine_version_ = version;
- return this;
- }
-
- std::shared_ptr<ArrowWriterProperties> build() {
- return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
- write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
- truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
- engine_version_));
- }
-
- private:
- bool write_timestamps_as_int96_;
-
- bool coerce_timestamps_enabled_;
- ::arrow::TimeUnit::type coerce_timestamps_unit_;
- bool truncated_timestamps_allowed_;
-
- bool store_schema_;
- bool compliant_nested_types_;
- EngineVersion engine_version_;
- };
-
- bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
-
- bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
- ::arrow::TimeUnit::type coerce_timestamps_unit() const {
- return coerce_timestamps_unit_;
- }
-
- bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
-
- bool store_schema() const { return store_schema_; }
-
- /// \brief Enable nested type naming according to the parquet specification.
- ///
- /// Older versions of arrow wrote out field names for nested lists based on the name
- /// of the field. According to the parquet specification they should always be
- /// "element".
- bool compliant_nested_types() const { return compliant_nested_types_; }
-
- /// \brief The underlying engine version to use when writing Arrow data.
- ///
- /// V2 is currently the latest V1 is considered deprecated but left in
- /// place in case there are bugs detected in V2.
- EngineVersion engine_version() const { return engine_version_; }
-
- private:
- explicit ArrowWriterProperties(bool write_nanos_as_int96,
- bool coerce_timestamps_enabled,
- ::arrow::TimeUnit::type coerce_timestamps_unit,
- bool truncated_timestamps_allowed, bool store_schema,
- bool compliant_nested_types,
- EngineVersion engine_version)
- : write_timestamps_as_int96_(write_nanos_as_int96),
- coerce_timestamps_enabled_(coerce_timestamps_enabled),
- coerce_timestamps_unit_(coerce_timestamps_unit),
- truncated_timestamps_allowed_(truncated_timestamps_allowed),
- store_schema_(store_schema),
- compliant_nested_types_(compliant_nested_types),
- engine_version_(engine_version) {}
-
- const bool write_timestamps_as_int96_;
- const bool coerce_timestamps_enabled_;
- const ::arrow::TimeUnit::type coerce_timestamps_unit_;
- const bool truncated_timestamps_allowed_;
- const bool store_schema_;
- const bool compliant_nested_types_;
- const EngineVersion engine_version_;
-};
-
-/// \brief State object used for writing Arrow data directly to a Parquet
-/// column chunk. API possibly not stable
-struct ArrowWriteContext {
- ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
- : memory_pool(memory_pool),
- properties(properties),
- data_buffer(AllocateBuffer(memory_pool)),
- def_levels_buffer(AllocateBuffer(memory_pool)) {}
-
- template <typename T>
- ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
- ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
- *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
- return ::arrow::Status::OK();
- }
-
- MemoryPool* memory_pool;
- const ArrowWriterProperties* properties;
-
- // Buffer used for storing the data of an array converted to the physical type
- // as expected by parquet-cpp.
- std::shared_ptr<ResizableBuffer> data_buffer;
-
- // We use the shared ownership of this buffer
- std::shared_ptr<ResizableBuffer> def_levels_buffer;
-};
-
-PARQUET_EXPORT
-std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/type.h"
+#include "arrow/util/compression.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/exception.h"
+#include "parquet/parquet_version.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Determines use of Parquet Format version >= 2.0.0 logical types. For
+/// example, when writing from Arrow data structures, PARQUET_2_0 will enable
+/// use of INT_* and UINT_* converted types as well as nanosecond timestamps
+/// stored physically as INT64. Since some Parquet implementations do not
+/// support the logical types added in the 2.0.0 format version, if you want to
+/// maximize compatibility of your files you may want to use PARQUET_1_0.
+///
+/// Note that the 2.x format version series also introduced new serialized
+/// data page metadata and on disk data page layout. To enable this, use
+/// ParquetDataPageVersion.
+struct ParquetVersion;
+
+/// Controls serialization format of data pages. parquet-format v2.0.0
+/// introduced a new data page metadata type DataPageV2 and serialized page
+/// structure (for example, encoded levels are no longer compressed). Prior to
+/// the completion of PARQUET-457 in 2020, this library did not implement
+/// DataPageV2 correctly, so if you use the V2 data page format, you may have
+/// forward compatibility issues (older versions of the library will be unable
+/// to read the files). Note that some Parquet implementations do not implement
+/// DataPageV2 at all.
+enum class ParquetDataPageVersion { V1, V2 };
+
+/// Align the default buffer size to a small multiple of a page size.
+constexpr int64_t kDefaultBufferSize = 4096 * 4;
+
+class PARQUET_EXPORT ReaderProperties {
+ public:
+ explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
+ : pool_(pool) {}
+
+ MemoryPool* memory_pool() const { return pool_; }
+
+ std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
+ int64_t start, int64_t num_bytes);
+
+ /// Buffered stream reading allows the user to control the memory usage of
+ /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
+ /// wrapped in a buffered reader that uses a fix sized buffer (of size
+ /// `buffer_size()`) instead of the full size of the ReadAt.
+ ///
+ /// The primary reason for this control knobs is for resource control and not
+ /// performance.
+ bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
+ void enable_buffered_stream() { buffered_stream_enabled_ = true; }
+ void disable_buffered_stream() { buffered_stream_enabled_ = false; }
+
+ int64_t buffer_size() const { return buffer_size_; }
+ void set_buffer_size(int64_t size) { buffer_size_ = size; }
+
+ void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
+ file_decryption_properties_ = std::move(decryption);
+ }
+
+ const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
+ return file_decryption_properties_;
+ }
+
+ private:
+ MemoryPool* pool_;
+ int64_t buffer_size_ = kDefaultBufferSize;
+ bool buffered_stream_enabled_ = false;
+ std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
+};
+
+ReaderProperties PARQUET_EXPORT default_reader_properties();
+
+static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
+static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
+static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 64 * 1024 * 1024;
+static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::PLAIN;
+static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
+static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
+
+class PARQUET_EXPORT ColumnProperties {
+ public:
+ ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
+ Compression::type codec = DEFAULT_COMPRESSION_TYPE,
+ bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
+ bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
+ size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE)
+ : encoding_(encoding),
+ codec_(codec),
+ dictionary_enabled_(dictionary_enabled),
+ statistics_enabled_(statistics_enabled),
+ max_stats_size_(max_stats_size),
+ compression_level_(Codec::UseDefaultCompressionLevel()) {}
+
+ void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
+
+ void set_compression(Compression::type codec) { codec_ = codec; }
+
+ void set_dictionary_enabled(bool dictionary_enabled) {
+ dictionary_enabled_ = dictionary_enabled;
+ }
+
+ void set_statistics_enabled(bool statistics_enabled) {
+ statistics_enabled_ = statistics_enabled;
+ }
+
+ void set_max_statistics_size(size_t max_stats_size) {
+ max_stats_size_ = max_stats_size;
+ }
+
+ void set_compression_level(int compression_level) {
+ compression_level_ = compression_level;
+ }
+
+ Encoding::type encoding() const { return encoding_; }
+
+ Compression::type compression() const { return codec_; }
+
+ bool dictionary_enabled() const { return dictionary_enabled_; }
+
+ bool statistics_enabled() const { return statistics_enabled_; }
+
+ size_t max_statistics_size() const { return max_stats_size_; }
+
+ int compression_level() const { return compression_level_; }
+
+ private:
+ Encoding::type encoding_;
+ Compression::type codec_;
+ bool dictionary_enabled_;
+ bool statistics_enabled_;
+ size_t max_stats_size_;
+ int compression_level_;
+};
+
+class PARQUET_EXPORT WriterProperties {
+ public:
+ class Builder {
+ public:
+ Builder()
+ : pool_(::arrow::default_memory_pool()),
+ dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+ write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+ max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
+ pagesize_(kDefaultDataPageSize),
+ version_(ParquetVersion::PARQUET_1_0),
+ data_page_version_(ParquetDataPageVersion::V1),
+ created_by_(DEFAULT_CREATED_BY) {}
+ virtual ~Builder() {}
+
+ Builder* memory_pool(MemoryPool* pool) {
+ pool_ = pool;
+ return this;
+ }
+
+ Builder* enable_dictionary() {
+ default_column_properties_.set_dictionary_enabled(true);
+ return this;
+ }
+
+ Builder* disable_dictionary() {
+ default_column_properties_.set_dictionary_enabled(false);
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_dictionary(path->ToDotString());
+ }
+
+ Builder* disable_dictionary(const std::string& path) {
+ dictionary_enabled_[path] = false;
+ return this;
+ }
+
+ Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->disable_dictionary(path->ToDotString());
+ }
+
+ Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
+ dictionary_pagesize_limit_ = dictionary_psize_limit;
+ return this;
+ }
+
+ Builder* write_batch_size(int64_t write_batch_size) {
+ write_batch_size_ = write_batch_size;
+ return this;
+ }
+
+ Builder* max_row_group_length(int64_t max_row_group_length) {
+ max_row_group_length_ = max_row_group_length;
+ return this;
+ }
+
+ Builder* data_pagesize(int64_t pg_size) {
+ pagesize_ = pg_size;
+ return this;
+ }
+
+ Builder* data_page_version(ParquetDataPageVersion data_page_version) {
+ data_page_version_ = data_page_version;
+ return this;
+ }
+
+ Builder* version(ParquetVersion::type version) {
+ version_ = version;
+ return this;
+ }
+
+ Builder* created_by(const std::string& created_by) {
+ created_by_ = created_by;
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+
+ default_column_properties_.set_encoding(encoding_type);
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::string& path, Encoding::type encoding_type) {
+ if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+ encoding_type == Encoding::RLE_DICTIONARY) {
+ throw ParquetException("Can't use dictionary encoding as fallback encoding");
+ }
+
+ encodings_[path] = encoding_type;
+ return this;
+ }
+
+ /**
+ * Define the encoding that is used when we don't utilise dictionary encoding.
+ *
+ * This either apply if dictionary encoding is disabled or if we fallback
+ * as the dictionary grew too large.
+ */
+ Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
+ Encoding::type encoding_type) {
+ return this->encoding(path->ToDotString(), encoding_type);
+ }
+
+ Builder* compression(Compression::type codec) {
+ default_column_properties_.set_compression(codec);
+ return this;
+ }
+
+ Builder* max_statistics_size(size_t max_stats_sz) {
+ default_column_properties_.set_max_statistics_size(max_stats_sz);
+ return this;
+ }
+
+ Builder* compression(const std::string& path, Compression::type codec) {
+ codecs_[path] = codec;
+ return this;
+ }
+
+ Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
+ Compression::type codec) {
+ return this->compression(path->ToDotString(), codec);
+ }
+
+ /// \brief Specify the default compression level for the compressor in
+ /// every column. In case a column does not have an explicitly specified
+ /// compression level, the default one would be used.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(int compression_level) {
+ default_column_properties_.set_compression_level(compression_level);
+ return this;
+ }
+
+ /// \brief Specify a compression level for the compressor for the column
+ /// described by path.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(const std::string& path, int compression_level) {
+ codecs_compression_level_[path] = compression_level;
+ return this;
+ }
+
+ /// \brief Specify a compression level for the compressor for the column
+ /// described by path.
+ ///
+ /// The provided compression level is compressor specific. The user would
+ /// have to familiarize oneself with the available levels for the selected
+ /// compressor. If the compressor does not allow for selecting different
+ /// compression levels, calling this function would not have any effect.
+ /// Parquet and Arrow do not validate the passed compression level. If no
+ /// level is selected by the user or if the special
+ /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+ /// compression level.
+ Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
+ int compression_level) {
+ return this->compression_level(path->ToDotString(), compression_level);
+ }
+
+ Builder* encryption(
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
+ file_encryption_properties_ = std::move(file_encryption_properties);
+ return this;
+ }
+
+ Builder* enable_statistics() {
+ default_column_properties_.set_statistics_enabled(true);
+ return this;
+ }
+
+ Builder* disable_statistics() {
+ default_column_properties_.set_statistics_enabled(false);
+ return this;
+ }
+
+ Builder* enable_statistics(const std::string& path) {
+ statistics_enabled_[path] = true;
+ return this;
+ }
+
+ Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->enable_statistics(path->ToDotString());
+ }
+
+ Builder* disable_statistics(const std::string& path) {
+ statistics_enabled_[path] = false;
+ return this;
+ }
+
+ Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+ return this->disable_statistics(path->ToDotString());
+ }
+
+ std::shared_ptr<WriterProperties> build() {
+ std::unordered_map<std::string, ColumnProperties> column_properties;
+ auto get = [&](const std::string& key) -> ColumnProperties& {
+ auto it = column_properties.find(key);
+ if (it == column_properties.end())
+ return column_properties[key] = default_column_properties_;
+ else
+ return it->second;
+ };
+
+ for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
+ for (const auto& item : codecs_) get(item.first).set_compression(item.second);
+ for (const auto& item : codecs_compression_level_)
+ get(item.first).set_compression_level(item.second);
+ for (const auto& item : dictionary_enabled_)
+ get(item.first).set_dictionary_enabled(item.second);
+ for (const auto& item : statistics_enabled_)
+ get(item.first).set_statistics_enabled(item.second);
+
+ return std::shared_ptr<WriterProperties>(new WriterProperties(
+ pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
+ pagesize_, version_, created_by_, std::move(file_encryption_properties_),
+ default_column_properties_, column_properties, data_page_version_));
+ }
+
+ private:
+ MemoryPool* pool_;
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
+ int64_t pagesize_;
+ ParquetVersion::type version_;
+ ParquetDataPageVersion data_page_version_;
+ std::string created_by_;
+
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+ // Settings used for each column unless overridden in any of the maps below
+ ColumnProperties default_column_properties_;
+ std::unordered_map<std::string, Encoding::type> encodings_;
+ std::unordered_map<std::string, Compression::type> codecs_;
+ std::unordered_map<std::string, int32_t> codecs_compression_level_;
+ std::unordered_map<std::string, bool> dictionary_enabled_;
+ std::unordered_map<std::string, bool> statistics_enabled_;
+ };
+
+ inline MemoryPool* memory_pool() const { return pool_; }
+
+ inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
+
+ inline int64_t write_batch_size() const { return write_batch_size_; }
+
+ inline int64_t max_row_group_length() const { return max_row_group_length_; }
+
+ inline int64_t data_pagesize() const { return pagesize_; }
+
+ inline ParquetDataPageVersion data_page_version() const {
+ return parquet_data_page_version_;
+ }
+
+ inline ParquetVersion::type version() const { return parquet_version_; }
+
+ inline std::string created_by() const { return parquet_created_by_; }
+
+ inline Encoding::type dictionary_index_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::RLE_DICTIONARY;
+ }
+ }
+
+ inline Encoding::type dictionary_page_encoding() const {
+ if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+ return Encoding::PLAIN_DICTIONARY;
+ } else {
+ return Encoding::PLAIN;
+ }
+ }
+
+ const ColumnProperties& column_properties(
+ const std::shared_ptr<schema::ColumnPath>& path) const {
+ auto it = column_properties_.find(path->ToDotString());
+ if (it != column_properties_.end()) return it->second;
+ return default_column_properties_;
+ }
+
+ Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).encoding();
+ }
+
+ Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).compression();
+ }
+
+ int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).compression_level();
+ }
+
+ bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).dictionary_enabled();
+ }
+
+ bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).statistics_enabled();
+ }
+
+ size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
+ return column_properties(path).max_statistics_size();
+ }
+
+ inline FileEncryptionProperties* file_encryption_properties() const {
+ return file_encryption_properties_.get();
+ }
+
+ std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+ const std::string& path) const {
+ if (file_encryption_properties_) {
+ return file_encryption_properties_->column_encryption_properties(path);
+ } else {
+ return NULLPTR;
+ }
+ }
+
+ private:
+ explicit WriterProperties(
+ MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
+ int64_t max_row_group_length, int64_t pagesize, ParquetVersion::type version,
+ const std::string& created_by,
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+ const ColumnProperties& default_column_properties,
+ const std::unordered_map<std::string, ColumnProperties>& column_properties,
+ ParquetDataPageVersion data_page_version)
+ : pool_(pool),
+ dictionary_pagesize_limit_(dictionary_pagesize_limit),
+ write_batch_size_(write_batch_size),
+ max_row_group_length_(max_row_group_length),
+ pagesize_(pagesize),
+ parquet_data_page_version_(data_page_version),
+ parquet_version_(version),
+ parquet_created_by_(created_by),
+ file_encryption_properties_(file_encryption_properties),
+ default_column_properties_(default_column_properties),
+ column_properties_(column_properties) {}
+
+ MemoryPool* pool_;
+ int64_t dictionary_pagesize_limit_;
+ int64_t write_batch_size_;
+ int64_t max_row_group_length_;
+ int64_t pagesize_;
+ ParquetDataPageVersion parquet_data_page_version_;
+ ParquetVersion::type parquet_version_;
+ std::string parquet_created_by_;
+
+ std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+ ColumnProperties default_column_properties_;
+ std::unordered_map<std::string, ColumnProperties> column_properties_;
+};
+
+PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
+
+// ----------------------------------------------------------------------
+// Properties specific to Apache Arrow columnar read and write
+
+static constexpr bool kArrowDefaultUseThreads = false;
+
+// Default number of rows to read when using ::arrow::RecordBatchReader
+static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
+
+/// EXPERIMENTAL: Properties for configuring FileReader behavior.
+class PARQUET_EXPORT ArrowReaderProperties {
+ public:
+ explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
+ : use_threads_(use_threads),
+ read_dict_indices_(),
+ batch_size_(kArrowDefaultBatchSize),
+ pre_buffer_(false),
+ cache_options_(::arrow::io::CacheOptions::Defaults()),
+ coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO) {}
+
+ void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
+
+ bool use_threads() const { return use_threads_; }
+
+ void set_read_dictionary(int column_index, bool read_dict) {
+ if (read_dict) {
+ read_dict_indices_.insert(column_index);
+ } else {
+ read_dict_indices_.erase(column_index);
+ }
+ }
+ bool read_dictionary(int column_index) const {
+ if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
+
+ int64_t batch_size() const { return batch_size_; }
+
+ /// Enable read coalescing.
+ ///
+ /// When enabled, the Arrow reader will pre-buffer necessary regions
+ /// of the file in-memory. This is intended to improve performance on
+ /// high-latency filesystems (e.g. Amazon S3).
+ void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
+
+ bool pre_buffer() const { return pre_buffer_; }
+
+ /// Set options for read coalescing. This can be used to tune the
+ /// implementation for characteristics of different filesystems.
+ void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
+
+ const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
+
+ /// Set execution context for read coalescing.
+ void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
+
+ const ::arrow::io::IOContext& io_context() const { return io_context_; }
+
+ /// Set timestamp unit to use for deprecated INT96-encoded timestamps
+ /// (default is NANO).
+ void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
+ coerce_int96_timestamp_unit_ = unit;
+ }
+
+ ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
+ return coerce_int96_timestamp_unit_;
+ }
+
+ private:
+ bool use_threads_;
+ std::unordered_set<int> read_dict_indices_;
+ int64_t batch_size_;
+ bool pre_buffer_;
+ ::arrow::io::IOContext io_context_;
+ ::arrow::io::CacheOptions cache_options_;
+ ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+};
+
+/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
+PARQUET_EXPORT
+ArrowReaderProperties default_arrow_reader_properties();
+
+class PARQUET_EXPORT ArrowWriterProperties {
+ public:
+ enum EngineVersion {
+ V1, // Supports only nested lists.
+ V2 // Full support for all nesting combinations
+ };
+ class Builder {
+ public:
+ Builder()
+ : write_timestamps_as_int96_(false),
+ coerce_timestamps_enabled_(false),
+ coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+ truncated_timestamps_allowed_(false),
+ store_schema_(false),
+ // TODO: At some point we should flip this.
+ compliant_nested_types_(false),
+ engine_version_(V2) {}
+ virtual ~Builder() = default;
+
+ Builder* disable_deprecated_int96_timestamps() {
+ write_timestamps_as_int96_ = false;
+ return this;
+ }
+
+ Builder* enable_deprecated_int96_timestamps() {
+ write_timestamps_as_int96_ = true;
+ return this;
+ }
+
+ Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
+ coerce_timestamps_enabled_ = true;
+ coerce_timestamps_unit_ = unit;
+ return this;
+ }
+
+ Builder* allow_truncated_timestamps() {
+ truncated_timestamps_allowed_ = true;
+ return this;
+ }
+
+ Builder* disallow_truncated_timestamps() {
+ truncated_timestamps_allowed_ = false;
+ return this;
+ }
+
+ /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
+ /// to enable certain read options (like "read_dictionary") to be set
+ /// automatically
+ Builder* store_schema() {
+ store_schema_ = true;
+ return this;
+ }
+
+ Builder* enable_compliant_nested_types() {
+ compliant_nested_types_ = true;
+ return this;
+ }
+
+ Builder* disable_compliant_nested_types() {
+ compliant_nested_types_ = false;
+ return this;
+ }
+
+ Builder* set_engine_version(EngineVersion version) {
+ engine_version_ = version;
+ return this;
+ }
+
+ std::shared_ptr<ArrowWriterProperties> build() {
+ return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+ write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
+ truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
+ engine_version_));
+ }
+
+ private:
+ bool write_timestamps_as_int96_;
+
+ bool coerce_timestamps_enabled_;
+ ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ bool truncated_timestamps_allowed_;
+
+ bool store_schema_;
+ bool compliant_nested_types_;
+ EngineVersion engine_version_;
+ };
+
+ bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
+
+ bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
+ ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+ return coerce_timestamps_unit_;
+ }
+
+ bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
+
+ bool store_schema() const { return store_schema_; }
+
+ /// \brief Enable nested type naming according to the parquet specification.
+ ///
+ /// Older versions of arrow wrote out field names for nested lists based on the name
+ /// of the field. According to the parquet specification they should always be
+ /// "element".
+ bool compliant_nested_types() const { return compliant_nested_types_; }
+
+ /// \brief The underlying engine version to use when writing Arrow data.
+ ///
+ /// V2 is currently the latest V1 is considered deprecated but left in
+ /// place in case there are bugs detected in V2.
+ EngineVersion engine_version() const { return engine_version_; }
+
+ private:
+ explicit ArrowWriterProperties(bool write_nanos_as_int96,
+ bool coerce_timestamps_enabled,
+ ::arrow::TimeUnit::type coerce_timestamps_unit,
+ bool truncated_timestamps_allowed, bool store_schema,
+ bool compliant_nested_types,
+ EngineVersion engine_version)
+ : write_timestamps_as_int96_(write_nanos_as_int96),
+ coerce_timestamps_enabled_(coerce_timestamps_enabled),
+ coerce_timestamps_unit_(coerce_timestamps_unit),
+ truncated_timestamps_allowed_(truncated_timestamps_allowed),
+ store_schema_(store_schema),
+ compliant_nested_types_(compliant_nested_types),
+ engine_version_(engine_version) {}
+
+ const bool write_timestamps_as_int96_;
+ const bool coerce_timestamps_enabled_;
+ const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+ const bool truncated_timestamps_allowed_;
+ const bool store_schema_;
+ const bool compliant_nested_types_;
+ const EngineVersion engine_version_;
+};
+
+/// \brief State object used for writing Arrow data directly to a Parquet
+/// column chunk. API possibly not stable
+struct ArrowWriteContext {
+ ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
+ : memory_pool(memory_pool),
+ properties(properties),
+ data_buffer(AllocateBuffer(memory_pool)),
+ def_levels_buffer(AllocateBuffer(memory_pool)) {}
+
+ template <typename T>
+ ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
+ ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
+ *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
+ return ::arrow::Status::OK();
+ }
+
+ MemoryPool* memory_pool;
+ const ArrowWriterProperties* properties;
+
+ // Buffer used for storing the data of an array converted to the physical type
+ // as expected by parquet-cpp.
+ std::shared_ptr<ResizableBuffer> data_buffer;
+
+ // We use the shared ownership of this buffer
+ std::shared_ptr<ResizableBuffer> def_levels_buffer;
+};
+
+PARQUET_EXPORT
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc b/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
index fe4e10d8514..cfa6bdb2912 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema.cc
@@ -1,945 +1,945 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/schema.h"
-
-#include <algorithm>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "arrow/util/logging.h"
-#include "parquet/exception.h"
-#include "parquet/schema_internal.h"
-#include "parquet/thrift_internal.h"
-
-using parquet::format::SchemaElement;
-
-namespace parquet {
-
-namespace schema {
-
-namespace {
-
-void ThrowInvalidLogicalType(const LogicalType& logical_type) {
- std::stringstream ss;
- ss << "Invalid logical type: " << logical_type.ToString();
- throw ParquetException(ss.str());
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// ColumnPath
-
-std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
- std::stringstream ss(dotstring);
- std::string item;
- std::vector<std::string> path;
- while (std::getline(ss, item, '.')) {
- path.push_back(item);
- }
- return std::make_shared<ColumnPath>(std::move(path));
-}
-
-std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node) {
- // Build the path in reverse order as we traverse the nodes to the top
- std::vector<std::string> rpath_;
- const Node* cursor = &node;
- // The schema node is not part of the ColumnPath
- while (cursor->parent()) {
- rpath_.push_back(cursor->name());
- cursor = cursor->parent();
- }
-
- // Build ColumnPath in correct order
- std::vector<std::string> path(rpath_.crbegin(), rpath_.crend());
- return std::make_shared<ColumnPath>(std::move(path));
-}
-
-std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
- std::vector<std::string> path;
- path.reserve(path_.size() + 1);
- path.resize(path_.size() + 1);
- std::copy(path_.cbegin(), path_.cend(), path.begin());
- path[path_.size()] = node_name;
-
- return std::make_shared<ColumnPath>(std::move(path));
-}
-
-std::string ColumnPath::ToDotString() const {
- std::stringstream ss;
- for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
- if (it != path_.cbegin()) {
- ss << ".";
- }
- ss << *it;
- }
- return ss.str();
-}
-
-const std::vector<std::string>& ColumnPath::ToDotVector() const { return path_; }
-
-// ----------------------------------------------------------------------
-// Base node
-
-const std::shared_ptr<ColumnPath> Node::path() const {
- // TODO(itaiin): Cache the result, or more precisely, cache ->ToDotString()
- // since it is being used to access the leaf nodes
- return ColumnPath::FromNode(*this);
-}
-
-bool Node::EqualsInternal(const Node* other) const {
- return type_ == other->type_ && name_ == other->name_ &&
- repetition_ == other->repetition_ && converted_type_ == other->converted_type_ &&
- field_id_ == other->field_id() &&
- logical_type_->Equals(*(other->logical_type()));
-}
-
-void Node::SetParent(const Node* parent) { parent_ = parent; }
-
-// ----------------------------------------------------------------------
-// Primitive node
-
-PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
- Type::type type, ConvertedType::type converted_type,
- int length, int precision, int scale, int id)
- : Node(Node::PRIMITIVE, name, repetition, converted_type, id),
- physical_type_(type),
- type_length_(length) {
- std::stringstream ss;
-
- // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
- // set to true, but Impala will raise an incompatible metadata in such cases
- memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
-
- // Check if the physical and logical types match
- // Mapping referred from Apache parquet-mr as on 2016-02-22
- switch (converted_type) {
- case ConvertedType::NONE:
- // Logical type not set
- break;
- case ConvertedType::UTF8:
- case ConvertedType::JSON:
- case ConvertedType::BSON:
- if (type != Type::BYTE_ARRAY) {
- ss << ConvertedTypeToString(converted_type);
- ss << " can only annotate BYTE_ARRAY fields";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::DECIMAL:
- if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
- (type != Type::FIXED_LEN_BYTE_ARRAY)) {
- ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
- throw ParquetException(ss.str());
- }
- if (precision <= 0) {
- ss << "Invalid DECIMAL precision: " << precision
- << ". Precision must be a number between 1 and 38 inclusive";
- throw ParquetException(ss.str());
- }
- if (scale < 0) {
- ss << "Invalid DECIMAL scale: " << scale
- << ". Scale must be a number between 0 and precision inclusive";
- throw ParquetException(ss.str());
- }
- if (scale > precision) {
- ss << "Invalid DECIMAL scale " << scale;
- ss << " cannot be greater than precision " << precision;
- throw ParquetException(ss.str());
- }
- decimal_metadata_.isset = true;
- decimal_metadata_.precision = precision;
- decimal_metadata_.scale = scale;
- break;
- case ConvertedType::DATE:
- case ConvertedType::TIME_MILLIS:
- case ConvertedType::UINT_8:
- case ConvertedType::UINT_16:
- case ConvertedType::UINT_32:
- case ConvertedType::INT_8:
- case ConvertedType::INT_16:
- case ConvertedType::INT_32:
- if (type != Type::INT32) {
- ss << ConvertedTypeToString(converted_type);
- ss << " can only annotate INT32";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::TIME_MICROS:
- case ConvertedType::TIMESTAMP_MILLIS:
- case ConvertedType::TIMESTAMP_MICROS:
- case ConvertedType::UINT_64:
- case ConvertedType::INT_64:
- if (type != Type::INT64) {
- ss << ConvertedTypeToString(converted_type);
- ss << " can only annotate INT64";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::INTERVAL:
- if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
- ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::ENUM:
- if (type != Type::BYTE_ARRAY) {
- ss << "ENUM can only annotate BYTE_ARRAY fields";
- throw ParquetException(ss.str());
- }
- break;
- case ConvertedType::NA:
- // NA can annotate any type
- break;
- default:
- ss << ConvertedTypeToString(converted_type);
- ss << " cannot be applied to a primitive type";
- throw ParquetException(ss.str());
- }
- // For forward compatibility, create an equivalent logical type
- logical_type_ = LogicalType::FromConvertedType(converted_type_, decimal_metadata_);
- if (!(logical_type_ && !logical_type_->is_nested() &&
- logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- if (type == Type::FIXED_LEN_BYTE_ARRAY) {
- if (length <= 0) {
- ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
- throw ParquetException(ss.str());
- }
- type_length_ = length;
- }
-}
-
-PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type,
- Type::type physical_type, int physical_length, int id)
- : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id),
- physical_type_(physical_type),
- type_length_(physical_length) {
- std::stringstream error;
- if (logical_type_) {
- // Check for logical type <=> node type consistency
- if (!logical_type_->is_nested()) {
- // Check for logical type <=> physical type consistency
- if (logical_type_->is_applicable(physical_type, physical_length)) {
- // For backward compatibility, assign equivalent legacy
- // converted type (if possible)
- converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
- } else {
- error << logical_type_->ToString();
- error << " can not be applied to primitive type ";
- error << TypeToString(physical_type);
- throw ParquetException(error.str());
- }
- } else {
- error << "Nested logical type ";
- error << logical_type_->ToString();
- error << " can not be applied to non-group node";
- throw ParquetException(error.str());
- }
- } else {
- logical_type_ = NoLogicalType::Make();
- converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
- }
- if (!(logical_type_ && !logical_type_->is_nested() &&
- logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- if (physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
- if (physical_length <= 0) {
- error << "Invalid FIXED_LEN_BYTE_ARRAY length: " << physical_length;
- throw ParquetException(error.str());
- }
- }
-}
-
-bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
- bool is_equal = true;
- if (physical_type_ != other->physical_type_) {
- return false;
- }
- if (converted_type_ == ConvertedType::DECIMAL) {
- is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
- (decimal_metadata_.scale == other->decimal_metadata_.scale);
- }
- if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
- is_equal &= (type_length_ == other->type_length_);
- }
- return is_equal;
-}
-
-bool PrimitiveNode::Equals(const Node* other) const {
- if (!Node::EqualsInternal(other)) {
- return false;
- }
- return EqualsInternal(static_cast<const PrimitiveNode*>(other));
-}
-
-void PrimitiveNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
-
-void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
- visitor->Visit(this);
-}
-
-// ----------------------------------------------------------------------
-// Group node
-
-GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields, ConvertedType::type converted_type, int id)
- : Node(Node::GROUP, name, repetition, converted_type, id), fields_(fields) {
- // For forward compatibility, create an equivalent logical type
- logical_type_ = LogicalType::FromConvertedType(converted_type_);
- if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
- logical_type_->is_compatible(converted_type_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- field_name_to_idx_.clear();
- auto field_idx = 0;
- for (NodePtr& field : fields_) {
- field->SetParent(this);
- field_name_to_idx_.emplace(field->name(), field_idx++);
- }
-}
-
-GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- std::shared_ptr<const LogicalType> logical_type, int id)
- : Node(Node::GROUP, name, repetition, std::move(logical_type), id), fields_(fields) {
- if (logical_type_) {
- // Check for logical type <=> node type consistency
- if (logical_type_->is_nested()) {
- // For backward compatibility, assign equivalent legacy converted type (if possible)
- converted_type_ = logical_type_->ToConvertedType(nullptr);
- } else {
- std::stringstream error;
- error << "Logical type ";
- error << logical_type_->ToString();
- error << " can not be applied to group node";
- throw ParquetException(error.str());
- }
- } else {
- logical_type_ = NoLogicalType::Make();
- converted_type_ = logical_type_->ToConvertedType(nullptr);
- }
- if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
- logical_type_->is_compatible(converted_type_))) {
- ThrowInvalidLogicalType(*logical_type_);
- }
-
- field_name_to_idx_.clear();
- auto field_idx = 0;
- for (NodePtr& field : fields_) {
- field->SetParent(this);
- field_name_to_idx_.emplace(field->name(), field_idx++);
- }
-}
-
-bool GroupNode::EqualsInternal(const GroupNode* other) const {
- if (this == other) {
- return true;
- }
- if (this->field_count() != other->field_count()) {
- return false;
- }
- for (int i = 0; i < this->field_count(); ++i) {
- if (!this->field(i)->Equals(other->field(i).get())) {
- return false;
- }
- }
- return true;
-}
-
-bool GroupNode::Equals(const Node* other) const {
- if (!Node::EqualsInternal(other)) {
- return false;
- }
- return EqualsInternal(static_cast<const GroupNode*>(other));
-}
-
-int GroupNode::FieldIndex(const std::string& name) const {
- auto search = field_name_to_idx_.find(name);
- if (search == field_name_to_idx_.end()) {
- // Not found
- return -1;
- }
- return search->second;
-}
-
-int GroupNode::FieldIndex(const Node& node) const {
- auto search = field_name_to_idx_.equal_range(node.name());
- for (auto it = search.first; it != search.second; ++it) {
- const int idx = it->second;
- if (&node == field(idx).get()) {
- return idx;
- }
- }
- return -1;
-}
-
-void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
-
-void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { visitor->Visit(this); }
-
-// ----------------------------------------------------------------------
-// Node construction from Parquet metadata
-
-std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element,
- NodeVector fields) {
- const format::SchemaElement* element =
- static_cast<const format::SchemaElement*>(opaque_element);
-
- int field_id = -1;
- if (element->__isset.field_id) {
- field_id = element->field_id;
- }
-
- std::unique_ptr<GroupNode> group_node;
- if (element->__isset.logicalType) {
- // updated writer with logical type present
- group_node = std::unique_ptr<GroupNode>(
- new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields,
- LogicalType::FromThrift(element->logicalType), field_id));
- } else {
- group_node = std::unique_ptr<GroupNode>(new GroupNode(
- element->name, LoadEnumSafe(&element->repetition_type), fields,
- (element->__isset.converted_type ? LoadEnumSafe(&element->converted_type)
- : ConvertedType::NONE),
- field_id));
- }
-
- return std::unique_ptr<Node>(group_node.release());
-}
-
-std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element) {
- const format::SchemaElement* element =
- static_cast<const format::SchemaElement*>(opaque_element);
-
- int field_id = -1;
- if (element->__isset.field_id) {
- field_id = element->field_id;
- }
-
- std::unique_ptr<PrimitiveNode> primitive_node;
- if (element->__isset.logicalType) {
- // updated writer with logical type present
- primitive_node = std::unique_ptr<PrimitiveNode>(
- new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type),
- LogicalType::FromThrift(element->logicalType),
- LoadEnumSafe(&element->type), element->type_length, field_id));
- } else if (element->__isset.converted_type) {
- // legacy writer with converted type present
- primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
- element->name, LoadEnumSafe(&element->repetition_type),
- LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
- element->type_length, element->precision, element->scale, field_id));
- } else {
- // logical type not present
- primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
- element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(),
- LoadEnumSafe(&element->type), element->type_length, field_id));
- }
-
- // Return as unique_ptr to the base type
- return std::unique_ptr<Node>(primitive_node.release());
-}
-
-bool GroupNode::HasRepeatedFields() const {
- for (int i = 0; i < this->field_count(); ++i) {
- auto field = this->field(i);
- if (field->repetition() == Repetition::REPEATED) {
- return true;
- }
- if (field->is_group()) {
- const auto& group = static_cast<const GroupNode&>(*field);
- return group.HasRepeatedFields();
- }
- }
- return false;
-}
-
-void GroupNode::ToParquet(void* opaque_element) const {
- format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
- element->__set_name(name_);
- element->__set_num_children(field_count());
- element->__set_repetition_type(ToThrift(repetition_));
- if (converted_type_ != ConvertedType::NONE) {
- element->__set_converted_type(ToThrift(converted_type_));
- }
- if (field_id_ >= 0) {
- element->__set_field_id(field_id_);
- }
- if (logical_type_ && logical_type_->is_serialized()) {
- element->__set_logicalType(logical_type_->ToThrift());
- }
- return;
-}
-
-void PrimitiveNode::ToParquet(void* opaque_element) const {
- format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
- element->__set_name(name_);
- element->__set_repetition_type(ToThrift(repetition_));
- if (converted_type_ != ConvertedType::NONE) {
- if (converted_type_ != ConvertedType::NA) {
- element->__set_converted_type(ToThrift(converted_type_));
- } else {
- // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
- // Never emit it (see PARQUET-1990 for discussion).
- if (!logical_type_ || !logical_type_->is_null()) {
- throw ParquetException(
- "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
- }
- }
- }
- if (field_id_ >= 0) {
- element->__set_field_id(field_id_);
- }
- if (logical_type_ && logical_type_->is_serialized() &&
- // TODO(tpboudreau): remove the following conjunct to enable serialization
- // of IntervalTypes after parquet.thrift recognizes them
- !logical_type_->is_interval()) {
- element->__set_logicalType(logical_type_->ToThrift());
- }
- element->__set_type(ToThrift(physical_type_));
- if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
- element->__set_type_length(type_length_);
- }
- if (decimal_metadata_.isset) {
- element->__set_precision(decimal_metadata_.precision);
- element->__set_scale(decimal_metadata_.scale);
- }
- return;
-}
-
-// ----------------------------------------------------------------------
-// Schema converters
-
-std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length) {
- if (elements[0].num_children == 0) {
- if (length == 1) {
- // Degenerate case of Parquet file with no columns
- return GroupNode::FromParquet(elements, {});
- } else {
- throw ParquetException(
- "Parquet schema had multiple nodes but root had no children");
- }
- }
-
- // We don't check that the root node is repeated since this is not
- // consistently set by implementations
-
- int pos = 0;
-
- std::function<std::unique_ptr<Node>()> NextNode = [&]() {
- if (pos == length) {
- throw ParquetException("Malformed schema: not enough elements");
- }
- const SchemaElement& element = elements[pos++];
- const void* opaque_element = static_cast<const void*>(&element);
-
- if (element.num_children == 0 && element.__isset.type) {
- // Leaf (primitive) node: always has a type
- return PrimitiveNode::FromParquet(opaque_element);
- } else {
- // Group node (may have 0 children, but cannot have a type)
- NodeVector fields;
- for (int i = 0; i < element.num_children; ++i) {
- std::unique_ptr<Node> field = NextNode();
- fields.push_back(NodePtr(field.release()));
- }
- return GroupNode::FromParquet(opaque_element, std::move(fields));
- }
- };
- return NextNode();
-}
-
-std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
- if (schema.empty()) {
- throw ParquetException("Empty file schema (no root)");
- }
- std::unique_ptr<Node> root = Unflatten(&schema[0], static_cast<int>(schema.size()));
- std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
- descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
- return descr;
-}
-
-class SchemaVisitor : public Node::ConstVisitor {
- public:
- explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
- : elements_(elements) {}
-
- void Visit(const Node* node) override {
- format::SchemaElement element;
- node->ToParquet(&element);
- elements_->push_back(element);
-
- if (node->is_group()) {
- const GroupNode* group_node = static_cast<const GroupNode*>(node);
- for (int i = 0; i < group_node->field_count(); ++i) {
- group_node->field(i)->VisitConst(this);
- }
- }
- }
-
- private:
- std::vector<format::SchemaElement>* elements_;
-};
-
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
- SchemaVisitor visitor(out);
- schema->VisitConst(&visitor);
-}
-
-// ----------------------------------------------------------------------
-// Schema printing
-
-static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
- switch (repetition) {
- case Repetition::REQUIRED:
- stream << "required";
- break;
- case Repetition::OPTIONAL:
- stream << "optional";
- break;
- case Repetition::REPEATED:
- stream << "repeated";
- break;
- default:
- break;
- }
-}
-
-static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
- switch (node->physical_type()) {
- case Type::BOOLEAN:
- stream << "boolean";
- break;
- case Type::INT32:
- stream << "int32";
- break;
- case Type::INT64:
- stream << "int64";
- break;
- case Type::INT96:
- stream << "int96";
- break;
- case Type::FLOAT:
- stream << "float";
- break;
- case Type::DOUBLE:
- stream << "double";
- break;
- case Type::BYTE_ARRAY:
- stream << "binary";
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- stream << "fixed_len_byte_array(" << node->type_length() << ")";
- break;
- default:
- break;
- }
-}
-
-static void PrintConvertedType(const PrimitiveNode* node, std::ostream& stream) {
- auto lt = node->converted_type();
- auto la = node->logical_type();
- if (la && la->is_valid() && !la->is_none()) {
- stream << " (" << la->ToString() << ")";
- } else if (lt == ConvertedType::DECIMAL) {
- stream << " (" << ConvertedTypeToString(lt) << "("
- << node->decimal_metadata().precision << "," << node->decimal_metadata().scale
- << "))";
- } else if (lt != ConvertedType::NONE) {
- stream << " (" << ConvertedTypeToString(lt) << ")";
- }
-}
-
-struct SchemaPrinter : public Node::ConstVisitor {
- explicit SchemaPrinter(std::ostream& stream, int indent_width)
- : stream_(stream), indent_(0), indent_width_(2) {}
-
- void Indent() {
- if (indent_ > 0) {
- std::string spaces(indent_, ' ');
- stream_ << spaces;
- }
- }
-
- void Visit(const Node* node) {
- Indent();
- if (node->is_group()) {
- Visit(static_cast<const GroupNode*>(node));
- } else {
- // Primitive
- Visit(static_cast<const PrimitiveNode*>(node));
- }
- }
-
- void Visit(const PrimitiveNode* node) {
- PrintRepLevel(node->repetition(), stream_);
- stream_ << " ";
- PrintType(node, stream_);
- stream_ << " field_id=" << node->field_id() << " " << node->name();
- PrintConvertedType(node, stream_);
- stream_ << ";" << std::endl;
- }
-
- void Visit(const GroupNode* node) {
- PrintRepLevel(node->repetition(), stream_);
- stream_ << " group "
- << "field_id=" << node->field_id() << " " << node->name();
- auto lt = node->converted_type();
- auto la = node->logical_type();
- if (la && la->is_valid() && !la->is_none()) {
- stream_ << " (" << la->ToString() << ")";
- } else if (lt != ConvertedType::NONE) {
- stream_ << " (" << ConvertedTypeToString(lt) << ")";
- }
- stream_ << " {" << std::endl;
-
- indent_ += indent_width_;
- for (int i = 0; i < node->field_count(); ++i) {
- node->field(i)->VisitConst(this);
- }
- indent_ -= indent_width_;
- Indent();
- stream_ << "}" << std::endl;
- }
-
- std::ostream& stream_;
- int indent_;
- int indent_width_;
-};
-
-void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
- SchemaPrinter printer(stream, indent_width);
- printer.Visit(schema);
-}
-
-} // namespace schema
-
-using schema::ColumnPath;
-using schema::GroupNode;
-using schema::Node;
-using schema::NodePtr;
-using schema::PrimitiveNode;
-
-void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
- Init(NodePtr(schema.release()));
-}
-
-class SchemaUpdater : public Node::Visitor {
- public:
- explicit SchemaUpdater(const std::vector<ColumnOrder>& column_orders)
- : column_orders_(column_orders), leaf_count_(0) {}
-
- void Visit(Node* node) override {
- if (node->is_group()) {
- GroupNode* group_node = static_cast<GroupNode*>(node);
- for (int i = 0; i < group_node->field_count(); ++i) {
- group_node->field(i)->Visit(this);
- }
- } else { // leaf node
- PrimitiveNode* leaf_node = static_cast<PrimitiveNode*>(node);
- leaf_node->SetColumnOrder(column_orders_[leaf_count_++]);
- }
- }
-
- private:
- const std::vector<ColumnOrder>& column_orders_;
- int leaf_count_;
-};
-
-void SchemaDescriptor::updateColumnOrders(const std::vector<ColumnOrder>& column_orders) {
- if (static_cast<int>(column_orders.size()) != num_columns()) {
- throw ParquetException("Malformed schema: not enough ColumnOrder values");
- }
- SchemaUpdater visitor(column_orders);
- const_cast<GroupNode*>(group_node_)->Visit(&visitor);
-}
-
-void SchemaDescriptor::Init(NodePtr schema) {
- schema_ = std::move(schema);
-
- if (!schema_->is_group()) {
- throw ParquetException("Must initialize with a schema group");
- }
-
- group_node_ = static_cast<const GroupNode*>(schema_.get());
- leaves_.clear();
-
- for (int i = 0; i < group_node_->field_count(); ++i) {
- BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
- }
-}
-
-bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
- if (this->num_columns() != other.num_columns()) {
- return false;
- }
-
- for (int i = 0; i < this->num_columns(); ++i) {
- if (!this->Column(i)->Equals(*other.Column(i))) {
- return false;
- }
- }
-
- return true;
-}
-
-void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
- int16_t max_rep_level, const NodePtr& base) {
- if (node->is_optional()) {
- ++max_def_level;
- } else if (node->is_repeated()) {
- // Repeated fields add a definition level. This is used to distinguish
- // between an empty list and a list with an item in it.
- ++max_rep_level;
- ++max_def_level;
- }
-
- // Now, walk the schema and create a ColumnDescriptor for each leaf node
- if (node->is_group()) {
- const GroupNode* group = static_cast<const GroupNode*>(node.get());
- for (int i = 0; i < group->field_count(); ++i) {
- BuildTree(group->field(i), max_def_level, max_rep_level, base);
- }
- } else {
- node_to_leaf_index_[static_cast<const PrimitiveNode*>(node.get())] =
- static_cast<int>(leaves_.size());
-
- // Primitive node, append to leaves
- leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
- leaf_to_base_.emplace(static_cast<int>(leaves_.size()) - 1, base);
- leaf_to_idx_.emplace(node->path()->ToDotString(),
- static_cast<int>(leaves_.size()) - 1);
- }
-}
-
-int SchemaDescriptor::GetColumnIndex(const PrimitiveNode& node) const {
- auto it = node_to_leaf_index_.find(&node);
- if (it == node_to_leaf_index_.end()) {
- return -1;
- }
- return it->second;
-}
-
-ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
- int16_t max_repetition_level,
- const SchemaDescriptor* schema_descr)
- : node_(std::move(node)),
- max_definition_level_(max_definition_level),
- max_repetition_level_(max_repetition_level) {
- if (!node_->is_primitive()) {
- throw ParquetException("Must be a primitive type");
- }
- primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
-}
-
-bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
- return primitive_node_->Equals(other.primitive_node_) &&
- max_repetition_level() == other.max_repetition_level() &&
- max_definition_level() == other.max_definition_level();
-}
-
-const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
- DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
- return &leaves_[i];
-}
-
-int SchemaDescriptor::ColumnIndex(const std::string& node_path) const {
- auto search = leaf_to_idx_.find(node_path);
- if (search == leaf_to_idx_.end()) {
- // Not found
- return -1;
- }
- return search->second;
-}
-
-int SchemaDescriptor::ColumnIndex(const Node& node) const {
- auto search = leaf_to_idx_.equal_range(node.path()->ToDotString());
- for (auto it = search.first; it != search.second; ++it) {
- const int idx = it->second;
- if (&node == Column(idx)->schema_node().get()) {
- return idx;
- }
- }
- return -1;
-}
-
-const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const {
- DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
- return leaf_to_base_.find(i)->second.get();
-}
-
-bool SchemaDescriptor::HasRepeatedFields() const {
- return group_node_->HasRepeatedFields();
-}
-
-std::string SchemaDescriptor::ToString() const {
- std::ostringstream ss;
- PrintSchema(schema_.get(), ss);
- return ss.str();
-}
-
-std::string ColumnDescriptor::ToString() const {
- std::ostringstream ss;
- ss << "column descriptor = {" << std::endl
- << " name: " << name() << "," << std::endl
- << " path: " << path()->ToDotString() << "," << std::endl
- << " physical_type: " << TypeToString(physical_type()) << "," << std::endl
- << " converted_type: " << ConvertedTypeToString(converted_type()) << ","
- << std::endl
- << " logical_type: " << logical_type()->ToString() << "," << std::endl
- << " max_definition_level: " << max_definition_level() << "," << std::endl
- << " max_repetition_level: " << max_repetition_level() << "," << std::endl;
-
- if (physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
- ss << " length: " << type_length() << "," << std::endl;
- }
-
- if (converted_type() == parquet::ConvertedType::DECIMAL) {
- ss << " precision: " << type_precision() << "," << std::endl
- << " scale: " << type_scale() << "," << std::endl;
- }
-
- ss << "}";
- return ss.str();
-}
-
-int ColumnDescriptor::type_scale() const {
- return primitive_node_->decimal_metadata().scale;
-}
-
-int ColumnDescriptor::type_precision() const {
- return primitive_node_->decimal_metadata().precision;
-}
-
-int ColumnDescriptor::type_length() const { return primitive_node_->type_length(); }
-
-const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
- return primitive_node_->path();
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/schema.h"
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/logging.h"
+#include "parquet/exception.h"
+#include "parquet/schema_internal.h"
+#include "parquet/thrift_internal.h"
+
+using parquet::format::SchemaElement;
+
+namespace parquet {
+
+namespace schema {
+
+namespace {
+
+void ThrowInvalidLogicalType(const LogicalType& logical_type) {
+ std::stringstream ss;
+ ss << "Invalid logical type: " << logical_type.ToString();
+ throw ParquetException(ss.str());
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// ColumnPath
+
+std::shared_ptr<ColumnPath> ColumnPath::FromDotString(const std::string& dotstring) {
+ std::stringstream ss(dotstring);
+ std::string item;
+ std::vector<std::string> path;
+ while (std::getline(ss, item, '.')) {
+ path.push_back(item);
+ }
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::FromNode(const Node& node) {
+ // Build the path in reverse order as we traverse the nodes to the top
+ std::vector<std::string> rpath_;
+ const Node* cursor = &node;
+ // The schema node is not part of the ColumnPath
+ while (cursor->parent()) {
+ rpath_.push_back(cursor->name());
+ cursor = cursor->parent();
+ }
+
+ // Build ColumnPath in correct order
+ std::vector<std::string> path(rpath_.crbegin(), rpath_.crend());
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::shared_ptr<ColumnPath> ColumnPath::extend(const std::string& node_name) const {
+ std::vector<std::string> path;
+ path.reserve(path_.size() + 1);
+ path.resize(path_.size() + 1);
+ std::copy(path_.cbegin(), path_.cend(), path.begin());
+ path[path_.size()] = node_name;
+
+ return std::make_shared<ColumnPath>(std::move(path));
+}
+
+std::string ColumnPath::ToDotString() const {
+ std::stringstream ss;
+ for (auto it = path_.cbegin(); it != path_.cend(); ++it) {
+ if (it != path_.cbegin()) {
+ ss << ".";
+ }
+ ss << *it;
+ }
+ return ss.str();
+}
+
+const std::vector<std::string>& ColumnPath::ToDotVector() const { return path_; }
+
+// ----------------------------------------------------------------------
+// Base node
+
+const std::shared_ptr<ColumnPath> Node::path() const {
+ // TODO(itaiin): Cache the result, or more precisely, cache ->ToDotString()
+ // since it is being used to access the leaf nodes
+ return ColumnPath::FromNode(*this);
+}
+
+bool Node::EqualsInternal(const Node* other) const {
+ return type_ == other->type_ && name_ == other->name_ &&
+ repetition_ == other->repetition_ && converted_type_ == other->converted_type_ &&
+ field_id_ == other->field_id() &&
+ logical_type_->Equals(*(other->logical_type()));
+}
+
+void Node::SetParent(const Node* parent) { parent_ = parent; }
+
+// ----------------------------------------------------------------------
+// Primitive node
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ Type::type type, ConvertedType::type converted_type,
+ int length, int precision, int scale, int id)
+ : Node(Node::PRIMITIVE, name, repetition, converted_type, id),
+ physical_type_(type),
+ type_length_(length) {
+ std::stringstream ss;
+
+ // PARQUET-842: In an earlier revision, decimal_metadata_.isset was being
+ // set to true, but Impala will raise an incompatible metadata in such cases
+ memset(&decimal_metadata_, 0, sizeof(decimal_metadata_));
+
+ // Check if the physical and logical types match
+ // Mapping referred from Apache parquet-mr as on 2016-02-22
+ switch (converted_type) {
+ case ConvertedType::NONE:
+ // Logical type not set
+ break;
+ case ConvertedType::UTF8:
+ case ConvertedType::JSON:
+ case ConvertedType::BSON:
+ if (type != Type::BYTE_ARRAY) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::DECIMAL:
+ if ((type != Type::INT32) && (type != Type::INT64) && (type != Type::BYTE_ARRAY) &&
+ (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+ ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+ throw ParquetException(ss.str());
+ }
+ if (precision <= 0) {
+ ss << "Invalid DECIMAL precision: " << precision
+ << ". Precision must be a number between 1 and 38 inclusive";
+ throw ParquetException(ss.str());
+ }
+ if (scale < 0) {
+ ss << "Invalid DECIMAL scale: " << scale
+ << ". Scale must be a number between 0 and precision inclusive";
+ throw ParquetException(ss.str());
+ }
+ if (scale > precision) {
+ ss << "Invalid DECIMAL scale " << scale;
+ ss << " cannot be greater than precision " << precision;
+ throw ParquetException(ss.str());
+ }
+ decimal_metadata_.isset = true;
+ decimal_metadata_.precision = precision;
+ decimal_metadata_.scale = scale;
+ break;
+ case ConvertedType::DATE:
+ case ConvertedType::TIME_MILLIS:
+ case ConvertedType::UINT_8:
+ case ConvertedType::UINT_16:
+ case ConvertedType::UINT_32:
+ case ConvertedType::INT_8:
+ case ConvertedType::INT_16:
+ case ConvertedType::INT_32:
+ if (type != Type::INT32) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate INT32";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::TIME_MICROS:
+ case ConvertedType::TIMESTAMP_MILLIS:
+ case ConvertedType::TIMESTAMP_MICROS:
+ case ConvertedType::UINT_64:
+ case ConvertedType::INT_64:
+ if (type != Type::INT64) {
+ ss << ConvertedTypeToString(converted_type);
+ ss << " can only annotate INT64";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::INTERVAL:
+ if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+ ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::ENUM:
+ if (type != Type::BYTE_ARRAY) {
+ ss << "ENUM can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case ConvertedType::NA:
+ // NA can annotate any type
+ break;
+ default:
+ ss << ConvertedTypeToString(converted_type);
+ ss << " cannot be applied to a primitive type";
+ throw ParquetException(ss.str());
+ }
+ // For forward compatibility, create an equivalent logical type
+ logical_type_ = LogicalType::FromConvertedType(converted_type_, decimal_metadata_);
+ if (!(logical_type_ && !logical_type_->is_nested() &&
+ logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (length <= 0) {
+ ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+ throw ParquetException(ss.str());
+ }
+ type_length_ = length;
+ }
+}
+
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type physical_type, int physical_length, int id)
+ : Node(Node::PRIMITIVE, name, repetition, std::move(logical_type), id),
+ physical_type_(physical_type),
+ type_length_(physical_length) {
+ std::stringstream error;
+ if (logical_type_) {
+ // Check for logical type <=> node type consistency
+ if (!logical_type_->is_nested()) {
+ // Check for logical type <=> physical type consistency
+ if (logical_type_->is_applicable(physical_type, physical_length)) {
+ // For backward compatibility, assign equivalent legacy
+ // converted type (if possible)
+ converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
+ } else {
+ error << logical_type_->ToString();
+ error << " can not be applied to primitive type ";
+ error << TypeToString(physical_type);
+ throw ParquetException(error.str());
+ }
+ } else {
+ error << "Nested logical type ";
+ error << logical_type_->ToString();
+ error << " can not be applied to non-group node";
+ throw ParquetException(error.str());
+ }
+ } else {
+ logical_type_ = NoLogicalType::Make();
+ converted_type_ = logical_type_->ToConvertedType(&decimal_metadata_);
+ }
+ if (!(logical_type_ && !logical_type_->is_nested() &&
+ logical_type_->is_compatible(converted_type_, decimal_metadata_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ if (physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (physical_length <= 0) {
+ error << "Invalid FIXED_LEN_BYTE_ARRAY length: " << physical_length;
+ throw ParquetException(error.str());
+ }
+ }
+}
+
+bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
+ bool is_equal = true;
+ if (physical_type_ != other->physical_type_) {
+ return false;
+ }
+ if (converted_type_ == ConvertedType::DECIMAL) {
+ is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
+ (decimal_metadata_.scale == other->decimal_metadata_.scale);
+ }
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ is_equal &= (type_length_ == other->type_length_);
+ }
+ return is_equal;
+}
+
+bool PrimitiveNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const PrimitiveNode*>(other));
+}
+
+void PrimitiveNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
+
+void PrimitiveNode::VisitConst(Node::ConstVisitor* visitor) const {
+ visitor->Visit(this);
+}
+
+// ----------------------------------------------------------------------
+// Group node
+
+GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, ConvertedType::type converted_type, int id)
+ : Node(Node::GROUP, name, repetition, converted_type, id), fields_(fields) {
+ // For forward compatibility, create an equivalent logical type
+ logical_type_ = LogicalType::FromConvertedType(converted_type_);
+ if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
+ logical_type_->is_compatible(converted_type_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ field_name_to_idx_.clear();
+ auto field_idx = 0;
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ field_name_to_idx_.emplace(field->name(), field_idx++);
+ }
+}
+
+GroupNode::GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ std::shared_ptr<const LogicalType> logical_type, int id)
+ : Node(Node::GROUP, name, repetition, std::move(logical_type), id), fields_(fields) {
+ if (logical_type_) {
+ // Check for logical type <=> node type consistency
+ if (logical_type_->is_nested()) {
+ // For backward compatibility, assign equivalent legacy converted type (if possible)
+ converted_type_ = logical_type_->ToConvertedType(nullptr);
+ } else {
+ std::stringstream error;
+ error << "Logical type ";
+ error << logical_type_->ToString();
+ error << " can not be applied to group node";
+ throw ParquetException(error.str());
+ }
+ } else {
+ logical_type_ = NoLogicalType::Make();
+ converted_type_ = logical_type_->ToConvertedType(nullptr);
+ }
+ if (!(logical_type_ && (logical_type_->is_nested() || logical_type_->is_none()) &&
+ logical_type_->is_compatible(converted_type_))) {
+ ThrowInvalidLogicalType(*logical_type_);
+ }
+
+ field_name_to_idx_.clear();
+ auto field_idx = 0;
+ for (NodePtr& field : fields_) {
+ field->SetParent(this);
+ field_name_to_idx_.emplace(field->name(), field_idx++);
+ }
+}
+
+bool GroupNode::EqualsInternal(const GroupNode* other) const {
+ if (this == other) {
+ return true;
+ }
+ if (this->field_count() != other->field_count()) {
+ return false;
+ }
+ for (int i = 0; i < this->field_count(); ++i) {
+ if (!this->field(i)->Equals(other->field(i).get())) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool GroupNode::Equals(const Node* other) const {
+ if (!Node::EqualsInternal(other)) {
+ return false;
+ }
+ return EqualsInternal(static_cast<const GroupNode*>(other));
+}
+
+int GroupNode::FieldIndex(const std::string& name) const {
+ auto search = field_name_to_idx_.find(name);
+ if (search == field_name_to_idx_.end()) {
+ // Not found
+ return -1;
+ }
+ return search->second;
+}
+
+int GroupNode::FieldIndex(const Node& node) const {
+ auto search = field_name_to_idx_.equal_range(node.name());
+ for (auto it = search.first; it != search.second; ++it) {
+ const int idx = it->second;
+ if (&node == field(idx).get()) {
+ return idx;
+ }
+ }
+ return -1;
+}
+
+void GroupNode::Visit(Node::Visitor* visitor) { visitor->Visit(this); }
+
+void GroupNode::VisitConst(Node::ConstVisitor* visitor) const { visitor->Visit(this); }
+
+// ----------------------------------------------------------------------
+// Node construction from Parquet metadata
+
+std::unique_ptr<Node> GroupNode::FromParquet(const void* opaque_element,
+ NodeVector fields) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+
+ int field_id = -1;
+ if (element->__isset.field_id) {
+ field_id = element->field_id;
+ }
+
+ std::unique_ptr<GroupNode> group_node;
+ if (element->__isset.logicalType) {
+ // updated writer with logical type present
+ group_node = std::unique_ptr<GroupNode>(
+ new GroupNode(element->name, LoadEnumSafe(&element->repetition_type), fields,
+ LogicalType::FromThrift(element->logicalType), field_id));
+ } else {
+ group_node = std::unique_ptr<GroupNode>(new GroupNode(
+ element->name, LoadEnumSafe(&element->repetition_type), fields,
+ (element->__isset.converted_type ? LoadEnumSafe(&element->converted_type)
+ : ConvertedType::NONE),
+ field_id));
+ }
+
+ return std::unique_ptr<Node>(group_node.release());
+}
+
+std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element) {
+ const format::SchemaElement* element =
+ static_cast<const format::SchemaElement*>(opaque_element);
+
+ int field_id = -1;
+ if (element->__isset.field_id) {
+ field_id = element->field_id;
+ }
+
+ std::unique_ptr<PrimitiveNode> primitive_node;
+ if (element->__isset.logicalType) {
+ // updated writer with logical type present
+ primitive_node = std::unique_ptr<PrimitiveNode>(
+ new PrimitiveNode(element->name, LoadEnumSafe(&element->repetition_type),
+ LogicalType::FromThrift(element->logicalType),
+ LoadEnumSafe(&element->type), element->type_length, field_id));
+ } else if (element->__isset.converted_type) {
+ // legacy writer with converted type present
+ primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
+ element->name, LoadEnumSafe(&element->repetition_type),
+ LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
+ element->type_length, element->precision, element->scale, field_id));
+ } else {
+ // logical type not present
+ primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
+ element->name, LoadEnumSafe(&element->repetition_type), NoLogicalType::Make(),
+ LoadEnumSafe(&element->type), element->type_length, field_id));
+ }
+
+ // Return as unique_ptr to the base type
+ return std::unique_ptr<Node>(primitive_node.release());
+}
+
+bool GroupNode::HasRepeatedFields() const {
+ for (int i = 0; i < this->field_count(); ++i) {
+ auto field = this->field(i);
+ if (field->repetition() == Repetition::REPEATED) {
+ return true;
+ }
+ if (field->is_group()) {
+ const auto& group = static_cast<const GroupNode&>(*field);
+ return group.HasRepeatedFields();
+ }
+ }
+ return false;
+}
+
+void GroupNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_num_children(field_count());
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (converted_type_ != ConvertedType::NONE) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ }
+ if (field_id_ >= 0) {
+ element->__set_field_id(field_id_);
+ }
+ if (logical_type_ && logical_type_->is_serialized()) {
+ element->__set_logicalType(logical_type_->ToThrift());
+ }
+ return;
+}
+
+void PrimitiveNode::ToParquet(void* opaque_element) const {
+ format::SchemaElement* element = static_cast<format::SchemaElement*>(opaque_element);
+ element->__set_name(name_);
+ element->__set_repetition_type(ToThrift(repetition_));
+ if (converted_type_ != ConvertedType::NONE) {
+ if (converted_type_ != ConvertedType::NA) {
+ element->__set_converted_type(ToThrift(converted_type_));
+ } else {
+ // ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
+ // Never emit it (see PARQUET-1990 for discussion).
+ if (!logical_type_ || !logical_type_->is_null()) {
+ throw ParquetException(
+ "ConvertedType::NA is obsolete, please use LogicalType::Null instead");
+ }
+ }
+ }
+ if (field_id_ >= 0) {
+ element->__set_field_id(field_id_);
+ }
+ if (logical_type_ && logical_type_->is_serialized() &&
+ // TODO(tpboudreau): remove the following conjunct to enable serialization
+ // of IntervalTypes after parquet.thrift recognizes them
+ !logical_type_->is_interval()) {
+ element->__set_logicalType(logical_type_->ToThrift());
+ }
+ element->__set_type(ToThrift(physical_type_));
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ element->__set_type_length(type_length_);
+ }
+ if (decimal_metadata_.isset) {
+ element->__set_precision(decimal_metadata_.precision);
+ element->__set_scale(decimal_metadata_.scale);
+ }
+ return;
+}
+
+// ----------------------------------------------------------------------
+// Schema converters
+
+std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length) {
+ if (elements[0].num_children == 0) {
+ if (length == 1) {
+ // Degenerate case of Parquet file with no columns
+ return GroupNode::FromParquet(elements, {});
+ } else {
+ throw ParquetException(
+ "Parquet schema had multiple nodes but root had no children");
+ }
+ }
+
+ // We don't check that the root node is repeated since this is not
+ // consistently set by implementations
+
+ int pos = 0;
+
+ std::function<std::unique_ptr<Node>()> NextNode = [&]() {
+ if (pos == length) {
+ throw ParquetException("Malformed schema: not enough elements");
+ }
+ const SchemaElement& element = elements[pos++];
+ const void* opaque_element = static_cast<const void*>(&element);
+
+ if (element.num_children == 0 && element.__isset.type) {
+ // Leaf (primitive) node: always has a type
+ return PrimitiveNode::FromParquet(opaque_element);
+ } else {
+ // Group node (may have 0 children, but cannot have a type)
+ NodeVector fields;
+ for (int i = 0; i < element.num_children; ++i) {
+ std::unique_ptr<Node> field = NextNode();
+ fields.push_back(NodePtr(field.release()));
+ }
+ return GroupNode::FromParquet(opaque_element, std::move(fields));
+ }
+ };
+ return NextNode();
+}
+
+std::shared_ptr<SchemaDescriptor> FromParquet(const std::vector<SchemaElement>& schema) {
+ if (schema.empty()) {
+ throw ParquetException("Empty file schema (no root)");
+ }
+ std::unique_ptr<Node> root = Unflatten(&schema[0], static_cast<int>(schema.size()));
+ std::shared_ptr<SchemaDescriptor> descr = std::make_shared<SchemaDescriptor>();
+ descr->Init(std::shared_ptr<GroupNode>(static_cast<GroupNode*>(root.release())));
+ return descr;
+}
+
+class SchemaVisitor : public Node::ConstVisitor {
+ public:
+ explicit SchemaVisitor(std::vector<format::SchemaElement>* elements)
+ : elements_(elements) {}
+
+ void Visit(const Node* node) override {
+ format::SchemaElement element;
+ node->ToParquet(&element);
+ elements_->push_back(element);
+
+ if (node->is_group()) {
+ const GroupNode* group_node = static_cast<const GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->VisitConst(this);
+ }
+ }
+ }
+
+ private:
+ std::vector<format::SchemaElement>* elements_;
+};
+
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out) {
+ SchemaVisitor visitor(out);
+ schema->VisitConst(&visitor);
+}
+
+// ----------------------------------------------------------------------
+// Schema printing
+
+static void PrintRepLevel(Repetition::type repetition, std::ostream& stream) {
+ switch (repetition) {
+ case Repetition::REQUIRED:
+ stream << "required";
+ break;
+ case Repetition::OPTIONAL:
+ stream << "optional";
+ break;
+ case Repetition::REPEATED:
+ stream << "repeated";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
+ switch (node->physical_type()) {
+ case Type::BOOLEAN:
+ stream << "boolean";
+ break;
+ case Type::INT32:
+ stream << "int32";
+ break;
+ case Type::INT64:
+ stream << "int64";
+ break;
+ case Type::INT96:
+ stream << "int96";
+ break;
+ case Type::FLOAT:
+ stream << "float";
+ break;
+ case Type::DOUBLE:
+ stream << "double";
+ break;
+ case Type::BYTE_ARRAY:
+ stream << "binary";
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ stream << "fixed_len_byte_array(" << node->type_length() << ")";
+ break;
+ default:
+ break;
+ }
+}
+
+static void PrintConvertedType(const PrimitiveNode* node, std::ostream& stream) {
+ auto lt = node->converted_type();
+ auto la = node->logical_type();
+ if (la && la->is_valid() && !la->is_none()) {
+ stream << " (" << la->ToString() << ")";
+ } else if (lt == ConvertedType::DECIMAL) {
+ stream << " (" << ConvertedTypeToString(lt) << "("
+ << node->decimal_metadata().precision << "," << node->decimal_metadata().scale
+ << "))";
+ } else if (lt != ConvertedType::NONE) {
+ stream << " (" << ConvertedTypeToString(lt) << ")";
+ }
+}
+
+struct SchemaPrinter : public Node::ConstVisitor {
+ explicit SchemaPrinter(std::ostream& stream, int indent_width)
+ : stream_(stream), indent_(0), indent_width_(2) {}
+
+ void Indent() {
+ if (indent_ > 0) {
+ std::string spaces(indent_, ' ');
+ stream_ << spaces;
+ }
+ }
+
+ void Visit(const Node* node) {
+ Indent();
+ if (node->is_group()) {
+ Visit(static_cast<const GroupNode*>(node));
+ } else {
+ // Primitive
+ Visit(static_cast<const PrimitiveNode*>(node));
+ }
+ }
+
+ void Visit(const PrimitiveNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " ";
+ PrintType(node, stream_);
+ stream_ << " field_id=" << node->field_id() << " " << node->name();
+ PrintConvertedType(node, stream_);
+ stream_ << ";" << std::endl;
+ }
+
+ void Visit(const GroupNode* node) {
+ PrintRepLevel(node->repetition(), stream_);
+ stream_ << " group "
+ << "field_id=" << node->field_id() << " " << node->name();
+ auto lt = node->converted_type();
+ auto la = node->logical_type();
+ if (la && la->is_valid() && !la->is_none()) {
+ stream_ << " (" << la->ToString() << ")";
+ } else if (lt != ConvertedType::NONE) {
+ stream_ << " (" << ConvertedTypeToString(lt) << ")";
+ }
+ stream_ << " {" << std::endl;
+
+ indent_ += indent_width_;
+ for (int i = 0; i < node->field_count(); ++i) {
+ node->field(i)->VisitConst(this);
+ }
+ indent_ -= indent_width_;
+ Indent();
+ stream_ << "}" << std::endl;
+ }
+
+ std::ostream& stream_;
+ int indent_;
+ int indent_width_;
+};
+
+void PrintSchema(const Node* schema, std::ostream& stream, int indent_width) {
+ SchemaPrinter printer(stream, indent_width);
+ printer.Visit(schema);
+}
+
+} // namespace schema
+
+using schema::ColumnPath;
+using schema::GroupNode;
+using schema::Node;
+using schema::NodePtr;
+using schema::PrimitiveNode;
+
+void SchemaDescriptor::Init(std::unique_ptr<schema::Node> schema) {
+ Init(NodePtr(schema.release()));
+}
+
+class SchemaUpdater : public Node::Visitor {
+ public:
+ explicit SchemaUpdater(const std::vector<ColumnOrder>& column_orders)
+ : column_orders_(column_orders), leaf_count_(0) {}
+
+ void Visit(Node* node) override {
+ if (node->is_group()) {
+ GroupNode* group_node = static_cast<GroupNode*>(node);
+ for (int i = 0; i < group_node->field_count(); ++i) {
+ group_node->field(i)->Visit(this);
+ }
+ } else { // leaf node
+ PrimitiveNode* leaf_node = static_cast<PrimitiveNode*>(node);
+ leaf_node->SetColumnOrder(column_orders_[leaf_count_++]);
+ }
+ }
+
+ private:
+ const std::vector<ColumnOrder>& column_orders_;
+ int leaf_count_;
+};
+
+void SchemaDescriptor::updateColumnOrders(const std::vector<ColumnOrder>& column_orders) {
+ if (static_cast<int>(column_orders.size()) != num_columns()) {
+ throw ParquetException("Malformed schema: not enough ColumnOrder values");
+ }
+ SchemaUpdater visitor(column_orders);
+ const_cast<GroupNode*>(group_node_)->Visit(&visitor);
+}
+
+void SchemaDescriptor::Init(NodePtr schema) {
+ schema_ = std::move(schema);
+
+ if (!schema_->is_group()) {
+ throw ParquetException("Must initialize with a schema group");
+ }
+
+ group_node_ = static_cast<const GroupNode*>(schema_.get());
+ leaves_.clear();
+
+ for (int i = 0; i < group_node_->field_count(); ++i) {
+ BuildTree(group_node_->field(i), 0, 0, group_node_->field(i));
+ }
+}
+
+bool SchemaDescriptor::Equals(const SchemaDescriptor& other) const {
+ if (this->num_columns() != other.num_columns()) {
+ return false;
+ }
+
+ for (int i = 0; i < this->num_columns(); ++i) {
+ if (!this->Column(i)->Equals(*other.Column(i))) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void SchemaDescriptor::BuildTree(const NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const NodePtr& base) {
+ if (node->is_optional()) {
+ ++max_def_level;
+ } else if (node->is_repeated()) {
+ // Repeated fields add a definition level. This is used to distinguish
+ // between an empty list and a list with an item in it.
+ ++max_rep_level;
+ ++max_def_level;
+ }
+
+ // Now, walk the schema and create a ColumnDescriptor for each leaf node
+ if (node->is_group()) {
+ const GroupNode* group = static_cast<const GroupNode*>(node.get());
+ for (int i = 0; i < group->field_count(); ++i) {
+ BuildTree(group->field(i), max_def_level, max_rep_level, base);
+ }
+ } else {
+ node_to_leaf_index_[static_cast<const PrimitiveNode*>(node.get())] =
+ static_cast<int>(leaves_.size());
+
+ // Primitive node, append to leaves
+ leaves_.push_back(ColumnDescriptor(node, max_def_level, max_rep_level, this));
+ leaf_to_base_.emplace(static_cast<int>(leaves_.size()) - 1, base);
+ leaf_to_idx_.emplace(node->path()->ToDotString(),
+ static_cast<int>(leaves_.size()) - 1);
+ }
+}
+
+int SchemaDescriptor::GetColumnIndex(const PrimitiveNode& node) const {
+ auto it = node_to_leaf_index_.find(&node);
+ if (it == node_to_leaf_index_.end()) {
+ return -1;
+ }
+ return it->second;
+}
+
+ColumnDescriptor::ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr)
+ : node_(std::move(node)),
+ max_definition_level_(max_definition_level),
+ max_repetition_level_(max_repetition_level) {
+ if (!node_->is_primitive()) {
+ throw ParquetException("Must be a primitive type");
+ }
+ primitive_node_ = static_cast<const PrimitiveNode*>(node_.get());
+}
+
+bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const {
+ return primitive_node_->Equals(other.primitive_node_) &&
+ max_repetition_level() == other.max_repetition_level() &&
+ max_definition_level() == other.max_definition_level();
+}
+
+const ColumnDescriptor* SchemaDescriptor::Column(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return &leaves_[i];
+}
+
+int SchemaDescriptor::ColumnIndex(const std::string& node_path) const {
+ auto search = leaf_to_idx_.find(node_path);
+ if (search == leaf_to_idx_.end()) {
+ // Not found
+ return -1;
+ }
+ return search->second;
+}
+
+int SchemaDescriptor::ColumnIndex(const Node& node) const {
+ auto search = leaf_to_idx_.equal_range(node.path()->ToDotString());
+ for (auto it = search.first; it != search.second; ++it) {
+ const int idx = it->second;
+ if (&node == Column(idx)->schema_node().get()) {
+ return idx;
+ }
+ }
+ return -1;
+}
+
+const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const {
+ DCHECK(i >= 0 && i < static_cast<int>(leaves_.size()));
+ return leaf_to_base_.find(i)->second.get();
+}
+
+bool SchemaDescriptor::HasRepeatedFields() const {
+ return group_node_->HasRepeatedFields();
+}
+
+std::string SchemaDescriptor::ToString() const {
+ std::ostringstream ss;
+ PrintSchema(schema_.get(), ss);
+ return ss.str();
+}
+
+std::string ColumnDescriptor::ToString() const {
+ std::ostringstream ss;
+ ss << "column descriptor = {" << std::endl
+ << " name: " << name() << "," << std::endl
+ << " path: " << path()->ToDotString() << "," << std::endl
+ << " physical_type: " << TypeToString(physical_type()) << "," << std::endl
+ << " converted_type: " << ConvertedTypeToString(converted_type()) << ","
+ << std::endl
+ << " logical_type: " << logical_type()->ToString() << "," << std::endl
+ << " max_definition_level: " << max_definition_level() << "," << std::endl
+ << " max_repetition_level: " << max_repetition_level() << "," << std::endl;
+
+ if (physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) {
+ ss << " length: " << type_length() << "," << std::endl;
+ }
+
+ if (converted_type() == parquet::ConvertedType::DECIMAL) {
+ ss << " precision: " << type_precision() << "," << std::endl
+ << " scale: " << type_scale() << "," << std::endl;
+ }
+
+ ss << "}";
+ return ss.str();
+}
+
+int ColumnDescriptor::type_scale() const {
+ return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+ return primitive_node_->decimal_metadata().precision;
+}
+
+int ColumnDescriptor::type_length() const { return primitive_node_->type_length(); }
+
+const std::shared_ptr<ColumnPath> ColumnDescriptor::path() const {
+ return primitive_node_->path();
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema.h b/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
index 63fc4706c7e..7dcfa7d144e 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema.h
@@ -1,494 +1,494 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// This module contains the logical parquet-cpp types (independent of Thrift
-// structures), schema nodes, and related type tools
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-class SchemaDescriptor;
-
-namespace schema {
-
-class Node;
-
-// List encodings: using the terminology from Impala to define different styles
-// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
-// the converted type named in the Parquet metadata is ConvertedType::LIST we
-// use that terminology here. It also helps distinguish from the *_ARRAY
-// primitive types.
-//
-// One-level encoding: Only allows required lists with required cells
-// repeated value_type name
-//
-// Two-level encoding: Enables optional lists with only required cells
-// <required/optional> group list
-// repeated value_type item
-//
-// Three-level encoding: Enables optional lists with optional cells
-// <required/optional> group bag
-// repeated group list
-// <required/optional> value_type item
-//
-// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
-// the non-repeated nodes set to required.
-//
-// The "official" encoding recommended in the Parquet spec is the 3-level, and
-// we use that as the default when creating list types. For semantic completeness
-// we allow the other two. Since all types of encodings will occur "in the
-// wild" we need to be able to interpret the associated definition levels in
-// the context of the actual encoding used in the file.
-//
-// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
-// SchemaElement, which could make things challenging if we are trying to infer
-// that a sequence of nodes semantically represents an array according to one
-// of these encodings (versus a struct containing an array). We should refuse
-// the temptation to guess, as they say.
-struct ListEncoding {
- enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
-};
-
-class PARQUET_EXPORT ColumnPath {
- public:
- ColumnPath() : path_() {}
- explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
- explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
-
- static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
- static std::shared_ptr<ColumnPath> FromNode(const Node& node);
-
- std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
- std::string ToDotString() const;
- const std::vector<std::string>& ToDotVector() const;
-
- protected:
- std::vector<std::string> path_;
-};
-
-// Base class for logical schema types. A type has a name, repetition level,
-// and optionally a logical type (ConvertedType in Parquet metadata parlance)
-class PARQUET_EXPORT Node {
- public:
- enum type { PRIMITIVE, GROUP };
-
- virtual ~Node() {}
-
- bool is_primitive() const { return type_ == Node::PRIMITIVE; }
-
- bool is_group() const { return type_ == Node::GROUP; }
-
- bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
-
- bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
-
- bool is_required() const { return repetition_ == Repetition::REQUIRED; }
-
- virtual bool Equals(const Node* other) const = 0;
-
- const std::string& name() const { return name_; }
-
- Node::type node_type() const { return type_; }
-
- Repetition::type repetition() const { return repetition_; }
-
- ConvertedType::type converted_type() const { return converted_type_; }
-
- const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
-
- /// \brief The field_id value for the serialized SchemaElement. If the
- /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
- /// Thrift.
- int field_id() const { return field_id_; }
-
- PARQUET_DEPRECATED("id() is deprecated. Use field_id() instead")
- int id() const { return field_id_; }
-
- const Node* parent() const { return parent_; }
-
- const std::shared_ptr<ColumnPath> path() const;
-
- virtual void ToParquet(void* element) const = 0;
-
- // Node::Visitor abstract class for walking schemas with the visitor pattern
- class Visitor {
- public:
- virtual ~Visitor() {}
-
- virtual void Visit(Node* node) = 0;
- };
- class ConstVisitor {
- public:
- virtual ~ConstVisitor() {}
-
- virtual void Visit(const Node* node) = 0;
- };
-
- virtual void Visit(Visitor* visitor) = 0;
- virtual void VisitConst(ConstVisitor* visitor) const = 0;
-
- protected:
- friend class GroupNode;
-
- Node(Node::type type, const std::string& name, Repetition::type repetition,
- ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
- : type_(type),
- name_(name),
- repetition_(repetition),
- converted_type_(converted_type),
- field_id_(field_id),
- parent_(NULLPTR) {}
-
- Node(Node::type type, const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
- : type_(type),
- name_(name),
- repetition_(repetition),
- logical_type_(std::move(logical_type)),
- field_id_(field_id),
- parent_(NULLPTR) {}
-
- Node::type type_;
- std::string name_;
- Repetition::type repetition_;
- ConvertedType::type converted_type_;
- std::shared_ptr<const LogicalType> logical_type_;
- int field_id_;
- // Nodes should not be shared, they have a single parent.
- const Node* parent_;
-
- bool EqualsInternal(const Node* other) const;
- void SetParent(const Node* p_parent);
-
- private:
- PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
-};
-
-// Save our breath all over the place with these typedefs
-typedef std::shared_ptr<Node> NodePtr;
-typedef std::vector<NodePtr> NodeVector;
-
-// A type that is one of the primitive Parquet storage types. In addition to
-// the other type metadata (name, repetition level, logical type), also has the
-// physical storage type and their type-specific metadata (byte width, decimal
-// parameters)
-class PARQUET_EXPORT PrimitiveNode : public Node {
- public:
- static std::unique_ptr<Node> FromParquet(const void* opaque_element);
-
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- Type::type type,
- ConvertedType::type converted_type = ConvertedType::NONE,
- int length = -1, int precision = -1, int scale = -1,
- int field_id = -1) {
- return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
- precision, scale, field_id));
- }
-
- // If no logical type, pass LogicalType::None() or nullptr
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type,
- Type::type primitive_type, int primitive_length = -1,
- int field_id = -1) {
- return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
- primitive_length, field_id));
- }
-
- bool Equals(const Node* other) const override;
-
- Type::type physical_type() const { return physical_type_; }
-
- ColumnOrder column_order() const { return column_order_; }
-
- void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
-
- int32_t type_length() const { return type_length_; }
-
- const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
-
- void ToParquet(void* element) const override;
- void Visit(Visitor* visitor) override;
- void VisitConst(ConstVisitor* visitor) const override;
-
- private:
- PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
- ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
- int precision = -1, int scale = -1, int field_id = -1);
-
- PrimitiveNode(const std::string& name, Repetition::type repetition,
- std::shared_ptr<const LogicalType> logical_type,
- Type::type primitive_type, int primitive_length = -1, int field_id = -1);
-
- Type::type physical_type_;
- int32_t type_length_;
- DecimalMetadata decimal_metadata_;
- ColumnOrder column_order_;
-
- // For FIXED_LEN_BYTE_ARRAY
- void SetTypeLength(int32_t length) { type_length_ = length; }
-
- bool EqualsInternal(const PrimitiveNode* other) const;
-
- FRIEND_TEST(TestPrimitiveNode, Attrs);
- FRIEND_TEST(TestPrimitiveNode, Equals);
- FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
- FRIEND_TEST(TestPrimitiveNode, FromParquet);
-};
-
-class PARQUET_EXPORT GroupNode : public Node {
- public:
- static std::unique_ptr<Node> FromParquet(const void* opaque_element,
- NodeVector fields = {});
-
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- ConvertedType::type converted_type = ConvertedType::NONE,
- int field_id = -1) {
- return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
- }
-
- // If no logical type, pass nullptr
- // A field_id -1 (or any negative value) will be serialized as null in Thrift
- static inline NodePtr Make(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- std::shared_ptr<const LogicalType> logical_type,
- int field_id = -1) {
- return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
- }
-
- bool Equals(const Node* other) const override;
-
- NodePtr field(int i) const { return fields_[i]; }
- // Get the index of a field by its name, or negative value if not found.
- // If several fields share the same name, it is unspecified which one
- // is returned.
- int FieldIndex(const std::string& name) const;
- // Get the index of a field by its node, or negative value if not found.
- int FieldIndex(const Node& node) const;
-
- int field_count() const { return static_cast<int>(fields_.size()); }
-
- void ToParquet(void* element) const override;
- void Visit(Visitor* visitor) override;
- void VisitConst(ConstVisitor* visitor) const override;
-
- /// \brief Return true if this node or any child node has REPEATED repetition
- /// type
- bool HasRepeatedFields() const;
-
- private:
- GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields,
- ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
-
- GroupNode(const std::string& name, Repetition::type repetition,
- const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
- int field_id = -1);
-
- NodeVector fields_;
- bool EqualsInternal(const GroupNode* other) const;
-
- // Mapping between field name to the field index
- std::unordered_multimap<std::string, int> field_name_to_idx_;
-
- FRIEND_TEST(TestGroupNode, Attrs);
- FRIEND_TEST(TestGroupNode, Equals);
- FRIEND_TEST(TestGroupNode, FieldIndex);
- FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
-};
-
-// ----------------------------------------------------------------------
-// Convenience primitive type factory functions
-
-#define PRIMITIVE_FACTORY(FuncName, TYPE) \
- static inline NodePtr FuncName(const std::string& name, \
- Repetition::type repetition = Repetition::OPTIONAL, \
- int field_id = -1) { \
- return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
- /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
- }
-
-PRIMITIVE_FACTORY(Boolean, BOOLEAN)
-PRIMITIVE_FACTORY(Int32, INT32)
-PRIMITIVE_FACTORY(Int64, INT64)
-PRIMITIVE_FACTORY(Int96, INT96)
-PRIMITIVE_FACTORY(Float, FLOAT)
-PRIMITIVE_FACTORY(Double, DOUBLE)
-PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
-
-void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
- int indent_width = 2);
-
-} // namespace schema
-
-// The ColumnDescriptor encapsulates information necessary to interpret
-// primitive column data in the context of a particular schema. We have to
-// examine the node structure of a column's path to the root in the schema tree
-// to be able to reassemble the nested structure from the repetition and
-// definition levels.
-class PARQUET_EXPORT ColumnDescriptor {
- public:
- ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
- int16_t max_repetition_level,
- const SchemaDescriptor* schema_descr = NULLPTR);
-
- bool Equals(const ColumnDescriptor& other) const;
-
- int16_t max_definition_level() const { return max_definition_level_; }
-
- int16_t max_repetition_level() const { return max_repetition_level_; }
-
- Type::type physical_type() const { return primitive_node_->physical_type(); }
-
- ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
-
- const std::shared_ptr<const LogicalType>& logical_type() const {
- return primitive_node_->logical_type();
- }
-
- ColumnOrder column_order() const { return primitive_node_->column_order(); }
-
- SortOrder::type sort_order() const {
- auto la = logical_type();
- auto pt = physical_type();
- return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
- }
-
- const std::string& name() const { return primitive_node_->name(); }
-
- const std::shared_ptr<schema::ColumnPath> path() const;
-
- const schema::NodePtr& schema_node() const { return node_; }
-
- std::string ToString() const;
-
- int type_length() const;
-
- int type_precision() const;
-
- int type_scale() const;
-
- private:
- schema::NodePtr node_;
- const schema::PrimitiveNode* primitive_node_;
-
- int16_t max_definition_level_;
- int16_t max_repetition_level_;
-};
-
-// Container for the converted Parquet schema with a computed information from
-// the schema analysis needed for file reading
-//
-// * Column index to Node
-// * Max repetition / definition levels for each primitive node
-//
-// The ColumnDescriptor objects produced by this class can be used to assist in
-// the reconstruction of fully materialized data structures from the
-// repetition-definition level encoding of nested data
-//
-// TODO(wesm): this object can be recomputed from a Schema
-class PARQUET_EXPORT SchemaDescriptor {
- public:
- SchemaDescriptor() {}
- ~SchemaDescriptor() {}
-
- // Analyze the schema
- void Init(std::unique_ptr<schema::Node> schema);
- void Init(schema::NodePtr schema);
-
- const ColumnDescriptor* Column(int i) const;
-
- // Get the index of a column by its dotstring path, or negative value if not found.
- // If several columns share the same dotstring path, it is unspecified which one
- // is returned.
- int ColumnIndex(const std::string& node_path) const;
- // Get the index of a column by its node, or negative value if not found.
- int ColumnIndex(const schema::Node& node) const;
-
- bool Equals(const SchemaDescriptor& other) const;
-
- // The number of physical columns appearing in the file
- int num_columns() const { return static_cast<int>(leaves_.size()); }
-
- const schema::NodePtr& schema_root() const { return schema_; }
-
- const schema::GroupNode* group_node() const { return group_node_; }
-
- // Returns the root (child of the schema root) node of the leaf(column) node
- const schema::Node* GetColumnRoot(int i) const;
-
- const std::string& name() const { return group_node_->name(); }
-
- std::string ToString() const;
-
- void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
-
- /// \brief Return column index corresponding to a particular
- /// PrimitiveNode. Returns -1 if not found
- int GetColumnIndex(const schema::PrimitiveNode& node) const;
-
- /// \brief Return true if any field or their children have REPEATED repetition
- /// type
- bool HasRepeatedFields() const;
-
- private:
- friend class ColumnDescriptor;
-
- // Root Node
- schema::NodePtr schema_;
- // Root Node
- const schema::GroupNode* group_node_;
-
- void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
- int16_t max_rep_level, const schema::NodePtr& base);
-
- // Result of leaf node / tree analysis
- std::vector<ColumnDescriptor> leaves_;
-
- std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
-
- // Mapping between leaf nodes and root group of leaf (first node
- // below the schema's root group)
- //
- // For example, the leaf `a.b.c.d` would have a link back to `a`
- //
- // -- a <------
- // -- -- b |
- // -- -- -- c |
- // -- -- -- -- d
- std::unordered_map<int, schema::NodePtr> leaf_to_base_;
-
- // Mapping between ColumnPath DotString to the leaf index
- std::unordered_multimap<std::string, int> leaf_to_idx_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class SchemaDescriptor;
+
+namespace schema {
+
+class Node;
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+// repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+// <required/optional> group list
+// repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+// <required/optional> group bag
+// repeated group list
+// <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+ enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
+};
+
+class PARQUET_EXPORT ColumnPath {
+ public:
+ ColumnPath() : path_() {}
+ explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+ explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
+
+ static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+ static std::shared_ptr<ColumnPath> FromNode(const Node& node);
+
+ std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+ std::string ToDotString() const;
+ const std::vector<std::string>& ToDotVector() const;
+
+ protected:
+ std::vector<std::string> path_;
+};
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class PARQUET_EXPORT Node {
+ public:
+ enum type { PRIMITIVE, GROUP };
+
+ virtual ~Node() {}
+
+ bool is_primitive() const { return type_ == Node::PRIMITIVE; }
+
+ bool is_group() const { return type_ == Node::GROUP; }
+
+ bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
+
+ bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
+
+ bool is_required() const { return repetition_ == Repetition::REQUIRED; }
+
+ virtual bool Equals(const Node* other) const = 0;
+
+ const std::string& name() const { return name_; }
+
+ Node::type node_type() const { return type_; }
+
+ Repetition::type repetition() const { return repetition_; }
+
+ ConvertedType::type converted_type() const { return converted_type_; }
+
+ const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
+
+ /// \brief The field_id value for the serialized SchemaElement. If the
+ /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
+ /// Thrift.
+ int field_id() const { return field_id_; }
+
+ PARQUET_DEPRECATED("id() is deprecated. Use field_id() instead")
+ int id() const { return field_id_; }
+
+ const Node* parent() const { return parent_; }
+
+ const std::shared_ptr<ColumnPath> path() const;
+
+ virtual void ToParquet(void* element) const = 0;
+
+ // Node::Visitor abstract class for walking schemas with the visitor pattern
+ class Visitor {
+ public:
+ virtual ~Visitor() {}
+
+ virtual void Visit(Node* node) = 0;
+ };
+ class ConstVisitor {
+ public:
+ virtual ~ConstVisitor() {}
+
+ virtual void Visit(const Node* node) = 0;
+ };
+
+ virtual void Visit(Visitor* visitor) = 0;
+ virtual void VisitConst(ConstVisitor* visitor) const = 0;
+
+ protected:
+ friend class GroupNode;
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ converted_type_(converted_type),
+ field_id_(field_id),
+ parent_(NULLPTR) {}
+
+ Node(Node::type type, const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
+ : type_(type),
+ name_(name),
+ repetition_(repetition),
+ logical_type_(std::move(logical_type)),
+ field_id_(field_id),
+ parent_(NULLPTR) {}
+
+ Node::type type_;
+ std::string name_;
+ Repetition::type repetition_;
+ ConvertedType::type converted_type_;
+ std::shared_ptr<const LogicalType> logical_type_;
+ int field_id_;
+ // Nodes should not be shared, they have a single parent.
+ const Node* parent_;
+
+ bool EqualsInternal(const Node* other) const;
+ void SetParent(const Node* p_parent);
+
+ private:
+ PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
+};
+
+// Save our breath all over the place with these typedefs
+typedef std::shared_ptr<Node> NodePtr;
+typedef std::vector<NodePtr> NodeVector;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PARQUET_EXPORT PrimitiveNode : public Node {
+ public:
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element);
+
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ Type::type type,
+ ConvertedType::type converted_type = ConvertedType::NONE,
+ int length = -1, int precision = -1, int scale = -1,
+ int field_id = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
+ precision, scale, field_id));
+ }
+
+ // If no logical type, pass LogicalType::None() or nullptr
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type primitive_type, int primitive_length = -1,
+ int field_id = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, logical_type, primitive_type,
+ primitive_length, field_id));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ Type::type physical_type() const { return physical_type_; }
+
+ ColumnOrder column_order() const { return column_order_; }
+
+ void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
+
+ int32_t type_length() const { return type_length_; }
+
+ const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
+
+ void ToParquet(void* element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+ PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
+ ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
+ int precision = -1, int scale = -1, int field_id = -1);
+
+ PrimitiveNode(const std::string& name, Repetition::type repetition,
+ std::shared_ptr<const LogicalType> logical_type,
+ Type::type primitive_type, int primitive_length = -1, int field_id = -1);
+
+ Type::type physical_type_;
+ int32_t type_length_;
+ DecimalMetadata decimal_metadata_;
+ ColumnOrder column_order_;
+
+ // For FIXED_LEN_BYTE_ARRAY
+ void SetTypeLength(int32_t length) { type_length_ = length; }
+
+ bool EqualsInternal(const PrimitiveNode* other) const;
+
+ FRIEND_TEST(TestPrimitiveNode, Attrs);
+ FRIEND_TEST(TestPrimitiveNode, Equals);
+ FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
+ FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class PARQUET_EXPORT GroupNode : public Node {
+ public:
+ static std::unique_ptr<Node> FromParquet(const void* opaque_element,
+ NodeVector fields = {});
+
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ ConvertedType::type converted_type = ConvertedType::NONE,
+ int field_id = -1) {
+ return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
+ }
+
+ // If no logical type, pass nullptr
+ // A field_id -1 (or any negative value) will be serialized as null in Thrift
+ static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ std::shared_ptr<const LogicalType> logical_type,
+ int field_id = -1) {
+ return NodePtr(new GroupNode(name, repetition, fields, logical_type, field_id));
+ }
+
+ bool Equals(const Node* other) const override;
+
+ NodePtr field(int i) const { return fields_[i]; }
+ // Get the index of a field by its name, or negative value if not found.
+ // If several fields share the same name, it is unspecified which one
+ // is returned.
+ int FieldIndex(const std::string& name) const;
+ // Get the index of a field by its node, or negative value if not found.
+ int FieldIndex(const Node& node) const;
+
+ int field_count() const { return static_cast<int>(fields_.size()); }
+
+ void ToParquet(void* element) const override;
+ void Visit(Visitor* visitor) override;
+ void VisitConst(ConstVisitor* visitor) const override;
+
+ /// \brief Return true if this node or any child node has REPEATED repetition
+ /// type
+ bool HasRepeatedFields() const;
+
+ private:
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields,
+ ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
+
+ GroupNode(const std::string& name, Repetition::type repetition,
+ const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
+ int field_id = -1);
+
+ NodeVector fields_;
+ bool EqualsInternal(const GroupNode* other) const;
+
+ // Mapping between field name to the field index
+ std::unordered_multimap<std::string, int> field_name_to_idx_;
+
+ FRIEND_TEST(TestGroupNode, Attrs);
+ FRIEND_TEST(TestGroupNode, Equals);
+ FRIEND_TEST(TestGroupNode, FieldIndex);
+ FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE) \
+ static inline NodePtr FuncName(const std::string& name, \
+ Repetition::type repetition = Repetition::OPTIONAL, \
+ int field_id = -1) { \
+ return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
+ /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
+ }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN)
+PRIMITIVE_FACTORY(Int32, INT32)
+PRIMITIVE_FACTORY(Int64, INT64)
+PRIMITIVE_FACTORY(Int96, INT96)
+PRIMITIVE_FACTORY(Float, FLOAT)
+PRIMITIVE_FACTORY(Double, DOUBLE)
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
+
+void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
+ int indent_width = 2);
+
+} // namespace schema
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class PARQUET_EXPORT ColumnDescriptor {
+ public:
+ ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+ int16_t max_repetition_level,
+ const SchemaDescriptor* schema_descr = NULLPTR);
+
+ bool Equals(const ColumnDescriptor& other) const;
+
+ int16_t max_definition_level() const { return max_definition_level_; }
+
+ int16_t max_repetition_level() const { return max_repetition_level_; }
+
+ Type::type physical_type() const { return primitive_node_->physical_type(); }
+
+ ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
+
+ const std::shared_ptr<const LogicalType>& logical_type() const {
+ return primitive_node_->logical_type();
+ }
+
+ ColumnOrder column_order() const { return primitive_node_->column_order(); }
+
+ SortOrder::type sort_order() const {
+ auto la = logical_type();
+ auto pt = physical_type();
+ return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
+ }
+
+ const std::string& name() const { return primitive_node_->name(); }
+
+ const std::shared_ptr<schema::ColumnPath> path() const;
+
+ const schema::NodePtr& schema_node() const { return node_; }
+
+ std::string ToString() const;
+
+ int type_length() const;
+
+ int type_precision() const;
+
+ int type_scale() const;
+
+ private:
+ schema::NodePtr node_;
+ const schema::PrimitiveNode* primitive_node_;
+
+ int16_t max_definition_level_;
+ int16_t max_repetition_level_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class PARQUET_EXPORT SchemaDescriptor {
+ public:
+ SchemaDescriptor() {}
+ ~SchemaDescriptor() {}
+
+ // Analyze the schema
+ void Init(std::unique_ptr<schema::Node> schema);
+ void Init(schema::NodePtr schema);
+
+ const ColumnDescriptor* Column(int i) const;
+
+ // Get the index of a column by its dotstring path, or negative value if not found.
+ // If several columns share the same dotstring path, it is unspecified which one
+ // is returned.
+ int ColumnIndex(const std::string& node_path) const;
+ // Get the index of a column by its node, or negative value if not found.
+ int ColumnIndex(const schema::Node& node) const;
+
+ bool Equals(const SchemaDescriptor& other) const;
+
+ // The number of physical columns appearing in the file
+ int num_columns() const { return static_cast<int>(leaves_.size()); }
+
+ const schema::NodePtr& schema_root() const { return schema_; }
+
+ const schema::GroupNode* group_node() const { return group_node_; }
+
+ // Returns the root (child of the schema root) node of the leaf(column) node
+ const schema::Node* GetColumnRoot(int i) const;
+
+ const std::string& name() const { return group_node_->name(); }
+
+ std::string ToString() const;
+
+ void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
+
+ /// \brief Return column index corresponding to a particular
+ /// PrimitiveNode. Returns -1 if not found
+ int GetColumnIndex(const schema::PrimitiveNode& node) const;
+
+ /// \brief Return true if any field or their children have REPEATED repetition
+ /// type
+ bool HasRepeatedFields() const;
+
+ private:
+ friend class ColumnDescriptor;
+
+ // Root Node
+ schema::NodePtr schema_;
+ // Root Node
+ const schema::GroupNode* group_node_;
+
+ void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+ int16_t max_rep_level, const schema::NodePtr& base);
+
+ // Result of leaf node / tree analysis
+ std::vector<ColumnDescriptor> leaves_;
+
+ std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
+
+ // Mapping between leaf nodes and root group of leaf (first node
+ // below the schema's root group)
+ //
+ // For example, the leaf `a.b.c.d` would have a link back to `a`
+ //
+ // -- a <------
+ // -- -- b |
+ // -- -- -- c |
+ // -- -- -- -- d
+ std::unordered_map<int, schema::NodePtr> leaf_to_base_;
+
+ // Mapping between ColumnPath DotString to the leaf index
+ std::unordered_multimap<std::string, int> leaf_to_idx_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
index 42102884bb0..c0cfffc87e2 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/schema_internal.h
@@ -1,54 +1,54 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Non-public Thrift schema serialization utilities
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-#include "parquet/types.h"
-
-namespace parquet {
-
-namespace format {
-class SchemaElement;
-}
-
-namespace schema {
-
-// ----------------------------------------------------------------------
-// Conversion from Parquet Thrift metadata
-
-PARQUET_EXPORT
-std::shared_ptr<SchemaDescriptor> FromParquet(
- const std::vector<format::SchemaElement>& schema);
-
-PARQUET_EXPORT
-std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length);
-
-// ----------------------------------------------------------------------
-// Conversion to Parquet Thrift metadata
-
-PARQUET_EXPORT
-void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
-
-} // namespace schema
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Non-public Thrift schema serialization utilities
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+namespace format {
+class SchemaElement;
+}
+
+namespace schema {
+
+// ----------------------------------------------------------------------
+// Conversion from Parquet Thrift metadata
+
+PARQUET_EXPORT
+std::shared_ptr<SchemaDescriptor> FromParquet(
+ const std::vector<format::SchemaElement>& schema);
+
+PARQUET_EXPORT
+std::unique_ptr<Node> Unflatten(const format::SchemaElement* elements, int length);
+
+// ----------------------------------------------------------------------
+// Conversion to Parquet Thrift metadata
+
+PARQUET_EXPORT
+void ToParquet(const GroupNode* schema, std::vector<format::SchemaElement>* out);
+
+} // namespace schema
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
index 3b037ac74bf..72341590e75 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.cc
@@ -1,885 +1,885 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/statistics.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <type_traits>
-#include <utility>
-
-#include "arrow/array.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_run_reader.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/optional.h"
-#include "arrow/util/ubsan.h"
-#include "arrow/visitor_inline.h"
-#include "parquet/encoding.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/schema.h"
-
-using arrow::default_memory_pool;
-using arrow::MemoryPool;
-using arrow::internal::checked_cast;
-using arrow::util::SafeCopy;
-
-namespace parquet {
-namespace {
-
-// ----------------------------------------------------------------------
-// Comparator implementations
-
-constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
-constexpr int value_length(int type_length, const FLBA& value) { return type_length; }
-
-template <typename DType, bool is_signed>
-struct CompareHelper {
- using T = typename DType::c_type;
-
- static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
- "T is an unsigned numeric");
-
- constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
- constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
-
- // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
- // standard requirements.
- template <typename T1 = T>
- static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
- T fallback) {
- return std::isnan(val) ? fallback : val;
- }
-
- template <typename T1 = T>
- static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(
- T val, T fallback) {
- return val;
- }
-
- static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
-
- static T Min(int type_length, T a, T b) { return a < b ? a : b; }
- static T Max(int type_length, T a, T b) { return a < b ? b : a; }
-};
-
-template <typename DType>
-struct UnsignedCompareHelperBase {
- using T = typename DType::c_type;
- using UCType = typename std::make_unsigned<T>::type;
-
- static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
- static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");
-
- // NOTE: according to the C++ spec, unsigned-to-signed conversion is
- // implementation-defined if the original value does not fit in the signed type
- // (i.e., two's complement cannot be assumed even on mainstream machines,
- // because the compiler may decide otherwise). Hence the use of `SafeCopy`
- // below for deterministic bit-casting.
- // (see "Integer conversions" in
- // https://en.cppreference.com/w/cpp/language/implicit_conversion)
-
- static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
- static const T DefaultMax() { return 0; }
-
- static T Coalesce(T val, T fallback) { return val; }
-
- static bool Compare(int type_length, T a, T b) {
- return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
- }
-
- static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
- static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; }
-};
-
-template <>
-struct CompareHelper<Int32Type, false> : public UnsignedCompareHelperBase<Int32Type> {};
-
-template <>
-struct CompareHelper<Int64Type, false> : public UnsignedCompareHelperBase<Int64Type> {};
-
-template <bool is_signed>
-struct CompareHelper<Int96Type, is_signed> {
- using T = typename Int96Type::c_type;
- using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;
-
- static T DefaultMin() {
- uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
- uint32_t kMax = std::numeric_limits<uint32_t>::max();
- return {kMax, kMax, kMsbMax};
- }
- static T DefaultMax() {
- uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
- uint32_t kMin = std::numeric_limits<uint32_t>::min();
- return {kMin, kMin, kMsbMin};
- }
- static T Coalesce(T val, T fallback) { return val; }
-
- static inline bool Compare(int type_length, const T& a, const T& b) {
- if (a.value[2] != b.value[2]) {
- // Only the MSB bit is by Signed comparison. For little-endian, this is the
- // last bit of Int96 type.
- return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
- } else if (a.value[1] != b.value[1]) {
- return (a.value[1] < b.value[1]);
- }
- return (a.value[0] < b.value[0]);
- }
-
- static T Min(int type_length, const T& a, const T& b) {
- return Compare(0, a, b) ? a : b;
- }
- static T Max(int type_length, const T& a, const T& b) {
- return Compare(0, a, b) ? b : a;
- }
-};
-
-template <typename T, bool is_signed>
-struct BinaryLikeComparer {};
-
-template <typename T>
-struct BinaryLikeComparer<T, /*is_signed=*/false> {
- static bool Compare(int type_length, const T& a, const T& b) {
- int a_length = value_length(type_length, a);
- int b_length = value_length(type_length, b);
- // Unsigned comparison is used for non-numeric types so straight
- // lexiographic comparison makes sense. (a.ptr is always unsigned)....
- return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length);
- }
-};
-
-template <typename T>
-struct BinaryLikeComparer<T, /*is_signed=*/true> {
- static bool Compare(int type_length, const T& a, const T& b) {
- // Is signed is used for integers encoded as big-endian twos
- // complement integers. (e.g. decimals).
- int a_length = value_length(type_length, a);
- int b_length = value_length(type_length, b);
-
- // At least of the lengths is zero.
- if (a_length == 0 || b_length == 0) {
- return a_length == 0 && b_length > 0;
- }
-
- int8_t first_a = *a.ptr;
- int8_t first_b = *b.ptr;
- // We can short circuit for different signed numbers or
- // for equal length bytes arrays that have different first bytes.
- // The equality requirement is necessary for sign extension cases.
- // 0xFF10 should be eqaul to 0x10 (due to big endian sign extension).
- if ((0x80 & first_a) != (0x80 & first_b) ||
- (a_length == b_length && first_a != first_b)) {
- return first_a < first_b;
- }
- // When the lengths are unequal and the numbers are of the same
- // sign we need to do comparison by sign extending the shorter
- // value first, and once we get to equal sized arrays, lexicographical
- // unsigned comparison of everything but the first byte is sufficient.
- const uint8_t* a_start = a.ptr;
- const uint8_t* b_start = b.ptr;
- if (a_length != b_length) {
- const uint8_t* lead_start = nullptr;
- const uint8_t* lead_end = nullptr;
- if (a_length > b_length) {
- int lead_length = a_length - b_length;
- lead_start = a.ptr;
- lead_end = a.ptr + lead_length;
- a_start += lead_length;
- } else {
- DCHECK_LT(a_length, b_length);
- int lead_length = b_length - a_length;
- lead_start = b.ptr;
- lead_end = b.ptr + lead_length;
- b_start += lead_length;
- }
- // Compare extra bytes to the sign extension of the first
- // byte of the other number.
- uint8_t extension = first_a < 0 ? 0xFF : 0;
- bool not_equal = std::any_of(lead_start, lead_end,
- [extension](uint8_t a) { return extension != a; });
- if (not_equal) {
- // Since sign extension are extrema values for unsigned bytes:
- //
- // Four cases exist:
- // negative values:
- // b is the longer value.
- // b must be the lesser value: return false
- // else:
- // a must be the lesser value: return true
- //
- // positive values:
- // b is the longer value.
- // values in b must be greater than a: return true
- // else:
- // values in a must be greater than b: return false
- bool negative_values = first_a < 0;
- bool b_longer = a_length < b_length;
- return negative_values != b_longer;
- }
- } else {
- a_start++;
- b_start++;
- }
- return std::lexicographical_compare(a_start, a.ptr + a_length, b_start,
- b.ptr + b_length);
- }
-};
-
-template <typename DType, bool is_signed>
-struct BinaryLikeCompareHelperBase {
- using T = typename DType::c_type;
-
- static T DefaultMin() { return {}; }
- static T DefaultMax() { return {}; }
- static T Coalesce(T val, T fallback) { return val; }
-
- static inline bool Compare(int type_length, const T& a, const T& b) {
- return BinaryLikeComparer<T, is_signed>::Compare(type_length, a, b);
- }
- static T Min(int type_length, const T& a, const T& b) {
- if (a.ptr == nullptr) return b;
- if (b.ptr == nullptr) return a;
- return Compare(type_length, a, b) ? a : b;
- }
-
- static T Max(int type_length, const T& a, const T& b) {
- if (a.ptr == nullptr) return b;
- if (b.ptr == nullptr) return a;
- return Compare(type_length, a, b) ? b : a;
- }
-};
-
-template <bool is_signed>
-struct CompareHelper<ByteArrayType, is_signed>
- : public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};
-
-template <bool is_signed>
-struct CompareHelper<FLBAType, is_signed>
- : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
-
-using ::arrow::util::optional;
-
-template <typename T>
-::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
-CleanStatistic(std::pair<T, T> min_max) {
- return min_max;
-}
-
-// In case of floating point types, the following rules are applied (as per
-// upstream parquet-mr):
-// - If any of min/max is NaN, return nothing.
-// - If min is 0.0f, replace with -0.0f
-// - If max is -0.0f, replace with 0.0f
-template <typename T>
-::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
-CleanStatistic(std::pair<T, T> min_max) {
- T min = min_max.first;
- T max = min_max.second;
-
- // Ignore if one of the value is nan.
- if (std::isnan(min) || std::isnan(max)) {
- return ::arrow::util::nullopt;
- }
-
- if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {
- return ::arrow::util::nullopt;
- }
-
- T zero{};
-
- if (min == zero && !std::signbit(min)) {
- min = -min;
- }
-
- if (max == zero && std::signbit(max)) {
- max = -max;
- }
-
- return {{min, max}};
-}
-
-optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max) {
- if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
- return ::arrow::util::nullopt;
- }
- return min_max;
-}
-
-optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
- std::pair<ByteArray, ByteArray> min_max) {
- if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
- return ::arrow::util::nullopt;
- }
- return min_max;
-}
-
-template <bool is_signed, typename DType>
-class TypedComparatorImpl : virtual public TypedComparator<DType> {
- public:
- using T = typename DType::c_type;
- using Helper = CompareHelper<DType, is_signed>;
-
- explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
-
- bool CompareInline(const T& a, const T& b) const {
- return Helper::Compare(type_length_, a, b);
- }
-
- bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }
-
- std::pair<T, T> GetMinMax(const T* values, int64_t length) override {
- DCHECK_GT(length, 0);
-
- T min = Helper::DefaultMin();
- T max = Helper::DefaultMax();
-
- for (int64_t i = 0; i < length; i++) {
- auto val = values[i];
- min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin()));
- max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax()));
- }
-
- return {min, max};
- }
-
- std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) override {
- DCHECK_GT(length, 0);
-
- T min = Helper::DefaultMin();
- T max = Helper::DefaultMax();
-
- ::arrow::internal::VisitSetBitRunsVoid(
- valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) {
- for (int64_t i = 0; i < length; i++) {
- const auto val = values[i + position];
- min = Helper::Min(type_length_, min,
- Helper::Coalesce(val, Helper::DefaultMin()));
- max = Helper::Max(type_length_, max,
- Helper::Coalesce(val, Helper::DefaultMax()));
- }
- });
-
- return {min, max};
- }
-
- std::pair<T, T> GetMinMax(const ::arrow::Array& values) override;
-
- private:
- int type_length_;
-};
-
-// ARROW-11675: A hand-written version of GetMinMax(), to work around
-// what looks like a MSVC code generation bug.
-// This does not seem to be required for GetMinMaxSpaced().
-template <>
-std::pair<int32_t, int32_t>
-TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
- int64_t length) {
- DCHECK_GT(length, 0);
-
- const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
- uint32_t min = std::numeric_limits<uint32_t>::max();
- uint32_t max = std::numeric_limits<uint32_t>::lowest();
-
- for (int64_t i = 0; i < length; i++) {
- const auto val = unsigned_values[i];
- min = std::min<uint32_t>(min, val);
- max = std::max<uint32_t>(max, val);
- }
-
- return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
-}
-
-template <bool is_signed, typename DType>
-std::pair<typename DType::c_type, typename DType::c_type>
-TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
- ParquetException::NYI(values.type()->ToString());
-}
-
-template <bool is_signed>
-std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
- const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
- const ::arrow::Array& values) {
- using Helper = CompareHelper<ByteArrayType, is_signed>;
-
- ByteArray min = Helper::DefaultMin();
- ByteArray max = Helper::DefaultMax();
- constexpr int type_length = -1;
-
- const auto valid_func = [&](ByteArray val) {
- min = Helper::Min(type_length, val, min);
- max = Helper::Max(type_length, val, max);
- };
- const auto null_func = [&]() {};
-
- if (::arrow::is_binary_like(values.type_id())) {
- ::arrow::VisitArrayDataInline<::arrow::BinaryType>(
- *values.data(), std::move(valid_func), std::move(null_func));
- } else {
- DCHECK(::arrow::is_large_binary_like(values.type_id()));
- ::arrow::VisitArrayDataInline<::arrow::LargeBinaryType>(
- *values.data(), std::move(valid_func), std::move(null_func));
- }
-
- return {min, max};
-}
-
-template <>
-std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
- const ::arrow::Array& values) {
- return GetMinMaxBinaryHelper<true>(*this, values);
-}
-
-template <>
-std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
- const ::arrow::Array& values) {
- return GetMinMaxBinaryHelper<false>(*this, values);
-}
-
-template <typename DType>
-class TypedStatisticsImpl : public TypedStatistics<DType> {
- public:
- using T = typename DType::c_type;
-
- TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
- : descr_(descr),
- pool_(pool),
- min_buffer_(AllocateBuffer(pool_, 0)),
- max_buffer_(AllocateBuffer(pool_, 0)) {
- auto comp = Comparator::Make(descr);
- comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
- Reset();
- has_null_count_ = true;
- has_distinct_count_ = true;
- }
-
- TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
- int64_t distinct_count)
- : pool_(default_memory_pool()),
- min_buffer_(AllocateBuffer(pool_, 0)),
- max_buffer_(AllocateBuffer(pool_, 0)) {
- IncrementNumValues(num_values);
- IncrementNullCount(null_count);
- IncrementDistinctCount(distinct_count);
-
- Copy(min, &min_, min_buffer_.get());
- Copy(max, &max_, max_buffer_.get());
- has_min_max_ = true;
- }
-
- TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
- const std::string& encoded_max, int64_t num_values,
- int64_t null_count, int64_t distinct_count, bool has_min_max,
- bool has_null_count, bool has_distinct_count, MemoryPool* pool)
- : TypedStatisticsImpl(descr, pool) {
- IncrementNumValues(num_values);
- if (has_null_count_) {
- IncrementNullCount(null_count);
- }
- if (has_distinct_count) {
- IncrementDistinctCount(distinct_count);
- }
-
- if (!encoded_min.empty()) {
- PlainDecode(encoded_min, &min_);
- }
- if (!encoded_max.empty()) {
- PlainDecode(encoded_max, &max_);
- }
- has_min_max_ = has_min_max;
- }
-
- bool HasDistinctCount() const override { return has_distinct_count_; };
- bool HasMinMax() const override { return has_min_max_; }
- bool HasNullCount() const override { return has_null_count_; };
-
- bool Equals(const Statistics& raw_other) const override {
- if (physical_type() != raw_other.physical_type()) return false;
-
- const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
-
- if (has_min_max_ != other.has_min_max_) return false;
-
- return (has_min_max_ && MinMaxEqual(other)) && null_count() == other.null_count() &&
- distinct_count() == other.distinct_count() &&
- num_values() == other.num_values();
- }
-
- bool MinMaxEqual(const TypedStatisticsImpl& other) const;
-
- void Reset() override {
- ResetCounts();
- has_min_max_ = false;
- has_distinct_count_ = false;
- has_null_count_ = false;
- }
-
- void SetMinMax(const T& arg_min, const T& arg_max) override {
- SetMinMaxPair({arg_min, arg_max});
- }
-
- void Merge(const TypedStatistics<DType>& other) override {
- this->num_values_ += other.num_values();
- if (other.HasNullCount()) {
- this->statistics_.null_count += other.null_count();
- }
- if (other.HasDistinctCount()) {
- this->statistics_.distinct_count += other.distinct_count();
- }
- if (other.HasMinMax()) {
- SetMinMax(other.min(), other.max());
- }
- }
-
- void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
- void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
- int64_t num_not_null, int64_t num_null) override;
-
- void Update(const ::arrow::Array& values) override {
- IncrementNullCount(values.null_count());
- IncrementNumValues(values.length() - values.null_count());
-
- if (values.null_count() == values.length()) {
- return;
- }
-
- SetMinMaxPair(comparator_->GetMinMax(values));
- }
-
- const T& min() const override { return min_; }
-
- const T& max() const override { return max_; }
-
- Type::type physical_type() const override { return descr_->physical_type(); }
-
- const ColumnDescriptor* descr() const override { return descr_; }
-
- std::string EncodeMin() const override {
- std::string s;
- if (HasMinMax()) this->PlainEncode(min_, &s);
- return s;
- }
-
- std::string EncodeMax() const override {
- std::string s;
- if (HasMinMax()) this->PlainEncode(max_, &s);
- return s;
- }
-
- EncodedStatistics Encode() override {
- EncodedStatistics s;
- if (HasMinMax()) {
- s.set_min(this->EncodeMin());
- s.set_max(this->EncodeMax());
- }
- if (HasNullCount()) {
- s.set_null_count(this->null_count());
- }
- return s;
- }
-
- int64_t null_count() const override { return statistics_.null_count; }
- int64_t distinct_count() const override { return statistics_.distinct_count; }
- int64_t num_values() const override { return num_values_; }
-
- private:
- const ColumnDescriptor* descr_;
- bool has_min_max_ = false;
- bool has_null_count_ = false;
- bool has_distinct_count_ = false;
- T min_;
- T max_;
- ::arrow::MemoryPool* pool_;
- int64_t num_values_ = 0;
- EncodedStatistics statistics_;
- std::shared_ptr<TypedComparator<DType>> comparator_;
- std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
-
- void PlainEncode(const T& src, std::string* dst) const;
- void PlainDecode(const std::string& src, T* dst) const;
-
- void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
-
- void IncrementNullCount(int64_t n) {
- statistics_.null_count += n;
- has_null_count_ = true;
- }
-
- void IncrementNumValues(int64_t n) { num_values_ += n; }
-
- void IncrementDistinctCount(int64_t n) {
- statistics_.distinct_count += n;
- has_distinct_count_ = true;
- }
-
- void ResetCounts() {
- this->statistics_.null_count = 0;
- this->statistics_.distinct_count = 0;
- this->num_values_ = 0;
- }
-
- void SetMinMaxPair(std::pair<T, T> min_max) {
- // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
- auto maybe_min_max = CleanStatistic(min_max);
- if (!maybe_min_max) return;
-
- auto min = maybe_min_max.value().first;
- auto max = maybe_min_max.value().second;
-
- if (!has_min_max_) {
- has_min_max_ = true;
- Copy(min, &min_, min_buffer_.get());
- Copy(max, &max_, max_buffer_.get());
- } else {
- Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
- Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
- }
- }
-};
-
-template <>
-inline bool TypedStatisticsImpl<FLBAType>::MinMaxEqual(
- const TypedStatisticsImpl<FLBAType>& other) const {
- uint32_t len = descr_->type_length();
- return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 &&
- std::memcmp(max_.ptr, other.max_.ptr, len) == 0;
-}
-
-template <typename DType>
-bool TypedStatisticsImpl<DType>::MinMaxEqual(
- const TypedStatisticsImpl<DType>& other) const {
- return min_ != other.min_ && max_ != other.max_;
-}
-
-template <>
-inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
- ResizableBuffer* buffer) {
- if (dst->ptr == src.ptr) return;
- uint32_t len = descr_->type_length();
- PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
- std::memcpy(buffer->mutable_data(), src.ptr, len);
- *dst = FLBA(buffer->data());
-}
-
-template <>
-inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
- ResizableBuffer* buffer) {
- if (dst->ptr == src.ptr) return;
- PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
- std::memcpy(buffer->mutable_data(), src.ptr, src.len);
- *dst = ByteArray(src.len, buffer->data());
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
- int64_t num_null) {
- DCHECK_GE(num_not_null, 0);
- DCHECK_GE(num_null, 0);
-
- IncrementNullCount(num_null);
- IncrementNumValues(num_not_null);
-
- if (num_not_null == 0) return;
- SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
- int64_t valid_bits_offset,
- int64_t num_not_null, int64_t num_null) {
- DCHECK_GE(num_not_null, 0);
- DCHECK_GE(num_null, 0);
-
- IncrementNullCount(num_null);
- IncrementNumValues(num_not_null);
-
- if (num_not_null == 0) return;
-
- int64_t length = num_null + num_not_null;
- SetMinMaxPair(
- comparator_->GetMinMaxSpaced(values, length, valid_bits, valid_bits_offset));
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) const {
- auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
- encoder->Put(&src, 1);
- auto buffer = encoder->FlushValues();
- auto ptr = reinterpret_cast<const char*>(buffer->data());
- dst->assign(ptr, buffer->size());
-}
-
-template <typename DType>
-void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) const {
- auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
- decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
- static_cast<int>(src.size()));
- decoder->Decode(dst, 1);
-}
-
-template <>
-void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src,
- std::string* dst) const {
- dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
-}
-
-template <>
-void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
- T* dst) const {
- dst->len = static_cast<uint32_t>(src.size());
- dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
-}
-
-} // namespace
-
-// ----------------------------------------------------------------------
-// Public factory functions
-
-std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
- SortOrder::type sort_order,
- int type_length) {
- if (SortOrder::SIGNED == sort_order) {
- switch (physical_type) {
- case Type::BOOLEAN:
- return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
- case Type::INT32:
- return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
- case Type::INT64:
- return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
- case Type::INT96:
- return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
- case Type::FLOAT:
- return std::make_shared<TypedComparatorImpl<true, FloatType>>();
- case Type::DOUBLE:
- return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
- default:
- ParquetException::NYI("Signed Compare not implemented");
- }
- } else if (SortOrder::UNSIGNED == sort_order) {
- switch (physical_type) {
- case Type::INT32:
- return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
- case Type::INT64:
- return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
- case Type::INT96:
- return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
- default:
- ParquetException::NYI("Unsigned Compare not implemented");
- }
- } else {
- throw ParquetException("UNKNOWN Sort Order");
- }
- return nullptr;
-}
-
-std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
- return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
-}
-
-std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool) {
- switch (descr->physical_type()) {
- case Type::BOOLEAN:
- return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
- case Type::INT32:
- return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
- case Type::INT64:
- return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
- case Type::FLOAT:
- return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
- case Type::DOUBLE:
- return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
- case Type::BYTE_ARRAY:
- return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
- case Type::FIXED_LEN_BYTE_ARRAY:
- return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
- default:
- ParquetException::NYI("Statistics not implemented");
- }
-}
-
-std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
- const void* max, int64_t num_values,
- int64_t null_count, int64_t distinct_count) {
-#define MAKE_STATS(CAP_TYPE, KLASS) \
- case Type::CAP_TYPE: \
- return std::make_shared<TypedStatisticsImpl<KLASS>>( \
- *reinterpret_cast<const typename KLASS::c_type*>(min), \
- *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
- distinct_count)
-
- switch (physical_type) {
- MAKE_STATS(BOOLEAN, BooleanType);
- MAKE_STATS(INT32, Int32Type);
- MAKE_STATS(INT64, Int64Type);
- MAKE_STATS(FLOAT, FloatType);
- MAKE_STATS(DOUBLE, DoubleType);
- MAKE_STATS(BYTE_ARRAY, ByteArrayType);
- MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
- default:
- break;
- }
-#undef MAKE_STATS
- DCHECK(false) << "Cannot reach here";
- return nullptr;
-}
-
-std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
- const std::string& encoded_min,
- const std::string& encoded_max,
- int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max,
- bool has_null_count, bool has_distinct_count,
- ::arrow::MemoryPool* pool) {
-#define MAKE_STATS(CAP_TYPE, KLASS) \
- case Type::CAP_TYPE: \
- return std::make_shared<TypedStatisticsImpl<KLASS>>( \
- descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
- has_min_max, has_null_count, has_distinct_count, pool)
-
- switch (descr->physical_type()) {
- MAKE_STATS(BOOLEAN, BooleanType);
- MAKE_STATS(INT32, Int32Type);
- MAKE_STATS(INT64, Int64Type);
- MAKE_STATS(FLOAT, FloatType);
- MAKE_STATS(DOUBLE, DoubleType);
- MAKE_STATS(BYTE_ARRAY, ByteArrayType);
- MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
- default:
- break;
- }
-#undef MAKE_STATS
- DCHECK(false) << "Cannot reach here";
- return nullptr;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/statistics.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/visitor_inline.h"
+#include "parquet/encoding.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+using arrow::default_memory_pool;
+using arrow::MemoryPool;
+using arrow::internal::checked_cast;
+using arrow::util::SafeCopy;
+
+namespace parquet {
+namespace {
+
+// ----------------------------------------------------------------------
+// Comparator implementations
+
+constexpr int value_length(int value_length, const ByteArray& value) { return value.len; }
+constexpr int value_length(int type_length, const FLBA& value) { return type_length; }
+
+template <typename DType, bool is_signed>
+struct CompareHelper {
+ using T = typename DType::c_type;
+
+ static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
+ "T is an unsigned numeric");
+
+ constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
+ constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
+
+ // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
+ // standard requirements.
+ template <typename T1 = T>
+ static ::arrow::enable_if_t<std::is_floating_point<T1>::value, T> Coalesce(T val,
+ T fallback) {
+ return std::isnan(val) ? fallback : val;
+ }
+
+ template <typename T1 = T>
+ static ::arrow::enable_if_t<!std::is_floating_point<T1>::value, T> Coalesce(
+ T val, T fallback) {
+ return val;
+ }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) { return a < b; }
+
+ static T Min(int type_length, T a, T b) { return a < b ? a : b; }
+ static T Max(int type_length, T a, T b) { return a < b ? b : a; }
+};
+
+template <typename DType>
+struct UnsignedCompareHelperBase {
+ using T = typename DType::c_type;
+ using UCType = typename std::make_unsigned<T>::type;
+
+ static_assert(!std::is_same<T, UCType>::value, "T is unsigned");
+ static_assert(sizeof(T) == sizeof(UCType), "T and UCType not the same size");
+
+ // NOTE: according to the C++ spec, unsigned-to-signed conversion is
+ // implementation-defined if the original value does not fit in the signed type
+ // (i.e., two's complement cannot be assumed even on mainstream machines,
+ // because the compiler may decide otherwise). Hence the use of `SafeCopy`
+ // below for deterministic bit-casting.
+ // (see "Integer conversions" in
+ // https://en.cppreference.com/w/cpp/language/implicit_conversion)
+
+ static const T DefaultMin() { return SafeCopy<T>(std::numeric_limits<UCType>::max()); }
+ static const T DefaultMax() { return 0; }
+
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static bool Compare(int type_length, T a, T b) {
+ return SafeCopy<UCType>(a) < SafeCopy<UCType>(b);
+ }
+
+ static T Min(int type_length, T a, T b) { return Compare(type_length, a, b) ? a : b; }
+ static T Max(int type_length, T a, T b) { return Compare(type_length, a, b) ? b : a; }
+};
+
+template <>
+struct CompareHelper<Int32Type, false> : public UnsignedCompareHelperBase<Int32Type> {};
+
+template <>
+struct CompareHelper<Int64Type, false> : public UnsignedCompareHelperBase<Int64Type> {};
+
+template <bool is_signed>
+struct CompareHelper<Int96Type, is_signed> {
+ using T = typename Int96Type::c_type;
+ using msb_type = typename std::conditional<is_signed, int32_t, uint32_t>::type;
+
+ static T DefaultMin() {
+ uint32_t kMsbMax = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::max());
+ uint32_t kMax = std::numeric_limits<uint32_t>::max();
+ return {kMax, kMax, kMsbMax};
+ }
+ static T DefaultMax() {
+ uint32_t kMsbMin = SafeCopy<uint32_t>(std::numeric_limits<msb_type>::min());
+ uint32_t kMin = std::numeric_limits<uint32_t>::min();
+ return {kMin, kMin, kMsbMin};
+ }
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) {
+ if (a.value[2] != b.value[2]) {
+ // Only the MSB bit is by Signed comparison. For little-endian, this is the
+ // last bit of Int96 type.
+ return SafeCopy<msb_type>(a.value[2]) < SafeCopy<msb_type>(b.value[2]);
+ } else if (a.value[1] != b.value[1]) {
+ return (a.value[1] < b.value[1]);
+ }
+ return (a.value[0] < b.value[0]);
+ }
+
+ static T Min(int type_length, const T& a, const T& b) {
+ return Compare(0, a, b) ? a : b;
+ }
+ static T Max(int type_length, const T& a, const T& b) {
+ return Compare(0, a, b) ? b : a;
+ }
+};
+
+template <typename T, bool is_signed>
+struct BinaryLikeComparer {};
+
+template <typename T>
+struct BinaryLikeComparer<T, /*is_signed=*/false> {
+ static bool Compare(int type_length, const T& a, const T& b) {
+ int a_length = value_length(type_length, a);
+ int b_length = value_length(type_length, b);
+ // Unsigned comparison is used for non-numeric types so straight
+ // lexiographic comparison makes sense. (a.ptr is always unsigned)....
+ return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length);
+ }
+};
+
+template <typename T>
+struct BinaryLikeComparer<T, /*is_signed=*/true> {
+ static bool Compare(int type_length, const T& a, const T& b) {
+ // Is signed is used for integers encoded as big-endian twos
+ // complement integers. (e.g. decimals).
+ int a_length = value_length(type_length, a);
+ int b_length = value_length(type_length, b);
+
+ // At least of the lengths is zero.
+ if (a_length == 0 || b_length == 0) {
+ return a_length == 0 && b_length > 0;
+ }
+
+ int8_t first_a = *a.ptr;
+ int8_t first_b = *b.ptr;
+ // We can short circuit for different signed numbers or
+ // for equal length bytes arrays that have different first bytes.
+ // The equality requirement is necessary for sign extension cases.
+ // 0xFF10 should be eqaul to 0x10 (due to big endian sign extension).
+ if ((0x80 & first_a) != (0x80 & first_b) ||
+ (a_length == b_length && first_a != first_b)) {
+ return first_a < first_b;
+ }
+ // When the lengths are unequal and the numbers are of the same
+ // sign we need to do comparison by sign extending the shorter
+ // value first, and once we get to equal sized arrays, lexicographical
+ // unsigned comparison of everything but the first byte is sufficient.
+ const uint8_t* a_start = a.ptr;
+ const uint8_t* b_start = b.ptr;
+ if (a_length != b_length) {
+ const uint8_t* lead_start = nullptr;
+ const uint8_t* lead_end = nullptr;
+ if (a_length > b_length) {
+ int lead_length = a_length - b_length;
+ lead_start = a.ptr;
+ lead_end = a.ptr + lead_length;
+ a_start += lead_length;
+ } else {
+ DCHECK_LT(a_length, b_length);
+ int lead_length = b_length - a_length;
+ lead_start = b.ptr;
+ lead_end = b.ptr + lead_length;
+ b_start += lead_length;
+ }
+ // Compare extra bytes to the sign extension of the first
+ // byte of the other number.
+ uint8_t extension = first_a < 0 ? 0xFF : 0;
+ bool not_equal = std::any_of(lead_start, lead_end,
+ [extension](uint8_t a) { return extension != a; });
+ if (not_equal) {
+ // Since sign extension are extrema values for unsigned bytes:
+ //
+ // Four cases exist:
+ // negative values:
+ // b is the longer value.
+ // b must be the lesser value: return false
+ // else:
+ // a must be the lesser value: return true
+ //
+ // positive values:
+ // b is the longer value.
+ // values in b must be greater than a: return true
+ // else:
+ // values in a must be greater than b: return false
+ bool negative_values = first_a < 0;
+ bool b_longer = a_length < b_length;
+ return negative_values != b_longer;
+ }
+ } else {
+ a_start++;
+ b_start++;
+ }
+ return std::lexicographical_compare(a_start, a.ptr + a_length, b_start,
+ b.ptr + b_length);
+ }
+};
+
+template <typename DType, bool is_signed>
+struct BinaryLikeCompareHelperBase {
+ using T = typename DType::c_type;
+
+ static T DefaultMin() { return {}; }
+ static T DefaultMax() { return {}; }
+ static T Coalesce(T val, T fallback) { return val; }
+
+ static inline bool Compare(int type_length, const T& a, const T& b) {
+ return BinaryLikeComparer<T, is_signed>::Compare(type_length, a, b);
+ }
+ static T Min(int type_length, const T& a, const T& b) {
+ if (a.ptr == nullptr) return b;
+ if (b.ptr == nullptr) return a;
+ return Compare(type_length, a, b) ? a : b;
+ }
+
+ static T Max(int type_length, const T& a, const T& b) {
+ if (a.ptr == nullptr) return b;
+ if (b.ptr == nullptr) return a;
+ return Compare(type_length, a, b) ? b : a;
+ }
+};
+
+template <bool is_signed>
+struct CompareHelper<ByteArrayType, is_signed>
+ : public BinaryLikeCompareHelperBase<ByteArrayType, is_signed> {};
+
+template <bool is_signed>
+struct CompareHelper<FLBAType, is_signed>
+ : public BinaryLikeCompareHelperBase<FLBAType, is_signed> {};
+
+using ::arrow::util::optional;
+
+template <typename T>
+::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
+CleanStatistic(std::pair<T, T> min_max) {
+ return min_max;
+}
+
+// In case of floating point types, the following rules are applied (as per
+// upstream parquet-mr):
+// - If any of min/max is NaN, return nothing.
+// - If min is 0.0f, replace with -0.0f
+// - If max is -0.0f, replace with 0.0f
+template <typename T>
+::arrow::enable_if_t<std::is_floating_point<T>::value, optional<std::pair<T, T>>>
+CleanStatistic(std::pair<T, T> min_max) {
+ T min = min_max.first;
+ T max = min_max.second;
+
+ // Ignore if one of the value is nan.
+ if (std::isnan(min) || std::isnan(max)) {
+ return ::arrow::util::nullopt;
+ }
+
+ if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {
+ return ::arrow::util::nullopt;
+ }
+
+ T zero{};
+
+ if (min == zero && !std::signbit(min)) {
+ min = -min;
+ }
+
+ if (max == zero && std::signbit(max)) {
+ max = -max;
+ }
+
+ return {{min, max}};
+}
+
+optional<std::pair<FLBA, FLBA>> CleanStatistic(std::pair<FLBA, FLBA> min_max) {
+ if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
+ return ::arrow::util::nullopt;
+ }
+ return min_max;
+}
+
+optional<std::pair<ByteArray, ByteArray>> CleanStatistic(
+ std::pair<ByteArray, ByteArray> min_max) {
+ if (min_max.first.ptr == nullptr || min_max.second.ptr == nullptr) {
+ return ::arrow::util::nullopt;
+ }
+ return min_max;
+}
+
+template <bool is_signed, typename DType>
+class TypedComparatorImpl : virtual public TypedComparator<DType> {
+ public:
+ using T = typename DType::c_type;
+ using Helper = CompareHelper<DType, is_signed>;
+
+ explicit TypedComparatorImpl(int type_length = -1) : type_length_(type_length) {}
+
+ bool CompareInline(const T& a, const T& b) const {
+ return Helper::Compare(type_length_, a, b);
+ }
+
+ bool Compare(const T& a, const T& b) override { return CompareInline(a, b); }
+
+ std::pair<T, T> GetMinMax(const T* values, int64_t length) override {
+ DCHECK_GT(length, 0);
+
+ T min = Helper::DefaultMin();
+ T max = Helper::DefaultMax();
+
+ for (int64_t i = 0; i < length; i++) {
+ auto val = values[i];
+ min = Helper::Min(type_length_, min, Helper::Coalesce(val, Helper::DefaultMin()));
+ max = Helper::Max(type_length_, max, Helper::Coalesce(val, Helper::DefaultMax()));
+ }
+
+ return {min, max};
+ }
+
+ std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ DCHECK_GT(length, 0);
+
+ T min = Helper::DefaultMin();
+ T max = Helper::DefaultMax();
+
+ ::arrow::internal::VisitSetBitRunsVoid(
+ valid_bits, valid_bits_offset, length, [&](int64_t position, int64_t length) {
+ for (int64_t i = 0; i < length; i++) {
+ const auto val = values[i + position];
+ min = Helper::Min(type_length_, min,
+ Helper::Coalesce(val, Helper::DefaultMin()));
+ max = Helper::Max(type_length_, max,
+ Helper::Coalesce(val, Helper::DefaultMax()));
+ }
+ });
+
+ return {min, max};
+ }
+
+ std::pair<T, T> GetMinMax(const ::arrow::Array& values) override;
+
+ private:
+ int type_length_;
+};
+
+// ARROW-11675: A hand-written version of GetMinMax(), to work around
+// what looks like a MSVC code generation bug.
+// This does not seem to be required for GetMinMaxSpaced().
+template <>
+std::pair<int32_t, int32_t>
+TypedComparatorImpl</*is_signed=*/false, Int32Type>::GetMinMax(const int32_t* values,
+ int64_t length) {
+ DCHECK_GT(length, 0);
+
+ const uint32_t* unsigned_values = reinterpret_cast<const uint32_t*>(values);
+ uint32_t min = std::numeric_limits<uint32_t>::max();
+ uint32_t max = std::numeric_limits<uint32_t>::lowest();
+
+ for (int64_t i = 0; i < length; i++) {
+ const auto val = unsigned_values[i];
+ min = std::min<uint32_t>(min, val);
+ max = std::max<uint32_t>(max, val);
+ }
+
+ return {SafeCopy<int32_t>(min), SafeCopy<int32_t>(max)};
+}
+
+template <bool is_signed, typename DType>
+std::pair<typename DType::c_type, typename DType::c_type>
+TypedComparatorImpl<is_signed, DType>::GetMinMax(const ::arrow::Array& values) {
+ ParquetException::NYI(values.type()->ToString());
+}
+
+template <bool is_signed>
+std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
+ const TypedComparatorImpl<is_signed, ByteArrayType>& comparator,
+ const ::arrow::Array& values) {
+ using Helper = CompareHelper<ByteArrayType, is_signed>;
+
+ ByteArray min = Helper::DefaultMin();
+ ByteArray max = Helper::DefaultMax();
+ constexpr int type_length = -1;
+
+ const auto valid_func = [&](ByteArray val) {
+ min = Helper::Min(type_length, val, min);
+ max = Helper::Max(type_length, val, max);
+ };
+ const auto null_func = [&]() {};
+
+ if (::arrow::is_binary_like(values.type_id())) {
+ ::arrow::VisitArrayDataInline<::arrow::BinaryType>(
+ *values.data(), std::move(valid_func), std::move(null_func));
+ } else {
+ DCHECK(::arrow::is_large_binary_like(values.type_id()));
+ ::arrow::VisitArrayDataInline<::arrow::LargeBinaryType>(
+ *values.data(), std::move(valid_func), std::move(null_func));
+ }
+
+ return {min, max};
+}
+
+template <>
+std::pair<ByteArray, ByteArray> TypedComparatorImpl<true, ByteArrayType>::GetMinMax(
+ const ::arrow::Array& values) {
+ return GetMinMaxBinaryHelper<true>(*this, values);
+}
+
+template <>
+std::pair<ByteArray, ByteArray> TypedComparatorImpl<false, ByteArrayType>::GetMinMax(
+ const ::arrow::Array& values) {
+ return GetMinMaxBinaryHelper<false>(*this, values);
+}
+
+template <typename DType>
+class TypedStatisticsImpl : public TypedStatistics<DType> {
+ public:
+ using T = typename DType::c_type;
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool)
+ : descr_(descr),
+ pool_(pool),
+ min_buffer_(AllocateBuffer(pool_, 0)),
+ max_buffer_(AllocateBuffer(pool_, 0)) {
+ auto comp = Comparator::Make(descr);
+ comparator_ = std::static_pointer_cast<TypedComparator<DType>>(comp);
+ Reset();
+ has_null_count_ = true;
+ has_distinct_count_ = true;
+ }
+
+ TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count)
+ : pool_(default_memory_pool()),
+ min_buffer_(AllocateBuffer(pool_, 0)),
+ max_buffer_(AllocateBuffer(pool_, 0)) {
+ IncrementNumValues(num_values);
+ IncrementNullCount(null_count);
+ IncrementDistinctCount(distinct_count);
+
+ Copy(min, &min_, min_buffer_.get());
+ Copy(max, &max_, max_buffer_.get());
+ has_min_max_ = true;
+ }
+
+ TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count, bool has_min_max,
+ bool has_null_count, bool has_distinct_count, MemoryPool* pool)
+ : TypedStatisticsImpl(descr, pool) {
+ IncrementNumValues(num_values);
+ if (has_null_count_) {
+ IncrementNullCount(null_count);
+ }
+ if (has_distinct_count) {
+ IncrementDistinctCount(distinct_count);
+ }
+
+ if (!encoded_min.empty()) {
+ PlainDecode(encoded_min, &min_);
+ }
+ if (!encoded_max.empty()) {
+ PlainDecode(encoded_max, &max_);
+ }
+ has_min_max_ = has_min_max;
+ }
+
+ bool HasDistinctCount() const override { return has_distinct_count_; };
+ bool HasMinMax() const override { return has_min_max_; }
+ bool HasNullCount() const override { return has_null_count_; };
+
+ bool Equals(const Statistics& raw_other) const override {
+ if (physical_type() != raw_other.physical_type()) return false;
+
+ const auto& other = checked_cast<const TypedStatisticsImpl&>(raw_other);
+
+ if (has_min_max_ != other.has_min_max_) return false;
+
+ return (has_min_max_ && MinMaxEqual(other)) && null_count() == other.null_count() &&
+ distinct_count() == other.distinct_count() &&
+ num_values() == other.num_values();
+ }
+
+ bool MinMaxEqual(const TypedStatisticsImpl& other) const;
+
+ void Reset() override {
+ ResetCounts();
+ has_min_max_ = false;
+ has_distinct_count_ = false;
+ has_null_count_ = false;
+ }
+
+ void SetMinMax(const T& arg_min, const T& arg_max) override {
+ SetMinMaxPair({arg_min, arg_max});
+ }
+
+ void Merge(const TypedStatistics<DType>& other) override {
+ this->num_values_ += other.num_values();
+ if (other.HasNullCount()) {
+ this->statistics_.null_count += other.null_count();
+ }
+ if (other.HasDistinctCount()) {
+ this->statistics_.distinct_count += other.distinct_count();
+ }
+ if (other.HasMinMax()) {
+ SetMinMax(other.min(), other.max());
+ }
+ }
+
+ void Update(const T* values, int64_t num_not_null, int64_t num_null) override;
+ void UpdateSpaced(const T* values, const uint8_t* valid_bits, int64_t valid_bits_spaced,
+ int64_t num_not_null, int64_t num_null) override;
+
+ void Update(const ::arrow::Array& values) override {
+ IncrementNullCount(values.null_count());
+ IncrementNumValues(values.length() - values.null_count());
+
+ if (values.null_count() == values.length()) {
+ return;
+ }
+
+ SetMinMaxPair(comparator_->GetMinMax(values));
+ }
+
+ const T& min() const override { return min_; }
+
+ const T& max() const override { return max_; }
+
+ Type::type physical_type() const override { return descr_->physical_type(); }
+
+ const ColumnDescriptor* descr() const override { return descr_; }
+
+ std::string EncodeMin() const override {
+ std::string s;
+ if (HasMinMax()) this->PlainEncode(min_, &s);
+ return s;
+ }
+
+ std::string EncodeMax() const override {
+ std::string s;
+ if (HasMinMax()) this->PlainEncode(max_, &s);
+ return s;
+ }
+
+ EncodedStatistics Encode() override {
+ EncodedStatistics s;
+ if (HasMinMax()) {
+ s.set_min(this->EncodeMin());
+ s.set_max(this->EncodeMax());
+ }
+ if (HasNullCount()) {
+ s.set_null_count(this->null_count());
+ }
+ return s;
+ }
+
+ int64_t null_count() const override { return statistics_.null_count; }
+ int64_t distinct_count() const override { return statistics_.distinct_count; }
+ int64_t num_values() const override { return num_values_; }
+
+ private:
+ const ColumnDescriptor* descr_;
+ bool has_min_max_ = false;
+ bool has_null_count_ = false;
+ bool has_distinct_count_ = false;
+ T min_;
+ T max_;
+ ::arrow::MemoryPool* pool_;
+ int64_t num_values_ = 0;
+ EncodedStatistics statistics_;
+ std::shared_ptr<TypedComparator<DType>> comparator_;
+ std::shared_ptr<ResizableBuffer> min_buffer_, max_buffer_;
+
+ void PlainEncode(const T& src, std::string* dst) const;
+ void PlainDecode(const std::string& src, T* dst) const;
+
+ void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; }
+
+ void IncrementNullCount(int64_t n) {
+ statistics_.null_count += n;
+ has_null_count_ = true;
+ }
+
+ void IncrementNumValues(int64_t n) { num_values_ += n; }
+
+ void IncrementDistinctCount(int64_t n) {
+ statistics_.distinct_count += n;
+ has_distinct_count_ = true;
+ }
+
+ void ResetCounts() {
+ this->statistics_.null_count = 0;
+ this->statistics_.distinct_count = 0;
+ this->num_values_ = 0;
+ }
+
+ void SetMinMaxPair(std::pair<T, T> min_max) {
+ // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN
+ auto maybe_min_max = CleanStatistic(min_max);
+ if (!maybe_min_max) return;
+
+ auto min = maybe_min_max.value().first;
+ auto max = maybe_min_max.value().second;
+
+ if (!has_min_max_) {
+ has_min_max_ = true;
+ Copy(min, &min_, min_buffer_.get());
+ Copy(max, &max_, max_buffer_.get());
+ } else {
+ Copy(comparator_->Compare(min_, min) ? min_ : min, &min_, min_buffer_.get());
+ Copy(comparator_->Compare(max_, max) ? max : max_, &max_, max_buffer_.get());
+ }
+ }
+};
+
+template <>
+inline bool TypedStatisticsImpl<FLBAType>::MinMaxEqual(
+ const TypedStatisticsImpl<FLBAType>& other) const {
+ uint32_t len = descr_->type_length();
+ return std::memcmp(min_.ptr, other.min_.ptr, len) == 0 &&
+ std::memcmp(max_.ptr, other.max_.ptr, len) == 0;
+}
+
+template <typename DType>
+bool TypedStatisticsImpl<DType>::MinMaxEqual(
+ const TypedStatisticsImpl<DType>& other) const {
+ return min_ != other.min_ && max_ != other.max_;
+}
+
+template <>
+inline void TypedStatisticsImpl<FLBAType>::Copy(const FLBA& src, FLBA* dst,
+ ResizableBuffer* buffer) {
+ if (dst->ptr == src.ptr) return;
+ uint32_t len = descr_->type_length();
+ PARQUET_THROW_NOT_OK(buffer->Resize(len, false));
+ std::memcpy(buffer->mutable_data(), src.ptr, len);
+ *dst = FLBA(buffer->data());
+}
+
+template <>
+inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteArray* dst,
+ ResizableBuffer* buffer) {
+ if (dst->ptr == src.ptr) return;
+ PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
+ std::memcpy(buffer->mutable_data(), src.ptr, src.len);
+ *dst = ByteArray(src.len, buffer->data());
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::Update(const T* values, int64_t num_not_null,
+ int64_t num_null) {
+ DCHECK_GE(num_not_null, 0);
+ DCHECK_GE(num_null, 0);
+
+ IncrementNullCount(num_null);
+ IncrementNumValues(num_not_null);
+
+ if (num_not_null == 0) return;
+ SetMinMaxPair(comparator_->GetMinMax(values, num_not_null));
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::UpdateSpaced(const T* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset,
+ int64_t num_not_null, int64_t num_null) {
+ DCHECK_GE(num_not_null, 0);
+ DCHECK_GE(num_null, 0);
+
+ IncrementNullCount(num_null);
+ IncrementNumValues(num_not_null);
+
+ if (num_not_null == 0) return;
+
+ int64_t length = num_null + num_not_null;
+ SetMinMaxPair(
+ comparator_->GetMinMaxSpaced(values, length, valid_bits, valid_bits_offset));
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::PlainEncode(const T& src, std::string* dst) const {
+ auto encoder = MakeTypedEncoder<DType>(Encoding::PLAIN, false, descr_, pool_);
+ encoder->Put(&src, 1);
+ auto buffer = encoder->FlushValues();
+ auto ptr = reinterpret_cast<const char*>(buffer->data());
+ dst->assign(ptr, buffer->size());
+}
+
+template <typename DType>
+void TypedStatisticsImpl<DType>::PlainDecode(const std::string& src, T* dst) const {
+ auto decoder = MakeTypedDecoder<DType>(Encoding::PLAIN, descr_);
+ decoder->SetData(1, reinterpret_cast<const uint8_t*>(src.c_str()),
+ static_cast<int>(src.size()));
+ decoder->Decode(dst, 1);
+}
+
+template <>
+void TypedStatisticsImpl<ByteArrayType>::PlainEncode(const T& src,
+ std::string* dst) const {
+ dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
+}
+
+template <>
+void TypedStatisticsImpl<ByteArrayType>::PlainDecode(const std::string& src,
+ T* dst) const {
+ dst->len = static_cast<uint32_t>(src.size());
+ dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
+}
+
+} // namespace
+
+// ----------------------------------------------------------------------
+// Public factory functions
+
+std::shared_ptr<Comparator> Comparator::Make(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length) {
+ if (SortOrder::SIGNED == sort_order) {
+ switch (physical_type) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedComparatorImpl<true, BooleanType>>();
+ case Type::INT32:
+ return std::make_shared<TypedComparatorImpl<true, Int32Type>>();
+ case Type::INT64:
+ return std::make_shared<TypedComparatorImpl<true, Int64Type>>();
+ case Type::INT96:
+ return std::make_shared<TypedComparatorImpl<true, Int96Type>>();
+ case Type::FLOAT:
+ return std::make_shared<TypedComparatorImpl<true, FloatType>>();
+ case Type::DOUBLE:
+ return std::make_shared<TypedComparatorImpl<true, DoubleType>>();
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<true, ByteArrayType>>();
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<true, FLBAType>>(type_length);
+ default:
+ ParquetException::NYI("Signed Compare not implemented");
+ }
+ } else if (SortOrder::UNSIGNED == sort_order) {
+ switch (physical_type) {
+ case Type::INT32:
+ return std::make_shared<TypedComparatorImpl<false, Int32Type>>();
+ case Type::INT64:
+ return std::make_shared<TypedComparatorImpl<false, Int64Type>>();
+ case Type::INT96:
+ return std::make_shared<TypedComparatorImpl<false, Int96Type>>();
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<false, ByteArrayType>>();
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedComparatorImpl<false, FLBAType>>(type_length);
+ default:
+ ParquetException::NYI("Unsigned Compare not implemented");
+ }
+ } else {
+ throw ParquetException("UNKNOWN Sort Order");
+ }
+ return nullptr;
+}
+
+std::shared_ptr<Comparator> Comparator::Make(const ColumnDescriptor* descr) {
+ return Make(descr->physical_type(), descr->sort_order(), descr->type_length());
+}
+
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool) {
+ switch (descr->physical_type()) {
+ case Type::BOOLEAN:
+ return std::make_shared<TypedStatisticsImpl<BooleanType>>(descr, pool);
+ case Type::INT32:
+ return std::make_shared<TypedStatisticsImpl<Int32Type>>(descr, pool);
+ case Type::INT64:
+ return std::make_shared<TypedStatisticsImpl<Int64Type>>(descr, pool);
+ case Type::FLOAT:
+ return std::make_shared<TypedStatisticsImpl<FloatType>>(descr, pool);
+ case Type::DOUBLE:
+ return std::make_shared<TypedStatisticsImpl<DoubleType>>(descr, pool);
+ case Type::BYTE_ARRAY:
+ return std::make_shared<TypedStatisticsImpl<ByteArrayType>>(descr, pool);
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return std::make_shared<TypedStatisticsImpl<FLBAType>>(descr, pool);
+ default:
+ ParquetException::NYI("Statistics not implemented");
+ }
+}
+
+std::shared_ptr<Statistics> Statistics::Make(Type::type physical_type, const void* min,
+ const void* max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count) {
+#define MAKE_STATS(CAP_TYPE, KLASS) \
+ case Type::CAP_TYPE: \
+ return std::make_shared<TypedStatisticsImpl<KLASS>>( \
+ *reinterpret_cast<const typename KLASS::c_type*>(min), \
+ *reinterpret_cast<const typename KLASS::c_type*>(max), num_values, null_count, \
+ distinct_count)
+
+ switch (physical_type) {
+ MAKE_STATS(BOOLEAN, BooleanType);
+ MAKE_STATS(INT32, Int32Type);
+ MAKE_STATS(INT64, Int64Type);
+ MAKE_STATS(FLOAT, FloatType);
+ MAKE_STATS(DOUBLE, DoubleType);
+ MAKE_STATS(BYTE_ARRAY, ByteArrayType);
+ MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+ default:
+ break;
+ }
+#undef MAKE_STATS
+ DCHECK(false) << "Cannot reach here";
+ return nullptr;
+}
+
+std::shared_ptr<Statistics> Statistics::Make(const ColumnDescriptor* descr,
+ const std::string& encoded_min,
+ const std::string& encoded_max,
+ int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max,
+ bool has_null_count, bool has_distinct_count,
+ ::arrow::MemoryPool* pool) {
+#define MAKE_STATS(CAP_TYPE, KLASS) \
+ case Type::CAP_TYPE: \
+ return std::make_shared<TypedStatisticsImpl<KLASS>>( \
+ descr, encoded_min, encoded_max, num_values, null_count, distinct_count, \
+ has_min_max, has_null_count, has_distinct_count, pool)
+
+ switch (descr->physical_type()) {
+ MAKE_STATS(BOOLEAN, BooleanType);
+ MAKE_STATS(INT32, Int32Type);
+ MAKE_STATS(INT64, Int64Type);
+ MAKE_STATS(FLOAT, FloatType);
+ MAKE_STATS(DOUBLE, DoubleType);
+ MAKE_STATS(BYTE_ARRAY, ByteArrayType);
+ MAKE_STATS(FIXED_LEN_BYTE_ARRAY, FLBAType);
+ default:
+ break;
+ }
+#undef MAKE_STATS
+ DCHECK(false) << "Cannot reach here";
+ return nullptr;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
index 1242180000c..18f68f21b87 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/statistics.h
@@ -1,342 +1,342 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "parquet/platform.h"
-#include "parquet/types.h"
-
-namespace arrow {
-
-class Array;
-class BinaryArray;
-
-} // namespace arrow
-
-namespace parquet {
-
-class ColumnDescriptor;
-
-// ----------------------------------------------------------------------
-// Value comparator interfaces
-
-/// \brief Base class for value comparators. Generally used with
-/// TypedComparator<T>
-class PARQUET_EXPORT Comparator {
- public:
- virtual ~Comparator() {}
-
- /// \brief Create a comparator explicitly from physical type and
- /// sort order
- /// \param[in] physical_type the physical type for the typed
- /// comparator
- /// \param[in] sort_order either SortOrder::SIGNED or
- /// SortOrder::UNSIGNED
- /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
- static std::shared_ptr<Comparator> Make(Type::type physical_type,
- SortOrder::type sort_order,
- int type_length = -1);
-
- /// \brief Create typed comparator inferring default sort order from
- /// ColumnDescriptor
- /// \param[in] descr the Parquet column schema
- static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
-};
-
-/// \brief Interface for comparison of physical types according to the
-/// semantics of a particular logical type.
-template <typename DType>
-class TypedComparator : public Comparator {
- public:
- using T = typename DType::c_type;
-
- /// \brief Scalar comparison of two elements, return true if first
- /// is strictly less than the second
- virtual bool Compare(const T& a, const T& b) = 0;
-
- /// \brief Compute maximum and minimum elements in a batch of
- /// elements without any nulls
- virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
-
- /// \brief Compute minimum and maximum elements from an Arrow array. Only
- /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
- /// / arrow::BinaryArray
- virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
-
- /// \brief Compute maximum and minimum elements in a batch of
- /// elements with accompanying bitmap indicating which elements are
- /// included (bit set) and excluded (bit not set)
- ///
- /// \param[in] values the sequence of values
- /// \param[in] length the length of the sequence
- /// \param[in] valid_bits a bitmap indicating which elements are
- /// included (1) or excluded (0)
- /// \param[in] valid_bits_offset the bit offset into the bitmap of
- /// the first element in the sequence
- virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
- const uint8_t* valid_bits,
- int64_t valid_bits_offset) = 0;
-};
-
-/// \brief Typed version of Comparator::Make
-template <typename DType>
-std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
- SortOrder::type sort_order,
- int type_length = -1) {
- return std::static_pointer_cast<TypedComparator<DType>>(
- Comparator::Make(physical_type, sort_order, type_length));
-}
-
-/// \brief Typed version of Comparator::Make
-template <typename DType>
-std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
- return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
-}
-
-// ----------------------------------------------------------------------
-
-/// \brief Structure represented encoded statistics to be written to
-/// and from Parquet serialized metadata
-class PARQUET_EXPORT EncodedStatistics {
- std::shared_ptr<std::string> max_, min_;
- bool is_signed_ = false;
-
- public:
- EncodedStatistics()
- : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
-
- const std::string& max() const { return *max_; }
- const std::string& min() const { return *min_; }
-
- int64_t null_count = 0;
- int64_t distinct_count = 0;
-
- bool has_min = false;
- bool has_max = false;
- bool has_null_count = false;
- bool has_distinct_count = false;
-
- // From parquet-mr
- // Don't write stats larger than the max size rather than truncating. The
- // rationale is that some engines may use the minimum value in the page as
- // the true minimum for aggregations and there is no way to mark that a
- // value has been truncated and is a lower bound and not in the page.
- void ApplyStatSizeLimits(size_t length) {
- if (max_->length() > length) {
- has_max = false;
- }
- if (min_->length() > length) {
- has_min = false;
- }
- }
-
- bool is_set() const {
- return has_min || has_max || has_null_count || has_distinct_count;
- }
-
- bool is_signed() const { return is_signed_; }
-
- void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
-
- EncodedStatistics& set_max(const std::string& value) {
- *max_ = value;
- has_max = true;
- return *this;
- }
-
- EncodedStatistics& set_min(const std::string& value) {
- *min_ = value;
- has_min = true;
- return *this;
- }
-
- EncodedStatistics& set_null_count(int64_t value) {
- null_count = value;
- has_null_count = true;
- return *this;
- }
-
- EncodedStatistics& set_distinct_count(int64_t value) {
- distinct_count = value;
- has_distinct_count = true;
- return *this;
- }
-};
-
-/// \brief Base type for computing column statistics while writing a file
-class PARQUET_EXPORT Statistics {
- public:
- virtual ~Statistics() {}
-
- /// \brief Create a new statistics instance given a column schema
- /// definition
- /// \param[in] descr the column schema
- /// \param[in] pool a memory pool to use for any memory allocations, optional
- static std::shared_ptr<Statistics> Make(
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- /// \brief Create a new statistics instance given a column schema
- /// definition and pre-existing state
- /// \param[in] descr the column schema
- /// \param[in] encoded_min the encoded minimum value
- /// \param[in] encoded_max the encoded maximum value
- /// \param[in] num_values total number of values
- /// \param[in] null_count number of null values
- /// \param[in] distinct_count number of distinct values
- /// \param[in] has_min_max whether the min/max statistics are set
- /// \param[in] has_null_count whether the null_count statistics are set
- /// \param[in] has_distinct_count whether the distinct_count statistics are set
- /// \param[in] pool a memory pool to use for any memory allocations, optional
- static std::shared_ptr<Statistics> Make(
- const ColumnDescriptor* descr, const std::string& encoded_min,
- const std::string& encoded_max, int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max, bool has_null_count,
- bool has_distinct_count,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
-
- /// \brief Return true if the count of null values is set
- virtual bool HasNullCount() const = 0;
-
- /// \brief The number of null values, may not be set
- virtual int64_t null_count() const = 0;
-
- /// \brief Return true if the count of distinct values is set
- virtual bool HasDistinctCount() const = 0;
-
- /// \brief The number of distinct values, may not be set
- virtual int64_t distinct_count() const = 0;
-
- /// \brief The total number of values in the column
- virtual int64_t num_values() const = 0;
-
- /// \brief Return true if the min and max statistics are set. Obtain
- /// with TypedStatistics<T>::min and max
- virtual bool HasMinMax() const = 0;
-
- /// \brief Reset state of object to initial (no data observed) state
- virtual void Reset() = 0;
-
- /// \brief Plain-encoded minimum value
- virtual std::string EncodeMin() const = 0;
-
- /// \brief Plain-encoded maximum value
- virtual std::string EncodeMax() const = 0;
-
- /// \brief The finalized encoded form of the statistics for transport
- virtual EncodedStatistics Encode() = 0;
-
- /// \brief The physical type of the column schema
- virtual Type::type physical_type() const = 0;
-
- /// \brief The full type descriptor from the column schema
- virtual const ColumnDescriptor* descr() const = 0;
-
- /// \brief Check two Statistics for equality
- virtual bool Equals(const Statistics& other) const = 0;
-
- protected:
- static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
- const void* max, int64_t num_values,
- int64_t null_count, int64_t distinct_count);
-};
-
-/// \brief A typed implementation of Statistics
-template <typename DType>
-class TypedStatistics : public Statistics {
- public:
- using T = typename DType::c_type;
-
- /// \brief The current minimum value
- virtual const T& min() const = 0;
-
- /// \brief The current maximum value
- virtual const T& max() const = 0;
-
- /// \brief Update state with state of another Statistics object
- virtual void Merge(const TypedStatistics<DType>& other) = 0;
-
- /// \brief Batch statistics update
- virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
-
- /// \brief Batch statistics update with supplied validity bitmap
- virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
- int64_t valid_bits_offset, int64_t num_not_null,
- int64_t num_null) = 0;
-
- /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
- /// conversion to a primitive Parquet C type. Only implemented for certain
- /// Parquet type / Arrow type combinations like BYTE_ARRAY /
- /// arrow::BinaryArray
- virtual void Update(const ::arrow::Array& values) = 0;
-
- /// \brief Set min and max values to particular values
- virtual void SetMinMax(const T& min, const T& max) = 0;
-};
-
-using BoolStatistics = TypedStatistics<BooleanType>;
-using Int32Statistics = TypedStatistics<Int32Type>;
-using Int64Statistics = TypedStatistics<Int64Type>;
-using FloatStatistics = TypedStatistics<FloatType>;
-using DoubleStatistics = TypedStatistics<DoubleType>;
-using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
-using FLBAStatistics = TypedStatistics<FLBAType>;
-
-/// \brief Typed version of Statistics::Make
-template <typename DType>
-std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
- const ColumnDescriptor* descr,
- ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
-}
-
-/// \brief Create Statistics initialized to a particular state
-/// \param[in] min the minimum value
-/// \param[in] max the minimum value
-/// \param[in] num_values number of values
-/// \param[in] null_count number of null values
-/// \param[in] distinct_count number of distinct values
-template <typename DType>
-std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
- const typename DType::c_type& max,
- int64_t num_values,
- int64_t null_count,
- int64_t distinct_count) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
- DType::type_num, &min, &max, num_values, null_count, distinct_count));
-}
-
-/// \brief Typed version of Statistics::Make
-template <typename DType>
-std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
- const ColumnDescriptor* descr, const std::string& encoded_min,
- const std::string& encoded_max, int64_t num_values, int64_t null_count,
- int64_t distinct_count, bool has_min_max, bool has_null_count,
- bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
- return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
- descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
- has_min_max, has_null_count, has_distinct_count, pool));
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class BinaryArray;
+
+} // namespace arrow
+
+namespace parquet {
+
+class ColumnDescriptor;
+
+// ----------------------------------------------------------------------
+// Value comparator interfaces
+
+/// \brief Base class for value comparators. Generally used with
+/// TypedComparator<T>
+class PARQUET_EXPORT Comparator {
+ public:
+ virtual ~Comparator() {}
+
+ /// \brief Create a comparator explicitly from physical type and
+ /// sort order
+ /// \param[in] physical_type the physical type for the typed
+ /// comparator
+ /// \param[in] sort_order either SortOrder::SIGNED or
+ /// SortOrder::UNSIGNED
+ /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
+ static std::shared_ptr<Comparator> Make(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length = -1);
+
+ /// \brief Create typed comparator inferring default sort order from
+ /// ColumnDescriptor
+ /// \param[in] descr the Parquet column schema
+ static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
+};
+
+/// \brief Interface for comparison of physical types according to the
+/// semantics of a particular logical type.
+template <typename DType>
+class TypedComparator : public Comparator {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief Scalar comparison of two elements, return true if first
+ /// is strictly less than the second
+ virtual bool Compare(const T& a, const T& b) = 0;
+
+ /// \brief Compute maximum and minimum elements in a batch of
+ /// elements without any nulls
+ virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) = 0;
+
+ /// \brief Compute minimum and maximum elements from an Arrow array. Only
+ /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
+ /// / arrow::BinaryArray
+ virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) = 0;
+
+ /// \brief Compute maximum and minimum elements in a batch of
+ /// elements with accompanying bitmap indicating which elements are
+ /// included (bit set) and excluded (bit not set)
+ ///
+ /// \param[in] values the sequence of values
+ /// \param[in] length the length of the sequence
+ /// \param[in] valid_bits a bitmap indicating which elements are
+ /// included (1) or excluded (0)
+ /// \param[in] valid_bits_offset the bit offset into the bitmap of
+ /// the first element in the sequence
+ virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+ const uint8_t* valid_bits,
+ int64_t valid_bits_offset) = 0;
+};
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
+ SortOrder::type sort_order,
+ int type_length = -1) {
+ return std::static_pointer_cast<TypedComparator<DType>>(
+ Comparator::Make(physical_type, sort_order, type_length));
+}
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
+ return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
+}
+
+// ----------------------------------------------------------------------
+
+/// \brief Structure represented encoded statistics to be written to
+/// and from Parquet serialized metadata
+class PARQUET_EXPORT EncodedStatistics {
+ std::shared_ptr<std::string> max_, min_;
+ bool is_signed_ = false;
+
+ public:
+ EncodedStatistics()
+ : max_(std::make_shared<std::string>()), min_(std::make_shared<std::string>()) {}
+
+ const std::string& max() const { return *max_; }
+ const std::string& min() const { return *min_; }
+
+ int64_t null_count = 0;
+ int64_t distinct_count = 0;
+
+ bool has_min = false;
+ bool has_max = false;
+ bool has_null_count = false;
+ bool has_distinct_count = false;
+
+ // From parquet-mr
+ // Don't write stats larger than the max size rather than truncating. The
+ // rationale is that some engines may use the minimum value in the page as
+ // the true minimum for aggregations and there is no way to mark that a
+ // value has been truncated and is a lower bound and not in the page.
+ void ApplyStatSizeLimits(size_t length) {
+ if (max_->length() > length) {
+ has_max = false;
+ }
+ if (min_->length() > length) {
+ has_min = false;
+ }
+ }
+
+ bool is_set() const {
+ return has_min || has_max || has_null_count || has_distinct_count;
+ }
+
+ bool is_signed() const { return is_signed_; }
+
+ void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
+
+ EncodedStatistics& set_max(const std::string& value) {
+ *max_ = value;
+ has_max = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_min(const std::string& value) {
+ *min_ = value;
+ has_min = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_null_count(int64_t value) {
+ null_count = value;
+ has_null_count = true;
+ return *this;
+ }
+
+ EncodedStatistics& set_distinct_count(int64_t value) {
+ distinct_count = value;
+ has_distinct_count = true;
+ return *this;
+ }
+};
+
+/// \brief Base type for computing column statistics while writing a file
+class PARQUET_EXPORT Statistics {
+ public:
+ virtual ~Statistics() {}
+
+ /// \brief Create a new statistics instance given a column schema
+ /// definition
+ /// \param[in] descr the column schema
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ /// \brief Create a new statistics instance given a column schema
+ /// definition and pre-existing state
+ /// \param[in] descr the column schema
+ /// \param[in] encoded_min the encoded minimum value
+ /// \param[in] encoded_max the encoded maximum value
+ /// \param[in] num_values total number of values
+ /// \param[in] null_count number of null values
+ /// \param[in] distinct_count number of distinct values
+ /// \param[in] has_min_max whether the min/max statistics are set
+ /// \param[in] has_null_count whether the null_count statistics are set
+ /// \param[in] has_distinct_count whether the distinct_count statistics are set
+ /// \param[in] pool a memory pool to use for any memory allocations, optional
+ static std::shared_ptr<Statistics> Make(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+ /// \brief Return true if the count of null values is set
+ virtual bool HasNullCount() const = 0;
+
+ /// \brief The number of null values, may not be set
+ virtual int64_t null_count() const = 0;
+
+ /// \brief Return true if the count of distinct values is set
+ virtual bool HasDistinctCount() const = 0;
+
+ /// \brief The number of distinct values, may not be set
+ virtual int64_t distinct_count() const = 0;
+
+ /// \brief The total number of values in the column
+ virtual int64_t num_values() const = 0;
+
+ /// \brief Return true if the min and max statistics are set. Obtain
+ /// with TypedStatistics<T>::min and max
+ virtual bool HasMinMax() const = 0;
+
+ /// \brief Reset state of object to initial (no data observed) state
+ virtual void Reset() = 0;
+
+ /// \brief Plain-encoded minimum value
+ virtual std::string EncodeMin() const = 0;
+
+ /// \brief Plain-encoded maximum value
+ virtual std::string EncodeMax() const = 0;
+
+ /// \brief The finalized encoded form of the statistics for transport
+ virtual EncodedStatistics Encode() = 0;
+
+ /// \brief The physical type of the column schema
+ virtual Type::type physical_type() const = 0;
+
+ /// \brief The full type descriptor from the column schema
+ virtual const ColumnDescriptor* descr() const = 0;
+
+ /// \brief Check two Statistics for equality
+ virtual bool Equals(const Statistics& other) const = 0;
+
+ protected:
+ static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
+ const void* max, int64_t num_values,
+ int64_t null_count, int64_t distinct_count);
+};
+
+/// \brief A typed implementation of Statistics
+template <typename DType>
+class TypedStatistics : public Statistics {
+ public:
+ using T = typename DType::c_type;
+
+ /// \brief The current minimum value
+ virtual const T& min() const = 0;
+
+ /// \brief The current maximum value
+ virtual const T& max() const = 0;
+
+ /// \brief Update state with state of another Statistics object
+ virtual void Merge(const TypedStatistics<DType>& other) = 0;
+
+ /// \brief Batch statistics update
+ virtual void Update(const T* values, int64_t num_not_null, int64_t num_null) = 0;
+
+ /// \brief Batch statistics update with supplied validity bitmap
+ virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
+ int64_t valid_bits_offset, int64_t num_not_null,
+ int64_t num_null) = 0;
+
+ /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
+ /// conversion to a primitive Parquet C type. Only implemented for certain
+ /// Parquet type / Arrow type combinations like BYTE_ARRAY /
+ /// arrow::BinaryArray
+ virtual void Update(const ::arrow::Array& values) = 0;
+
+ /// \brief Set min and max values to particular values
+ virtual void SetMinMax(const T& min, const T& max) = 0;
+};
+
+using BoolStatistics = TypedStatistics<BooleanType>;
+using Int32Statistics = TypedStatistics<Int32Type>;
+using Int64Statistics = TypedStatistics<Int64Type>;
+using FloatStatistics = TypedStatistics<FloatType>;
+using DoubleStatistics = TypedStatistics<DoubleType>;
+using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
+using FLBAStatistics = TypedStatistics<FLBAType>;
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr,
+ ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
+}
+
+/// \brief Create Statistics initialized to a particular state
+/// \param[in] min the minimum value
+/// \param[in] max the minimum value
+/// \param[in] num_values number of values
+/// \param[in] null_count number of null values
+/// \param[in] distinct_count number of distinct values
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
+ const typename DType::c_type& max,
+ int64_t num_values,
+ int64_t null_count,
+ int64_t distinct_count) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+ DType::type_num, &min, &max, num_values, null_count, distinct_count));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+ const ColumnDescriptor* descr, const std::string& encoded_min,
+ const std::string& encoded_max, int64_t num_values, int64_t null_count,
+ int64_t distinct_count, bool has_min_max, bool has_null_count,
+ bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+ return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+ descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+ has_min_max, has_null_count, has_distinct_count, pool));
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
index af7a35ddbc1..9a7cc8cdf86 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.cc
@@ -1,521 +1,521 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/stream_reader.h"
-
-#include <set>
-#include <utility>
-
-namespace parquet {
-
-constexpr int64_t StreamReader::kBatchSizeOne;
-
-// The converted type expected by the stream reader does not always
-// exactly match with the schema in the Parquet file. The following
-// is a list of converted types which are allowed instead of the
-// expected converted type.
-// Each pair given is:
-// {<StreamReader expected type>, <Parquet file converted type>}
-// So for example {ConvertedType::INT_32, ConvertedType::NONE} means
-// that if the StreamReader was expecting the converted type INT_32,
-// then it will allow the Parquet file to use the converted type
-// NONE.
-//
-static const std::set<std::pair<ConvertedType::type, ConvertedType::type> >
- converted_type_exceptions = {{ConvertedType::INT_32, ConvertedType::NONE},
- {ConvertedType::INT_64, ConvertedType::NONE},
- {ConvertedType::INT_32, ConvertedType::DECIMAL},
- {ConvertedType::INT_64, ConvertedType::DECIMAL},
- {ConvertedType::UTF8, ConvertedType::NONE}};
-
-StreamReader::StreamReader(std::unique_ptr<ParquetFileReader> reader)
- : file_reader_{std::move(reader)}, eof_{false} {
- file_metadata_ = file_reader_->metadata();
-
- auto schema = file_metadata_->schema();
- auto group_node = schema->group_node();
-
- nodes_.resize(schema->num_columns());
-
- for (auto i = 0; i < schema->num_columns(); ++i) {
- nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
- }
- NextRowGroup();
-}
-
-int StreamReader::num_columns() const {
- // Check for file metadata i.e. object is not default constructed.
- if (file_metadata_) {
- return file_metadata_->num_columns();
- }
- return 0;
-}
-
-int64_t StreamReader::num_rows() const {
- // Check for file metadata i.e. object is not default constructed.
- if (file_metadata_) {
- return file_metadata_->num_rows();
- }
- return 0;
-}
-
-StreamReader& StreamReader::operator>>(bool& v) {
- CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
- Read<BoolReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int8_t& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_8);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint8_t& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_8);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int16_t& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_16);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint16_t& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_16);
- Read<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int32_t& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_32);
- Read<Int32Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint32_t& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_32);
- Read<Int32Reader>(reinterpret_cast<int32_t*>(&v));
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(int64_t& v) {
- CheckColumn(Type::INT64, ConvertedType::INT_64);
- Read<Int64Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(uint64_t& v) {
- CheckColumn(Type::INT64, ConvertedType::UINT_64);
- Read<Int64Reader>(reinterpret_cast<int64_t*>(&v));
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(std::chrono::milliseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
- int64_t tmp;
- Read<Int64Reader>(&tmp);
- v = std::chrono::milliseconds{tmp};
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(std::chrono::microseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
- int64_t tmp;
- Read<Int64Reader>(&tmp);
- v = std::chrono::microseconds{tmp};
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(float& v) {
- CheckColumn(Type::FLOAT, ConvertedType::NONE);
- Read<FloatReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(double& v) {
- CheckColumn(Type::DOUBLE, ConvertedType::NONE);
- Read<DoubleReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(char& v) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
- FixedLenByteArray flba;
-
- Read(&flba);
- v = static_cast<char>(flba.ptr[0]);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(std::string& v) {
- CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
- ByteArray ba;
-
- Read(&ba);
- v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<bool>& v) {
- CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
- ReadOptional<BoolReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int8_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_8);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint8_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_8);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int16_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_16);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint16_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_16);
- ReadOptional<Int32Reader, int32_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int32_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::INT_32);
- ReadOptional<Int32Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint32_t>& v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_32);
- ReadOptional<Int32Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<int64_t>& v) {
- CheckColumn(Type::INT64, ConvertedType::INT_64);
- ReadOptional<Int64Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<uint64_t>& v) {
- CheckColumn(Type::INT64, ConvertedType::UINT_64);
- ReadOptional<Int64Reader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<float>& v) {
- CheckColumn(Type::FLOAT, ConvertedType::NONE);
- ReadOptional<FloatReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<double>& v) {
- CheckColumn(Type::DOUBLE, ConvertedType::NONE);
- ReadOptional<DoubleReader>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<std::chrono::milliseconds>& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
- ReadOptional<Int64Reader, int64_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<std::chrono::microseconds>& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
- ReadOptional<Int64Reader, int64_t>(&v);
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<char>& v) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
- FixedLenByteArray flba;
-
- if (ReadOptional(&flba)) {
- v = static_cast<char>(flba.ptr[0]);
- } else {
- v.reset();
- }
- return *this;
-}
-
-StreamReader& StreamReader::operator>>(optional<std::string>& v) {
- CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
- ByteArray ba;
-
- if (ReadOptional(&ba)) {
- v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
- } else {
- v.reset();
- }
- return *this;
-}
-
-void StreamReader::ReadFixedLength(char* ptr, int len) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, len);
- FixedLenByteArray flba;
- Read(&flba);
- std::memcpy(ptr, flba.ptr, len);
-}
-
-void StreamReader::Read(ByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read != 1) {
- ThrowReadFailedException(node);
- }
-}
-
-bool StreamReader::ReadOptional(ByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read == 1) {
- return true;
- } else if ((values_read == 0) && (def_level == 0)) {
- return false;
- }
- ThrowReadFailedException(node);
-}
-
-void StreamReader::Read(FixedLenByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader =
- static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read != 1) {
- ThrowReadFailedException(node);
- }
-}
-
-bool StreamReader::ReadOptional(FixedLenByteArray* v) {
- const auto& node = nodes_[column_index_];
- auto reader =
- static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read == 1) {
- return true;
- } else if ((values_read == 0) && (def_level == 0)) {
- return false;
- }
- ThrowReadFailedException(node);
-}
-
-void StreamReader::EndRow() {
- if (!file_reader_) {
- throw ParquetException("StreamReader not initialized");
- }
- if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
- throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
- " of " + std::to_string(nodes_.size()) + " columns read");
- }
- column_index_ = 0;
- ++current_row_;
-
- if (!column_readers_[0]->HasNext()) {
- NextRowGroup();
- }
-}
-
-void StreamReader::NextRowGroup() {
- // Find next none-empty row group
- while (row_group_index_ < file_metadata_->num_row_groups()) {
- row_group_reader_ = file_reader_->RowGroup(row_group_index_);
- ++row_group_index_;
-
- column_readers_.resize(file_metadata_->num_columns());
-
- for (int i = 0; i < file_metadata_->num_columns(); ++i) {
- column_readers_[i] = row_group_reader_->Column(i);
- }
- if (column_readers_[0]->HasNext()) {
- row_group_row_offset_ = current_row_;
- return;
- }
- }
- // No more row groups found.
- SetEof();
-}
-
-void StreamReader::SetEof() {
- // Do not reset file_metadata_ to ensure queries on the number of
- // rows/columns still function.
- eof_ = true;
- file_reader_.reset();
- row_group_reader_.reset();
- column_readers_.clear();
- nodes_.clear();
-}
-
-int64_t StreamReader::SkipRows(int64_t num_rows_to_skip) {
- if (0 != column_index_) {
- throw ParquetException("Must finish reading current row before skipping rows.");
- }
- int64_t num_rows_remaining_to_skip = num_rows_to_skip;
-
- while (!eof_ && (num_rows_remaining_to_skip > 0)) {
- int64_t num_rows_in_row_group = row_group_reader_->metadata()->num_rows();
- int64_t num_rows_remaining_in_row_group =
- num_rows_in_row_group - current_row_ - row_group_row_offset_;
-
- if (num_rows_remaining_in_row_group > num_rows_remaining_to_skip) {
- for (auto reader : column_readers_) {
- SkipRowsInColumn(reader.get(), num_rows_remaining_to_skip);
- }
- current_row_ += num_rows_remaining_to_skip;
- num_rows_remaining_to_skip = 0;
- } else {
- num_rows_remaining_to_skip -= num_rows_remaining_in_row_group;
- current_row_ += num_rows_remaining_in_row_group;
- NextRowGroup();
- }
- }
- return num_rows_to_skip - num_rows_remaining_to_skip;
-}
-
-int64_t StreamReader::SkipColumns(int64_t num_columns_to_skip) {
- int64_t num_columns_skipped = 0;
-
- if (!eof_) {
- for (; (num_columns_to_skip > num_columns_skipped) &&
- static_cast<std::size_t>(column_index_) < nodes_.size();
- ++column_index_) {
- SkipRowsInColumn(column_readers_[column_index_].get(), 1);
- ++num_columns_skipped;
- }
- }
- return num_columns_skipped;
-}
-
-void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip) {
- int64_t num_skipped = 0;
-
- switch (reader->type()) {
- case Type::BOOLEAN:
- num_skipped = static_cast<BoolReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::INT32:
- num_skipped = static_cast<Int32Reader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::INT64:
- num_skipped = static_cast<Int64Reader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::BYTE_ARRAY:
- num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::FLOAT:
- num_skipped = static_cast<FloatReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::DOUBLE:
- num_skipped = static_cast<DoubleReader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::INT96:
- num_skipped = static_cast<Int96Reader*>(reader)->Skip(num_rows_to_skip);
- break;
- case Type::UNDEFINED:
- throw ParquetException("Unexpected type: " + TypeToString(reader->type()));
- break;
- }
- if (num_rows_to_skip != num_skipped) {
- throw ParquetException("Skipped " + std::to_string(num_skipped) + "/" +
- std::to_string(num_rows_to_skip) + " rows in column " +
- reader->descr()->name());
- }
-}
-
-void StreamReader::CheckColumn(Type::type physical_type,
- ConvertedType::type converted_type, int length) {
- if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
- if (eof_) {
- ParquetException::EofException();
- }
- throw ParquetException("Column index out-of-bounds. Index " +
- std::to_string(column_index_) + " is invalid for " +
- std::to_string(nodes_.size()) + " columns");
- }
- const auto& node = nodes_[column_index_];
-
- if (physical_type != node->physical_type()) {
- throw ParquetException("Column physical type mismatch. Column '" + node->name() +
- "' has physical type '" + TypeToString(node->physical_type()) +
- "' not '" + TypeToString(physical_type) + "'");
- }
- if (converted_type != node->converted_type()) {
- // The converted type does not always match with the value
- // provided so check the set of exceptions.
- if (converted_type_exceptions.find({converted_type, node->converted_type()}) ==
- converted_type_exceptions.end()) {
- throw ParquetException("Column converted type mismatch. Column '" + node->name() +
- "' has converted type '" +
- ConvertedTypeToString(node->converted_type()) + "' not '" +
- ConvertedTypeToString(converted_type) + "'");
- }
- }
- // Length must be exact.
- if (length != node->type_length()) {
- throw ParquetException("Column length mismatch. Column '" + node->name() +
- "' has length " + std::to_string(node->type_length()) +
- "] not " + std::to_string(length));
- }
-} // namespace parquet
-
-void StreamReader::ThrowReadFailedException(
- const std::shared_ptr<schema::PrimitiveNode>& node) {
- throw ParquetException("Failed to read value for column '" + node->name() +
- "' on row " + std::to_string(current_row_));
-}
-
-StreamReader& operator>>(StreamReader& os, EndRowType) {
- os.EndRow();
- return os;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/stream_reader.h"
+
+#include <set>
+#include <utility>
+
+namespace parquet {
+
+constexpr int64_t StreamReader::kBatchSizeOne;
+
+// The converted type expected by the stream reader does not always
+// exactly match with the schema in the Parquet file. The following
+// is a list of converted types which are allowed instead of the
+// expected converted type.
+// Each pair given is:
+// {<StreamReader expected type>, <Parquet file converted type>}
+// So for example {ConvertedType::INT_32, ConvertedType::NONE} means
+// that if the StreamReader was expecting the converted type INT_32,
+// then it will allow the Parquet file to use the converted type
+// NONE.
+//
+static const std::set<std::pair<ConvertedType::type, ConvertedType::type> >
+ converted_type_exceptions = {{ConvertedType::INT_32, ConvertedType::NONE},
+ {ConvertedType::INT_64, ConvertedType::NONE},
+ {ConvertedType::INT_32, ConvertedType::DECIMAL},
+ {ConvertedType::INT_64, ConvertedType::DECIMAL},
+ {ConvertedType::UTF8, ConvertedType::NONE}};
+
+StreamReader::StreamReader(std::unique_ptr<ParquetFileReader> reader)
+ : file_reader_{std::move(reader)}, eof_{false} {
+ file_metadata_ = file_reader_->metadata();
+
+ auto schema = file_metadata_->schema();
+ auto group_node = schema->group_node();
+
+ nodes_.resize(schema->num_columns());
+
+ for (auto i = 0; i < schema->num_columns(); ++i) {
+ nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
+ }
+ NextRowGroup();
+}
+
+int StreamReader::num_columns() const {
+ // Check for file metadata i.e. object is not default constructed.
+ if (file_metadata_) {
+ return file_metadata_->num_columns();
+ }
+ return 0;
+}
+
+int64_t StreamReader::num_rows() const {
+ // Check for file metadata i.e. object is not default constructed.
+ if (file_metadata_) {
+ return file_metadata_->num_rows();
+ }
+ return 0;
+}
+
+StreamReader& StreamReader::operator>>(bool& v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ Read<BoolReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int8_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint8_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int16_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint16_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ Read<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int32_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ Read<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint32_t& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ Read<Int32Reader>(reinterpret_cast<int32_t*>(&v));
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(int64_t& v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ Read<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(uint64_t& v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ Read<Int64Reader>(reinterpret_cast<int64_t*>(&v));
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::chrono::milliseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ int64_t tmp;
+ Read<Int64Reader>(&tmp);
+ v = std::chrono::milliseconds{tmp};
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::chrono::microseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ int64_t tmp;
+ Read<Int64Reader>(&tmp);
+ v = std::chrono::microseconds{tmp};
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(float& v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ Read<FloatReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(double& v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ Read<DoubleReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(char& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
+ FixedLenByteArray flba;
+
+ Read(&flba);
+ v = static_cast<char>(flba.ptr[0]);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(std::string& v) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+ ByteArray ba;
+
+ Read(&ba);
+ v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<bool>& v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ ReadOptional<BoolReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int8_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint8_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int16_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint16_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ ReadOptional<Int32Reader, int32_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int32_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ ReadOptional<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint32_t>& v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ ReadOptional<Int32Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<int64_t>& v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ ReadOptional<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<uint64_t>& v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ ReadOptional<Int64Reader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<float>& v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ ReadOptional<FloatReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<double>& v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ ReadOptional<DoubleReader>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::chrono::milliseconds>& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ ReadOptional<Int64Reader, int64_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::chrono::microseconds>& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ ReadOptional<Int64Reader, int64_t>(&v);
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<char>& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, 1);
+ FixedLenByteArray flba;
+
+ if (ReadOptional(&flba)) {
+ v = static_cast<char>(flba.ptr[0]);
+ } else {
+ v.reset();
+ }
+ return *this;
+}
+
+StreamReader& StreamReader::operator>>(optional<std::string>& v) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+ ByteArray ba;
+
+ if (ReadOptional(&ba)) {
+ v = std::string(reinterpret_cast<const char*>(ba.ptr), ba.len);
+ } else {
+ v.reset();
+ }
+ return *this;
+}
+
+void StreamReader::ReadFixedLength(char* ptr, int len) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, len);
+ FixedLenByteArray flba;
+ Read(&flba);
+ std::memcpy(ptr, flba.ptr, len);
+}
+
+void StreamReader::Read(ByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+}
+
+bool StreamReader::ReadOptional(ByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read == 1) {
+ return true;
+ } else if ((values_read == 0) && (def_level == 0)) {
+ return false;
+ }
+ ThrowReadFailedException(node);
+}
+
+void StreamReader::Read(FixedLenByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader =
+ static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+}
+
+bool StreamReader::ReadOptional(FixedLenByteArray* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader =
+ static_cast<FixedLenByteArrayReader*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read == 1) {
+ return true;
+ } else if ((values_read == 0) && (def_level == 0)) {
+ return false;
+ }
+ ThrowReadFailedException(node);
+}
+
+void StreamReader::EndRow() {
+ if (!file_reader_) {
+ throw ParquetException("StreamReader not initialized");
+ }
+ if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
+ throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
+ " of " + std::to_string(nodes_.size()) + " columns read");
+ }
+ column_index_ = 0;
+ ++current_row_;
+
+ if (!column_readers_[0]->HasNext()) {
+ NextRowGroup();
+ }
+}
+
+void StreamReader::NextRowGroup() {
+ // Find next none-empty row group
+ while (row_group_index_ < file_metadata_->num_row_groups()) {
+ row_group_reader_ = file_reader_->RowGroup(row_group_index_);
+ ++row_group_index_;
+
+ column_readers_.resize(file_metadata_->num_columns());
+
+ for (int i = 0; i < file_metadata_->num_columns(); ++i) {
+ column_readers_[i] = row_group_reader_->Column(i);
+ }
+ if (column_readers_[0]->HasNext()) {
+ row_group_row_offset_ = current_row_;
+ return;
+ }
+ }
+ // No more row groups found.
+ SetEof();
+}
+
+void StreamReader::SetEof() {
+ // Do not reset file_metadata_ to ensure queries on the number of
+ // rows/columns still function.
+ eof_ = true;
+ file_reader_.reset();
+ row_group_reader_.reset();
+ column_readers_.clear();
+ nodes_.clear();
+}
+
+int64_t StreamReader::SkipRows(int64_t num_rows_to_skip) {
+ if (0 != column_index_) {
+ throw ParquetException("Must finish reading current row before skipping rows.");
+ }
+ int64_t num_rows_remaining_to_skip = num_rows_to_skip;
+
+ while (!eof_ && (num_rows_remaining_to_skip > 0)) {
+ int64_t num_rows_in_row_group = row_group_reader_->metadata()->num_rows();
+ int64_t num_rows_remaining_in_row_group =
+ num_rows_in_row_group - current_row_ - row_group_row_offset_;
+
+ if (num_rows_remaining_in_row_group > num_rows_remaining_to_skip) {
+ for (auto reader : column_readers_) {
+ SkipRowsInColumn(reader.get(), num_rows_remaining_to_skip);
+ }
+ current_row_ += num_rows_remaining_to_skip;
+ num_rows_remaining_to_skip = 0;
+ } else {
+ num_rows_remaining_to_skip -= num_rows_remaining_in_row_group;
+ current_row_ += num_rows_remaining_in_row_group;
+ NextRowGroup();
+ }
+ }
+ return num_rows_to_skip - num_rows_remaining_to_skip;
+}
+
+int64_t StreamReader::SkipColumns(int64_t num_columns_to_skip) {
+ int64_t num_columns_skipped = 0;
+
+ if (!eof_) {
+ for (; (num_columns_to_skip > num_columns_skipped) &&
+ static_cast<std::size_t>(column_index_) < nodes_.size();
+ ++column_index_) {
+ SkipRowsInColumn(column_readers_[column_index_].get(), 1);
+ ++num_columns_skipped;
+ }
+ }
+ return num_columns_skipped;
+}
+
+void StreamReader::SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip) {
+ int64_t num_skipped = 0;
+
+ switch (reader->type()) {
+ case Type::BOOLEAN:
+ num_skipped = static_cast<BoolReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT32:
+ num_skipped = static_cast<Int32Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT64:
+ num_skipped = static_cast<Int64Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::BYTE_ARRAY:
+ num_skipped = static_cast<ByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ num_skipped = static_cast<FixedLenByteArrayReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::FLOAT:
+ num_skipped = static_cast<FloatReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::DOUBLE:
+ num_skipped = static_cast<DoubleReader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::INT96:
+ num_skipped = static_cast<Int96Reader*>(reader)->Skip(num_rows_to_skip);
+ break;
+ case Type::UNDEFINED:
+ throw ParquetException("Unexpected type: " + TypeToString(reader->type()));
+ break;
+ }
+ if (num_rows_to_skip != num_skipped) {
+ throw ParquetException("Skipped " + std::to_string(num_skipped) + "/" +
+ std::to_string(num_rows_to_skip) + " rows in column " +
+ reader->descr()->name());
+ }
+}
+
+void StreamReader::CheckColumn(Type::type physical_type,
+ ConvertedType::type converted_type, int length) {
+ if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
+ if (eof_) {
+ ParquetException::EofException();
+ }
+ throw ParquetException("Column index out-of-bounds. Index " +
+ std::to_string(column_index_) + " is invalid for " +
+ std::to_string(nodes_.size()) + " columns");
+ }
+ const auto& node = nodes_[column_index_];
+
+ if (physical_type != node->physical_type()) {
+ throw ParquetException("Column physical type mismatch. Column '" + node->name() +
+ "' has physical type '" + TypeToString(node->physical_type()) +
+ "' not '" + TypeToString(physical_type) + "'");
+ }
+ if (converted_type != node->converted_type()) {
+ // The converted type does not always match with the value
+ // provided so check the set of exceptions.
+ if (converted_type_exceptions.find({converted_type, node->converted_type()}) ==
+ converted_type_exceptions.end()) {
+ throw ParquetException("Column converted type mismatch. Column '" + node->name() +
+ "' has converted type '" +
+ ConvertedTypeToString(node->converted_type()) + "' not '" +
+ ConvertedTypeToString(converted_type) + "'");
+ }
+ }
+ // Length must be exact.
+ if (length != node->type_length()) {
+ throw ParquetException("Column length mismatch. Column '" + node->name() +
+ "' has length " + std::to_string(node->type_length()) +
+ "] not " + std::to_string(length));
+ }
+} // namespace parquet
+
+void StreamReader::ThrowReadFailedException(
+ const std::shared_ptr<schema::PrimitiveNode>& node) {
+ throw ParquetException("Failed to read value for column '" + node->name() +
+ "' on row " + std::to_string(current_row_));
+}
+
+StreamReader& operator>>(StreamReader& os, EndRowType) {
+ os.EndRow();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
index 3dfebb27146..806b0e8ad9a 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_reader.h
@@ -1,299 +1,299 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <array>
-#include <chrono>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/util/optional.h"
-#include "parquet/column_reader.h"
-#include "parquet/file_reader.h"
-#include "parquet/stream_writer.h"
-
-namespace parquet {
-
-/// \brief A class for reading Parquet files using an output stream type API.
-///
-/// The values given must be of the correct type i.e. the type must
-/// match the file schema exactly otherwise a ParquetException will be
-/// thrown.
-///
-/// The user must explicitly advance to the next row using the
-/// EndRow() function or EndRow input manipulator.
-///
-/// Required and optional fields are supported:
-/// - Required fields are read using operator>>(T)
-/// - Optional fields are read with
-/// operator>>(arrow::util::optional<T>)
-///
-/// Note that operator>>(arrow::util::optional<T>) can be used to read
-/// required fields.
-///
-/// Similarly operator>>(T) can be used to read optional fields.
-/// However, if the value is not present then a ParquetException will
-/// be raised.
-///
-/// Currently there is no support for repeated fields.
-///
-class PARQUET_EXPORT StreamReader {
- public:
- template <typename T>
- using optional = ::arrow::util::optional<T>;
-
- // N.B. Default constructed objects are not usable. This
- // constructor is provided so that the object may be move
- // assigned afterwards.
- StreamReader() = default;
-
- explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
-
- ~StreamReader() = default;
-
- bool eof() const { return eof_; }
-
- int current_column() const { return column_index_; }
-
- int64_t current_row() const { return current_row_; }
-
- int num_columns() const;
-
- int64_t num_rows() const;
-
- // Moving is possible.
- StreamReader(StreamReader&&) = default;
- StreamReader& operator=(StreamReader&&) = default;
-
- // Copying is not allowed.
- StreamReader(const StreamReader&) = delete;
- StreamReader& operator=(const StreamReader&) = delete;
-
- StreamReader& operator>>(bool& v);
-
- StreamReader& operator>>(int8_t& v);
-
- StreamReader& operator>>(uint8_t& v);
-
- StreamReader& operator>>(int16_t& v);
-
- StreamReader& operator>>(uint16_t& v);
-
- StreamReader& operator>>(int32_t& v);
-
- StreamReader& operator>>(uint32_t& v);
-
- StreamReader& operator>>(int64_t& v);
-
- StreamReader& operator>>(uint64_t& v);
-
- StreamReader& operator>>(std::chrono::milliseconds& v);
-
- StreamReader& operator>>(std::chrono::microseconds& v);
-
- StreamReader& operator>>(float& v);
-
- StreamReader& operator>>(double& v);
-
- StreamReader& operator>>(char& v);
-
- template <int N>
- StreamReader& operator>>(char (&v)[N]) {
- ReadFixedLength(v, N);
- return *this;
- }
-
- template <std::size_t N>
- StreamReader& operator>>(std::array<char, N>& v) {
- ReadFixedLength(v.data(), static_cast<int>(N));
- return *this;
- }
-
- // N.B. Cannot allow for reading to a arbitrary char pointer as the
- // length cannot be verified. Also it would overshadow the
- // char[N] input operator.
- // StreamReader& operator>>(char * v);
-
- StreamReader& operator>>(std::string& v);
-
- // Input operators for optional fields.
-
- StreamReader& operator>>(optional<bool>& v);
-
- StreamReader& operator>>(optional<int8_t>& v);
-
- StreamReader& operator>>(optional<uint8_t>& v);
-
- StreamReader& operator>>(optional<int16_t>& v);
-
- StreamReader& operator>>(optional<uint16_t>& v);
-
- StreamReader& operator>>(optional<int32_t>& v);
-
- StreamReader& operator>>(optional<uint32_t>& v);
-
- StreamReader& operator>>(optional<int64_t>& v);
-
- StreamReader& operator>>(optional<uint64_t>& v);
-
- StreamReader& operator>>(optional<float>& v);
-
- StreamReader& operator>>(optional<double>& v);
-
- StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
-
- StreamReader& operator>>(optional<std::chrono::microseconds>& v);
-
- StreamReader& operator>>(optional<char>& v);
-
- StreamReader& operator>>(optional<std::string>& v);
-
- template <std::size_t N>
- StreamReader& operator>>(optional<std::array<char, N>>& v) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
- FixedLenByteArray flba;
- if (ReadOptional(&flba)) {
- v = std::array<char, N>{};
- std::memcpy(v->data(), flba.ptr, N);
- } else {
- v.reset();
- }
- return *this;
- }
-
- /// \brief Terminate current row and advance to next one.
- /// \throws ParquetException if all columns in the row were not
- /// read or skipped.
- void EndRow();
-
- /// \brief Skip the data in the next columns.
- /// If the number of columns exceeds the columns remaining on the
- /// current row then skipping is terminated - it does _not_ continue
- /// skipping columns on the next row.
- /// Skipping of columns still requires the use 'EndRow' even if all
- /// remaining columns were skipped.
- /// \return Number of columns actually skipped.
- int64_t SkipColumns(int64_t num_columns_to_skip);
-
- /// \brief Skip the data in the next rows.
- /// Skipping of rows is not allowed if reading of data for the
- /// current row is not finished.
- /// Skipping of rows will be terminated if the end of file is
- /// reached.
- /// \return Number of rows actually skipped.
- int64_t SkipRows(int64_t num_rows_to_skip);
-
- protected:
- [[noreturn]] void ThrowReadFailedException(
- const std::shared_ptr<schema::PrimitiveNode>& node);
-
- template <typename ReaderType, typename T>
- void Read(T* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
-
- if (values_read != 1) {
- ThrowReadFailedException(node);
- }
- }
-
- template <typename ReaderType, typename ReadType, typename T>
- void Read(T* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- ReadType tmp;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
-
- if (values_read == 1) {
- *v = tmp;
- } else {
- ThrowReadFailedException(node);
- }
- }
-
- template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
- void ReadOptional(optional<T>* v) {
- const auto& node = nodes_[column_index_];
- auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
- int16_t def_level;
- int16_t rep_level;
- ReadType tmp;
- int64_t values_read;
-
- reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
-
- if (values_read == 1) {
- *v = T(tmp);
- } else if ((values_read == 0) && (def_level == 0)) {
- v->reset();
- } else {
- ThrowReadFailedException(node);
- }
- }
-
- void ReadFixedLength(char* ptr, int len);
-
- void Read(ByteArray* v);
-
- void Read(FixedLenByteArray* v);
-
- bool ReadOptional(ByteArray* v);
-
- bool ReadOptional(FixedLenByteArray* v);
-
- void NextRowGroup();
-
- void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
- int length = 0);
-
- void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
-
- void SetEof();
-
- private:
- std::unique_ptr<ParquetFileReader> file_reader_;
- std::shared_ptr<FileMetaData> file_metadata_;
- std::shared_ptr<RowGroupReader> row_group_reader_;
- std::vector<std::shared_ptr<ColumnReader>> column_readers_;
- std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
-
- bool eof_{true};
- int row_group_index_{0};
- int column_index_{0};
- int64_t current_row_{0};
- int64_t row_group_row_offset_{0};
-
- static constexpr int64_t kBatchSizeOne = 1;
-}; // namespace parquet
-
-PARQUET_EXPORT
-StreamReader& operator>>(StreamReader&, EndRowType);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/stream_writer.h"
+
+namespace parquet {
+
+/// \brief A class for reading Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly advance to the next row using the
+/// EndRow() function or EndRow input manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are read using operator>>(T)
+/// - Optional fields are read with
+/// operator>>(arrow::util::optional<T>)
+///
+/// Note that operator>>(arrow::util::optional<T>) can be used to read
+/// required fields.
+///
+/// Similarly operator>>(T) can be used to read optional fields.
+/// However, if the value is not present then a ParquetException will
+/// be raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamReader {
+ public:
+ template <typename T>
+ using optional = ::arrow::util::optional<T>;
+
+ // N.B. Default constructed objects are not usable. This
+ // constructor is provided so that the object may be move
+ // assigned afterwards.
+ StreamReader() = default;
+
+ explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
+
+ ~StreamReader() = default;
+
+ bool eof() const { return eof_; }
+
+ int current_column() const { return column_index_; }
+
+ int64_t current_row() const { return current_row_; }
+
+ int num_columns() const;
+
+ int64_t num_rows() const;
+
+ // Moving is possible.
+ StreamReader(StreamReader&&) = default;
+ StreamReader& operator=(StreamReader&&) = default;
+
+ // Copying is not allowed.
+ StreamReader(const StreamReader&) = delete;
+ StreamReader& operator=(const StreamReader&) = delete;
+
+ StreamReader& operator>>(bool& v);
+
+ StreamReader& operator>>(int8_t& v);
+
+ StreamReader& operator>>(uint8_t& v);
+
+ StreamReader& operator>>(int16_t& v);
+
+ StreamReader& operator>>(uint16_t& v);
+
+ StreamReader& operator>>(int32_t& v);
+
+ StreamReader& operator>>(uint32_t& v);
+
+ StreamReader& operator>>(int64_t& v);
+
+ StreamReader& operator>>(uint64_t& v);
+
+ StreamReader& operator>>(std::chrono::milliseconds& v);
+
+ StreamReader& operator>>(std::chrono::microseconds& v);
+
+ StreamReader& operator>>(float& v);
+
+ StreamReader& operator>>(double& v);
+
+ StreamReader& operator>>(char& v);
+
+ template <int N>
+ StreamReader& operator>>(char (&v)[N]) {
+ ReadFixedLength(v, N);
+ return *this;
+ }
+
+ template <std::size_t N>
+ StreamReader& operator>>(std::array<char, N>& v) {
+ ReadFixedLength(v.data(), static_cast<int>(N));
+ return *this;
+ }
+
+ // N.B. Cannot allow for reading to a arbitrary char pointer as the
+ // length cannot be verified. Also it would overshadow the
+ // char[N] input operator.
+ // StreamReader& operator>>(char * v);
+
+ StreamReader& operator>>(std::string& v);
+
+ // Input operators for optional fields.
+
+ StreamReader& operator>>(optional<bool>& v);
+
+ StreamReader& operator>>(optional<int8_t>& v);
+
+ StreamReader& operator>>(optional<uint8_t>& v);
+
+ StreamReader& operator>>(optional<int16_t>& v);
+
+ StreamReader& operator>>(optional<uint16_t>& v);
+
+ StreamReader& operator>>(optional<int32_t>& v);
+
+ StreamReader& operator>>(optional<uint32_t>& v);
+
+ StreamReader& operator>>(optional<int64_t>& v);
+
+ StreamReader& operator>>(optional<uint64_t>& v);
+
+ StreamReader& operator>>(optional<float>& v);
+
+ StreamReader& operator>>(optional<double>& v);
+
+ StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
+
+ StreamReader& operator>>(optional<std::chrono::microseconds>& v);
+
+ StreamReader& operator>>(optional<char>& v);
+
+ StreamReader& operator>>(optional<std::string>& v);
+
+ template <std::size_t N>
+ StreamReader& operator>>(optional<std::array<char, N>>& v) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
+ FixedLenByteArray flba;
+ if (ReadOptional(&flba)) {
+ v = std::array<char, N>{};
+ std::memcpy(v->data(), flba.ptr, N);
+ } else {
+ v.reset();
+ }
+ return *this;
+ }
+
+ /// \brief Terminate current row and advance to next one.
+ /// \throws ParquetException if all columns in the row were not
+ /// read or skipped.
+ void EndRow();
+
+ /// \brief Skip the data in the next columns.
+ /// If the number of columns exceeds the columns remaining on the
+ /// current row then skipping is terminated - it does _not_ continue
+ /// skipping columns on the next row.
+ /// Skipping of columns still requires the use 'EndRow' even if all
+ /// remaining columns were skipped.
+ /// \return Number of columns actually skipped.
+ int64_t SkipColumns(int64_t num_columns_to_skip);
+
+ /// \brief Skip the data in the next rows.
+ /// Skipping of rows is not allowed if reading of data for the
+ /// current row is not finished.
+ /// Skipping of rows will be terminated if the end of file is
+ /// reached.
+ /// \return Number of rows actually skipped.
+ int64_t SkipRows(int64_t num_rows_to_skip);
+
+ protected:
+ [[noreturn]] void ThrowReadFailedException(
+ const std::shared_ptr<schema::PrimitiveNode>& node);
+
+ template <typename ReaderType, typename T>
+ void Read(T* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+ if (values_read != 1) {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ template <typename ReaderType, typename ReadType, typename T>
+ void Read(T* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ ReadType tmp;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+ if (values_read == 1) {
+ *v = tmp;
+ } else {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
+ void ReadOptional(optional<T>* v) {
+ const auto& node = nodes_[column_index_];
+ auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+ int16_t def_level;
+ int16_t rep_level;
+ ReadType tmp;
+ int64_t values_read;
+
+ reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+ if (values_read == 1) {
+ *v = T(tmp);
+ } else if ((values_read == 0) && (def_level == 0)) {
+ v->reset();
+ } else {
+ ThrowReadFailedException(node);
+ }
+ }
+
+ void ReadFixedLength(char* ptr, int len);
+
+ void Read(ByteArray* v);
+
+ void Read(FixedLenByteArray* v);
+
+ bool ReadOptional(ByteArray* v);
+
+ bool ReadOptional(FixedLenByteArray* v);
+
+ void NextRowGroup();
+
+ void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+ int length = 0);
+
+ void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
+
+ void SetEof();
+
+ private:
+ std::unique_ptr<ParquetFileReader> file_reader_;
+ std::shared_ptr<FileMetaData> file_metadata_;
+ std::shared_ptr<RowGroupReader> row_group_reader_;
+ std::vector<std::shared_ptr<ColumnReader>> column_readers_;
+ std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
+
+ bool eof_{true};
+ int row_group_index_{0};
+ int column_index_{0};
+ int64_t current_row_{0};
+ int64_t row_group_row_offset_{0};
+
+ static constexpr int64_t kBatchSizeOne = 1;
+}; // namespace parquet
+
+PARQUET_EXPORT
+StreamReader& operator>>(StreamReader&, EndRowType);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
index 2ebbd3c5e23..253ebf1bc91 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.cc
@@ -1,324 +1,324 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "parquet/stream_writer.h"
-
-#include <utility>
-
-namespace parquet {
-
-int64_t StreamWriter::default_row_group_size_{512 * 1024 * 1024}; // 512MB
-
-constexpr int16_t StreamWriter::kDefLevelZero;
-constexpr int16_t StreamWriter::kDefLevelOne;
-constexpr int16_t StreamWriter::kRepLevelZero;
-constexpr int64_t StreamWriter::kBatchSizeOne;
-
-StreamWriter::FixedStringView::FixedStringView(const char* data_ptr)
- : data{data_ptr}, size{std::strlen(data_ptr)} {}
-
-StreamWriter::FixedStringView::FixedStringView(const char* data_ptr, std::size_t data_len)
- : data{data_ptr}, size{data_len} {}
-
-StreamWriter::StreamWriter(std::unique_ptr<ParquetFileWriter> writer)
- : file_writer_{std::move(writer)},
- row_group_writer_{file_writer_->AppendBufferedRowGroup()} {
- auto schema = file_writer_->schema();
- auto group_node = schema->group_node();
-
- nodes_.resize(schema->num_columns());
-
- for (auto i = 0; i < schema->num_columns(); ++i) {
- nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
- }
-}
-
-void StreamWriter::SetDefaultMaxRowGroupSize(int64_t max_size) {
- default_row_group_size_ = max_size;
-}
-
-void StreamWriter::SetMaxRowGroupSize(int64_t max_size) {
- max_row_group_size_ = max_size;
-}
-
-int StreamWriter::num_columns() const { return static_cast<int>(nodes_.size()); }
-
-StreamWriter& StreamWriter::operator<<(bool v) {
- CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
- return Write<BoolWriter>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(int8_t v) {
- CheckColumn(Type::INT32, ConvertedType::INT_8);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(uint8_t v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_8);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(int16_t v) {
- CheckColumn(Type::INT32, ConvertedType::INT_16);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(uint16_t v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_16);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(int32_t v) {
- CheckColumn(Type::INT32, ConvertedType::INT_32);
- return Write<Int32Writer>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(uint32_t v) {
- CheckColumn(Type::INT32, ConvertedType::UINT_32);
- return Write<Int32Writer>(static_cast<int32_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(int64_t v) {
- CheckColumn(Type::INT64, ConvertedType::INT_64);
- return Write<Int64Writer>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(uint64_t v) {
- CheckColumn(Type::INT64, ConvertedType::UINT_64);
- return Write<Int64Writer>(static_cast<int64_t>(v));
-}
-
-StreamWriter& StreamWriter::operator<<(const std::chrono::milliseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
- return Write<Int64Writer>(static_cast<int64_t>(v.count()));
-}
-
-StreamWriter& StreamWriter::operator<<(const std::chrono::microseconds& v) {
- CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
- return Write<Int64Writer>(static_cast<int64_t>(v.count()));
-}
-
-StreamWriter& StreamWriter::operator<<(float v) {
- CheckColumn(Type::FLOAT, ConvertedType::NONE);
- return Write<FloatWriter>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(double v) {
- CheckColumn(Type::DOUBLE, ConvertedType::NONE);
- return Write<DoubleWriter>(v);
-}
-
-StreamWriter& StreamWriter::operator<<(char v) { return WriteFixedLength(&v, 1); }
-
-StreamWriter& StreamWriter::operator<<(FixedStringView v) {
- return WriteFixedLength(v.data, v.size);
-}
-
-StreamWriter& StreamWriter::operator<<(const char* v) {
- return WriteVariableLength(v, std::strlen(v));
-}
-
-StreamWriter& StreamWriter::operator<<(const std::string& v) {
- return WriteVariableLength(v.data(), v.size());
-}
-
-StreamWriter& StreamWriter::operator<<(::arrow::util::string_view v) {
- return WriteVariableLength(v.data(), v.size());
-}
-
-StreamWriter& StreamWriter::WriteVariableLength(const char* data_ptr,
- std::size_t data_len) {
- CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
-
- auto writer = static_cast<ByteArrayWriter*>(row_group_writer_->column(column_index_++));
-
- if (data_ptr != nullptr) {
- ByteArray ba_value;
-
- ba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
- ba_value.len = static_cast<uint32_t>(data_len);
-
- writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &ba_value);
- } else {
- writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
- }
- if (max_row_group_size_ > 0) {
- row_group_size_ += writer->EstimatedBufferedValueBytes();
- }
- return *this;
-}
-
-StreamWriter& StreamWriter::WriteFixedLength(const char* data_ptr, std::size_t data_len) {
- CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
- static_cast<int>(data_len));
-
- auto writer =
- static_cast<FixedLenByteArrayWriter*>(row_group_writer_->column(column_index_++));
-
- if (data_ptr != nullptr) {
- FixedLenByteArray flba_value;
-
- flba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
- writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &flba_value);
- } else {
- writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
- }
- if (max_row_group_size_ > 0) {
- row_group_size_ += writer->EstimatedBufferedValueBytes();
- }
- return *this;
-}
-
-void StreamWriter::CheckColumn(Type::type physical_type,
- ConvertedType::type converted_type, int length) {
- if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
- throw ParquetException("Column index out-of-bounds. Index " +
- std::to_string(column_index_) + " is invalid for " +
- std::to_string(nodes_.size()) + " columns");
- }
- const auto& node = nodes_[column_index_];
-
- if (physical_type != node->physical_type()) {
- throw ParquetException("Column physical type mismatch. Column '" + node->name() +
- "' has physical type '" + TypeToString(node->physical_type()) +
- "' not '" + TypeToString(physical_type) + "'");
- }
- if (converted_type != node->converted_type()) {
- throw ParquetException("Column converted type mismatch. Column '" + node->name() +
- "' has converted type[" +
- ConvertedTypeToString(node->converted_type()) + "] not '" +
- ConvertedTypeToString(converted_type) + "'");
- }
- // Length must be exact.
- // A shorter length fixed array is not acceptable as it would
- // result in array bound read errors.
- //
- if (length != node->type_length()) {
- throw ParquetException("Column length mismatch. Column '" + node->name() +
- "' has length " + std::to_string(node->type_length()) +
- " not " + std::to_string(length));
- }
-}
-
-int64_t StreamWriter::SkipColumns(int num_columns_to_skip) {
- int num_columns_skipped = 0;
-
- for (; (num_columns_to_skip > num_columns_skipped) &&
- static_cast<std::size_t>(column_index_) < nodes_.size();
- ++num_columns_skipped) {
- const auto& node = nodes_[column_index_];
-
- if (node->is_required()) {
- throw ParquetException("Cannot skip column '" + node->name() +
- "' as it is required.");
- }
- auto writer = row_group_writer_->column(column_index_++);
-
- WriteNullValue(writer);
- }
- return num_columns_skipped;
-}
-
-void StreamWriter::WriteNullValue(ColumnWriter* writer) {
- switch (writer->type()) {
- case Type::BOOLEAN:
- static_cast<BoolWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::INT32:
- static_cast<Int32Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::INT64:
- static_cast<Int64Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::BYTE_ARRAY:
- static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::FIXED_LEN_BYTE_ARRAY:
- static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
- kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
- break;
- case Type::FLOAT:
- static_cast<FloatWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::DOUBLE:
- static_cast<DoubleWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
- &kRepLevelZero, nullptr);
- break;
- case Type::INT96:
- case Type::UNDEFINED:
- throw ParquetException("Unexpected type: " + TypeToString(writer->type()));
- break;
- }
-}
-
-void StreamWriter::SkipOptionalColumn() {
- if (SkipColumns(1) != 1) {
- throw ParquetException("Failed to skip optional column at column index " +
- std::to_string(column_index_));
- }
-}
-
-void StreamWriter::EndRow() {
- if (!file_writer_) {
- throw ParquetException("StreamWriter not initialized");
- }
- if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
- throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
- " of " + std::to_string(nodes_.size()) + " columns written");
- }
- column_index_ = 0;
- ++current_row_;
-
- if (max_row_group_size_ > 0) {
- if (row_group_size_ > max_row_group_size_) {
- EndRowGroup();
- }
- // Initialize for each row with size already written
- // (compressed + uncompressed).
- //
- row_group_size_ = row_group_writer_->total_bytes_written() +
- row_group_writer_->total_compressed_bytes();
- }
-}
-
-void StreamWriter::EndRowGroup() {
- if (!file_writer_) {
- throw ParquetException("StreamWriter not initialized");
- }
- // Avoid creating empty row groups.
- if (row_group_writer_->num_rows() > 0) {
- row_group_writer_->Close();
- row_group_writer_.reset(file_writer_->AppendBufferedRowGroup());
- }
-}
-
-StreamWriter& operator<<(StreamWriter& os, EndRowType) {
- os.EndRow();
- return os;
-}
-
-StreamWriter& operator<<(StreamWriter& os, EndRowGroupType) {
- os.EndRowGroup();
- return os;
-}
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/stream_writer.h"
+
+#include <utility>
+
+namespace parquet {
+
+int64_t StreamWriter::default_row_group_size_{512 * 1024 * 1024}; // 512MB
+
+constexpr int16_t StreamWriter::kDefLevelZero;
+constexpr int16_t StreamWriter::kDefLevelOne;
+constexpr int16_t StreamWriter::kRepLevelZero;
+constexpr int64_t StreamWriter::kBatchSizeOne;
+
+StreamWriter::FixedStringView::FixedStringView(const char* data_ptr)
+ : data{data_ptr}, size{std::strlen(data_ptr)} {}
+
+StreamWriter::FixedStringView::FixedStringView(const char* data_ptr, std::size_t data_len)
+ : data{data_ptr}, size{data_len} {}
+
+StreamWriter::StreamWriter(std::unique_ptr<ParquetFileWriter> writer)
+ : file_writer_{std::move(writer)},
+ row_group_writer_{file_writer_->AppendBufferedRowGroup()} {
+ auto schema = file_writer_->schema();
+ auto group_node = schema->group_node();
+
+ nodes_.resize(schema->num_columns());
+
+ for (auto i = 0; i < schema->num_columns(); ++i) {
+ nodes_[i] = std::static_pointer_cast<schema::PrimitiveNode>(group_node->field(i));
+ }
+}
+
+void StreamWriter::SetDefaultMaxRowGroupSize(int64_t max_size) {
+ default_row_group_size_ = max_size;
+}
+
+void StreamWriter::SetMaxRowGroupSize(int64_t max_size) {
+ max_row_group_size_ = max_size;
+}
+
+int StreamWriter::num_columns() const { return static_cast<int>(nodes_.size()); }
+
+StreamWriter& StreamWriter::operator<<(bool v) {
+ CheckColumn(Type::BOOLEAN, ConvertedType::NONE);
+ return Write<BoolWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(int8_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_8);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(uint8_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_8);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int16_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_16);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(uint16_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_16);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int32_t v) {
+ CheckColumn(Type::INT32, ConvertedType::INT_32);
+ return Write<Int32Writer>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(uint32_t v) {
+ CheckColumn(Type::INT32, ConvertedType::UINT_32);
+ return Write<Int32Writer>(static_cast<int32_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(int64_t v) {
+ CheckColumn(Type::INT64, ConvertedType::INT_64);
+ return Write<Int64Writer>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(uint64_t v) {
+ CheckColumn(Type::INT64, ConvertedType::UINT_64);
+ return Write<Int64Writer>(static_cast<int64_t>(v));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::chrono::milliseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MILLIS);
+ return Write<Int64Writer>(static_cast<int64_t>(v.count()));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::chrono::microseconds& v) {
+ CheckColumn(Type::INT64, ConvertedType::TIMESTAMP_MICROS);
+ return Write<Int64Writer>(static_cast<int64_t>(v.count()));
+}
+
+StreamWriter& StreamWriter::operator<<(float v) {
+ CheckColumn(Type::FLOAT, ConvertedType::NONE);
+ return Write<FloatWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(double v) {
+ CheckColumn(Type::DOUBLE, ConvertedType::NONE);
+ return Write<DoubleWriter>(v);
+}
+
+StreamWriter& StreamWriter::operator<<(char v) { return WriteFixedLength(&v, 1); }
+
+StreamWriter& StreamWriter::operator<<(FixedStringView v) {
+ return WriteFixedLength(v.data, v.size);
+}
+
+StreamWriter& StreamWriter::operator<<(const char* v) {
+ return WriteVariableLength(v, std::strlen(v));
+}
+
+StreamWriter& StreamWriter::operator<<(const std::string& v) {
+ return WriteVariableLength(v.data(), v.size());
+}
+
+StreamWriter& StreamWriter::operator<<(::arrow::util::string_view v) {
+ return WriteVariableLength(v.data(), v.size());
+}
+
+StreamWriter& StreamWriter::WriteVariableLength(const char* data_ptr,
+ std::size_t data_len) {
+ CheckColumn(Type::BYTE_ARRAY, ConvertedType::UTF8);
+
+ auto writer = static_cast<ByteArrayWriter*>(row_group_writer_->column(column_index_++));
+
+ if (data_ptr != nullptr) {
+ ByteArray ba_value;
+
+ ba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
+ ba_value.len = static_cast<uint32_t>(data_len);
+
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &ba_value);
+ } else {
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ }
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+}
+
+StreamWriter& StreamWriter::WriteFixedLength(const char* data_ptr, std::size_t data_len) {
+ CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE,
+ static_cast<int>(data_len));
+
+ auto writer =
+ static_cast<FixedLenByteArrayWriter*>(row_group_writer_->column(column_index_++));
+
+ if (data_ptr != nullptr) {
+ FixedLenByteArray flba_value;
+
+ flba_value.ptr = reinterpret_cast<const uint8_t*>(data_ptr);
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &flba_value);
+ } else {
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ }
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+}
+
+void StreamWriter::CheckColumn(Type::type physical_type,
+ ConvertedType::type converted_type, int length) {
+ if (static_cast<std::size_t>(column_index_) >= nodes_.size()) {
+ throw ParquetException("Column index out-of-bounds. Index " +
+ std::to_string(column_index_) + " is invalid for " +
+ std::to_string(nodes_.size()) + " columns");
+ }
+ const auto& node = nodes_[column_index_];
+
+ if (physical_type != node->physical_type()) {
+ throw ParquetException("Column physical type mismatch. Column '" + node->name() +
+ "' has physical type '" + TypeToString(node->physical_type()) +
+ "' not '" + TypeToString(physical_type) + "'");
+ }
+ if (converted_type != node->converted_type()) {
+ throw ParquetException("Column converted type mismatch. Column '" + node->name() +
+ "' has converted type[" +
+ ConvertedTypeToString(node->converted_type()) + "] not '" +
+ ConvertedTypeToString(converted_type) + "'");
+ }
+ // Length must be exact.
+ // A shorter length fixed array is not acceptable as it would
+ // result in array bound read errors.
+ //
+ if (length != node->type_length()) {
+ throw ParquetException("Column length mismatch. Column '" + node->name() +
+ "' has length " + std::to_string(node->type_length()) +
+ " not " + std::to_string(length));
+ }
+}
+
+int64_t StreamWriter::SkipColumns(int num_columns_to_skip) {
+ int num_columns_skipped = 0;
+
+ for (; (num_columns_to_skip > num_columns_skipped) &&
+ static_cast<std::size_t>(column_index_) < nodes_.size();
+ ++num_columns_skipped) {
+ const auto& node = nodes_[column_index_];
+
+ if (node->is_required()) {
+ throw ParquetException("Cannot skip column '" + node->name() +
+ "' as it is required.");
+ }
+ auto writer = row_group_writer_->column(column_index_++);
+
+ WriteNullValue(writer);
+ }
+ return num_columns_skipped;
+}
+
+void StreamWriter::WriteNullValue(ColumnWriter* writer) {
+ switch (writer->type()) {
+ case Type::BOOLEAN:
+ static_cast<BoolWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT32:
+ static_cast<Int32Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT64:
+ static_cast<Int64Writer*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::BYTE_ARRAY:
+ static_cast<ByteArrayWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ static_cast<FixedLenByteArrayWriter*>(writer)->WriteBatch(
+ kBatchSizeOne, &kDefLevelZero, &kRepLevelZero, nullptr);
+ break;
+ case Type::FLOAT:
+ static_cast<FloatWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::DOUBLE:
+ static_cast<DoubleWriter*>(writer)->WriteBatch(kBatchSizeOne, &kDefLevelZero,
+ &kRepLevelZero, nullptr);
+ break;
+ case Type::INT96:
+ case Type::UNDEFINED:
+ throw ParquetException("Unexpected type: " + TypeToString(writer->type()));
+ break;
+ }
+}
+
+void StreamWriter::SkipOptionalColumn() {
+ if (SkipColumns(1) != 1) {
+ throw ParquetException("Failed to skip optional column at column index " +
+ std::to_string(column_index_));
+ }
+}
+
+void StreamWriter::EndRow() {
+ if (!file_writer_) {
+ throw ParquetException("StreamWriter not initialized");
+ }
+ if (static_cast<std::size_t>(column_index_) < nodes_.size()) {
+ throw ParquetException("Cannot end row with " + std::to_string(column_index_) +
+ " of " + std::to_string(nodes_.size()) + " columns written");
+ }
+ column_index_ = 0;
+ ++current_row_;
+
+ if (max_row_group_size_ > 0) {
+ if (row_group_size_ > max_row_group_size_) {
+ EndRowGroup();
+ }
+ // Initialize for each row with size already written
+ // (compressed + uncompressed).
+ //
+ row_group_size_ = row_group_writer_->total_bytes_written() +
+ row_group_writer_->total_compressed_bytes();
+ }
+}
+
+void StreamWriter::EndRowGroup() {
+ if (!file_writer_) {
+ throw ParquetException("StreamWriter not initialized");
+ }
+ // Avoid creating empty row groups.
+ if (row_group_writer_->num_rows() > 0) {
+ row_group_writer_->Close();
+ row_group_writer_.reset(file_writer_->AppendBufferedRowGroup());
+ }
+}
+
+StreamWriter& operator<<(StreamWriter& os, EndRowType) {
+ os.EndRow();
+ return os;
+}
+
+StreamWriter& operator<<(StreamWriter& os, EndRowGroupType) {
+ os.EndRowGroup();
+ return os;
+}
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
index ebd9a278a2b..d0db850c341 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/stream_writer.h
@@ -1,243 +1,243 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <array>
-#include <chrono>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/util/optional.h"
-#include "arrow/util/string_view.h"
-#include "parquet/column_writer.h"
-#include "parquet/file_writer.h"
-
-namespace parquet {
-
-/// \brief A class for writing Parquet files using an output stream type API.
-///
-/// The values given must be of the correct type i.e. the type must
-/// match the file schema exactly otherwise a ParquetException will be
-/// thrown.
-///
-/// The user must explicitly indicate the end of the row using the
-/// EndRow() function or EndRow output manipulator.
-///
-/// A maximum row group size can be configured, the default size is
-/// 512MB. Alternatively the row group size can be set to zero and the
-/// user can create new row groups by calling the EndRowGroup()
-/// function or using the EndRowGroup output manipulator.
-///
-/// Required and optional fields are supported:
-/// - Required fields are written using operator<<(T)
-/// - Optional fields are written using
-/// operator<<(arrow::util::optional<T>).
-///
-/// Note that operator<<(T) can be used to write optional fields.
-///
-/// Similarly, operator<<(arrow::util::optional<T>) can be used to
-/// write required fields. However if the optional parameter does not
-/// have a value (i.e. it is nullopt) then a ParquetException will be
-/// raised.
-///
-/// Currently there is no support for repeated fields.
-///
-class PARQUET_EXPORT StreamWriter {
- public:
- template <typename T>
- using optional = ::arrow::util::optional<T>;
-
- // N.B. Default constructed objects are not usable. This
- // constructor is provided so that the object may be move
- // assigned afterwards.
- StreamWriter() = default;
-
- explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
-
- ~StreamWriter() = default;
-
- static void SetDefaultMaxRowGroupSize(int64_t max_size);
-
- void SetMaxRowGroupSize(int64_t max_size);
-
- int current_column() const { return column_index_; }
-
- int64_t current_row() const { return current_row_; }
-
- int num_columns() const;
-
- // Moving is possible.
- StreamWriter(StreamWriter&&) = default;
- StreamWriter& operator=(StreamWriter&&) = default;
-
- // Copying is not allowed.
- StreamWriter(const StreamWriter&) = delete;
- StreamWriter& operator=(const StreamWriter&) = delete;
-
- /// \brief Output operators for required fields.
- /// These can also be used for optional fields when a value must be set.
- StreamWriter& operator<<(bool v);
-
- StreamWriter& operator<<(int8_t v);
-
- StreamWriter& operator<<(uint8_t v);
-
- StreamWriter& operator<<(int16_t v);
-
- StreamWriter& operator<<(uint16_t v);
-
- StreamWriter& operator<<(int32_t v);
-
- StreamWriter& operator<<(uint32_t v);
-
- StreamWriter& operator<<(int64_t v);
-
- StreamWriter& operator<<(uint64_t v);
-
- StreamWriter& operator<<(const std::chrono::milliseconds& v);
-
- StreamWriter& operator<<(const std::chrono::microseconds& v);
-
- StreamWriter& operator<<(float v);
-
- StreamWriter& operator<<(double v);
-
- StreamWriter& operator<<(char v);
-
- /// \brief Helper class to write fixed length strings.
- /// This is useful as the standard string view (such as
- /// arrow::util::string_view) is for variable length data.
- struct PARQUET_EXPORT FixedStringView {
- FixedStringView() = default;
-
- explicit FixedStringView(const char* data_ptr);
-
- FixedStringView(const char* data_ptr, std::size_t data_len);
-
- const char* data{NULLPTR};
- std::size_t size{0};
- };
-
- /// \brief Output operators for fixed length strings.
- template <int N>
- StreamWriter& operator<<(const char (&v)[N]) {
- return WriteFixedLength(v, N);
- }
- template <std::size_t N>
- StreamWriter& operator<<(const std::array<char, N>& v) {
- return WriteFixedLength(v.data(), N);
- }
- StreamWriter& operator<<(FixedStringView v);
-
- /// \brief Output operators for variable length strings.
- StreamWriter& operator<<(const char* v);
- StreamWriter& operator<<(const std::string& v);
- StreamWriter& operator<<(::arrow::util::string_view v);
-
- /// \brief Output operator for optional fields.
- template <typename T>
- StreamWriter& operator<<(const optional<T>& v) {
- if (v) {
- return operator<<(*v);
- }
- SkipOptionalColumn();
- return *this;
- }
-
- /// \brief Skip the next N columns of optional data. If there are
- /// less than N columns remaining then the excess columns are
- /// ignored.
- /// \throws ParquetException if there is an attempt to skip any
- /// required column.
- /// \return Number of columns actually skipped.
- int64_t SkipColumns(int num_columns_to_skip);
-
- /// \brief Terminate the current row and advance to next one.
- /// \throws ParquetException if all columns in the row were not
- /// written or skipped.
- void EndRow();
-
- /// \brief Terminate the current row group and create new one.
- void EndRowGroup();
-
- protected:
- template <typename WriterType, typename T>
- StreamWriter& Write(const T v) {
- auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
-
- writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
-
- if (max_row_group_size_ > 0) {
- row_group_size_ += writer->EstimatedBufferedValueBytes();
- }
- return *this;
- }
-
- StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
-
- StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
-
- void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
- int length = -1);
-
- /// \brief Skip the next column which must be optional.
- /// \throws ParquetException if the next column does not exist or is
- /// not optional.
- void SkipOptionalColumn();
-
- void WriteNullValue(ColumnWriter* writer);
-
- private:
- using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
-
- struct null_deleter {
- void operator()(void*) {}
- };
-
- int32_t column_index_{0};
- int64_t current_row_{0};
- int64_t row_group_size_{0};
- int64_t max_row_group_size_{default_row_group_size_};
-
- std::unique_ptr<ParquetFileWriter> file_writer_;
- std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
- std::vector<node_ptr_type> nodes_;
-
- static constexpr int16_t kDefLevelZero = 0;
- static constexpr int16_t kDefLevelOne = 1;
- static constexpr int16_t kRepLevelZero = 0;
- static constexpr int64_t kBatchSizeOne = 1;
-
- static int64_t default_row_group_size_;
-};
-
-struct PARQUET_EXPORT EndRowType {};
-constexpr EndRowType EndRow = {};
-
-struct PARQUET_EXPORT EndRowGroupType {};
-constexpr EndRowGroupType EndRowGroup = {};
-
-PARQUET_EXPORT
-StreamWriter& operator<<(StreamWriter&, EndRowType);
-
-PARQUET_EXPORT
-StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/optional.h"
+#include "arrow/util/string_view.h"
+#include "parquet/column_writer.h"
+#include "parquet/file_writer.h"
+
+namespace parquet {
+
+/// \brief A class for writing Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly indicate the end of the row using the
+/// EndRow() function or EndRow output manipulator.
+///
+/// A maximum row group size can be configured, the default size is
+/// 512MB. Alternatively the row group size can be set to zero and the
+/// user can create new row groups by calling the EndRowGroup()
+/// function or using the EndRowGroup output manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are written using operator<<(T)
+/// - Optional fields are written using
+/// operator<<(arrow::util::optional<T>).
+///
+/// Note that operator<<(T) can be used to write optional fields.
+///
+/// Similarly, operator<<(arrow::util::optional<T>) can be used to
+/// write required fields. However if the optional parameter does not
+/// have a value (i.e. it is nullopt) then a ParquetException will be
+/// raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamWriter {
+ public:
+ template <typename T>
+ using optional = ::arrow::util::optional<T>;
+
+ // N.B. Default constructed objects are not usable. This
+ // constructor is provided so that the object may be move
+ // assigned afterwards.
+ StreamWriter() = default;
+
+ explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
+
+ ~StreamWriter() = default;
+
+ static void SetDefaultMaxRowGroupSize(int64_t max_size);
+
+ void SetMaxRowGroupSize(int64_t max_size);
+
+ int current_column() const { return column_index_; }
+
+ int64_t current_row() const { return current_row_; }
+
+ int num_columns() const;
+
+ // Moving is possible.
+ StreamWriter(StreamWriter&&) = default;
+ StreamWriter& operator=(StreamWriter&&) = default;
+
+ // Copying is not allowed.
+ StreamWriter(const StreamWriter&) = delete;
+ StreamWriter& operator=(const StreamWriter&) = delete;
+
+ /// \brief Output operators for required fields.
+ /// These can also be used for optional fields when a value must be set.
+ StreamWriter& operator<<(bool v);
+
+ StreamWriter& operator<<(int8_t v);
+
+ StreamWriter& operator<<(uint8_t v);
+
+ StreamWriter& operator<<(int16_t v);
+
+ StreamWriter& operator<<(uint16_t v);
+
+ StreamWriter& operator<<(int32_t v);
+
+ StreamWriter& operator<<(uint32_t v);
+
+ StreamWriter& operator<<(int64_t v);
+
+ StreamWriter& operator<<(uint64_t v);
+
+ StreamWriter& operator<<(const std::chrono::milliseconds& v);
+
+ StreamWriter& operator<<(const std::chrono::microseconds& v);
+
+ StreamWriter& operator<<(float v);
+
+ StreamWriter& operator<<(double v);
+
+ StreamWriter& operator<<(char v);
+
+ /// \brief Helper class to write fixed length strings.
+ /// This is useful as the standard string view (such as
+ /// arrow::util::string_view) is for variable length data.
+ struct PARQUET_EXPORT FixedStringView {
+ FixedStringView() = default;
+
+ explicit FixedStringView(const char* data_ptr);
+
+ FixedStringView(const char* data_ptr, std::size_t data_len);
+
+ const char* data{NULLPTR};
+ std::size_t size{0};
+ };
+
+ /// \brief Output operators for fixed length strings.
+ template <int N>
+ StreamWriter& operator<<(const char (&v)[N]) {
+ return WriteFixedLength(v, N);
+ }
+ template <std::size_t N>
+ StreamWriter& operator<<(const std::array<char, N>& v) {
+ return WriteFixedLength(v.data(), N);
+ }
+ StreamWriter& operator<<(FixedStringView v);
+
+ /// \brief Output operators for variable length strings.
+ StreamWriter& operator<<(const char* v);
+ StreamWriter& operator<<(const std::string& v);
+ StreamWriter& operator<<(::arrow::util::string_view v);
+
+ /// \brief Output operator for optional fields.
+ template <typename T>
+ StreamWriter& operator<<(const optional<T>& v) {
+ if (v) {
+ return operator<<(*v);
+ }
+ SkipOptionalColumn();
+ return *this;
+ }
+
+ /// \brief Skip the next N columns of optional data. If there are
+ /// less than N columns remaining then the excess columns are
+ /// ignored.
+ /// \throws ParquetException if there is an attempt to skip any
+ /// required column.
+ /// \return Number of columns actually skipped.
+ int64_t SkipColumns(int num_columns_to_skip);
+
+ /// \brief Terminate the current row and advance to next one.
+ /// \throws ParquetException if all columns in the row were not
+ /// written or skipped.
+ void EndRow();
+
+ /// \brief Terminate the current row group and create new one.
+ void EndRowGroup();
+
+ protected:
+ template <typename WriterType, typename T>
+ StreamWriter& Write(const T v) {
+ auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
+
+ writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
+
+ if (max_row_group_size_ > 0) {
+ row_group_size_ += writer->EstimatedBufferedValueBytes();
+ }
+ return *this;
+ }
+
+ StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len);
+
+ StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
+
+ void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+ int length = -1);
+
+ /// \brief Skip the next column which must be optional.
+ /// \throws ParquetException if the next column does not exist or is
+ /// not optional.
+ void SkipOptionalColumn();
+
+ void WriteNullValue(ColumnWriter* writer);
+
+ private:
+ using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
+
+ struct null_deleter {
+ void operator()(void*) {}
+ };
+
+ int32_t column_index_{0};
+ int64_t current_row_{0};
+ int64_t row_group_size_{0};
+ int64_t max_row_group_size_{default_row_group_size_};
+
+ std::unique_ptr<ParquetFileWriter> file_writer_;
+ std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
+ std::vector<node_ptr_type> nodes_;
+
+ static constexpr int16_t kDefLevelZero = 0;
+ static constexpr int16_t kDefLevelOne = 1;
+ static constexpr int16_t kRepLevelZero = 0;
+ static constexpr int64_t kBatchSizeOne = 1;
+
+ static int64_t default_row_group_size_;
+};
+
+struct PARQUET_EXPORT EndRowType {};
+constexpr EndRowType EndRow = {};
+
+struct PARQUET_EXPORT EndRowGroupType {};
+constexpr EndRowGroupType EndRowGroup = {};
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowType);
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map b/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
index 9df019e5fcd..4bf032dd584 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/symbols.map
@@ -1,40 +1,40 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-{
- # Symbols marked as 'local' are not exported by the DSO and thus may not
- # be used by client applications.
- local:
- # devtoolset / static-libstdc++ symbols
- __cxa_*;
- __once_proxy;
-
- extern "C++" {
- # boost
- boost::*;
-
- # thrift
- apache::thrift::*;
-
- # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
- # links c++11 symbols into binaries so that the result may be executed on
- # a system with an older libstdc++ which doesn't include the necessary
- # c++11 symbols.
- std::*;
- *std::__once_call*;
- };
-};
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+{
+ # Symbols marked as 'local' are not exported by the DSO and thus may not
+ # be used by client applications.
+ local:
+ # devtoolset / static-libstdc++ symbols
+ __cxa_*;
+ __once_proxy;
+
+ extern "C++" {
+ # boost
+ boost::*;
+
+ # thrift
+ apache::thrift::*;
+
+ # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically
+ # links c++11 symbols into binaries so that the result may be executed on
+ # a system with an older libstdc++ which doesn't include the necessary
+ # c++11 symbols.
+ std::*;
+ *std::__once_call*;
+ };
+};
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h b/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
index 443d948e30a..ea7df209621 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/thrift_internal.h
@@ -1,494 +1,494 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/util/windows_compatibility.h"
-
-#include <cstdint>
-// Check if thrift version < 0.11.0
-// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
-#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
-#include <boost/shared_ptr.hpp>
-#else
-#include <memory>
-#endif
-#include <string>
-#include <vector>
-
-// TCompactProtocol requires some #defines to work right.
-#define SIGNED_RIGHT_SHIFT_IS 1
-#define ARITHMETIC_RIGHT_SHIFT 1
-#include <thrift/TApplicationException.h>
-#include <thrift/protocol/TCompactProtocol.h>
-#include <thrift/protocol/TDebugProtocol.h>
-
-#include <thrift/protocol/TBinaryProtocol.h>
-#include <thrift/transport/TBufferTransports.h>
-#include <sstream>
-
-#include "arrow/util/logging.h"
-
-#include "parquet/encryption/internal_file_decryptor.h"
-#include "parquet/encryption/internal_file_encryptor.h"
-#include "parquet/exception.h"
-#include "parquet/platform.h"
-#include "parquet/statistics.h"
-#include "parquet/types.h"
-
-#include "generated/parquet_types.h" // IYWU pragma: export
-
-namespace parquet {
-
-// Check if thrift version < 0.11.0
-// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
-#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
-using ::boost::shared_ptr;
-#else
-using ::std::shared_ptr;
-#endif
-
-// ----------------------------------------------------------------------
-// Convert Thrift enums to Parquet enums
-
-// Unsafe enum converters (input is not checked for validity)
-
-static inline Type::type FromThriftUnsafe(format::Type::type type) {
- return static_cast<Type::type>(type);
-}
-
-static inline ConvertedType::type FromThriftUnsafe(format::ConvertedType::type type) {
- // item 0 is NONE
- return static_cast<ConvertedType::type>(static_cast<int>(type) + 1);
-}
-
-static inline Repetition::type FromThriftUnsafe(format::FieldRepetitionType::type type) {
- return static_cast<Repetition::type>(type);
-}
-
-static inline Encoding::type FromThriftUnsafe(format::Encoding::type type) {
- return static_cast<Encoding::type>(type);
-}
-
-static inline PageType::type FromThriftUnsafe(format::PageType::type type) {
- return static_cast<PageType::type>(type);
-}
-
-static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type type) {
- switch (type) {
- case format::CompressionCodec::UNCOMPRESSED:
- return Compression::UNCOMPRESSED;
- case format::CompressionCodec::SNAPPY:
- return Compression::SNAPPY;
- case format::CompressionCodec::GZIP:
- return Compression::GZIP;
- case format::CompressionCodec::LZO:
- return Compression::LZO;
- case format::CompressionCodec::BROTLI:
- return Compression::BROTLI;
- case format::CompressionCodec::LZ4:
- return Compression::LZ4_HADOOP;
- case format::CompressionCodec::LZ4_RAW:
- return Compression::LZ4;
- case format::CompressionCodec::ZSTD:
- return Compression::ZSTD;
- default:
- DCHECK(false) << "Cannot reach here";
- return Compression::UNCOMPRESSED;
- }
-}
-
-namespace internal {
-
-template <typename T>
-struct ThriftEnumTypeTraits {};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::Type::type> {
- using ParquetEnum = Type;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::ConvertedType::type> {
- using ParquetEnum = ConvertedType;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::FieldRepetitionType::type> {
- using ParquetEnum = Repetition;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::Encoding::type> {
- using ParquetEnum = Encoding;
-};
-
-template <>
-struct ThriftEnumTypeTraits<::parquet::format::PageType::type> {
- using ParquetEnum = PageType;
-};
-
-// If the parquet file is corrupted it is possible the enum value decoded
-// will not be in the range of defined values, which is undefined behaviour.
-// This facility prevents this by loading the value as the underlying type
-// and checking to make sure it is in range.
-
-template <typename EnumType,
- typename EnumTypeRaw = typename std::underlying_type<EnumType>::type>
-inline static EnumTypeRaw LoadEnumRaw(const EnumType* in) {
- EnumTypeRaw raw_value;
- // Use memcpy(), as a regular cast would be undefined behaviour on invalid values
- memcpy(&raw_value, in, sizeof(EnumType));
- return raw_value;
-}
-
-template <typename ApiType>
-struct SafeLoader {
- using ApiTypeEnum = typename ApiType::type;
- using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;
-
- template <typename ThriftType>
- inline static ApiTypeRawEnum LoadRaw(const ThriftType* in) {
- static_assert(sizeof(ApiTypeEnum) == sizeof(ThriftType),
- "parquet type should always be the same size as thrift type");
- return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
- }
-
- template <typename ThriftType, bool IsUnsigned = true>
- inline static ApiTypeEnum LoadChecked(
- const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
- auto raw_value = LoadRaw(in);
- if (ARROW_PREDICT_FALSE(raw_value >=
- static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
- return ApiType::UNDEFINED;
- }
- return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
- }
-
- template <typename ThriftType, bool IsUnsigned = false>
- inline static ApiTypeEnum LoadChecked(
- const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
- auto raw_value = LoadRaw(in);
- if (ARROW_PREDICT_FALSE(raw_value >=
- static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
- raw_value < 0)) {
- return ApiType::UNDEFINED;
- }
- return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
- }
-
- template <typename ThriftType>
- inline static ApiTypeEnum Load(const ThriftType* in) {
- return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
- }
-};
-
-} // namespace internal
-
-// Safe enum loader: will check for invalid enum value before converting
-
-template <typename ThriftType,
- typename ParquetEnum =
- typename internal::ThriftEnumTypeTraits<ThriftType>::ParquetEnum>
-inline typename ParquetEnum::type LoadEnumSafe(const ThriftType* in) {
- return internal::SafeLoader<ParquetEnum>::Load(in);
-}
-
-inline typename Compression::type LoadEnumSafe(const format::CompressionCodec::type* in) {
- const auto raw_value = internal::LoadEnumRaw(in);
- // Check bounds manually, as Compression::type doesn't have the same values
- // as format::CompressionCodec.
- const auto min_value =
- static_cast<decltype(raw_value)>(format::CompressionCodec::UNCOMPRESSED);
- const auto max_value =
- static_cast<decltype(raw_value)>(format::CompressionCodec::LZ4_RAW);
- if (raw_value < min_value || raw_value > max_value) {
- return Compression::UNCOMPRESSED;
- }
- return FromThriftUnsafe(*in);
-}
-
-// Safe non-enum converters
-
-static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
- return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
- aesGcmV1.supply_aad_prefix};
-}
-
-static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
- return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique,
- aesGcmCtrV1.supply_aad_prefix};
-}
-
-static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) {
- EncryptionAlgorithm encryption_algorithm;
-
- if (encryption.__isset.AES_GCM_V1) {
- encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
- encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1);
- } else if (encryption.__isset.AES_GCM_CTR_V1) {
- encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1;
- encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1);
- } else {
- throw ParquetException("Unsupported algorithm");
- }
- return encryption_algorithm;
-}
-
-// ----------------------------------------------------------------------
-// Convert Thrift enums from Parquet enums
-
-static inline format::Type::type ToThrift(Type::type type) {
- return static_cast<format::Type::type>(type);
-}
-
-static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
- // item 0 is NONE
- DCHECK_NE(type, ConvertedType::NONE);
- // it is forbidden to emit "NA" (PARQUET-1990)
- DCHECK_NE(type, ConvertedType::NA);
- DCHECK_NE(type, ConvertedType::UNDEFINED);
- return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
-}
-
-static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) {
- return static_cast<format::FieldRepetitionType::type>(type);
-}
-
-static inline format::Encoding::type ToThrift(Encoding::type type) {
- return static_cast<format::Encoding::type>(type);
-}
-
-static inline format::CompressionCodec::type ToThrift(Compression::type type) {
- switch (type) {
- case Compression::UNCOMPRESSED:
- return format::CompressionCodec::UNCOMPRESSED;
- case Compression::SNAPPY:
- return format::CompressionCodec::SNAPPY;
- case Compression::GZIP:
- return format::CompressionCodec::GZIP;
- case Compression::LZO:
- return format::CompressionCodec::LZO;
- case Compression::BROTLI:
- return format::CompressionCodec::BROTLI;
- case Compression::LZ4:
- return format::CompressionCodec::LZ4_RAW;
- case Compression::LZ4_HADOOP:
- // Deprecated "LZ4" Parquet compression has Hadoop-specific framing
- return format::CompressionCodec::LZ4;
- case Compression::ZSTD:
- return format::CompressionCodec::ZSTD;
- default:
- DCHECK(false) << "Cannot reach here";
- return format::CompressionCodec::UNCOMPRESSED;
- }
-}
-
-static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
- format::Statistics statistics;
- if (stats.has_min) {
- statistics.__set_min_value(stats.min());
- // If the order is SIGNED, then the old min value must be set too.
- // This for backward compatibility
- if (stats.is_signed()) {
- statistics.__set_min(stats.min());
- }
- }
- if (stats.has_max) {
- statistics.__set_max_value(stats.max());
- // If the order is SIGNED, then the old max value must be set too.
- // This for backward compatibility
- if (stats.is_signed()) {
- statistics.__set_max(stats.max());
- }
- }
- if (stats.has_null_count) {
- statistics.__set_null_count(stats.null_count);
- }
- if (stats.has_distinct_count) {
- statistics.__set_distinct_count(stats.distinct_count);
- }
-
- return statistics;
-}
-
-static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) {
- format::AesGcmV1 aesGcmV1;
- // aad_file_unique is always set
- aesGcmV1.__set_aad_file_unique(aad.aad_file_unique);
- aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
- if (!aad.aad_prefix.empty()) {
- aesGcmV1.__set_aad_prefix(aad.aad_prefix);
- }
- return aesGcmV1;
-}
-
-static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) {
- format::AesGcmCtrV1 aesGcmCtrV1;
- // aad_file_unique is always set
- aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique);
- aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
- if (!aad.aad_prefix.empty()) {
- aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix);
- }
- return aesGcmCtrV1;
-}
-
-static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) {
- format::EncryptionAlgorithm encryption_algorithm;
- if (encryption.algorithm == ParquetCipher::AES_GCM_V1) {
- encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad));
- } else {
- encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad));
- }
- return encryption_algorithm;
-}
-
-// ----------------------------------------------------------------------
-// Thrift struct serialization / deserialization utilities
-
-using ThriftBuffer = apache::thrift::transport::TMemoryBuffer;
-
-template <class T>
-inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
- T* deserialized_msg) {
- // Deserialize msg bytes into c++ thrift msg using memory transport.
- shared_ptr<ThriftBuffer> tmem_transport(
- new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
- apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
- // Protect against CPU and memory bombs
- tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
- // Structs in the thrift definition are relatively large (at least 300 bytes).
- // This limits total memory to the same order of magnitude as stringSize.
- tproto_factory.setContainerSizeLimit(1000 * 1000);
- shared_ptr<apache::thrift::protocol::TProtocol> tproto = //
- tproto_factory.getProtocol(tmem_transport);
- try {
- deserialized_msg->read(tproto.get());
- } catch (std::exception& e) {
- std::stringstream ss;
- ss << "Couldn't deserialize thrift: " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- uint32_t bytes_left = tmem_transport->available_read();
- *len = *len - bytes_left;
-}
-
-// Deserialize a thrift message from buf/len. buf/len must at least contain
-// all the bytes needed to store the thrift message. On return, len will be
-// set to the actual length of the header.
-template <class T>
-inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg,
- const std::shared_ptr<Decryptor>& decryptor = NULLPTR) {
- // thrift message is not encrypted
- if (decryptor == NULLPTR) {
- DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg);
- } else { // thrift message is encrypted
- uint32_t clen;
- clen = *len;
- // decrypt
- std::shared_ptr<ResizableBuffer> decrypted_buffer =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
- decryptor->pool(),
- static_cast<int64_t>(clen - decryptor->CiphertextSizeDelta())));
- const uint8_t* cipher_buf = buf;
- uint32_t decrypted_buffer_len =
- decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data());
- if (decrypted_buffer_len <= 0) {
- throw ParquetException("Couldn't decrypt buffer\n");
- }
- *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta();
- DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len,
- deserialized_msg);
- }
-}
-
-/// Utility class to serialize thrift objects to a binary format. This object
-/// should be reused if possible to reuse the underlying memory.
-/// Note: thrift will encode NULLs into the serialized buffer so it is not valid
-/// to treat it as a string.
-class ThriftSerializer {
- public:
- explicit ThriftSerializer(int initial_buffer_size = 1024)
- : mem_buffer_(new ThriftBuffer(initial_buffer_size)) {
- apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> factory;
- protocol_ = factory.getProtocol(mem_buffer_);
- }
-
- /// Serialize obj into a memory buffer. The result is returned in buffer/len. The
- /// memory returned is owned by this object and will be invalid when another object
- /// is serialized.
- template <class T>
- void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) {
- SerializeObject(obj);
- mem_buffer_->getBuffer(buffer, len);
- }
-
- template <class T>
- void SerializeToString(const T* obj, std::string* result) {
- SerializeObject(obj);
- *result = mem_buffer_->getBufferAsString();
- }
-
- template <class T>
- int64_t Serialize(const T* obj, ArrowOutputStream* out,
- const std::shared_ptr<Encryptor>& encryptor = NULLPTR) {
- uint8_t* out_buffer;
- uint32_t out_length;
- SerializeToBuffer(obj, &out_length, &out_buffer);
-
- // obj is not encrypted
- if (encryptor == NULLPTR) {
- PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length));
- return static_cast<int64_t>(out_length);
- } else { // obj is encrypted
- return SerializeEncryptedObj(out, out_buffer, out_length, encryptor);
- }
- }
-
- private:
- template <class T>
- void SerializeObject(const T* obj) {
- try {
- mem_buffer_->resetBuffer();
- obj->write(protocol_.get());
- } catch (std::exception& e) {
- std::stringstream ss;
- ss << "Couldn't serialize thrift: " << e.what() << "\n";
- throw ParquetException(ss.str());
- }
- }
-
- int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer,
- uint32_t out_length,
- const std::shared_ptr<Encryptor>& encryptor) {
- std::shared_ptr<ResizableBuffer> cipher_buffer =
- std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
- encryptor->pool(),
- static_cast<int64_t>(encryptor->CiphertextSizeDelta() + out_length)));
- int cipher_buffer_len =
- encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data());
-
- PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));
- return static_cast<int64_t>(cipher_buffer_len);
- }
-
- shared_ptr<ThriftBuffer> mem_buffer_;
- shared_ptr<apache::thrift::protocol::TProtocol> protocol_;
-};
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#include <cstdint>
+// Check if thrift version < 0.11.0
+// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
+#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
+#include <boost/shared_ptr.hpp>
+#else
+#include <memory>
+#endif
+#include <string>
+#include <vector>
+
+// TCompactProtocol requires some #defines to work right.
+#define SIGNED_RIGHT_SHIFT_IS 1
+#define ARITHMETIC_RIGHT_SHIFT 1
+#include <thrift/TApplicationException.h>
+#include <thrift/protocol/TCompactProtocol.h>
+#include <thrift/protocol/TDebugProtocol.h>
+
+#include <thrift/protocol/TBinaryProtocol.h>
+#include <thrift/transport/TBufferTransports.h>
+#include <sstream>
+
+#include "arrow/util/logging.h"
+
+#include "parquet/encryption/internal_file_decryptor.h"
+#include "parquet/encryption/internal_file_encryptor.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+#include "generated/parquet_types.h" // IYWU pragma: export
+
+namespace parquet {
+
+// Check if thrift version < 0.11.0
+// or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp
+#if defined(PARQUET_THRIFT_USE_BOOST) || defined(FORCE_BOOST_SMART_PTR)
+using ::boost::shared_ptr;
+#else
+using ::std::shared_ptr;
+#endif
+
+// ----------------------------------------------------------------------
+// Convert Thrift enums to Parquet enums
+
+// Unsafe enum converters (input is not checked for validity)
+
+static inline Type::type FromThriftUnsafe(format::Type::type type) {
+ return static_cast<Type::type>(type);
+}
+
+static inline ConvertedType::type FromThriftUnsafe(format::ConvertedType::type type) {
+ // item 0 is NONE
+ return static_cast<ConvertedType::type>(static_cast<int>(type) + 1);
+}
+
+static inline Repetition::type FromThriftUnsafe(format::FieldRepetitionType::type type) {
+ return static_cast<Repetition::type>(type);
+}
+
+static inline Encoding::type FromThriftUnsafe(format::Encoding::type type) {
+ return static_cast<Encoding::type>(type);
+}
+
+static inline PageType::type FromThriftUnsafe(format::PageType::type type) {
+ return static_cast<PageType::type>(type);
+}
+
+static inline Compression::type FromThriftUnsafe(format::CompressionCodec::type type) {
+ switch (type) {
+ case format::CompressionCodec::UNCOMPRESSED:
+ return Compression::UNCOMPRESSED;
+ case format::CompressionCodec::SNAPPY:
+ return Compression::SNAPPY;
+ case format::CompressionCodec::GZIP:
+ return Compression::GZIP;
+ case format::CompressionCodec::LZO:
+ return Compression::LZO;
+ case format::CompressionCodec::BROTLI:
+ return Compression::BROTLI;
+ case format::CompressionCodec::LZ4:
+ return Compression::LZ4_HADOOP;
+ case format::CompressionCodec::LZ4_RAW:
+ return Compression::LZ4;
+ case format::CompressionCodec::ZSTD:
+ return Compression::ZSTD;
+ default:
+ DCHECK(false) << "Cannot reach here";
+ return Compression::UNCOMPRESSED;
+ }
+}
+
+namespace internal {
+
+template <typename T>
+struct ThriftEnumTypeTraits {};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::Type::type> {
+ using ParquetEnum = Type;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::ConvertedType::type> {
+ using ParquetEnum = ConvertedType;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::FieldRepetitionType::type> {
+ using ParquetEnum = Repetition;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::Encoding::type> {
+ using ParquetEnum = Encoding;
+};
+
+template <>
+struct ThriftEnumTypeTraits<::parquet::format::PageType::type> {
+ using ParquetEnum = PageType;
+};
+
+// If the parquet file is corrupted it is possible the enum value decoded
+// will not be in the range of defined values, which is undefined behaviour.
+// This facility prevents this by loading the value as the underlying type
+// and checking to make sure it is in range.
+
+template <typename EnumType,
+ typename EnumTypeRaw = typename std::underlying_type<EnumType>::type>
+inline static EnumTypeRaw LoadEnumRaw(const EnumType* in) {
+ EnumTypeRaw raw_value;
+ // Use memcpy(), as a regular cast would be undefined behaviour on invalid values
+ memcpy(&raw_value, in, sizeof(EnumType));
+ return raw_value;
+}
+
+template <typename ApiType>
+struct SafeLoader {
+ using ApiTypeEnum = typename ApiType::type;
+ using ApiTypeRawEnum = typename std::underlying_type<ApiTypeEnum>::type;
+
+ template <typename ThriftType>
+ inline static ApiTypeRawEnum LoadRaw(const ThriftType* in) {
+ static_assert(sizeof(ApiTypeEnum) == sizeof(ThriftType),
+ "parquet type should always be the same size as thrift type");
+ return static_cast<ApiTypeRawEnum>(LoadEnumRaw(in));
+ }
+
+ template <typename ThriftType, bool IsUnsigned = true>
+ inline static ApiTypeEnum LoadChecked(
+ const typename std::enable_if<IsUnsigned, ThriftType>::type* in) {
+ auto raw_value = LoadRaw(in);
+ if (ARROW_PREDICT_FALSE(raw_value >=
+ static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED))) {
+ return ApiType::UNDEFINED;
+ }
+ return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
+ }
+
+ template <typename ThriftType, bool IsUnsigned = false>
+ inline static ApiTypeEnum LoadChecked(
+ const typename std::enable_if<!IsUnsigned, ThriftType>::type* in) {
+ auto raw_value = LoadRaw(in);
+ if (ARROW_PREDICT_FALSE(raw_value >=
+ static_cast<ApiTypeRawEnum>(ApiType::UNDEFINED) ||
+ raw_value < 0)) {
+ return ApiType::UNDEFINED;
+ }
+ return FromThriftUnsafe(static_cast<ThriftType>(raw_value));
+ }
+
+ template <typename ThriftType>
+ inline static ApiTypeEnum Load(const ThriftType* in) {
+ return LoadChecked<ThriftType, std::is_unsigned<ApiTypeRawEnum>::value>(in);
+ }
+};
+
+} // namespace internal
+
+// Safe enum loader: will check for invalid enum value before converting
+
+template <typename ThriftType,
+ typename ParquetEnum =
+ typename internal::ThriftEnumTypeTraits<ThriftType>::ParquetEnum>
+inline typename ParquetEnum::type LoadEnumSafe(const ThriftType* in) {
+ return internal::SafeLoader<ParquetEnum>::Load(in);
+}
+
+inline typename Compression::type LoadEnumSafe(const format::CompressionCodec::type* in) {
+ const auto raw_value = internal::LoadEnumRaw(in);
+ // Check bounds manually, as Compression::type doesn't have the same values
+ // as format::CompressionCodec.
+ const auto min_value =
+ static_cast<decltype(raw_value)>(format::CompressionCodec::UNCOMPRESSED);
+ const auto max_value =
+ static_cast<decltype(raw_value)>(format::CompressionCodec::LZ4_RAW);
+ if (raw_value < min_value || raw_value > max_value) {
+ return Compression::UNCOMPRESSED;
+ }
+ return FromThriftUnsafe(*in);
+}
+
+// Safe non-enum converters
+
+static inline AadMetadata FromThrift(format::AesGcmV1 aesGcmV1) {
+ return AadMetadata{aesGcmV1.aad_prefix, aesGcmV1.aad_file_unique,
+ aesGcmV1.supply_aad_prefix};
+}
+
+static inline AadMetadata FromThrift(format::AesGcmCtrV1 aesGcmCtrV1) {
+ return AadMetadata{aesGcmCtrV1.aad_prefix, aesGcmCtrV1.aad_file_unique,
+ aesGcmCtrV1.supply_aad_prefix};
+}
+
+static inline EncryptionAlgorithm FromThrift(format::EncryptionAlgorithm encryption) {
+ EncryptionAlgorithm encryption_algorithm;
+
+ if (encryption.__isset.AES_GCM_V1) {
+ encryption_algorithm.algorithm = ParquetCipher::AES_GCM_V1;
+ encryption_algorithm.aad = FromThrift(encryption.AES_GCM_V1);
+ } else if (encryption.__isset.AES_GCM_CTR_V1) {
+ encryption_algorithm.algorithm = ParquetCipher::AES_GCM_CTR_V1;
+ encryption_algorithm.aad = FromThrift(encryption.AES_GCM_CTR_V1);
+ } else {
+ throw ParquetException("Unsupported algorithm");
+ }
+ return encryption_algorithm;
+}
+
+// ----------------------------------------------------------------------
+// Convert Thrift enums from Parquet enums
+
+static inline format::Type::type ToThrift(Type::type type) {
+ return static_cast<format::Type::type>(type);
+}
+
+static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
+ // item 0 is NONE
+ DCHECK_NE(type, ConvertedType::NONE);
+ // it is forbidden to emit "NA" (PARQUET-1990)
+ DCHECK_NE(type, ConvertedType::NA);
+ DCHECK_NE(type, ConvertedType::UNDEFINED);
+ return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
+}
+
+static inline format::FieldRepetitionType::type ToThrift(Repetition::type type) {
+ return static_cast<format::FieldRepetitionType::type>(type);
+}
+
+static inline format::Encoding::type ToThrift(Encoding::type type) {
+ return static_cast<format::Encoding::type>(type);
+}
+
+static inline format::CompressionCodec::type ToThrift(Compression::type type) {
+ switch (type) {
+ case Compression::UNCOMPRESSED:
+ return format::CompressionCodec::UNCOMPRESSED;
+ case Compression::SNAPPY:
+ return format::CompressionCodec::SNAPPY;
+ case Compression::GZIP:
+ return format::CompressionCodec::GZIP;
+ case Compression::LZO:
+ return format::CompressionCodec::LZO;
+ case Compression::BROTLI:
+ return format::CompressionCodec::BROTLI;
+ case Compression::LZ4:
+ return format::CompressionCodec::LZ4_RAW;
+ case Compression::LZ4_HADOOP:
+ // Deprecated "LZ4" Parquet compression has Hadoop-specific framing
+ return format::CompressionCodec::LZ4;
+ case Compression::ZSTD:
+ return format::CompressionCodec::ZSTD;
+ default:
+ DCHECK(false) << "Cannot reach here";
+ return format::CompressionCodec::UNCOMPRESSED;
+ }
+}
+
+static inline format::Statistics ToThrift(const EncodedStatistics& stats) {
+ format::Statistics statistics;
+ if (stats.has_min) {
+ statistics.__set_min_value(stats.min());
+ // If the order is SIGNED, then the old min value must be set too.
+ // This for backward compatibility
+ if (stats.is_signed()) {
+ statistics.__set_min(stats.min());
+ }
+ }
+ if (stats.has_max) {
+ statistics.__set_max_value(stats.max());
+ // If the order is SIGNED, then the old max value must be set too.
+ // This for backward compatibility
+ if (stats.is_signed()) {
+ statistics.__set_max(stats.max());
+ }
+ }
+ if (stats.has_null_count) {
+ statistics.__set_null_count(stats.null_count);
+ }
+ if (stats.has_distinct_count) {
+ statistics.__set_distinct_count(stats.distinct_count);
+ }
+
+ return statistics;
+}
+
+static inline format::AesGcmV1 ToAesGcmV1Thrift(AadMetadata aad) {
+ format::AesGcmV1 aesGcmV1;
+ // aad_file_unique is always set
+ aesGcmV1.__set_aad_file_unique(aad.aad_file_unique);
+ aesGcmV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
+ if (!aad.aad_prefix.empty()) {
+ aesGcmV1.__set_aad_prefix(aad.aad_prefix);
+ }
+ return aesGcmV1;
+}
+
+static inline format::AesGcmCtrV1 ToAesGcmCtrV1Thrift(AadMetadata aad) {
+ format::AesGcmCtrV1 aesGcmCtrV1;
+ // aad_file_unique is always set
+ aesGcmCtrV1.__set_aad_file_unique(aad.aad_file_unique);
+ aesGcmCtrV1.__set_supply_aad_prefix(aad.supply_aad_prefix);
+ if (!aad.aad_prefix.empty()) {
+ aesGcmCtrV1.__set_aad_prefix(aad.aad_prefix);
+ }
+ return aesGcmCtrV1;
+}
+
+static inline format::EncryptionAlgorithm ToThrift(EncryptionAlgorithm encryption) {
+ format::EncryptionAlgorithm encryption_algorithm;
+ if (encryption.algorithm == ParquetCipher::AES_GCM_V1) {
+ encryption_algorithm.__set_AES_GCM_V1(ToAesGcmV1Thrift(encryption.aad));
+ } else {
+ encryption_algorithm.__set_AES_GCM_CTR_V1(ToAesGcmCtrV1Thrift(encryption.aad));
+ }
+ return encryption_algorithm;
+}
+
+// ----------------------------------------------------------------------
+// Thrift struct serialization / deserialization utilities
+
+using ThriftBuffer = apache::thrift::transport::TMemoryBuffer;
+
+template <class T>
+inline void DeserializeThriftUnencryptedMsg(const uint8_t* buf, uint32_t* len,
+ T* deserialized_msg) {
+ // Deserialize msg bytes into c++ thrift msg using memory transport.
+ shared_ptr<ThriftBuffer> tmem_transport(
+ new ThriftBuffer(const_cast<uint8_t*>(buf), *len));
+ apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> tproto_factory;
+ // Protect against CPU and memory bombs
+ tproto_factory.setStringSizeLimit(100 * 1000 * 1000);
+ // Structs in the thrift definition are relatively large (at least 300 bytes).
+ // This limits total memory to the same order of magnitude as stringSize.
+ tproto_factory.setContainerSizeLimit(1000 * 1000);
+ shared_ptr<apache::thrift::protocol::TProtocol> tproto = //
+ tproto_factory.getProtocol(tmem_transport);
+ try {
+ deserialized_msg->read(tproto.get());
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Couldn't deserialize thrift: " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ uint32_t bytes_left = tmem_transport->available_read();
+ *len = *len - bytes_left;
+}
+
+// Deserialize a thrift message from buf/len. buf/len must at least contain
+// all the bytes needed to store the thrift message. On return, len will be
+// set to the actual length of the header.
+template <class T>
+inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deserialized_msg,
+ const std::shared_ptr<Decryptor>& decryptor = NULLPTR) {
+ // thrift message is not encrypted
+ if (decryptor == NULLPTR) {
+ DeserializeThriftUnencryptedMsg(buf, len, deserialized_msg);
+ } else { // thrift message is encrypted
+ uint32_t clen;
+ clen = *len;
+ // decrypt
+ std::shared_ptr<ResizableBuffer> decrypted_buffer =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
+ decryptor->pool(),
+ static_cast<int64_t>(clen - decryptor->CiphertextSizeDelta())));
+ const uint8_t* cipher_buf = buf;
+ uint32_t decrypted_buffer_len =
+ decryptor->Decrypt(cipher_buf, 0, decrypted_buffer->mutable_data());
+ if (decrypted_buffer_len <= 0) {
+ throw ParquetException("Couldn't decrypt buffer\n");
+ }
+ *len = decrypted_buffer_len + decryptor->CiphertextSizeDelta();
+ DeserializeThriftMsg(decrypted_buffer->data(), &decrypted_buffer_len,
+ deserialized_msg);
+ }
+}
+
+/// Utility class to serialize thrift objects to a binary format. This object
+/// should be reused if possible to reuse the underlying memory.
+/// Note: thrift will encode NULLs into the serialized buffer so it is not valid
+/// to treat it as a string.
+class ThriftSerializer {
+ public:
+ explicit ThriftSerializer(int initial_buffer_size = 1024)
+ : mem_buffer_(new ThriftBuffer(initial_buffer_size)) {
+ apache::thrift::protocol::TCompactProtocolFactoryT<ThriftBuffer> factory;
+ protocol_ = factory.getProtocol(mem_buffer_);
+ }
+
+ /// Serialize obj into a memory buffer. The result is returned in buffer/len. The
+ /// memory returned is owned by this object and will be invalid when another object
+ /// is serialized.
+ template <class T>
+ void SerializeToBuffer(const T* obj, uint32_t* len, uint8_t** buffer) {
+ SerializeObject(obj);
+ mem_buffer_->getBuffer(buffer, len);
+ }
+
+ template <class T>
+ void SerializeToString(const T* obj, std::string* result) {
+ SerializeObject(obj);
+ *result = mem_buffer_->getBufferAsString();
+ }
+
+ template <class T>
+ int64_t Serialize(const T* obj, ArrowOutputStream* out,
+ const std::shared_ptr<Encryptor>& encryptor = NULLPTR) {
+ uint8_t* out_buffer;
+ uint32_t out_length;
+ SerializeToBuffer(obj, &out_length, &out_buffer);
+
+ // obj is not encrypted
+ if (encryptor == NULLPTR) {
+ PARQUET_THROW_NOT_OK(out->Write(out_buffer, out_length));
+ return static_cast<int64_t>(out_length);
+ } else { // obj is encrypted
+ return SerializeEncryptedObj(out, out_buffer, out_length, encryptor);
+ }
+ }
+
+ private:
+ template <class T>
+ void SerializeObject(const T* obj) {
+ try {
+ mem_buffer_->resetBuffer();
+ obj->write(protocol_.get());
+ } catch (std::exception& e) {
+ std::stringstream ss;
+ ss << "Couldn't serialize thrift: " << e.what() << "\n";
+ throw ParquetException(ss.str());
+ }
+ }
+
+ int64_t SerializeEncryptedObj(ArrowOutputStream* out, uint8_t* out_buffer,
+ uint32_t out_length,
+ const std::shared_ptr<Encryptor>& encryptor) {
+ std::shared_ptr<ResizableBuffer> cipher_buffer =
+ std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(
+ encryptor->pool(),
+ static_cast<int64_t>(encryptor->CiphertextSizeDelta() + out_length)));
+ int cipher_buffer_len =
+ encryptor->Encrypt(out_buffer, out_length, cipher_buffer->mutable_data());
+
+ PARQUET_THROW_NOT_OK(out->Write(cipher_buffer->data(), cipher_buffer_len));
+ return static_cast<int64_t>(cipher_buffer_len);
+ }
+
+ shared_ptr<ThriftBuffer> mem_buffer_;
+ shared_ptr<apache::thrift::protocol::TProtocol> protocol_;
+};
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h b/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
index 2153ea63efb..a427f5a9591 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/type_fwd.h
@@ -1,43 +1,43 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-namespace parquet {
-
-struct ParquetVersion {
- enum type { PARQUET_1_0, PARQUET_2_0 };
-};
-
-class FileMetaData;
-class SchemaDescriptor;
-
-class ReaderProperties;
-class ArrowReaderProperties;
-
-class WriterProperties;
-class WriterPropertiesBuilder;
-class ArrowWriterProperties;
-class ArrowWriterPropertiesBuilder;
-
-namespace arrow {
-
-class FileWriter;
-class FileReader;
-
-} // namespace arrow
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace parquet {
+
+struct ParquetVersion {
+ enum type { PARQUET_1_0, PARQUET_2_0 };
+};
+
+class FileMetaData;
+class SchemaDescriptor;
+
+class ReaderProperties;
+class ArrowReaderProperties;
+
+class WriterProperties;
+class WriterPropertiesBuilder;
+class ArrowWriterProperties;
+class ArrowWriterPropertiesBuilder;
+
+namespace arrow {
+
+class FileWriter;
+class FileReader;
+
+} // namespace arrow
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/types.cc b/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
index 35cc43639b8..ef23c40662b 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/types.cc
@@ -1,1567 +1,1567 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cmath>
-#include <cstdint>
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/compression.h"
-#include "arrow/util/logging.h"
-
-#include "parquet/exception.h"
-#include "parquet/types.h"
-
-#include "generated/parquet_types.h"
-
-using arrow::internal::checked_cast;
-using arrow::util::Codec;
-
-namespace parquet {
-
-bool IsCodecSupported(Compression::type codec) {
- switch (codec) {
- case Compression::UNCOMPRESSED:
- case Compression::SNAPPY:
- case Compression::GZIP:
- case Compression::BROTLI:
- case Compression::ZSTD:
- case Compression::LZ4:
- case Compression::LZ4_HADOOP:
- return true;
- default:
- return false;
- }
-}
-
-std::unique_ptr<Codec> GetCodec(Compression::type codec) {
- return GetCodec(codec, Codec::UseDefaultCompressionLevel());
-}
-
-std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level) {
- std::unique_ptr<Codec> result;
- if (codec == Compression::LZO) {
- throw ParquetException(
- "While LZO compression is supported by the Parquet format in "
- "general, it is currently not supported by the C++ implementation.");
- }
-
- if (!IsCodecSupported(codec)) {
- std::stringstream ss;
- ss << "Codec type " << Codec::GetCodecAsString(codec)
- << " not supported in Parquet format";
- throw ParquetException(ss.str());
- }
-
- PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, compression_level));
- return result;
-}
-
-std::string FormatStatValue(Type::type parquet_type, ::arrow::util::string_view val) {
- std::stringstream result;
-
- const char* bytes = val.data();
- switch (parquet_type) {
- case Type::BOOLEAN:
- result << reinterpret_cast<const bool*>(bytes)[0];
- break;
- case Type::INT32:
- result << reinterpret_cast<const int32_t*>(bytes)[0];
- break;
- case Type::INT64:
- result << reinterpret_cast<const int64_t*>(bytes)[0];
- break;
- case Type::DOUBLE:
- result << reinterpret_cast<const double*>(bytes)[0];
- break;
- case Type::FLOAT:
- result << reinterpret_cast<const float*>(bytes)[0];
- break;
- case Type::INT96: {
- auto const i32_val = reinterpret_cast<const int32_t*>(bytes);
- result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
- break;
- }
- case Type::BYTE_ARRAY: {
- return std::string(val);
- }
- case Type::FIXED_LEN_BYTE_ARRAY: {
- return std::string(val);
- }
- case Type::UNDEFINED:
- default:
- break;
- }
- return result.str();
-}
-
-std::string EncodingToString(Encoding::type t) {
- switch (t) {
- case Encoding::PLAIN:
- return "PLAIN";
- case Encoding::PLAIN_DICTIONARY:
- return "PLAIN_DICTIONARY";
- case Encoding::RLE:
- return "RLE";
- case Encoding::BIT_PACKED:
- return "BIT_PACKED";
- case Encoding::DELTA_BINARY_PACKED:
- return "DELTA_BINARY_PACKED";
- case Encoding::DELTA_LENGTH_BYTE_ARRAY:
- return "DELTA_LENGTH_BYTE_ARRAY";
- case Encoding::DELTA_BYTE_ARRAY:
- return "DELTA_BYTE_ARRAY";
- case Encoding::RLE_DICTIONARY:
- return "RLE_DICTIONARY";
- case Encoding::BYTE_STREAM_SPLIT:
- return "BYTE_STREAM_SPLIT";
- default:
- return "UNKNOWN";
- }
-}
-
-std::string TypeToString(Type::type t) {
- switch (t) {
- case Type::BOOLEAN:
- return "BOOLEAN";
- case Type::INT32:
- return "INT32";
- case Type::INT64:
- return "INT64";
- case Type::INT96:
- return "INT96";
- case Type::FLOAT:
- return "FLOAT";
- case Type::DOUBLE:
- return "DOUBLE";
- case Type::BYTE_ARRAY:
- return "BYTE_ARRAY";
- case Type::FIXED_LEN_BYTE_ARRAY:
- return "FIXED_LEN_BYTE_ARRAY";
- case Type::UNDEFINED:
- default:
- return "UNKNOWN";
- }
-}
-
-std::string ConvertedTypeToString(ConvertedType::type t) {
- switch (t) {
- case ConvertedType::NONE:
- return "NONE";
- case ConvertedType::UTF8:
- return "UTF8";
- case ConvertedType::MAP:
- return "MAP";
- case ConvertedType::MAP_KEY_VALUE:
- return "MAP_KEY_VALUE";
- case ConvertedType::LIST:
- return "LIST";
- case ConvertedType::ENUM:
- return "ENUM";
- case ConvertedType::DECIMAL:
- return "DECIMAL";
- case ConvertedType::DATE:
- return "DATE";
- case ConvertedType::TIME_MILLIS:
- return "TIME_MILLIS";
- case ConvertedType::TIME_MICROS:
- return "TIME_MICROS";
- case ConvertedType::TIMESTAMP_MILLIS:
- return "TIMESTAMP_MILLIS";
- case ConvertedType::TIMESTAMP_MICROS:
- return "TIMESTAMP_MICROS";
- case ConvertedType::UINT_8:
- return "UINT_8";
- case ConvertedType::UINT_16:
- return "UINT_16";
- case ConvertedType::UINT_32:
- return "UINT_32";
- case ConvertedType::UINT_64:
- return "UINT_64";
- case ConvertedType::INT_8:
- return "INT_8";
- case ConvertedType::INT_16:
- return "INT_16";
- case ConvertedType::INT_32:
- return "INT_32";
- case ConvertedType::INT_64:
- return "INT_64";
- case ConvertedType::JSON:
- return "JSON";
- case ConvertedType::BSON:
- return "BSON";
- case ConvertedType::INTERVAL:
- return "INTERVAL";
- case ConvertedType::UNDEFINED:
- default:
- return "UNKNOWN";
- }
-}
-
-int GetTypeByteSize(Type::type parquet_type) {
- switch (parquet_type) {
- case Type::BOOLEAN:
- return type_traits<BooleanType::type_num>::value_byte_size;
- case Type::INT32:
- return type_traits<Int32Type::type_num>::value_byte_size;
- case Type::INT64:
- return type_traits<Int64Type::type_num>::value_byte_size;
- case Type::INT96:
- return type_traits<Int96Type::type_num>::value_byte_size;
- case Type::DOUBLE:
- return type_traits<DoubleType::type_num>::value_byte_size;
- case Type::FLOAT:
- return type_traits<FloatType::type_num>::value_byte_size;
- case Type::BYTE_ARRAY:
- return type_traits<ByteArrayType::type_num>::value_byte_size;
- case Type::FIXED_LEN_BYTE_ARRAY:
- return type_traits<FLBAType::type_num>::value_byte_size;
- case Type::UNDEFINED:
- default:
- return 0;
- }
- return 0;
-}
-
-// Return the Sort Order of the Parquet Physical Types
-SortOrder::type DefaultSortOrder(Type::type primitive) {
- switch (primitive) {
- case Type::BOOLEAN:
- case Type::INT32:
- case Type::INT64:
- case Type::FLOAT:
- case Type::DOUBLE:
- return SortOrder::SIGNED;
- case Type::BYTE_ARRAY:
- case Type::FIXED_LEN_BYTE_ARRAY:
- return SortOrder::UNSIGNED;
- case Type::INT96:
- case Type::UNDEFINED:
- return SortOrder::UNKNOWN;
- }
- return SortOrder::UNKNOWN;
-}
-
-// Return the SortOrder of the Parquet Types using Logical or Physical Types
-SortOrder::type GetSortOrder(ConvertedType::type converted, Type::type primitive) {
- if (converted == ConvertedType::NONE) return DefaultSortOrder(primitive);
- switch (converted) {
- case ConvertedType::INT_8:
- case ConvertedType::INT_16:
- case ConvertedType::INT_32:
- case ConvertedType::INT_64:
- case ConvertedType::DATE:
- case ConvertedType::TIME_MICROS:
- case ConvertedType::TIME_MILLIS:
- case ConvertedType::TIMESTAMP_MICROS:
- case ConvertedType::TIMESTAMP_MILLIS:
- return SortOrder::SIGNED;
- case ConvertedType::UINT_8:
- case ConvertedType::UINT_16:
- case ConvertedType::UINT_32:
- case ConvertedType::UINT_64:
- case ConvertedType::ENUM:
- case ConvertedType::UTF8:
- case ConvertedType::BSON:
- case ConvertedType::JSON:
- return SortOrder::UNSIGNED;
- case ConvertedType::DECIMAL:
- case ConvertedType::LIST:
- case ConvertedType::MAP:
- case ConvertedType::MAP_KEY_VALUE:
- case ConvertedType::INTERVAL:
- case ConvertedType::NONE: // required instead of default
- case ConvertedType::NA: // required instead of default
- case ConvertedType::UNDEFINED:
- return SortOrder::UNKNOWN;
- }
- return SortOrder::UNKNOWN;
-}
-
-SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_type,
- Type::type primitive) {
- SortOrder::type o = SortOrder::UNKNOWN;
- if (logical_type && logical_type->is_valid()) {
- o = (logical_type->is_none() ? DefaultSortOrder(primitive)
- : logical_type->sort_order());
- }
- return o;
-}
-
-ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
-ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
-
-// Static methods for LogicalType class
-
-std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
- const ConvertedType::type converted_type,
- const schema::DecimalMetadata converted_decimal_metadata) {
- switch (converted_type) {
- case ConvertedType::UTF8:
- return StringLogicalType::Make();
- case ConvertedType::MAP_KEY_VALUE:
- case ConvertedType::MAP:
- return MapLogicalType::Make();
- case ConvertedType::LIST:
- return ListLogicalType::Make();
- case ConvertedType::ENUM:
- return EnumLogicalType::Make();
- case ConvertedType::DECIMAL:
- return DecimalLogicalType::Make(converted_decimal_metadata.precision,
- converted_decimal_metadata.scale);
- case ConvertedType::DATE:
- return DateLogicalType::Make();
- case ConvertedType::TIME_MILLIS:
- return TimeLogicalType::Make(true, LogicalType::TimeUnit::MILLIS);
- case ConvertedType::TIME_MICROS:
- return TimeLogicalType::Make(true, LogicalType::TimeUnit::MICROS);
- case ConvertedType::TIMESTAMP_MILLIS:
- return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MILLIS,
- /*is_from_converted_type=*/true,
- /*force_set_converted_type=*/false);
- case ConvertedType::TIMESTAMP_MICROS:
- return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MICROS,
- /*is_from_converted_type=*/true,
- /*force_set_converted_type=*/false);
- case ConvertedType::INTERVAL:
- return IntervalLogicalType::Make();
- case ConvertedType::INT_8:
- return IntLogicalType::Make(8, true);
- case ConvertedType::INT_16:
- return IntLogicalType::Make(16, true);
- case ConvertedType::INT_32:
- return IntLogicalType::Make(32, true);
- case ConvertedType::INT_64:
- return IntLogicalType::Make(64, true);
- case ConvertedType::UINT_8:
- return IntLogicalType::Make(8, false);
- case ConvertedType::UINT_16:
- return IntLogicalType::Make(16, false);
- case ConvertedType::UINT_32:
- return IntLogicalType::Make(32, false);
- case ConvertedType::UINT_64:
- return IntLogicalType::Make(64, false);
- case ConvertedType::JSON:
- return JSONLogicalType::Make();
- case ConvertedType::BSON:
- return BSONLogicalType::Make();
- case ConvertedType::NA:
- return NullLogicalType::Make();
- case ConvertedType::NONE:
- return NoLogicalType::Make();
- case ConvertedType::UNDEFINED:
- return UndefinedLogicalType::Make();
- }
- return UndefinedLogicalType::Make();
-}
-
-std::shared_ptr<const LogicalType> LogicalType::FromThrift(
- const format::LogicalType& type) {
- if (type.__isset.STRING) {
- return StringLogicalType::Make();
- } else if (type.__isset.MAP) {
- return MapLogicalType::Make();
- } else if (type.__isset.LIST) {
- return ListLogicalType::Make();
- } else if (type.__isset.ENUM) {
- return EnumLogicalType::Make();
- } else if (type.__isset.DECIMAL) {
- return DecimalLogicalType::Make(type.DECIMAL.precision, type.DECIMAL.scale);
- } else if (type.__isset.DATE) {
- return DateLogicalType::Make();
- } else if (type.__isset.TIME) {
- LogicalType::TimeUnit::unit unit;
- if (type.TIME.unit.__isset.MILLIS) {
- unit = LogicalType::TimeUnit::MILLIS;
- } else if (type.TIME.unit.__isset.MICROS) {
- unit = LogicalType::TimeUnit::MICROS;
- } else if (type.TIME.unit.__isset.NANOS) {
- unit = LogicalType::TimeUnit::NANOS;
- } else {
- unit = LogicalType::TimeUnit::UNKNOWN;
- }
- return TimeLogicalType::Make(type.TIME.isAdjustedToUTC, unit);
- } else if (type.__isset.TIMESTAMP) {
- LogicalType::TimeUnit::unit unit;
- if (type.TIMESTAMP.unit.__isset.MILLIS) {
- unit = LogicalType::TimeUnit::MILLIS;
- } else if (type.TIMESTAMP.unit.__isset.MICROS) {
- unit = LogicalType::TimeUnit::MICROS;
- } else if (type.TIMESTAMP.unit.__isset.NANOS) {
- unit = LogicalType::TimeUnit::NANOS;
- } else {
- unit = LogicalType::TimeUnit::UNKNOWN;
- }
- return TimestampLogicalType::Make(type.TIMESTAMP.isAdjustedToUTC, unit);
- // TODO(tpboudreau): activate the commented code after parquet.thrift
- // recognizes IntervalType as a LogicalType
- //} else if (type.__isset.INTERVAL) {
- // return IntervalLogicalType::Make();
- } else if (type.__isset.INTEGER) {
- return IntLogicalType::Make(static_cast<int>(type.INTEGER.bitWidth),
- type.INTEGER.isSigned);
- } else if (type.__isset.UNKNOWN) {
- return NullLogicalType::Make();
- } else if (type.__isset.JSON) {
- return JSONLogicalType::Make();
- } else if (type.__isset.BSON) {
- return BSONLogicalType::Make();
- } else if (type.__isset.UUID) {
- return UUIDLogicalType::Make();
- } else {
- throw ParquetException("Metadata contains Thrift LogicalType that is not recognized");
- }
-}
-
-std::shared_ptr<const LogicalType> LogicalType::String() {
- return StringLogicalType::Make();
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Map() { return MapLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::List() { return ListLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::Enum() { return EnumLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::Decimal(int32_t precision,
- int32_t scale) {
- return DecimalLogicalType::Make(precision, scale);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Date() { return DateLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::Time(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
- DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
- return TimeLogicalType::Make(is_adjusted_to_utc, time_unit);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Timestamp(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type, bool force_set_converted_type) {
- DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
- return TimestampLogicalType::Make(is_adjusted_to_utc, time_unit, is_from_converted_type,
- force_set_converted_type);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Interval() {
- return IntervalLogicalType::Make();
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Int(int bit_width, bool is_signed) {
- DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8);
- return IntLogicalType::Make(bit_width, is_signed);
-}
-
-std::shared_ptr<const LogicalType> LogicalType::Null() { return NullLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::JSON() { return JSONLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::BSON() { return BSONLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType::Make(); }
-
-std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
-
-/*
- * The logical type implementation classes are built in four layers: (1) the base
- * layer, which establishes the interface and provides generally reusable implementations
- * for the ToJSON() and Equals() methods; (2) an intermediate derived layer for the
- * "compatibility" methods, which provides implementations for is_compatible() and
- * ToConvertedType(); (3) another intermediate layer for the "applicability" methods
- * that provides several implementations for the is_applicable() method; and (4) the
- * final derived classes, one for each logical type, which supply implementations
- * for those methods that remain virtual (usually just ToString() and ToThrift()) or
- * otherwise need to be overridden.
- */
-
-// LogicalTypeImpl base class
-
-class LogicalType::Impl {
- public:
- virtual bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const = 0;
-
- virtual bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata = {
- false, -1, -1}) const = 0;
-
- virtual ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const = 0;
-
- virtual std::string ToString() const = 0;
-
- virtual bool is_serialized() const {
- return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
- }
-
- virtual std::string ToJSON() const {
- std::stringstream json;
- json << R"({"Type": ")" << ToString() << R"("})";
- return json.str();
- }
-
- virtual format::LogicalType ToThrift() const {
- // logical types inheriting this method should never be serialized
- std::stringstream ss;
- ss << "Logical type " << ToString() << " should not be serialized";
- throw ParquetException(ss.str());
- }
-
- virtual bool Equals(const LogicalType& other) const { return other.type() == type_; }
-
- LogicalType::Type::type type() const { return type_; }
-
- SortOrder::type sort_order() const { return order_; }
-
- Impl(const Impl&) = delete;
- Impl& operator=(const Impl&) = delete;
- virtual ~Impl() noexcept {}
-
- class Compatible;
- class SimpleCompatible;
- class Incompatible;
-
- class Applicable;
- class SimpleApplicable;
- class TypeLengthApplicable;
- class UniversalApplicable;
- class Inapplicable;
-
- class String;
- class Map;
- class List;
- class Enum;
- class Decimal;
- class Date;
- class Time;
- class Timestamp;
- class Interval;
- class Int;
- class Null;
- class JSON;
- class BSON;
- class UUID;
- class No;
- class Undefined;
-
- protected:
- Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
- Impl() = default;
-
- private:
- LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
- SortOrder::type order_ = SortOrder::UNKNOWN;
-};
-
-// Special methods for public LogicalType class
-
-LogicalType::LogicalType() = default;
-LogicalType::~LogicalType() noexcept = default;
-
-// Delegating methods for public LogicalType class
-
-bool LogicalType::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- return impl_->is_applicable(primitive_type, primitive_length);
-}
-
-bool LogicalType::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- return impl_->is_compatible(converted_type, converted_decimal_metadata);
-}
-
-ConvertedType::type LogicalType::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- return impl_->ToConvertedType(out_decimal_metadata);
-}
-
-std::string LogicalType::ToString() const { return impl_->ToString(); }
-
-std::string LogicalType::ToJSON() const { return impl_->ToJSON(); }
-
-format::LogicalType LogicalType::ToThrift() const { return impl_->ToThrift(); }
-
-bool LogicalType::Equals(const LogicalType& other) const { return impl_->Equals(other); }
-
-LogicalType::Type::type LogicalType::type() const { return impl_->type(); }
-
-SortOrder::type LogicalType::sort_order() const { return impl_->sort_order(); }
-
-// Type checks for public LogicalType class
-
-bool LogicalType::is_string() const { return impl_->type() == LogicalType::Type::STRING; }
-bool LogicalType::is_map() const { return impl_->type() == LogicalType::Type::MAP; }
-bool LogicalType::is_list() const { return impl_->type() == LogicalType::Type::LIST; }
-bool LogicalType::is_enum() const { return impl_->type() == LogicalType::Type::ENUM; }
-bool LogicalType::is_decimal() const {
- return impl_->type() == LogicalType::Type::DECIMAL;
-}
-bool LogicalType::is_date() const { return impl_->type() == LogicalType::Type::DATE; }
-bool LogicalType::is_time() const { return impl_->type() == LogicalType::Type::TIME; }
-bool LogicalType::is_timestamp() const {
- return impl_->type() == LogicalType::Type::TIMESTAMP;
-}
-bool LogicalType::is_interval() const {
- return impl_->type() == LogicalType::Type::INTERVAL;
-}
-bool LogicalType::is_int() const { return impl_->type() == LogicalType::Type::INT; }
-bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::NIL; }
-bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; }
-bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
-bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
-bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
-bool LogicalType::is_valid() const {
- return impl_->type() != LogicalType::Type::UNDEFINED;
-}
-bool LogicalType::is_invalid() const { return !is_valid(); }
-bool LogicalType::is_nested() const {
- return (impl_->type() == LogicalType::Type::LIST) ||
- (impl_->type() == LogicalType::Type::MAP);
-}
-bool LogicalType::is_nonnested() const { return !is_nested(); }
-bool LogicalType::is_serialized() const { return impl_->is_serialized(); }
-
-// LogicalTypeImpl intermediate "compatibility" classes
-
-class LogicalType::Impl::Compatible : public virtual LogicalType::Impl {
- protected:
- Compatible() = default;
-};
-
-#define set_decimal_metadata(m___, i___, p___, s___) \
- { \
- if (m___) { \
- (m___)->isset = (i___); \
- (m___)->scale = (s___); \
- (m___)->precision = (p___); \
- } \
- }
-
-#define reset_decimal_metadata(m___) \
- { set_decimal_metadata(m___, false, -1, -1); }
-
-// For logical types that always translate to the same converted type
-class LogicalType::Impl::SimpleCompatible : public virtual LogicalType::Impl::Compatible {
- public:
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override {
- return (converted_type == converted_type_) && !converted_decimal_metadata.isset;
- }
-
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override {
- reset_decimal_metadata(out_decimal_metadata);
- return converted_type_;
- }
-
- protected:
- explicit SimpleCompatible(ConvertedType::type c) : converted_type_(c) {}
-
- private:
- ConvertedType::type converted_type_ = ConvertedType::NA;
-};
-
-// For logical types that have no corresponding converted type
-class LogicalType::Impl::Incompatible : public virtual LogicalType::Impl {
- public:
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override {
- return (converted_type == ConvertedType::NONE ||
- converted_type == ConvertedType::NA) &&
- !converted_decimal_metadata.isset;
- }
-
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override {
- reset_decimal_metadata(out_decimal_metadata);
- return ConvertedType::NONE;
- }
-
- protected:
- Incompatible() = default;
-};
-
-// LogicalTypeImpl intermediate "applicability" classes
-
-class LogicalType::Impl::Applicable : public virtual LogicalType::Impl {
- protected:
- Applicable() = default;
-};
-
-// For logical types that can apply only to a single
-// physical type
-class LogicalType::Impl::SimpleApplicable : public virtual LogicalType::Impl::Applicable {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return primitive_type == type_;
- }
-
- protected:
- explicit SimpleApplicable(parquet::Type::type t) : type_(t) {}
-
- private:
- parquet::Type::type type_;
-};
-
-// For logical types that can apply only to a particular
-// physical type and physical length combination
-class LogicalType::Impl::TypeLengthApplicable
- : public virtual LogicalType::Impl::Applicable {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return primitive_type == type_ && primitive_length == length_;
- }
-
- protected:
- TypeLengthApplicable(parquet::Type::type t, int32_t l) : type_(t), length_(l) {}
-
- private:
- parquet::Type::type type_;
- int32_t length_;
-};
-
-// For logical types that can apply to any physical type
-class LogicalType::Impl::UniversalApplicable
- : public virtual LogicalType::Impl::Applicable {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return true;
- }
-
- protected:
- UniversalApplicable() = default;
-};
-
-// For logical types that can never apply to any primitive
-// physical type
-class LogicalType::Impl::Inapplicable : public virtual LogicalType::Impl {
- public:
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override {
- return false;
- }
-
- protected:
- Inapplicable() = default;
-};
-
-// LogicalType implementation final classes
-
-#define OVERRIDE_TOSTRING(n___) \
- std::string ToString() const override { return #n___; }
-
-#define OVERRIDE_TOTHRIFT(t___, s___) \
- format::LogicalType ToThrift() const override { \
- format::LogicalType type; \
- format::t___ subtype; \
- type.__set_##s___(subtype); \
- return type; \
- }
-
-class LogicalType::Impl::String final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class StringLogicalType;
-
- OVERRIDE_TOSTRING(String)
- OVERRIDE_TOTHRIFT(StringType, STRING)
-
- private:
- String()
- : LogicalType::Impl(LogicalType::Type::STRING, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::UTF8),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-// Each public logical type class's Make() creation method instantiates a corresponding
-// LogicalType::Impl::* object and installs that implementation in the logical type
-// it returns.
-
-#define GENERATE_MAKE(a___) \
- std::shared_ptr<const LogicalType> a___##LogicalType::Make() { \
- auto* logical_type = new a___##LogicalType(); \
- logical_type->impl_.reset(new LogicalType::Impl::a___()); \
- return std::shared_ptr<const LogicalType>(logical_type); \
- }
-
-GENERATE_MAKE(String)
-
-class LogicalType::Impl::Map final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::Inapplicable {
- public:
- friend class MapLogicalType;
-
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override {
- return (converted_type == ConvertedType::MAP ||
- converted_type == ConvertedType::MAP_KEY_VALUE) &&
- !converted_decimal_metadata.isset;
- }
-
- OVERRIDE_TOSTRING(Map)
- OVERRIDE_TOTHRIFT(MapType, MAP)
-
- private:
- Map()
- : LogicalType::Impl(LogicalType::Type::MAP, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::MAP) {}
-};
-
-GENERATE_MAKE(Map)
-
-class LogicalType::Impl::List final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::Inapplicable {
- public:
- friend class ListLogicalType;
-
- OVERRIDE_TOSTRING(List)
- OVERRIDE_TOTHRIFT(ListType, LIST)
-
- private:
- List()
- : LogicalType::Impl(LogicalType::Type::LIST, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::LIST) {}
-};
-
-GENERATE_MAKE(List)
-
-class LogicalType::Impl::Enum final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class EnumLogicalType;
-
- OVERRIDE_TOSTRING(Enum)
- OVERRIDE_TOTHRIFT(EnumType, ENUM)
-
- private:
- Enum()
- : LogicalType::Impl(LogicalType::Type::ENUM, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::ENUM),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-GENERATE_MAKE(Enum)
-
-// The parameterized logical types (currently Decimal, Time, Timestamp, and Int)
-// generally can't reuse the simple method implementations available in the base and
-// intermediate classes and must (re)implement them all
-
-class LogicalType::Impl::Decimal final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::Applicable {
- public:
- friend class DecimalLogicalType;
-
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- int32_t precision() const { return precision_; }
- int32_t scale() const { return scale_; }
-
- private:
- Decimal(int32_t p, int32_t s)
- : LogicalType::Impl(LogicalType::Type::DECIMAL, SortOrder::SIGNED),
- precision_(p),
- scale_(s) {}
- int32_t precision_ = -1;
- int32_t scale_ = -1;
-};
-
-bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- bool ok = false;
- switch (primitive_type) {
- case parquet::Type::INT32: {
- ok = (1 <= precision_) && (precision_ <= 9);
- } break;
- case parquet::Type::INT64: {
- ok = (1 <= precision_) && (precision_ <= 18);
- if (precision_ < 10) {
- // FIXME(tpb): warn that INT32 could be used
- }
- } break;
- case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
- ok = precision_ <= static_cast<int32_t>(std::floor(
- std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
- } break;
- case parquet::Type::BYTE_ARRAY: {
- ok = true;
- } break;
- default: {
- } break;
- }
- return ok;
-}
-
-bool LogicalType::Impl::Decimal::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- return converted_type == ConvertedType::DECIMAL &&
- (converted_decimal_metadata.isset &&
- converted_decimal_metadata.scale == scale_ &&
- converted_decimal_metadata.precision == precision_);
-}
-
-ConvertedType::type LogicalType::Impl::Decimal::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- set_decimal_metadata(out_decimal_metadata, true, precision_, scale_);
- return ConvertedType::DECIMAL;
-}
-
-std::string LogicalType::Impl::Decimal::ToString() const {
- std::stringstream type;
- type << "Decimal(precision=" << precision_ << ", scale=" << scale_ << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Decimal::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Decimal", "precision": )" << precision_ << R"(, "scale": )"
- << scale_ << "}";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Decimal::ToThrift() const {
- format::LogicalType type;
- format::DecimalType decimal_type;
- decimal_type.__set_precision(precision_);
- decimal_type.__set_scale(scale_);
- type.__set_DECIMAL(decimal_type);
- return type;
-}
-
-bool LogicalType::Impl::Decimal::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_decimal()) {
- const auto& other_decimal = checked_cast<const DecimalLogicalType&>(other);
- eq = (precision_ == other_decimal.precision() && scale_ == other_decimal.scale());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> DecimalLogicalType::Make(int32_t precision,
- int32_t scale) {
- if (precision < 1) {
- throw ParquetException(
- "Precision must be greater than or equal to 1 for Decimal logical type");
- }
- if (scale < 0 || scale > precision) {
- throw ParquetException(
- "Scale must be a non-negative integer that does not exceed precision for "
- "Decimal logical type");
- }
- auto* logical_type = new DecimalLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Decimal(precision, scale));
- return std::shared_ptr<const LogicalType>(logical_type);
-}
-
-int32_t DecimalLogicalType::precision() const {
- return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).precision();
-}
-
-int32_t DecimalLogicalType::scale() const {
- return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).scale();
-}
-
-class LogicalType::Impl::Date final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class DateLogicalType;
-
- OVERRIDE_TOSTRING(Date)
- OVERRIDE_TOTHRIFT(DateType, DATE)
-
- private:
- Date()
- : LogicalType::Impl(LogicalType::Type::DATE, SortOrder::SIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::DATE),
- LogicalType::Impl::SimpleApplicable(parquet::Type::INT32) {}
-};
-
-GENERATE_MAKE(Date)
-
-#define time_unit_string(u___) \
- ((u___) == LogicalType::TimeUnit::MILLIS \
- ? "milliseconds" \
- : ((u___) == LogicalType::TimeUnit::MICROS \
- ? "microseconds" \
- : ((u___) == LogicalType::TimeUnit::NANOS ? "nanoseconds" : "unknown")))
-
-class LogicalType::Impl::Time final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::Applicable {
- public:
- friend class TimeLogicalType;
-
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- bool is_adjusted_to_utc() const { return adjusted_; }
- LogicalType::TimeUnit::unit time_unit() const { return unit_; }
-
- private:
- Time(bool a, LogicalType::TimeUnit::unit u)
- : LogicalType::Impl(LogicalType::Type::TIME, SortOrder::SIGNED),
- adjusted_(a),
- unit_(u) {}
- bool adjusted_ = false;
- LogicalType::TimeUnit::unit unit_;
-};
-
-bool LogicalType::Impl::Time::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- return (primitive_type == parquet::Type::INT32 &&
- unit_ == LogicalType::TimeUnit::MILLIS) ||
- (primitive_type == parquet::Type::INT64 &&
- (unit_ == LogicalType::TimeUnit::MICROS ||
- unit_ == LogicalType::TimeUnit::NANOS));
-}
-
-bool LogicalType::Impl::Time::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- if (converted_decimal_metadata.isset) {
- return false;
- } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MILLIS) {
- return converted_type == ConvertedType::TIME_MILLIS;
- } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MICROS) {
- return converted_type == ConvertedType::TIME_MICROS;
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
-}
-
-ConvertedType::type LogicalType::Impl::Time::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- reset_decimal_metadata(out_decimal_metadata);
- if (adjusted_) {
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- return ConvertedType::TIME_MILLIS;
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- return ConvertedType::TIME_MICROS;
- }
- }
- return ConvertedType::NONE;
-}
-
-std::string LogicalType::Impl::Time::ToString() const {
- std::stringstream type;
- type << "Time(isAdjustedToUTC=" << std::boolalpha << adjusted_
- << ", timeUnit=" << time_unit_string(unit_) << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Time::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Time", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
- << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"("})";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Time::ToThrift() const {
- format::LogicalType type;
- format::TimeType time_type;
- format::TimeUnit time_unit;
- DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- format::MilliSeconds millis;
- time_unit.__set_MILLIS(millis);
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- format::MicroSeconds micros;
- time_unit.__set_MICROS(micros);
- } else if (unit_ == LogicalType::TimeUnit::NANOS) {
- format::NanoSeconds nanos;
- time_unit.__set_NANOS(nanos);
- }
- time_type.__set_isAdjustedToUTC(adjusted_);
- time_type.__set_unit(time_unit);
- type.__set_TIME(time_type);
- return type;
-}
-
-bool LogicalType::Impl::Time::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_time()) {
- const auto& other_time = checked_cast<const TimeLogicalType&>(other);
- eq =
- (adjusted_ == other_time.is_adjusted_to_utc() && unit_ == other_time.time_unit());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> TimeLogicalType::Make(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
- if (time_unit == LogicalType::TimeUnit::MILLIS ||
- time_unit == LogicalType::TimeUnit::MICROS ||
- time_unit == LogicalType::TimeUnit::NANOS) {
- auto* logical_type = new TimeLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Time(is_adjusted_to_utc, time_unit));
- return std::shared_ptr<const LogicalType>(logical_type);
- } else {
- throw ParquetException(
- "TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type");
- }
-}
-
-bool TimeLogicalType::is_adjusted_to_utc() const {
- return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).is_adjusted_to_utc();
-}
-
-LogicalType::TimeUnit::unit TimeLogicalType::time_unit() const {
- return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).time_unit();
-}
-
-class LogicalType::Impl::Timestamp final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class TimestampLogicalType;
-
- bool is_serialized() const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- bool is_adjusted_to_utc() const { return adjusted_; }
- LogicalType::TimeUnit::unit time_unit() const { return unit_; }
-
- bool is_from_converted_type() const { return is_from_converted_type_; }
- bool force_set_converted_type() const { return force_set_converted_type_; }
-
- private:
- Timestamp(bool adjusted, LogicalType::TimeUnit::unit unit, bool is_from_converted_type,
- bool force_set_converted_type)
- : LogicalType::Impl(LogicalType::Type::TIMESTAMP, SortOrder::SIGNED),
- LogicalType::Impl::SimpleApplicable(parquet::Type::INT64),
- adjusted_(adjusted),
- unit_(unit),
- is_from_converted_type_(is_from_converted_type),
- force_set_converted_type_(force_set_converted_type) {}
- bool adjusted_ = false;
- LogicalType::TimeUnit::unit unit_;
- bool is_from_converted_type_ = false;
- bool force_set_converted_type_ = false;
-};
-
-bool LogicalType::Impl::Timestamp::is_serialized() const {
- return !is_from_converted_type_;
-}
-
-bool LogicalType::Impl::Timestamp::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- if (converted_decimal_metadata.isset) {
- return false;
- } else if (unit_ == LogicalType::TimeUnit::MILLIS) {
- if (adjusted_ || force_set_converted_type_) {
- return converted_type == ConvertedType::TIMESTAMP_MILLIS;
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- if (adjusted_ || force_set_converted_type_) {
- return converted_type == ConvertedType::TIMESTAMP_MICROS;
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
- } else {
- return (converted_type == ConvertedType::NONE) ||
- (converted_type == ConvertedType::NA);
- }
-}
-
-ConvertedType::type LogicalType::Impl::Timestamp::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- reset_decimal_metadata(out_decimal_metadata);
- if (adjusted_ || force_set_converted_type_) {
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- return ConvertedType::TIMESTAMP_MILLIS;
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- return ConvertedType::TIMESTAMP_MICROS;
- }
- }
- return ConvertedType::NONE;
-}
-
-std::string LogicalType::Impl::Timestamp::ToString() const {
- std::stringstream type;
- type << "Timestamp(isAdjustedToUTC=" << std::boolalpha << adjusted_
- << ", timeUnit=" << time_unit_string(unit_)
- << ", is_from_converted_type=" << is_from_converted_type_
- << ", force_set_converted_type=" << force_set_converted_type_ << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Timestamp::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Timestamp", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
- << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"(")"
- << R"(, "is_from_converted_type": )" << is_from_converted_type_
- << R"(, "force_set_converted_type": )" << force_set_converted_type_ << R"(})";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Timestamp::ToThrift() const {
- format::LogicalType type;
- format::TimestampType timestamp_type;
- format::TimeUnit time_unit;
- DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
- if (unit_ == LogicalType::TimeUnit::MILLIS) {
- format::MilliSeconds millis;
- time_unit.__set_MILLIS(millis);
- } else if (unit_ == LogicalType::TimeUnit::MICROS) {
- format::MicroSeconds micros;
- time_unit.__set_MICROS(micros);
- } else if (unit_ == LogicalType::TimeUnit::NANOS) {
- format::NanoSeconds nanos;
- time_unit.__set_NANOS(nanos);
- }
- timestamp_type.__set_isAdjustedToUTC(adjusted_);
- timestamp_type.__set_unit(time_unit);
- type.__set_TIMESTAMP(timestamp_type);
- return type;
-}
-
-bool LogicalType::Impl::Timestamp::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_timestamp()) {
- const auto& other_timestamp = checked_cast<const TimestampLogicalType&>(other);
- eq = (adjusted_ == other_timestamp.is_adjusted_to_utc() &&
- unit_ == other_timestamp.time_unit());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> TimestampLogicalType::Make(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type, bool force_set_converted_type) {
- if (time_unit == LogicalType::TimeUnit::MILLIS ||
- time_unit == LogicalType::TimeUnit::MICROS ||
- time_unit == LogicalType::TimeUnit::NANOS) {
- auto* logical_type = new TimestampLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Timestamp(
- is_adjusted_to_utc, time_unit, is_from_converted_type, force_set_converted_type));
- return std::shared_ptr<const LogicalType>(logical_type);
- } else {
- throw ParquetException(
- "TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type");
- }
-}
-
-bool TimestampLogicalType::is_adjusted_to_utc() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).is_adjusted_to_utc();
-}
-
-LogicalType::TimeUnit::unit TimestampLogicalType::time_unit() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).time_unit();
-}
-
-bool TimestampLogicalType::is_from_converted_type() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
- .is_from_converted_type();
-}
-
-bool TimestampLogicalType::force_set_converted_type() const {
- return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
- .force_set_converted_type();
-}
-
-class LogicalType::Impl::Interval final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::TypeLengthApplicable {
- public:
- friend class IntervalLogicalType;
-
- OVERRIDE_TOSTRING(Interval)
- // TODO(tpboudreau): uncomment the following line to enable serialization after
- // parquet.thrift recognizes IntervalType as a ConvertedType
- // OVERRIDE_TOTHRIFT(IntervalType, INTERVAL)
-
- private:
- Interval()
- : LogicalType::Impl(LogicalType::Type::INTERVAL, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::INTERVAL),
- LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 12) {
- }
-};
-
-GENERATE_MAKE(Interval)
-
-class LogicalType::Impl::Int final : public LogicalType::Impl::Compatible,
- public LogicalType::Impl::Applicable {
- public:
- friend class IntLogicalType;
-
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const override;
- bool is_compatible(ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const override;
- ConvertedType::type ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const override;
- std::string ToString() const override;
- std::string ToJSON() const override;
- format::LogicalType ToThrift() const override;
- bool Equals(const LogicalType& other) const override;
-
- int bit_width() const { return width_; }
- bool is_signed() const { return signed_; }
-
- private:
- Int(int w, bool s)
- : LogicalType::Impl(LogicalType::Type::INT,
- (s ? SortOrder::SIGNED : SortOrder::UNSIGNED)),
- width_(w),
- signed_(s) {}
- int width_ = 0;
- bool signed_ = false;
-};
-
-bool LogicalType::Impl::Int::is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length) const {
- return (primitive_type == parquet::Type::INT32 && width_ <= 32) ||
- (primitive_type == parquet::Type::INT64 && width_ == 64);
-}
-
-bool LogicalType::Impl::Int::is_compatible(
- ConvertedType::type converted_type,
- schema::DecimalMetadata converted_decimal_metadata) const {
- if (converted_decimal_metadata.isset) {
- return false;
- } else if (signed_ && width_ == 8) {
- return converted_type == ConvertedType::INT_8;
- } else if (signed_ && width_ == 16) {
- return converted_type == ConvertedType::INT_16;
- } else if (signed_ && width_ == 32) {
- return converted_type == ConvertedType::INT_32;
- } else if (signed_ && width_ == 64) {
- return converted_type == ConvertedType::INT_64;
- } else if (!signed_ && width_ == 8) {
- return converted_type == ConvertedType::UINT_8;
- } else if (!signed_ && width_ == 16) {
- return converted_type == ConvertedType::UINT_16;
- } else if (!signed_ && width_ == 32) {
- return converted_type == ConvertedType::UINT_32;
- } else if (!signed_ && width_ == 64) {
- return converted_type == ConvertedType::UINT_64;
- } else {
- return false;
- }
-}
-
-ConvertedType::type LogicalType::Impl::Int::ToConvertedType(
- schema::DecimalMetadata* out_decimal_metadata) const {
- reset_decimal_metadata(out_decimal_metadata);
- if (signed_) {
- switch (width_) {
- case 8:
- return ConvertedType::INT_8;
- case 16:
- return ConvertedType::INT_16;
- case 32:
- return ConvertedType::INT_32;
- case 64:
- return ConvertedType::INT_64;
- }
- } else { // unsigned
- switch (width_) {
- case 8:
- return ConvertedType::UINT_8;
- case 16:
- return ConvertedType::UINT_16;
- case 32:
- return ConvertedType::UINT_32;
- case 64:
- return ConvertedType::UINT_64;
- }
- }
- return ConvertedType::NONE;
-}
-
-std::string LogicalType::Impl::Int::ToString() const {
- std::stringstream type;
- type << "Int(bitWidth=" << width_ << ", isSigned=" << std::boolalpha << signed_ << ")";
- return type.str();
-}
-
-std::string LogicalType::Impl::Int::ToJSON() const {
- std::stringstream json;
- json << R"({"Type": "Int", "bitWidth": )" << width_ << R"(, "isSigned": )"
- << std::boolalpha << signed_ << "}";
- return json.str();
-}
-
-format::LogicalType LogicalType::Impl::Int::ToThrift() const {
- format::LogicalType type;
- format::IntType int_type;
- DCHECK(width_ == 64 || width_ == 32 || width_ == 16 || width_ == 8);
- int_type.__set_bitWidth(static_cast<int8_t>(width_));
- int_type.__set_isSigned(signed_);
- type.__set_INTEGER(int_type);
- return type;
-}
-
-bool LogicalType::Impl::Int::Equals(const LogicalType& other) const {
- bool eq = false;
- if (other.is_int()) {
- const auto& other_int = checked_cast<const IntLogicalType&>(other);
- eq = (width_ == other_int.bit_width() && signed_ == other_int.is_signed());
- }
- return eq;
-}
-
-std::shared_ptr<const LogicalType> IntLogicalType::Make(int bit_width, bool is_signed) {
- if (bit_width == 8 || bit_width == 16 || bit_width == 32 || bit_width == 64) {
- auto* logical_type = new IntLogicalType();
- logical_type->impl_.reset(new LogicalType::Impl::Int(bit_width, is_signed));
- return std::shared_ptr<const LogicalType>(logical_type);
- } else {
- throw ParquetException(
- "Bit width must be exactly 8, 16, 32, or 64 for Int logical type");
- }
-}
-
-int IntLogicalType::bit_width() const {
- return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).bit_width();
-}
-
-bool IntLogicalType::is_signed() const {
- return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).is_signed();
-}
-
-class LogicalType::Impl::Null final : public LogicalType::Impl::Incompatible,
- public LogicalType::Impl::UniversalApplicable {
- public:
- friend class NullLogicalType;
-
- OVERRIDE_TOSTRING(Null)
- OVERRIDE_TOTHRIFT(NullType, UNKNOWN)
-
- private:
- Null() : LogicalType::Impl(LogicalType::Type::NIL, SortOrder::UNKNOWN) {}
-};
-
-GENERATE_MAKE(Null)
-
-class LogicalType::Impl::JSON final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class JSONLogicalType;
-
- OVERRIDE_TOSTRING(JSON)
- OVERRIDE_TOTHRIFT(JsonType, JSON)
-
- private:
- JSON()
- : LogicalType::Impl(LogicalType::Type::JSON, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::JSON),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-GENERATE_MAKE(JSON)
-
-class LogicalType::Impl::BSON final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::SimpleApplicable {
- public:
- friend class BSONLogicalType;
-
- OVERRIDE_TOSTRING(BSON)
- OVERRIDE_TOTHRIFT(BsonType, BSON)
-
- private:
- BSON()
- : LogicalType::Impl(LogicalType::Type::BSON, SortOrder::UNSIGNED),
- LogicalType::Impl::SimpleCompatible(ConvertedType::BSON),
- LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
-};
-
-GENERATE_MAKE(BSON)
-
-class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible,
- public LogicalType::Impl::TypeLengthApplicable {
- public:
- friend class UUIDLogicalType;
-
- OVERRIDE_TOSTRING(UUID)
- OVERRIDE_TOTHRIFT(UUIDType, UUID)
-
- private:
- UUID()
- : LogicalType::Impl(LogicalType::Type::UUID, SortOrder::UNSIGNED),
- LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 16) {
- }
-};
-
-GENERATE_MAKE(UUID)
-
-class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::UniversalApplicable {
- public:
- friend class NoLogicalType;
-
- OVERRIDE_TOSTRING(None)
-
- private:
- No()
- : LogicalType::Impl(LogicalType::Type::NONE, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::NONE) {}
-};
-
-GENERATE_MAKE(No)
-
-class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
- public LogicalType::Impl::UniversalApplicable {
- public:
- friend class UndefinedLogicalType;
-
- OVERRIDE_TOSTRING(Undefined)
-
- private:
- Undefined()
- : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
- LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
-};
-
-GENERATE_MAKE(Undefined)
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/logging.h"
+
+#include "parquet/exception.h"
+#include "parquet/types.h"
+
+#include "generated/parquet_types.h"
+
+using arrow::internal::checked_cast;
+using arrow::util::Codec;
+
+namespace parquet {
+
+bool IsCodecSupported(Compression::type codec) {
+ switch (codec) {
+ case Compression::UNCOMPRESSED:
+ case Compression::SNAPPY:
+ case Compression::GZIP:
+ case Compression::BROTLI:
+ case Compression::ZSTD:
+ case Compression::LZ4:
+ case Compression::LZ4_HADOOP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec) {
+ return GetCodec(codec, Codec::UseDefaultCompressionLevel());
+}
+
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level) {
+ std::unique_ptr<Codec> result;
+ if (codec == Compression::LZO) {
+ throw ParquetException(
+ "While LZO compression is supported by the Parquet format in "
+ "general, it is currently not supported by the C++ implementation.");
+ }
+
+ if (!IsCodecSupported(codec)) {
+ std::stringstream ss;
+ ss << "Codec type " << Codec::GetCodecAsString(codec)
+ << " not supported in Parquet format";
+ throw ParquetException(ss.str());
+ }
+
+ PARQUET_ASSIGN_OR_THROW(result, Codec::Create(codec, compression_level));
+ return result;
+}
+
+std::string FormatStatValue(Type::type parquet_type, ::arrow::util::string_view val) {
+ std::stringstream result;
+
+ const char* bytes = val.data();
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ result << reinterpret_cast<const bool*>(bytes)[0];
+ break;
+ case Type::INT32:
+ result << reinterpret_cast<const int32_t*>(bytes)[0];
+ break;
+ case Type::INT64:
+ result << reinterpret_cast<const int64_t*>(bytes)[0];
+ break;
+ case Type::DOUBLE:
+ result << reinterpret_cast<const double*>(bytes)[0];
+ break;
+ case Type::FLOAT:
+ result << reinterpret_cast<const float*>(bytes)[0];
+ break;
+ case Type::INT96: {
+ auto const i32_val = reinterpret_cast<const int32_t*>(bytes);
+ result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
+ break;
+ }
+ case Type::BYTE_ARRAY: {
+ return std::string(val);
+ }
+ case Type::FIXED_LEN_BYTE_ARRAY: {
+ return std::string(val);
+ }
+ case Type::UNDEFINED:
+ default:
+ break;
+ }
+ return result.str();
+}
+
+std::string EncodingToString(Encoding::type t) {
+ switch (t) {
+ case Encoding::PLAIN:
+ return "PLAIN";
+ case Encoding::PLAIN_DICTIONARY:
+ return "PLAIN_DICTIONARY";
+ case Encoding::RLE:
+ return "RLE";
+ case Encoding::BIT_PACKED:
+ return "BIT_PACKED";
+ case Encoding::DELTA_BINARY_PACKED:
+ return "DELTA_BINARY_PACKED";
+ case Encoding::DELTA_LENGTH_BYTE_ARRAY:
+ return "DELTA_LENGTH_BYTE_ARRAY";
+ case Encoding::DELTA_BYTE_ARRAY:
+ return "DELTA_BYTE_ARRAY";
+ case Encoding::RLE_DICTIONARY:
+ return "RLE_DICTIONARY";
+ case Encoding::BYTE_STREAM_SPLIT:
+ return "BYTE_STREAM_SPLIT";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+std::string TypeToString(Type::type t) {
+ switch (t) {
+ case Type::BOOLEAN:
+ return "BOOLEAN";
+ case Type::INT32:
+ return "INT32";
+ case Type::INT64:
+ return "INT64";
+ case Type::INT96:
+ return "INT96";
+ case Type::FLOAT:
+ return "FLOAT";
+ case Type::DOUBLE:
+ return "DOUBLE";
+ case Type::BYTE_ARRAY:
+ return "BYTE_ARRAY";
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return "FIXED_LEN_BYTE_ARRAY";
+ case Type::UNDEFINED:
+ default:
+ return "UNKNOWN";
+ }
+}
+
+std::string ConvertedTypeToString(ConvertedType::type t) {
+ switch (t) {
+ case ConvertedType::NONE:
+ return "NONE";
+ case ConvertedType::UTF8:
+ return "UTF8";
+ case ConvertedType::MAP:
+ return "MAP";
+ case ConvertedType::MAP_KEY_VALUE:
+ return "MAP_KEY_VALUE";
+ case ConvertedType::LIST:
+ return "LIST";
+ case ConvertedType::ENUM:
+ return "ENUM";
+ case ConvertedType::DECIMAL:
+ return "DECIMAL";
+ case ConvertedType::DATE:
+ return "DATE";
+ case ConvertedType::TIME_MILLIS:
+ return "TIME_MILLIS";
+ case ConvertedType::TIME_MICROS:
+ return "TIME_MICROS";
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return "TIMESTAMP_MILLIS";
+ case ConvertedType::TIMESTAMP_MICROS:
+ return "TIMESTAMP_MICROS";
+ case ConvertedType::UINT_8:
+ return "UINT_8";
+ case ConvertedType::UINT_16:
+ return "UINT_16";
+ case ConvertedType::UINT_32:
+ return "UINT_32";
+ case ConvertedType::UINT_64:
+ return "UINT_64";
+ case ConvertedType::INT_8:
+ return "INT_8";
+ case ConvertedType::INT_16:
+ return "INT_16";
+ case ConvertedType::INT_32:
+ return "INT_32";
+ case ConvertedType::INT_64:
+ return "INT_64";
+ case ConvertedType::JSON:
+ return "JSON";
+ case ConvertedType::BSON:
+ return "BSON";
+ case ConvertedType::INTERVAL:
+ return "INTERVAL";
+ case ConvertedType::UNDEFINED:
+ default:
+ return "UNKNOWN";
+ }
+}
+
+int GetTypeByteSize(Type::type parquet_type) {
+ switch (parquet_type) {
+ case Type::BOOLEAN:
+ return type_traits<BooleanType::type_num>::value_byte_size;
+ case Type::INT32:
+ return type_traits<Int32Type::type_num>::value_byte_size;
+ case Type::INT64:
+ return type_traits<Int64Type::type_num>::value_byte_size;
+ case Type::INT96:
+ return type_traits<Int96Type::type_num>::value_byte_size;
+ case Type::DOUBLE:
+ return type_traits<DoubleType::type_num>::value_byte_size;
+ case Type::FLOAT:
+ return type_traits<FloatType::type_num>::value_byte_size;
+ case Type::BYTE_ARRAY:
+ return type_traits<ByteArrayType::type_num>::value_byte_size;
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return type_traits<FLBAType::type_num>::value_byte_size;
+ case Type::UNDEFINED:
+ default:
+ return 0;
+ }
+ return 0;
+}
+
+// Return the Sort Order of the Parquet Physical Types
+SortOrder::type DefaultSortOrder(Type::type primitive) {
+ switch (primitive) {
+ case Type::BOOLEAN:
+ case Type::INT32:
+ case Type::INT64:
+ case Type::FLOAT:
+ case Type::DOUBLE:
+ return SortOrder::SIGNED;
+ case Type::BYTE_ARRAY:
+ case Type::FIXED_LEN_BYTE_ARRAY:
+ return SortOrder::UNSIGNED;
+ case Type::INT96:
+ case Type::UNDEFINED:
+ return SortOrder::UNKNOWN;
+ }
+ return SortOrder::UNKNOWN;
+}
+
+// Return the SortOrder of the Parquet Types using Logical or Physical Types
+SortOrder::type GetSortOrder(ConvertedType::type converted, Type::type primitive) {
+ if (converted == ConvertedType::NONE) return DefaultSortOrder(primitive);
+ switch (converted) {
+ case ConvertedType::INT_8:
+ case ConvertedType::INT_16:
+ case ConvertedType::INT_32:
+ case ConvertedType::INT_64:
+ case ConvertedType::DATE:
+ case ConvertedType::TIME_MICROS:
+ case ConvertedType::TIME_MILLIS:
+ case ConvertedType::TIMESTAMP_MICROS:
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return SortOrder::SIGNED;
+ case ConvertedType::UINT_8:
+ case ConvertedType::UINT_16:
+ case ConvertedType::UINT_32:
+ case ConvertedType::UINT_64:
+ case ConvertedType::ENUM:
+ case ConvertedType::UTF8:
+ case ConvertedType::BSON:
+ case ConvertedType::JSON:
+ return SortOrder::UNSIGNED;
+ case ConvertedType::DECIMAL:
+ case ConvertedType::LIST:
+ case ConvertedType::MAP:
+ case ConvertedType::MAP_KEY_VALUE:
+ case ConvertedType::INTERVAL:
+ case ConvertedType::NONE: // required instead of default
+ case ConvertedType::NA: // required instead of default
+ case ConvertedType::UNDEFINED:
+ return SortOrder::UNKNOWN;
+ }
+ return SortOrder::UNKNOWN;
+}
+
+SortOrder::type GetSortOrder(const std::shared_ptr<const LogicalType>& logical_type,
+ Type::type primitive) {
+ SortOrder::type o = SortOrder::UNKNOWN;
+ if (logical_type && logical_type->is_valid()) {
+ o = (logical_type->is_none() ? DefaultSortOrder(primitive)
+ : logical_type->sort_order());
+ }
+ return o;
+}
+
+ColumnOrder ColumnOrder::undefined_ = ColumnOrder(ColumnOrder::UNDEFINED);
+ColumnOrder ColumnOrder::type_defined_ = ColumnOrder(ColumnOrder::TYPE_DEFINED_ORDER);
+
+// Static methods for LogicalType class
+
+std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
+ const ConvertedType::type converted_type,
+ const schema::DecimalMetadata converted_decimal_metadata) {
+ switch (converted_type) {
+ case ConvertedType::UTF8:
+ return StringLogicalType::Make();
+ case ConvertedType::MAP_KEY_VALUE:
+ case ConvertedType::MAP:
+ return MapLogicalType::Make();
+ case ConvertedType::LIST:
+ return ListLogicalType::Make();
+ case ConvertedType::ENUM:
+ return EnumLogicalType::Make();
+ case ConvertedType::DECIMAL:
+ return DecimalLogicalType::Make(converted_decimal_metadata.precision,
+ converted_decimal_metadata.scale);
+ case ConvertedType::DATE:
+ return DateLogicalType::Make();
+ case ConvertedType::TIME_MILLIS:
+ return TimeLogicalType::Make(true, LogicalType::TimeUnit::MILLIS);
+ case ConvertedType::TIME_MICROS:
+ return TimeLogicalType::Make(true, LogicalType::TimeUnit::MICROS);
+ case ConvertedType::TIMESTAMP_MILLIS:
+ return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MILLIS,
+ /*is_from_converted_type=*/true,
+ /*force_set_converted_type=*/false);
+ case ConvertedType::TIMESTAMP_MICROS:
+ return TimestampLogicalType::Make(true, LogicalType::TimeUnit::MICROS,
+ /*is_from_converted_type=*/true,
+ /*force_set_converted_type=*/false);
+ case ConvertedType::INTERVAL:
+ return IntervalLogicalType::Make();
+ case ConvertedType::INT_8:
+ return IntLogicalType::Make(8, true);
+ case ConvertedType::INT_16:
+ return IntLogicalType::Make(16, true);
+ case ConvertedType::INT_32:
+ return IntLogicalType::Make(32, true);
+ case ConvertedType::INT_64:
+ return IntLogicalType::Make(64, true);
+ case ConvertedType::UINT_8:
+ return IntLogicalType::Make(8, false);
+ case ConvertedType::UINT_16:
+ return IntLogicalType::Make(16, false);
+ case ConvertedType::UINT_32:
+ return IntLogicalType::Make(32, false);
+ case ConvertedType::UINT_64:
+ return IntLogicalType::Make(64, false);
+ case ConvertedType::JSON:
+ return JSONLogicalType::Make();
+ case ConvertedType::BSON:
+ return BSONLogicalType::Make();
+ case ConvertedType::NA:
+ return NullLogicalType::Make();
+ case ConvertedType::NONE:
+ return NoLogicalType::Make();
+ case ConvertedType::UNDEFINED:
+ return UndefinedLogicalType::Make();
+ }
+ return UndefinedLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::FromThrift(
+ const format::LogicalType& type) {
+ if (type.__isset.STRING) {
+ return StringLogicalType::Make();
+ } else if (type.__isset.MAP) {
+ return MapLogicalType::Make();
+ } else if (type.__isset.LIST) {
+ return ListLogicalType::Make();
+ } else if (type.__isset.ENUM) {
+ return EnumLogicalType::Make();
+ } else if (type.__isset.DECIMAL) {
+ return DecimalLogicalType::Make(type.DECIMAL.precision, type.DECIMAL.scale);
+ } else if (type.__isset.DATE) {
+ return DateLogicalType::Make();
+ } else if (type.__isset.TIME) {
+ LogicalType::TimeUnit::unit unit;
+ if (type.TIME.unit.__isset.MILLIS) {
+ unit = LogicalType::TimeUnit::MILLIS;
+ } else if (type.TIME.unit.__isset.MICROS) {
+ unit = LogicalType::TimeUnit::MICROS;
+ } else if (type.TIME.unit.__isset.NANOS) {
+ unit = LogicalType::TimeUnit::NANOS;
+ } else {
+ unit = LogicalType::TimeUnit::UNKNOWN;
+ }
+ return TimeLogicalType::Make(type.TIME.isAdjustedToUTC, unit);
+ } else if (type.__isset.TIMESTAMP) {
+ LogicalType::TimeUnit::unit unit;
+ if (type.TIMESTAMP.unit.__isset.MILLIS) {
+ unit = LogicalType::TimeUnit::MILLIS;
+ } else if (type.TIMESTAMP.unit.__isset.MICROS) {
+ unit = LogicalType::TimeUnit::MICROS;
+ } else if (type.TIMESTAMP.unit.__isset.NANOS) {
+ unit = LogicalType::TimeUnit::NANOS;
+ } else {
+ unit = LogicalType::TimeUnit::UNKNOWN;
+ }
+ return TimestampLogicalType::Make(type.TIMESTAMP.isAdjustedToUTC, unit);
+ // TODO(tpboudreau): activate the commented code after parquet.thrift
+ // recognizes IntervalType as a LogicalType
+ //} else if (type.__isset.INTERVAL) {
+ // return IntervalLogicalType::Make();
+ } else if (type.__isset.INTEGER) {
+ return IntLogicalType::Make(static_cast<int>(type.INTEGER.bitWidth),
+ type.INTEGER.isSigned);
+ } else if (type.__isset.UNKNOWN) {
+ return NullLogicalType::Make();
+ } else if (type.__isset.JSON) {
+ return JSONLogicalType::Make();
+ } else if (type.__isset.BSON) {
+ return BSONLogicalType::Make();
+ } else if (type.__isset.UUID) {
+ return UUIDLogicalType::Make();
+ } else {
+ throw ParquetException("Metadata contains Thrift LogicalType that is not recognized");
+ }
+}
+
+std::shared_ptr<const LogicalType> LogicalType::String() {
+ return StringLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Map() { return MapLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::List() { return ListLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Enum() { return EnumLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Decimal(int32_t precision,
+ int32_t scale) {
+ return DecimalLogicalType::Make(precision, scale);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Date() { return DateLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::Time(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
+ DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
+ return TimeLogicalType::Make(is_adjusted_to_utc, time_unit);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Timestamp(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type, bool force_set_converted_type) {
+ DCHECK(time_unit != LogicalType::TimeUnit::UNKNOWN);
+ return TimestampLogicalType::Make(is_adjusted_to_utc, time_unit, is_from_converted_type,
+ force_set_converted_type);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Interval() {
+ return IntervalLogicalType::Make();
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Int(int bit_width, bool is_signed) {
+ DCHECK(bit_width == 64 || bit_width == 32 || bit_width == 16 || bit_width == 8);
+ return IntLogicalType::Make(bit_width, is_signed);
+}
+
+std::shared_ptr<const LogicalType> LogicalType::Null() { return NullLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::JSON() { return JSONLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::BSON() { return BSONLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType::Make(); }
+
+std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }
+
+/*
+ * The logical type implementation classes are built in four layers: (1) the base
+ * layer, which establishes the interface and provides generally reusable implementations
+ * for the ToJSON() and Equals() methods; (2) an intermediate derived layer for the
+ * "compatibility" methods, which provides implementations for is_compatible() and
+ * ToConvertedType(); (3) another intermediate layer for the "applicability" methods
+ * that provides several implementations for the is_applicable() method; and (4) the
+ * final derived classes, one for each logical type, which supply implementations
+ * for those methods that remain virtual (usually just ToString() and ToThrift()) or
+ * otherwise need to be overridden.
+ */
+
+// LogicalTypeImpl base class
+
+class LogicalType::Impl {
+ public:
+ virtual bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const = 0;
+
+ virtual bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata = {
+ false, -1, -1}) const = 0;
+
+ virtual ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const = 0;
+
+ virtual std::string ToString() const = 0;
+
+ virtual bool is_serialized() const {
+ return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
+ }
+
+ virtual std::string ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": ")" << ToString() << R"("})";
+ return json.str();
+ }
+
+ virtual format::LogicalType ToThrift() const {
+ // logical types inheriting this method should never be serialized
+ std::stringstream ss;
+ ss << "Logical type " << ToString() << " should not be serialized";
+ throw ParquetException(ss.str());
+ }
+
+ virtual bool Equals(const LogicalType& other) const { return other.type() == type_; }
+
+ LogicalType::Type::type type() const { return type_; }
+
+ SortOrder::type sort_order() const { return order_; }
+
+ Impl(const Impl&) = delete;
+ Impl& operator=(const Impl&) = delete;
+ virtual ~Impl() noexcept {}
+
+ class Compatible;
+ class SimpleCompatible;
+ class Incompatible;
+
+ class Applicable;
+ class SimpleApplicable;
+ class TypeLengthApplicable;
+ class UniversalApplicable;
+ class Inapplicable;
+
+ class String;
+ class Map;
+ class List;
+ class Enum;
+ class Decimal;
+ class Date;
+ class Time;
+ class Timestamp;
+ class Interval;
+ class Int;
+ class Null;
+ class JSON;
+ class BSON;
+ class UUID;
+ class No;
+ class Undefined;
+
+ protected:
+ Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
+ Impl() = default;
+
+ private:
+ LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
+ SortOrder::type order_ = SortOrder::UNKNOWN;
+};
+
+// Special methods for public LogicalType class
+
+LogicalType::LogicalType() = default;
+LogicalType::~LogicalType() noexcept = default;
+
+// Delegating methods for public LogicalType class
+
+bool LogicalType::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return impl_->is_applicable(primitive_type, primitive_length);
+}
+
+bool LogicalType::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ return impl_->is_compatible(converted_type, converted_decimal_metadata);
+}
+
+ConvertedType::type LogicalType::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ return impl_->ToConvertedType(out_decimal_metadata);
+}
+
+std::string LogicalType::ToString() const { return impl_->ToString(); }
+
+std::string LogicalType::ToJSON() const { return impl_->ToJSON(); }
+
+format::LogicalType LogicalType::ToThrift() const { return impl_->ToThrift(); }
+
+bool LogicalType::Equals(const LogicalType& other) const { return impl_->Equals(other); }
+
+LogicalType::Type::type LogicalType::type() const { return impl_->type(); }
+
+SortOrder::type LogicalType::sort_order() const { return impl_->sort_order(); }
+
+// Type checks for public LogicalType class
+
+bool LogicalType::is_string() const { return impl_->type() == LogicalType::Type::STRING; }
+bool LogicalType::is_map() const { return impl_->type() == LogicalType::Type::MAP; }
+bool LogicalType::is_list() const { return impl_->type() == LogicalType::Type::LIST; }
+bool LogicalType::is_enum() const { return impl_->type() == LogicalType::Type::ENUM; }
+bool LogicalType::is_decimal() const {
+ return impl_->type() == LogicalType::Type::DECIMAL;
+}
+bool LogicalType::is_date() const { return impl_->type() == LogicalType::Type::DATE; }
+bool LogicalType::is_time() const { return impl_->type() == LogicalType::Type::TIME; }
+bool LogicalType::is_timestamp() const {
+ return impl_->type() == LogicalType::Type::TIMESTAMP;
+}
+bool LogicalType::is_interval() const {
+ return impl_->type() == LogicalType::Type::INTERVAL;
+}
+bool LogicalType::is_int() const { return impl_->type() == LogicalType::Type::INT; }
+bool LogicalType::is_null() const { return impl_->type() == LogicalType::Type::NIL; }
+bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::JSON; }
+bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
+bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
+bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
+bool LogicalType::is_valid() const {
+ return impl_->type() != LogicalType::Type::UNDEFINED;
+}
+bool LogicalType::is_invalid() const { return !is_valid(); }
+bool LogicalType::is_nested() const {
+ return (impl_->type() == LogicalType::Type::LIST) ||
+ (impl_->type() == LogicalType::Type::MAP);
+}
+bool LogicalType::is_nonnested() const { return !is_nested(); }
+bool LogicalType::is_serialized() const { return impl_->is_serialized(); }
+
+// LogicalTypeImpl intermediate "compatibility" classes
+
+class LogicalType::Impl::Compatible : public virtual LogicalType::Impl {
+ protected:
+ Compatible() = default;
+};
+
+#define set_decimal_metadata(m___, i___, p___, s___) \
+ { \
+ if (m___) { \
+ (m___)->isset = (i___); \
+ (m___)->scale = (s___); \
+ (m___)->precision = (p___); \
+ } \
+ }
+
+#define reset_decimal_metadata(m___) \
+ { set_decimal_metadata(m___, false, -1, -1); }
+
+// For logical types that always translate to the same converted type
+class LogicalType::Impl::SimpleCompatible : public virtual LogicalType::Impl::Compatible {
+ public:
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == converted_type_) && !converted_decimal_metadata.isset;
+ }
+
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override {
+ reset_decimal_metadata(out_decimal_metadata);
+ return converted_type_;
+ }
+
+ protected:
+ explicit SimpleCompatible(ConvertedType::type c) : converted_type_(c) {}
+
+ private:
+ ConvertedType::type converted_type_ = ConvertedType::NA;
+};
+
+// For logical types that have no corresponding converted type
+class LogicalType::Impl::Incompatible : public virtual LogicalType::Impl {
+ public:
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == ConvertedType::NONE ||
+ converted_type == ConvertedType::NA) &&
+ !converted_decimal_metadata.isset;
+ }
+
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override {
+ reset_decimal_metadata(out_decimal_metadata);
+ return ConvertedType::NONE;
+ }
+
+ protected:
+ Incompatible() = default;
+};
+
+// LogicalTypeImpl intermediate "applicability" classes
+
+class LogicalType::Impl::Applicable : public virtual LogicalType::Impl {
+ protected:
+ Applicable() = default;
+};
+
+// For logical types that can apply only to a single
+// physical type
+class LogicalType::Impl::SimpleApplicable : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return primitive_type == type_;
+ }
+
+ protected:
+ explicit SimpleApplicable(parquet::Type::type t) : type_(t) {}
+
+ private:
+ parquet::Type::type type_;
+};
+
+// For logical types that can apply only to a particular
+// physical type and physical length combination
+class LogicalType::Impl::TypeLengthApplicable
+ : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return primitive_type == type_ && primitive_length == length_;
+ }
+
+ protected:
+ TypeLengthApplicable(parquet::Type::type t, int32_t l) : type_(t), length_(l) {}
+
+ private:
+ parquet::Type::type type_;
+ int32_t length_;
+};
+
+// For logical types that can apply to any physical type
+class LogicalType::Impl::UniversalApplicable
+ : public virtual LogicalType::Impl::Applicable {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return true;
+ }
+
+ protected:
+ UniversalApplicable() = default;
+};
+
+// For logical types that can never apply to any primitive
+// physical type
+class LogicalType::Impl::Inapplicable : public virtual LogicalType::Impl {
+ public:
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override {
+ return false;
+ }
+
+ protected:
+ Inapplicable() = default;
+};
+
+// LogicalType implementation final classes
+
+#define OVERRIDE_TOSTRING(n___) \
+ std::string ToString() const override { return #n___; }
+
+#define OVERRIDE_TOTHRIFT(t___, s___) \
+ format::LogicalType ToThrift() const override { \
+ format::LogicalType type; \
+ format::t___ subtype; \
+ type.__set_##s___(subtype); \
+ return type; \
+ }
+
+class LogicalType::Impl::String final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class StringLogicalType;
+
+ OVERRIDE_TOSTRING(String)
+ OVERRIDE_TOTHRIFT(StringType, STRING)
+
+ private:
+ String()
+ : LogicalType::Impl(LogicalType::Type::STRING, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UTF8),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+// Each public logical type class's Make() creation method instantiates a corresponding
+// LogicalType::Impl::* object and installs that implementation in the logical type
+// it returns.
+
+#define GENERATE_MAKE(a___) \
+ std::shared_ptr<const LogicalType> a___##LogicalType::Make() { \
+ auto* logical_type = new a___##LogicalType(); \
+ logical_type->impl_.reset(new LogicalType::Impl::a___()); \
+ return std::shared_ptr<const LogicalType>(logical_type); \
+ }
+
+GENERATE_MAKE(String)
+
+class LogicalType::Impl::Map final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::Inapplicable {
+ public:
+ friend class MapLogicalType;
+
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override {
+ return (converted_type == ConvertedType::MAP ||
+ converted_type == ConvertedType::MAP_KEY_VALUE) &&
+ !converted_decimal_metadata.isset;
+ }
+
+ OVERRIDE_TOSTRING(Map)
+ OVERRIDE_TOTHRIFT(MapType, MAP)
+
+ private:
+ Map()
+ : LogicalType::Impl(LogicalType::Type::MAP, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::MAP) {}
+};
+
+GENERATE_MAKE(Map)
+
+class LogicalType::Impl::List final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::Inapplicable {
+ public:
+ friend class ListLogicalType;
+
+ OVERRIDE_TOSTRING(List)
+ OVERRIDE_TOTHRIFT(ListType, LIST)
+
+ private:
+ List()
+ : LogicalType::Impl(LogicalType::Type::LIST, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::LIST) {}
+};
+
+GENERATE_MAKE(List)
+
+class LogicalType::Impl::Enum final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class EnumLogicalType;
+
+ OVERRIDE_TOSTRING(Enum)
+ OVERRIDE_TOTHRIFT(EnumType, ENUM)
+
+ private:
+ Enum()
+ : LogicalType::Impl(LogicalType::Type::ENUM, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::ENUM),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(Enum)
+
+// The parameterized logical types (currently Decimal, Time, Timestamp, and Int)
+// generally can't reuse the simple method implementations available in the base and
+// intermediate classes and must (re)implement them all
+
+class LogicalType::Impl::Decimal final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class DecimalLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ int32_t precision() const { return precision_; }
+ int32_t scale() const { return scale_; }
+
+ private:
+ Decimal(int32_t p, int32_t s)
+ : LogicalType::Impl(LogicalType::Type::DECIMAL, SortOrder::SIGNED),
+ precision_(p),
+ scale_(s) {}
+ int32_t precision_ = -1;
+ int32_t scale_ = -1;
+};
+
+bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ bool ok = false;
+ switch (primitive_type) {
+ case parquet::Type::INT32: {
+ ok = (1 <= precision_) && (precision_ <= 9);
+ } break;
+ case parquet::Type::INT64: {
+ ok = (1 <= precision_) && (precision_ <= 18);
+ if (precision_ < 10) {
+ // FIXME(tpb): warn that INT32 could be used
+ }
+ } break;
+ case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ ok = precision_ <= static_cast<int32_t>(std::floor(
+ std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
+ } break;
+ case parquet::Type::BYTE_ARRAY: {
+ ok = true;
+ } break;
+ default: {
+ } break;
+ }
+ return ok;
+}
+
+bool LogicalType::Impl::Decimal::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ return converted_type == ConvertedType::DECIMAL &&
+ (converted_decimal_metadata.isset &&
+ converted_decimal_metadata.scale == scale_ &&
+ converted_decimal_metadata.precision == precision_);
+}
+
+ConvertedType::type LogicalType::Impl::Decimal::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ set_decimal_metadata(out_decimal_metadata, true, precision_, scale_);
+ return ConvertedType::DECIMAL;
+}
+
+std::string LogicalType::Impl::Decimal::ToString() const {
+ std::stringstream type;
+ type << "Decimal(precision=" << precision_ << ", scale=" << scale_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Decimal::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Decimal", "precision": )" << precision_ << R"(, "scale": )"
+ << scale_ << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Decimal::ToThrift() const {
+ format::LogicalType type;
+ format::DecimalType decimal_type;
+ decimal_type.__set_precision(precision_);
+ decimal_type.__set_scale(scale_);
+ type.__set_DECIMAL(decimal_type);
+ return type;
+}
+
+bool LogicalType::Impl::Decimal::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_decimal()) {
+ const auto& other_decimal = checked_cast<const DecimalLogicalType&>(other);
+ eq = (precision_ == other_decimal.precision() && scale_ == other_decimal.scale());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> DecimalLogicalType::Make(int32_t precision,
+ int32_t scale) {
+ if (precision < 1) {
+ throw ParquetException(
+ "Precision must be greater than or equal to 1 for Decimal logical type");
+ }
+ if (scale < 0 || scale > precision) {
+ throw ParquetException(
+ "Scale must be a non-negative integer that does not exceed precision for "
+ "Decimal logical type");
+ }
+ auto* logical_type = new DecimalLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Decimal(precision, scale));
+ return std::shared_ptr<const LogicalType>(logical_type);
+}
+
+int32_t DecimalLogicalType::precision() const {
+ return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).precision();
+}
+
+int32_t DecimalLogicalType::scale() const {
+ return (dynamic_cast<const LogicalType::Impl::Decimal&>(*impl_)).scale();
+}
+
+class LogicalType::Impl::Date final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class DateLogicalType;
+
+ OVERRIDE_TOSTRING(Date)
+ OVERRIDE_TOTHRIFT(DateType, DATE)
+
+ private:
+ Date()
+ : LogicalType::Impl(LogicalType::Type::DATE, SortOrder::SIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::DATE),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::INT32) {}
+};
+
+GENERATE_MAKE(Date)
+
+#define time_unit_string(u___) \
+ ((u___) == LogicalType::TimeUnit::MILLIS \
+ ? "milliseconds" \
+ : ((u___) == LogicalType::TimeUnit::MICROS \
+ ? "microseconds" \
+ : ((u___) == LogicalType::TimeUnit::NANOS ? "nanoseconds" : "unknown")))
+
+class LogicalType::Impl::Time final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class TimeLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ bool is_adjusted_to_utc() const { return adjusted_; }
+ LogicalType::TimeUnit::unit time_unit() const { return unit_; }
+
+ private:
+ Time(bool a, LogicalType::TimeUnit::unit u)
+ : LogicalType::Impl(LogicalType::Type::TIME, SortOrder::SIGNED),
+ adjusted_(a),
+ unit_(u) {}
+ bool adjusted_ = false;
+ LogicalType::TimeUnit::unit unit_;
+};
+
+bool LogicalType::Impl::Time::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return (primitive_type == parquet::Type::INT32 &&
+ unit_ == LogicalType::TimeUnit::MILLIS) ||
+ (primitive_type == parquet::Type::INT64 &&
+ (unit_ == LogicalType::TimeUnit::MICROS ||
+ unit_ == LogicalType::TimeUnit::NANOS));
+}
+
+bool LogicalType::Impl::Time::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MILLIS) {
+ return converted_type == ConvertedType::TIME_MILLIS;
+ } else if (adjusted_ && unit_ == LogicalType::TimeUnit::MICROS) {
+ return converted_type == ConvertedType::TIME_MICROS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Time::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (adjusted_) {
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ return ConvertedType::TIME_MILLIS;
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ return ConvertedType::TIME_MICROS;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Time::ToString() const {
+ std::stringstream type;
+ type << "Time(isAdjustedToUTC=" << std::boolalpha << adjusted_
+ << ", timeUnit=" << time_unit_string(unit_) << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Time::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Time", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
+ << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"("})";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Time::ToThrift() const {
+ format::LogicalType type;
+ format::TimeType time_type;
+ format::TimeUnit time_unit;
+ DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ format::MilliSeconds millis;
+ time_unit.__set_MILLIS(millis);
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ format::MicroSeconds micros;
+ time_unit.__set_MICROS(micros);
+ } else if (unit_ == LogicalType::TimeUnit::NANOS) {
+ format::NanoSeconds nanos;
+ time_unit.__set_NANOS(nanos);
+ }
+ time_type.__set_isAdjustedToUTC(adjusted_);
+ time_type.__set_unit(time_unit);
+ type.__set_TIME(time_type);
+ return type;
+}
+
+bool LogicalType::Impl::Time::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_time()) {
+ const auto& other_time = checked_cast<const TimeLogicalType&>(other);
+ eq =
+ (adjusted_ == other_time.is_adjusted_to_utc() && unit_ == other_time.time_unit());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> TimeLogicalType::Make(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit) {
+ if (time_unit == LogicalType::TimeUnit::MILLIS ||
+ time_unit == LogicalType::TimeUnit::MICROS ||
+ time_unit == LogicalType::TimeUnit::NANOS) {
+ auto* logical_type = new TimeLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Time(is_adjusted_to_utc, time_unit));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "TimeUnit must be one of MILLIS, MICROS, or NANOS for Time logical type");
+ }
+}
+
+bool TimeLogicalType::is_adjusted_to_utc() const {
+ return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).is_adjusted_to_utc();
+}
+
+LogicalType::TimeUnit::unit TimeLogicalType::time_unit() const {
+ return (dynamic_cast<const LogicalType::Impl::Time&>(*impl_)).time_unit();
+}
+
+class LogicalType::Impl::Timestamp final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class TimestampLogicalType;
+
+ bool is_serialized() const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ bool is_adjusted_to_utc() const { return adjusted_; }
+ LogicalType::TimeUnit::unit time_unit() const { return unit_; }
+
+ bool is_from_converted_type() const { return is_from_converted_type_; }
+ bool force_set_converted_type() const { return force_set_converted_type_; }
+
+ private:
+ Timestamp(bool adjusted, LogicalType::TimeUnit::unit unit, bool is_from_converted_type,
+ bool force_set_converted_type)
+ : LogicalType::Impl(LogicalType::Type::TIMESTAMP, SortOrder::SIGNED),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::INT64),
+ adjusted_(adjusted),
+ unit_(unit),
+ is_from_converted_type_(is_from_converted_type),
+ force_set_converted_type_(force_set_converted_type) {}
+ bool adjusted_ = false;
+ LogicalType::TimeUnit::unit unit_;
+ bool is_from_converted_type_ = false;
+ bool force_set_converted_type_ = false;
+};
+
+bool LogicalType::Impl::Timestamp::is_serialized() const {
+ return !is_from_converted_type_;
+}
+
+bool LogicalType::Impl::Timestamp::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ if (adjusted_ || force_set_converted_type_) {
+ return converted_type == ConvertedType::TIMESTAMP_MILLIS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ if (adjusted_ || force_set_converted_type_) {
+ return converted_type == ConvertedType::TIMESTAMP_MICROS;
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+ } else {
+ return (converted_type == ConvertedType::NONE) ||
+ (converted_type == ConvertedType::NA);
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Timestamp::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (adjusted_ || force_set_converted_type_) {
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ return ConvertedType::TIMESTAMP_MILLIS;
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ return ConvertedType::TIMESTAMP_MICROS;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Timestamp::ToString() const {
+ std::stringstream type;
+ type << "Timestamp(isAdjustedToUTC=" << std::boolalpha << adjusted_
+ << ", timeUnit=" << time_unit_string(unit_)
+ << ", is_from_converted_type=" << is_from_converted_type_
+ << ", force_set_converted_type=" << force_set_converted_type_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Timestamp::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Timestamp", "isAdjustedToUTC": )" << std::boolalpha << adjusted_
+ << R"(, "timeUnit": ")" << time_unit_string(unit_) << R"(")"
+ << R"(, "is_from_converted_type": )" << is_from_converted_type_
+ << R"(, "force_set_converted_type": )" << force_set_converted_type_ << R"(})";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Timestamp::ToThrift() const {
+ format::LogicalType type;
+ format::TimestampType timestamp_type;
+ format::TimeUnit time_unit;
+ DCHECK(unit_ != LogicalType::TimeUnit::UNKNOWN);
+ if (unit_ == LogicalType::TimeUnit::MILLIS) {
+ format::MilliSeconds millis;
+ time_unit.__set_MILLIS(millis);
+ } else if (unit_ == LogicalType::TimeUnit::MICROS) {
+ format::MicroSeconds micros;
+ time_unit.__set_MICROS(micros);
+ } else if (unit_ == LogicalType::TimeUnit::NANOS) {
+ format::NanoSeconds nanos;
+ time_unit.__set_NANOS(nanos);
+ }
+ timestamp_type.__set_isAdjustedToUTC(adjusted_);
+ timestamp_type.__set_unit(time_unit);
+ type.__set_TIMESTAMP(timestamp_type);
+ return type;
+}
+
+bool LogicalType::Impl::Timestamp::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_timestamp()) {
+ const auto& other_timestamp = checked_cast<const TimestampLogicalType&>(other);
+ eq = (adjusted_ == other_timestamp.is_adjusted_to_utc() &&
+ unit_ == other_timestamp.time_unit());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> TimestampLogicalType::Make(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type, bool force_set_converted_type) {
+ if (time_unit == LogicalType::TimeUnit::MILLIS ||
+ time_unit == LogicalType::TimeUnit::MICROS ||
+ time_unit == LogicalType::TimeUnit::NANOS) {
+ auto* logical_type = new TimestampLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Timestamp(
+ is_adjusted_to_utc, time_unit, is_from_converted_type, force_set_converted_type));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "TimeUnit must be one of MILLIS, MICROS, or NANOS for Timestamp logical type");
+ }
+}
+
+bool TimestampLogicalType::is_adjusted_to_utc() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).is_adjusted_to_utc();
+}
+
+LogicalType::TimeUnit::unit TimestampLogicalType::time_unit() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_)).time_unit();
+}
+
+bool TimestampLogicalType::is_from_converted_type() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
+ .is_from_converted_type();
+}
+
+bool TimestampLogicalType::force_set_converted_type() const {
+ return (dynamic_cast<const LogicalType::Impl::Timestamp&>(*impl_))
+ .force_set_converted_type();
+}
+
+class LogicalType::Impl::Interval final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::TypeLengthApplicable {
+ public:
+ friend class IntervalLogicalType;
+
+ OVERRIDE_TOSTRING(Interval)
+ // TODO(tpboudreau): uncomment the following line to enable serialization after
+ // parquet.thrift recognizes IntervalType as a ConvertedType
+ // OVERRIDE_TOTHRIFT(IntervalType, INTERVAL)
+
+ private:
+ Interval()
+ : LogicalType::Impl(LogicalType::Type::INTERVAL, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::INTERVAL),
+ LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 12) {
+ }
+};
+
+GENERATE_MAKE(Interval)
+
+class LogicalType::Impl::Int final : public LogicalType::Impl::Compatible,
+ public LogicalType::Impl::Applicable {
+ public:
+ friend class IntLogicalType;
+
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const override;
+ bool is_compatible(ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const override;
+ ConvertedType::type ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const override;
+ std::string ToString() const override;
+ std::string ToJSON() const override;
+ format::LogicalType ToThrift() const override;
+ bool Equals(const LogicalType& other) const override;
+
+ int bit_width() const { return width_; }
+ bool is_signed() const { return signed_; }
+
+ private:
+ Int(int w, bool s)
+ : LogicalType::Impl(LogicalType::Type::INT,
+ (s ? SortOrder::SIGNED : SortOrder::UNSIGNED)),
+ width_(w),
+ signed_(s) {}
+ int width_ = 0;
+ bool signed_ = false;
+};
+
+bool LogicalType::Impl::Int::is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length) const {
+ return (primitive_type == parquet::Type::INT32 && width_ <= 32) ||
+ (primitive_type == parquet::Type::INT64 && width_ == 64);
+}
+
+bool LogicalType::Impl::Int::is_compatible(
+ ConvertedType::type converted_type,
+ schema::DecimalMetadata converted_decimal_metadata) const {
+ if (converted_decimal_metadata.isset) {
+ return false;
+ } else if (signed_ && width_ == 8) {
+ return converted_type == ConvertedType::INT_8;
+ } else if (signed_ && width_ == 16) {
+ return converted_type == ConvertedType::INT_16;
+ } else if (signed_ && width_ == 32) {
+ return converted_type == ConvertedType::INT_32;
+ } else if (signed_ && width_ == 64) {
+ return converted_type == ConvertedType::INT_64;
+ } else if (!signed_ && width_ == 8) {
+ return converted_type == ConvertedType::UINT_8;
+ } else if (!signed_ && width_ == 16) {
+ return converted_type == ConvertedType::UINT_16;
+ } else if (!signed_ && width_ == 32) {
+ return converted_type == ConvertedType::UINT_32;
+ } else if (!signed_ && width_ == 64) {
+ return converted_type == ConvertedType::UINT_64;
+ } else {
+ return false;
+ }
+}
+
+ConvertedType::type LogicalType::Impl::Int::ToConvertedType(
+ schema::DecimalMetadata* out_decimal_metadata) const {
+ reset_decimal_metadata(out_decimal_metadata);
+ if (signed_) {
+ switch (width_) {
+ case 8:
+ return ConvertedType::INT_8;
+ case 16:
+ return ConvertedType::INT_16;
+ case 32:
+ return ConvertedType::INT_32;
+ case 64:
+ return ConvertedType::INT_64;
+ }
+ } else { // unsigned
+ switch (width_) {
+ case 8:
+ return ConvertedType::UINT_8;
+ case 16:
+ return ConvertedType::UINT_16;
+ case 32:
+ return ConvertedType::UINT_32;
+ case 64:
+ return ConvertedType::UINT_64;
+ }
+ }
+ return ConvertedType::NONE;
+}
+
+std::string LogicalType::Impl::Int::ToString() const {
+ std::stringstream type;
+ type << "Int(bitWidth=" << width_ << ", isSigned=" << std::boolalpha << signed_ << ")";
+ return type.str();
+}
+
+std::string LogicalType::Impl::Int::ToJSON() const {
+ std::stringstream json;
+ json << R"({"Type": "Int", "bitWidth": )" << width_ << R"(, "isSigned": )"
+ << std::boolalpha << signed_ << "}";
+ return json.str();
+}
+
+format::LogicalType LogicalType::Impl::Int::ToThrift() const {
+ format::LogicalType type;
+ format::IntType int_type;
+ DCHECK(width_ == 64 || width_ == 32 || width_ == 16 || width_ == 8);
+ int_type.__set_bitWidth(static_cast<int8_t>(width_));
+ int_type.__set_isSigned(signed_);
+ type.__set_INTEGER(int_type);
+ return type;
+}
+
+bool LogicalType::Impl::Int::Equals(const LogicalType& other) const {
+ bool eq = false;
+ if (other.is_int()) {
+ const auto& other_int = checked_cast<const IntLogicalType&>(other);
+ eq = (width_ == other_int.bit_width() && signed_ == other_int.is_signed());
+ }
+ return eq;
+}
+
+std::shared_ptr<const LogicalType> IntLogicalType::Make(int bit_width, bool is_signed) {
+ if (bit_width == 8 || bit_width == 16 || bit_width == 32 || bit_width == 64) {
+ auto* logical_type = new IntLogicalType();
+ logical_type->impl_.reset(new LogicalType::Impl::Int(bit_width, is_signed));
+ return std::shared_ptr<const LogicalType>(logical_type);
+ } else {
+ throw ParquetException(
+ "Bit width must be exactly 8, 16, 32, or 64 for Int logical type");
+ }
+}
+
+int IntLogicalType::bit_width() const {
+ return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).bit_width();
+}
+
+bool IntLogicalType::is_signed() const {
+ return (dynamic_cast<const LogicalType::Impl::Int&>(*impl_)).is_signed();
+}
+
+class LogicalType::Impl::Null final : public LogicalType::Impl::Incompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class NullLogicalType;
+
+ OVERRIDE_TOSTRING(Null)
+ OVERRIDE_TOTHRIFT(NullType, UNKNOWN)
+
+ private:
+ Null() : LogicalType::Impl(LogicalType::Type::NIL, SortOrder::UNKNOWN) {}
+};
+
+GENERATE_MAKE(Null)
+
+class LogicalType::Impl::JSON final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class JSONLogicalType;
+
+ OVERRIDE_TOSTRING(JSON)
+ OVERRIDE_TOTHRIFT(JsonType, JSON)
+
+ private:
+ JSON()
+ : LogicalType::Impl(LogicalType::Type::JSON, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::JSON),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(JSON)
+
+class LogicalType::Impl::BSON final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::SimpleApplicable {
+ public:
+ friend class BSONLogicalType;
+
+ OVERRIDE_TOSTRING(BSON)
+ OVERRIDE_TOTHRIFT(BsonType, BSON)
+
+ private:
+ BSON()
+ : LogicalType::Impl(LogicalType::Type::BSON, SortOrder::UNSIGNED),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::BSON),
+ LogicalType::Impl::SimpleApplicable(parquet::Type::BYTE_ARRAY) {}
+};
+
+GENERATE_MAKE(BSON)
+
+class LogicalType::Impl::UUID final : public LogicalType::Impl::Incompatible,
+ public LogicalType::Impl::TypeLengthApplicable {
+ public:
+ friend class UUIDLogicalType;
+
+ OVERRIDE_TOSTRING(UUID)
+ OVERRIDE_TOTHRIFT(UUIDType, UUID)
+
+ private:
+ UUID()
+ : LogicalType::Impl(LogicalType::Type::UUID, SortOrder::UNSIGNED),
+ LogicalType::Impl::TypeLengthApplicable(parquet::Type::FIXED_LEN_BYTE_ARRAY, 16) {
+ }
+};
+
+GENERATE_MAKE(UUID)
+
+class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class NoLogicalType;
+
+ OVERRIDE_TOSTRING(None)
+
+ private:
+ No()
+ : LogicalType::Impl(LogicalType::Type::NONE, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::NONE) {}
+};
+
+GENERATE_MAKE(No)
+
+class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
+ public LogicalType::Impl::UniversalApplicable {
+ public:
+ friend class UndefinedLogicalType;
+
+ OVERRIDE_TOSTRING(Undefined)
+
+ private:
+ Undefined()
+ : LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
+ LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
+};
+
+GENERATE_MAKE(Undefined)
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/types.h b/contrib/libs/apache/arrow/cpp/src/parquet/types.h
index 40981d9bf1a..c25719830ec 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/types.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/types.h
@@ -1,765 +1,765 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include "arrow/util/string_view.h"
-
-#include "parquet/platform.h"
-#include "parquet/type_fwd.h"
-
-#ifdef _WIN32
-
-// Repetition::OPTIONAL conflicts with a #define, so we undefine it
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
-
-#endif // _WIN32
-
-namespace arrow {
-namespace util {
-
-class Codec;
-
-} // namespace util
-} // namespace arrow
-
-namespace parquet {
-
-// ----------------------------------------------------------------------
-// Metadata enums to match Thrift metadata
-//
-// The reason we maintain our own enums is to avoid transitive dependency on
-// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
-// public API. After building parquet-cpp, you should not need to include
-// Thrift headers in your application. This means some boilerplate to convert
-// between our types and Parquet's Thrift types.
-//
-// We can also add special values like NONE to distinguish between metadata
-// values being set and not set. As an example consider ConvertedType and
-// CompressionCodec
-
-// Mirrors parquet::Type
-struct Type {
- enum type {
- BOOLEAN = 0,
- INT32 = 1,
- INT64 = 2,
- INT96 = 3,
- FLOAT = 4,
- DOUBLE = 5,
- BYTE_ARRAY = 6,
- FIXED_LEN_BYTE_ARRAY = 7,
- // Should always be last element.
- UNDEFINED = 8
- };
-};
-
-// Mirrors parquet::ConvertedType
-struct ConvertedType {
- enum type {
- NONE, // Not a real converted type, but means no converted type is specified
- UTF8,
- MAP,
- MAP_KEY_VALUE,
- LIST,
- ENUM,
- DECIMAL,
- DATE,
- TIME_MILLIS,
- TIME_MICROS,
- TIMESTAMP_MILLIS,
- TIMESTAMP_MICROS,
- UINT_8,
- UINT_16,
- UINT_32,
- UINT_64,
- INT_8,
- INT_16,
- INT_32,
- INT_64,
- JSON,
- BSON,
- INTERVAL,
- // DEPRECATED INVALID ConvertedType for all-null data.
- // Only useful for reading legacy files written out by interim Parquet C++ releases.
- // For writing, always emit LogicalType::Null instead.
- // See PARQUET-1990.
- NA = 25,
- UNDEFINED = 26 // Not a real converted type; should always be last element
- };
-};
-
-// forward declaration
-namespace format {
-
-class LogicalType;
-
-}
-
-// Mirrors parquet::FieldRepetitionType
-struct Repetition {
- enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
-};
-
-// Reference:
-// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
-// format/converter/ParquetMetadataConverter.java
-// Sort order for page and column statistics. Types are associated with sort
-// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
-// aggregated using a sort order. As of parquet-format version 2.3.1, the
-// order used to aggregate stats is always SIGNED and is not stored in the
-// Parquet file. These stats are discarded for types that need unsigned.
-// See PARQUET-686.
-struct SortOrder {
- enum type { SIGNED, UNSIGNED, UNKNOWN };
-};
-
-namespace schema {
-
-struct DecimalMetadata {
- bool isset;
- int32_t scale;
- int32_t precision;
-};
-
-} // namespace schema
-
-/// \brief Implementation of parquet.thrift LogicalType types.
-class PARQUET_EXPORT LogicalType {
- public:
- struct Type {
- enum type {
- UNDEFINED = 0, // Not a real logical type
- STRING = 1,
- MAP,
- LIST,
- ENUM,
- DECIMAL,
- DATE,
- TIME,
- TIMESTAMP,
- INTERVAL,
- INT,
- NIL, // Thrift NullType: annotates data that is always null
- JSON,
- BSON,
- UUID,
- NONE // Not a real logical type; should always be last element
- };
- };
-
- struct TimeUnit {
- enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
- };
-
- /// \brief If possible, return a logical type equivalent to the given legacy
- /// converted type (and decimal metadata if applicable).
- static std::shared_ptr<const LogicalType> FromConvertedType(
- const parquet::ConvertedType::type converted_type,
- const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
- -1});
-
- /// \brief Return the logical type represented by the Thrift intermediary object.
- static std::shared_ptr<const LogicalType> FromThrift(
- const parquet::format::LogicalType& thrift_logical_type);
-
- /// \brief Return the explicitly requested logical type.
- static std::shared_ptr<const LogicalType> String();
- static std::shared_ptr<const LogicalType> Map();
- static std::shared_ptr<const LogicalType> List();
- static std::shared_ptr<const LogicalType> Enum();
- static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
- static std::shared_ptr<const LogicalType> Date();
- static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
- LogicalType::TimeUnit::unit time_unit);
-
- /// \brief Create a Timestamp logical type
- /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
- /// \param[in] time_unit the resolution of the timestamp
- /// \param[in] is_from_converted_type if true, the timestamp was generated
- /// by translating a legacy converted type of TIMESTAMP_MILLIS or
- /// TIMESTAMP_MICROS. Default is false.
- /// \param[in] force_set_converted_type if true, always set the
- /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
- /// metadata. Default is false
- static std::shared_ptr<const LogicalType> Timestamp(
- bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type = false, bool force_set_converted_type = false);
-
- static std::shared_ptr<const LogicalType> Interval();
- static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
-
- /// \brief Create a logical type for data that's always null
- ///
- /// Any physical type can be annotated with this logical type.
- static std::shared_ptr<const LogicalType> Null();
-
- static std::shared_ptr<const LogicalType> JSON();
- static std::shared_ptr<const LogicalType> BSON();
- static std::shared_ptr<const LogicalType> UUID();
-
- /// \brief Create a placeholder for when no logical type is specified
- static std::shared_ptr<const LogicalType> None();
-
- /// \brief Return true if this logical type is consistent with the given underlying
- /// physical type.
- bool is_applicable(parquet::Type::type primitive_type,
- int32_t primitive_length = -1) const;
-
- /// \brief Return true if this logical type is equivalent to the given legacy converted
- /// type (and decimal metadata if applicable).
- bool is_compatible(parquet::ConvertedType::type converted_type,
- parquet::schema::DecimalMetadata converted_decimal_metadata = {
- false, -1, -1}) const;
-
- /// \brief If possible, return the legacy converted type (and decimal metadata if
- /// applicable) equivalent to this logical type.
- parquet::ConvertedType::type ToConvertedType(
- parquet::schema::DecimalMetadata* out_decimal_metadata) const;
-
- /// \brief Return a printable representation of this logical type.
- std::string ToString() const;
-
- /// \brief Return a JSON representation of this logical type.
- std::string ToJSON() const;
-
- /// \brief Return a serializable Thrift object for this logical type.
- parquet::format::LogicalType ToThrift() const;
-
- /// \brief Return true if the given logical type is equivalent to this logical type.
- bool Equals(const LogicalType& other) const;
-
- /// \brief Return the enumerated type of this logical type.
- LogicalType::Type::type type() const;
-
- /// \brief Return the appropriate sort order for this logical type.
- SortOrder::type sort_order() const;
-
- // Type checks ...
- bool is_string() const;
- bool is_map() const;
- bool is_list() const;
- bool is_enum() const;
- bool is_decimal() const;
- bool is_date() const;
- bool is_time() const;
- bool is_timestamp() const;
- bool is_interval() const;
- bool is_int() const;
- bool is_null() const;
- bool is_JSON() const;
- bool is_BSON() const;
- bool is_UUID() const;
- bool is_none() const;
- /// \brief Return true if this logical type is of a known type.
- bool is_valid() const;
- bool is_invalid() const;
- /// \brief Return true if this logical type is suitable for a schema GroupNode.
- bool is_nested() const;
- bool is_nonnested() const;
- /// \brief Return true if this logical type is included in the Thrift output for its
- /// node.
- bool is_serialized() const;
-
- LogicalType(const LogicalType&) = delete;
- LogicalType& operator=(const LogicalType&) = delete;
- virtual ~LogicalType() noexcept;
-
- protected:
- LogicalType();
-
- class Impl;
- std::unique_ptr<const Impl> impl_;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
-class PARQUET_EXPORT StringLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- StringLogicalType() = default;
-};
-
-/// \brief Allowed for group nodes only.
-class PARQUET_EXPORT MapLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- MapLogicalType() = default;
-};
-
-/// \brief Allowed for group nodes only.
-class PARQUET_EXPORT ListLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- ListLogicalType() = default;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
-class PARQUET_EXPORT EnumLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- EnumLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
-/// depending on the precision.
-class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
- int32_t precision() const;
- int32_t scale() const;
-
- private:
- DecimalLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32.
-class PARQUET_EXPORT DateLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- DateLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
-class PARQUET_EXPORT TimeLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
- LogicalType::TimeUnit::unit time_unit);
- bool is_adjusted_to_utc() const;
- LogicalType::TimeUnit::unit time_unit() const;
-
- private:
- TimeLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT64.
-class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
- LogicalType::TimeUnit::unit time_unit,
- bool is_from_converted_type = false,
- bool force_set_converted_type = false);
- bool is_adjusted_to_utc() const;
- LogicalType::TimeUnit::unit time_unit() const;
-
- /// \brief If true, will not set LogicalType in Thrift metadata
- bool is_from_converted_type() const;
-
- /// \brief If true, will set ConvertedType for micros and millis
- /// resolution in legacy ConvertedType Thrift metadata
- bool force_set_converted_type() const;
-
- private:
- TimestampLogicalType() = default;
-};
-
-/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
-class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- IntervalLogicalType() = default;
-};
-
-/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
-/// (for bit width 64).
-class PARQUET_EXPORT IntLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
- int bit_width() const;
- bool is_signed() const;
-
- private:
- IntLogicalType() = default;
-};
-
-/// \brief Allowed for any physical type.
-class PARQUET_EXPORT NullLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- NullLogicalType() = default;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY.
-class PARQUET_EXPORT JSONLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- JSONLogicalType() = default;
-};
-
-/// \brief Allowed for physical type BYTE_ARRAY.
-class PARQUET_EXPORT BSONLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- BSONLogicalType() = default;
-};
-
-/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
-/// must encode raw UUID bytes.
-class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- UUIDLogicalType() = default;
-};
-
-/// \brief Allowed for any physical type.
-class PARQUET_EXPORT NoLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- NoLogicalType() = default;
-};
-
-// Internal API, for unrecognized logical types
-class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
- public:
- static std::shared_ptr<const LogicalType> Make();
-
- private:
- UndefinedLogicalType() = default;
-};
-
-// Data encodings. Mirrors parquet::Encoding
-struct Encoding {
- enum type {
- PLAIN = 0,
- PLAIN_DICTIONARY = 2,
- RLE = 3,
- BIT_PACKED = 4,
- DELTA_BINARY_PACKED = 5,
- DELTA_LENGTH_BYTE_ARRAY = 6,
- DELTA_BYTE_ARRAY = 7,
- RLE_DICTIONARY = 8,
- BYTE_STREAM_SPLIT = 9,
- // Should always be last element (except UNKNOWN)
- UNDEFINED = 10,
- UNKNOWN = 999
- };
-};
-
-// Exposed data encodings. It is the encoding of the data read from the file,
-// rather than the encoding of the data in the file. E.g., the data encoded as
-// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
-// decoding, in which case the data read from the file is DICTIONARY encoded.
-enum class ExposedEncoding {
- NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
- DICTIONARY = 1
-};
-
-/// \brief Return true if Parquet supports indicated compression type
-PARQUET_EXPORT
-bool IsCodecSupported(Compression::type codec);
-
-PARQUET_EXPORT
-std::unique_ptr<Codec> GetCodec(Compression::type codec);
-
-PARQUET_EXPORT
-std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
-
-struct ParquetCipher {
- enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
-};
-
-struct AadMetadata {
- std::string aad_prefix;
- std::string aad_file_unique;
- bool supply_aad_prefix;
-};
-
-struct EncryptionAlgorithm {
- ParquetCipher::type algorithm;
- AadMetadata aad;
-};
-
-// parquet::PageType
-struct PageType {
- enum type {
- DATA_PAGE,
- INDEX_PAGE,
- DICTIONARY_PAGE,
- DATA_PAGE_V2,
- // Should always be last element
- UNDEFINED
- };
-};
-
-class ColumnOrder {
- public:
- enum type { UNDEFINED, TYPE_DEFINED_ORDER };
- explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
- // Default to Type Defined Order
- ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
- ColumnOrder::type get_order() { return column_order_; }
-
- static ColumnOrder undefined_;
- static ColumnOrder type_defined_;
-
- private:
- ColumnOrder::type column_order_;
-};
-
-// ----------------------------------------------------------------------
-
-struct ByteArray {
- ByteArray() : len(0), ptr(NULLPTR) {}
- ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
-
- ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
- : ByteArray(static_cast<uint32_t>(view.size()),
- reinterpret_cast<const uint8_t*>(view.data())) {}
- uint32_t len;
- const uint8_t* ptr;
-};
-
-inline bool operator==(const ByteArray& left, const ByteArray& right) {
- return left.len == right.len &&
- (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
-}
-
-inline bool operator!=(const ByteArray& left, const ByteArray& right) {
- return !(left == right);
-}
-
-struct FixedLenByteArray {
- FixedLenByteArray() : ptr(NULLPTR) {}
- explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
- const uint8_t* ptr;
-};
-
-using FLBA = FixedLenByteArray;
-
-// Julian day at unix epoch.
-//
-// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
-// the Julian day count starting from noon Universal time, with Julian day
-// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
-// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
-// calendar),
-constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
-constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
-constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
-constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
-constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
-
-MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
-STRUCT_END(Int96, 12);
-
-inline bool operator==(const Int96& left, const Int96& right) {
- return std::equal(left.value, left.value + 3, right.value);
-}
-
-inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
-
-static inline std::string ByteArrayToString(const ByteArray& a) {
- return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
-}
-
-static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
- std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
-}
-
-struct DecodedInt96 {
- uint64_t days_since_epoch;
- uint64_t nanoseconds;
-};
-
-static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
- // We do the computations in the unsigned domain to avoid unsigned behaviour
- // on overflow.
- DecodedInt96 result;
- result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
- result.nanoseconds = 0;
-
- memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
- return result;
-}
-
-static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
- decoded.nanoseconds);
-}
-
-static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
- return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
- microseconds);
-}
-
-static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
- return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
- milliseconds);
-}
-
-static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
- const auto decoded = DecodeInt96Timestamp(i96);
- uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
- return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
-}
-
-static inline std::string Int96ToString(const Int96& a) {
- std::ostringstream result;
- std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
- return result.str();
-}
-
-static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
- std::ostringstream result;
- std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
- return result.str();
-}
-
-template <Type::type TYPE>
-struct type_traits {};
-
-template <>
-struct type_traits<Type::BOOLEAN> {
- using value_type = bool;
-
- static constexpr int value_byte_size = 1;
- static constexpr const char* printf_code = "d";
-};
-
-template <>
-struct type_traits<Type::INT32> {
- using value_type = int32_t;
-
- static constexpr int value_byte_size = 4;
- static constexpr const char* printf_code = "d";
-};
-
-template <>
-struct type_traits<Type::INT64> {
- using value_type = int64_t;
-
- static constexpr int value_byte_size = 8;
- static constexpr const char* printf_code = "ld";
-};
-
-template <>
-struct type_traits<Type::INT96> {
- using value_type = Int96;
-
- static constexpr int value_byte_size = 12;
- static constexpr const char* printf_code = "s";
-};
-
-template <>
-struct type_traits<Type::FLOAT> {
- using value_type = float;
-
- static constexpr int value_byte_size = 4;
- static constexpr const char* printf_code = "f";
-};
-
-template <>
-struct type_traits<Type::DOUBLE> {
- using value_type = double;
-
- static constexpr int value_byte_size = 8;
- static constexpr const char* printf_code = "lf";
-};
-
-template <>
-struct type_traits<Type::BYTE_ARRAY> {
- using value_type = ByteArray;
-
- static constexpr int value_byte_size = sizeof(ByteArray);
- static constexpr const char* printf_code = "s";
-};
-
-template <>
-struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
- using value_type = FixedLenByteArray;
-
- static constexpr int value_byte_size = sizeof(FixedLenByteArray);
- static constexpr const char* printf_code = "s";
-};
-
-template <Type::type TYPE>
-struct PhysicalType {
- using c_type = typename type_traits<TYPE>::value_type;
- static constexpr Type::type type_num = TYPE;
-};
-
-using BooleanType = PhysicalType<Type::BOOLEAN>;
-using Int32Type = PhysicalType<Type::INT32>;
-using Int64Type = PhysicalType<Type::INT64>;
-using Int96Type = PhysicalType<Type::INT96>;
-using FloatType = PhysicalType<Type::FLOAT>;
-using DoubleType = PhysicalType<Type::DOUBLE>;
-using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
-using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
-
-template <typename Type>
-inline std::string format_fwf(int width) {
- std::stringstream ss;
- ss << "%-" << width << type_traits<Type::type_num>::printf_code;
- return ss.str();
-}
-
-PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
-
-PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
-
-PARQUET_EXPORT std::string TypeToString(Type::type t);
-
-PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
- ::arrow::util::string_view val);
-
-PARQUET_EXPORT int GetTypeByteSize(Type::type t);
-
-PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
-
-PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
- Type::type primitive);
-
-PARQUET_EXPORT SortOrder::type GetSortOrder(
- const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
-
-} // namespace parquet
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/util/string_view.h"
+
+#include "parquet/platform.h"
+#include "parquet/type_fwd.h"
+
+#ifdef _WIN32
+
+// Repetition::OPTIONAL conflicts with a #define, so we undefine it
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif // _WIN32
+
+namespace arrow {
+namespace util {
+
+class Codec;
+
+} // namespace util
+} // namespace arrow
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// Metadata enums to match Thrift metadata
+//
+// The reason we maintain our own enums is to avoid transitive dependency on
+// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
+// public API. After building parquet-cpp, you should not need to include
+// Thrift headers in your application. This means some boilerplate to convert
+// between our types and Parquet's Thrift types.
+//
+// We can also add special values like NONE to distinguish between metadata
+// values being set and not set. As an example consider ConvertedType and
+// CompressionCodec
+
+// Mirrors parquet::Type
+struct Type {
+ enum type {
+ BOOLEAN = 0,
+ INT32 = 1,
+ INT64 = 2,
+ INT96 = 3,
+ FLOAT = 4,
+ DOUBLE = 5,
+ BYTE_ARRAY = 6,
+ FIXED_LEN_BYTE_ARRAY = 7,
+ // Should always be last element.
+ UNDEFINED = 8
+ };
+};
+
+// Mirrors parquet::ConvertedType
+struct ConvertedType {
+ enum type {
+ NONE, // Not a real converted type, but means no converted type is specified
+ UTF8,
+ MAP,
+ MAP_KEY_VALUE,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME_MILLIS,
+ TIME_MICROS,
+ TIMESTAMP_MILLIS,
+ TIMESTAMP_MICROS,
+ UINT_8,
+ UINT_16,
+ UINT_32,
+ UINT_64,
+ INT_8,
+ INT_16,
+ INT_32,
+ INT_64,
+ JSON,
+ BSON,
+ INTERVAL,
+ // DEPRECATED INVALID ConvertedType for all-null data.
+ // Only useful for reading legacy files written out by interim Parquet C++ releases.
+ // For writing, always emit LogicalType::Null instead.
+ // See PARQUET-1990.
+ NA = 25,
+ UNDEFINED = 26 // Not a real converted type; should always be last element
+ };
+};
+
+// forward declaration
+namespace format {
+
+class LogicalType;
+
+}
+
+// Mirrors parquet::FieldRepetitionType
+struct Repetition {
+ enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
+};
+
+// Reference:
+// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
+// format/converter/ParquetMetadataConverter.java
+// Sort order for page and column statistics. Types are associated with sort
+// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
+// aggregated using a sort order. As of parquet-format version 2.3.1, the
+// order used to aggregate stats is always SIGNED and is not stored in the
+// Parquet file. These stats are discarded for types that need unsigned.
+// See PARQUET-686.
+struct SortOrder {
+ enum type { SIGNED, UNSIGNED, UNKNOWN };
+};
+
+namespace schema {
+
+struct DecimalMetadata {
+ bool isset;
+ int32_t scale;
+ int32_t precision;
+};
+
+} // namespace schema
+
+/// \brief Implementation of parquet.thrift LogicalType types.
+class PARQUET_EXPORT LogicalType {
+ public:
+ struct Type {
+ enum type {
+ UNDEFINED = 0, // Not a real logical type
+ STRING = 1,
+ MAP,
+ LIST,
+ ENUM,
+ DECIMAL,
+ DATE,
+ TIME,
+ TIMESTAMP,
+ INTERVAL,
+ INT,
+ NIL, // Thrift NullType: annotates data that is always null
+ JSON,
+ BSON,
+ UUID,
+ NONE // Not a real logical type; should always be last element
+ };
+ };
+
+ struct TimeUnit {
+ enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
+ };
+
+ /// \brief If possible, return a logical type equivalent to the given legacy
+ /// converted type (and decimal metadata if applicable).
+ static std::shared_ptr<const LogicalType> FromConvertedType(
+ const parquet::ConvertedType::type converted_type,
+ const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
+ -1});
+
+ /// \brief Return the logical type represented by the Thrift intermediary object.
+ static std::shared_ptr<const LogicalType> FromThrift(
+ const parquet::format::LogicalType& thrift_logical_type);
+
+ /// \brief Return the explicitly requested logical type.
+ static std::shared_ptr<const LogicalType> String();
+ static std::shared_ptr<const LogicalType> Map();
+ static std::shared_ptr<const LogicalType> List();
+ static std::shared_ptr<const LogicalType> Enum();
+ static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
+ static std::shared_ptr<const LogicalType> Date();
+ static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit);
+
+ /// \brief Create a Timestamp logical type
+ /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
+ /// \param[in] time_unit the resolution of the timestamp
+ /// \param[in] is_from_converted_type if true, the timestamp was generated
+ /// by translating a legacy converted type of TIMESTAMP_MILLIS or
+ /// TIMESTAMP_MICROS. Default is false.
+ /// \param[in] force_set_converted_type if true, always set the
+ /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
+ /// metadata. Default is false
+ static std::shared_ptr<const LogicalType> Timestamp(
+ bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type = false, bool force_set_converted_type = false);
+
+ static std::shared_ptr<const LogicalType> Interval();
+ static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+ /// \brief Create a logical type for data that's always null
+ ///
+ /// Any physical type can be annotated with this logical type.
+ static std::shared_ptr<const LogicalType> Null();
+
+ static std::shared_ptr<const LogicalType> JSON();
+ static std::shared_ptr<const LogicalType> BSON();
+ static std::shared_ptr<const LogicalType> UUID();
+
+ /// \brief Create a placeholder for when no logical type is specified
+ static std::shared_ptr<const LogicalType> None();
+
+ /// \brief Return true if this logical type is consistent with the given underlying
+ /// physical type.
+ bool is_applicable(parquet::Type::type primitive_type,
+ int32_t primitive_length = -1) const;
+
+ /// \brief Return true if this logical type is equivalent to the given legacy converted
+ /// type (and decimal metadata if applicable).
+ bool is_compatible(parquet::ConvertedType::type converted_type,
+ parquet::schema::DecimalMetadata converted_decimal_metadata = {
+ false, -1, -1}) const;
+
+ /// \brief If possible, return the legacy converted type (and decimal metadata if
+ /// applicable) equivalent to this logical type.
+ parquet::ConvertedType::type ToConvertedType(
+ parquet::schema::DecimalMetadata* out_decimal_metadata) const;
+
+ /// \brief Return a printable representation of this logical type.
+ std::string ToString() const;
+
+ /// \brief Return a JSON representation of this logical type.
+ std::string ToJSON() const;
+
+ /// \brief Return a serializable Thrift object for this logical type.
+ parquet::format::LogicalType ToThrift() const;
+
+ /// \brief Return true if the given logical type is equivalent to this logical type.
+ bool Equals(const LogicalType& other) const;
+
+ /// \brief Return the enumerated type of this logical type.
+ LogicalType::Type::type type() const;
+
+ /// \brief Return the appropriate sort order for this logical type.
+ SortOrder::type sort_order() const;
+
+ // Type checks ...
+ bool is_string() const;
+ bool is_map() const;
+ bool is_list() const;
+ bool is_enum() const;
+ bool is_decimal() const;
+ bool is_date() const;
+ bool is_time() const;
+ bool is_timestamp() const;
+ bool is_interval() const;
+ bool is_int() const;
+ bool is_null() const;
+ bool is_JSON() const;
+ bool is_BSON() const;
+ bool is_UUID() const;
+ bool is_none() const;
+ /// \brief Return true if this logical type is of a known type.
+ bool is_valid() const;
+ bool is_invalid() const;
+ /// \brief Return true if this logical type is suitable for a schema GroupNode.
+ bool is_nested() const;
+ bool is_nonnested() const;
+ /// \brief Return true if this logical type is included in the Thrift output for its
+ /// node.
+ bool is_serialized() const;
+
+ LogicalType(const LogicalType&) = delete;
+ LogicalType& operator=(const LogicalType&) = delete;
+ virtual ~LogicalType() noexcept;
+
+ protected:
+ LogicalType();
+
+ class Impl;
+ std::unique_ptr<const Impl> impl_;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT StringLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ StringLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT MapLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ MapLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT ListLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ ListLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT EnumLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ EnumLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
+/// depending on the precision.
+class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
+ int32_t precision() const;
+ int32_t scale() const;
+
+ private:
+ DecimalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32.
+class PARQUET_EXPORT DateLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ DateLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
+class PARQUET_EXPORT TimeLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit);
+ bool is_adjusted_to_utc() const;
+ LogicalType::TimeUnit::unit time_unit() const;
+
+ private:
+ TimeLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT64.
+class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+ LogicalType::TimeUnit::unit time_unit,
+ bool is_from_converted_type = false,
+ bool force_set_converted_type = false);
+ bool is_adjusted_to_utc() const;
+ LogicalType::TimeUnit::unit time_unit() const;
+
+ /// \brief If true, will not set LogicalType in Thrift metadata
+ bool is_from_converted_type() const;
+
+ /// \brief If true, will set ConvertedType for micros and millis
+ /// resolution in legacy ConvertedType Thrift metadata
+ bool force_set_converted_type() const;
+
+ private:
+ TimestampLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
+class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ IntervalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
+/// (for bit width 64).
+class PARQUET_EXPORT IntLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
+ int bit_width() const;
+ bool is_signed() const;
+
+ private:
+ IntLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NullLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ NullLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT JSONLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ JSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT BSONLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ BSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
+/// must encode raw UUID bytes.
+class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ UUIDLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NoLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ NoLogicalType() = default;
+};
+
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
+ public:
+ static std::shared_ptr<const LogicalType> Make();
+
+ private:
+ UndefinedLogicalType() = default;
+};
+
+// Data encodings. Mirrors parquet::Encoding
+struct Encoding {
+ enum type {
+ PLAIN = 0,
+ PLAIN_DICTIONARY = 2,
+ RLE = 3,
+ BIT_PACKED = 4,
+ DELTA_BINARY_PACKED = 5,
+ DELTA_LENGTH_BYTE_ARRAY = 6,
+ DELTA_BYTE_ARRAY = 7,
+ RLE_DICTIONARY = 8,
+ BYTE_STREAM_SPLIT = 9,
+ // Should always be last element (except UNKNOWN)
+ UNDEFINED = 10,
+ UNKNOWN = 999
+ };
+};
+
+// Exposed data encodings. It is the encoding of the data read from the file,
+// rather than the encoding of the data in the file. E.g., the data encoded as
+// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
+// decoding, in which case the data read from the file is DICTIONARY encoded.
+enum class ExposedEncoding {
+ NO_ENCODING = 0, // data is not encoded, i.e. already decoded during reading
+ DICTIONARY = 1
+};
+
+/// \brief Return true if Parquet supports indicated compression type
+PARQUET_EXPORT
+bool IsCodecSupported(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
+
+struct ParquetCipher {
+ enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
+};
+
+struct AadMetadata {
+ std::string aad_prefix;
+ std::string aad_file_unique;
+ bool supply_aad_prefix;
+};
+
+struct EncryptionAlgorithm {
+ ParquetCipher::type algorithm;
+ AadMetadata aad;
+};
+
+// parquet::PageType
+struct PageType {
+ enum type {
+ DATA_PAGE,
+ INDEX_PAGE,
+ DICTIONARY_PAGE,
+ DATA_PAGE_V2,
+ // Should always be last element
+ UNDEFINED
+ };
+};
+
+class ColumnOrder {
+ public:
+ enum type { UNDEFINED, TYPE_DEFINED_ORDER };
+ explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
+ // Default to Type Defined Order
+ ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
+ ColumnOrder::type get_order() { return column_order_; }
+
+ static ColumnOrder undefined_;
+ static ColumnOrder type_defined_;
+
+ private:
+ ColumnOrder::type column_order_;
+};
+
+// ----------------------------------------------------------------------
+
+struct ByteArray {
+ ByteArray() : len(0), ptr(NULLPTR) {}
+ ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+
+ ByteArray(::arrow::util::string_view view) // NOLINT implicit conversion
+ : ByteArray(static_cast<uint32_t>(view.size()),
+ reinterpret_cast<const uint8_t*>(view.data())) {}
+ uint32_t len;
+ const uint8_t* ptr;
+};
+
+inline bool operator==(const ByteArray& left, const ByteArray& right) {
+ return left.len == right.len &&
+ (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
+}
+
+inline bool operator!=(const ByteArray& left, const ByteArray& right) {
+ return !(left == right);
+}
+
+struct FixedLenByteArray {
+ FixedLenByteArray() : ptr(NULLPTR) {}
+ explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
+ const uint8_t* ptr;
+};
+
+using FLBA = FixedLenByteArray;
+
+// Julian day at unix epoch.
+//
+// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
+// the Julian day count starting from noon Universal time, with Julian day
+// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
+// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
+// calendar),
+constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
+constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
+constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
+constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
+constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
+
+MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
+STRUCT_END(Int96, 12);
+
+inline bool operator==(const Int96& left, const Int96& right) {
+ return std::equal(left.value, left.value + 3, right.value);
+}
+
+inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
+
+static inline std::string ByteArrayToString(const ByteArray& a) {
+ return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
+}
+
+static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
+ std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
+}
+
+struct DecodedInt96 {
+ uint64_t days_since_epoch;
+ uint64_t nanoseconds;
+};
+
+static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
+ // We do the computations in the unsigned domain to avoid unsigned behaviour
+ // on overflow.
+ DecodedInt96 result;
+ result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
+ result.nanoseconds = 0;
+
+ memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
+ return result;
+}
+
+static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
+ decoded.nanoseconds);
+}
+
+static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
+ microseconds);
+}
+
+static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
+ milliseconds);
+}
+
+static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
+ const auto decoded = DecodeInt96Timestamp(i96);
+ uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
+ return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
+}
+
+static inline std::string Int96ToString(const Int96& a) {
+ std::ostringstream result;
+ std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
+ return result.str();
+}
+
+static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
+ std::ostringstream result;
+ std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
+ return result.str();
+}
+
+template <Type::type TYPE>
+struct type_traits {};
+
+template <>
+struct type_traits<Type::BOOLEAN> {
+ using value_type = bool;
+
+ static constexpr int value_byte_size = 1;
+ static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT32> {
+ using value_type = int32_t;
+
+ static constexpr int value_byte_size = 4;
+ static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT64> {
+ using value_type = int64_t;
+
+ static constexpr int value_byte_size = 8;
+ static constexpr const char* printf_code = "ld";
+};
+
+template <>
+struct type_traits<Type::INT96> {
+ using value_type = Int96;
+
+ static constexpr int value_byte_size = 12;
+ static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FLOAT> {
+ using value_type = float;
+
+ static constexpr int value_byte_size = 4;
+ static constexpr const char* printf_code = "f";
+};
+
+template <>
+struct type_traits<Type::DOUBLE> {
+ using value_type = double;
+
+ static constexpr int value_byte_size = 8;
+ static constexpr const char* printf_code = "lf";
+};
+
+template <>
+struct type_traits<Type::BYTE_ARRAY> {
+ using value_type = ByteArray;
+
+ static constexpr int value_byte_size = sizeof(ByteArray);
+ static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
+ using value_type = FixedLenByteArray;
+
+ static constexpr int value_byte_size = sizeof(FixedLenByteArray);
+ static constexpr const char* printf_code = "s";
+};
+
+template <Type::type TYPE>
+struct PhysicalType {
+ using c_type = typename type_traits<TYPE>::value_type;
+ static constexpr Type::type type_num = TYPE;
+};
+
+using BooleanType = PhysicalType<Type::BOOLEAN>;
+using Int32Type = PhysicalType<Type::INT32>;
+using Int64Type = PhysicalType<Type::INT64>;
+using Int96Type = PhysicalType<Type::INT96>;
+using FloatType = PhysicalType<Type::FLOAT>;
+using DoubleType = PhysicalType<Type::DOUBLE>;
+using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
+
+template <typename Type>
+inline std::string format_fwf(int width) {
+ std::stringstream ss;
+ ss << "%-" << width << type_traits<Type::type_num>::printf_code;
+ return ss.str();
+}
+
+PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
+
+PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
+
+PARQUET_EXPORT std::string TypeToString(Type::type t);
+
+PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type,
+ ::arrow::util::string_view val);
+
+PARQUET_EXPORT int GetTypeByteSize(Type::type t);
+
+PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
+ Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(
+ const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
+
+} // namespace parquet
diff --git a/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h b/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
index 6e5b6b330e6..31ca04c8b66 100644
--- a/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
+++ b/contrib/libs/apache/arrow/cpp/src/parquet/windows_compatibility.h
@@ -1,30 +1,30 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/util/windows_compatibility.h"
-
-#ifdef _WIN32
-
-// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from
-// above, so we undefine it
-#ifdef OPTIONAL
-#undef OPTIONAL
-#endif
-
-#endif
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#ifdef _WIN32
+
+// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from
+// above, so we undefine it
+#ifdef OPTIONAL
+#undef OPTIONAL
+#endif
+
+#endif